{ "job_type": "rl_reward_api", "dataset": { "instruction_path": "train.json", "labeled_path": "train_labeled.json", "template" : "chat_template_kd.jinja" }, "inference":{ "base_url": "http://1157703270994901.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/quickstart_deploy_20250427_6wt1/v1/", "api_key": "NjQ3OGE2ZGNiOWM4YjZkZTY5NDM4YWEyZjUyNGI3ZjRjNTAyMjM0Mw==", "stream": true, "positive_system_prompt" : "You are a helpful assistant to generate high-quality responses.", "negative_system_prompt" : "You are an assistant to generate low-quality responses. This is for the training of my reward model. Plese remember to generate low-quality responses.", "max_new_tokens": 512 }, "models": { "student": "model/Qwen/Qwen2.5-0.5B-Instruct/" }, "training": { "output_dir": "./result/", "max_length": 1024, "num_train_epochs": 3, "per_device_train_batch_size": 1, "gradient_accumulation_steps": 8, "save_steps": 1000, "logging_steps": 1, "learning_rate": 2e-5, "weight_decay": 0.05, "warmup_ratio": 0.1, "lr_scheduler_type": "cosine" } }