{ "job_type": "rl_grpo", "dataset": { "instruction_path": "sample.json", "template" : "chat_template_kd.jinja", "train_ratio": 0.7, "seed": 42 }, "models": { "reward": "reward/", "student": "Qwen/Qwen2.5-0.5B-Instruct" }, "training": { "output_dir": "./result/", "per_device_train_batch_size": 1, "gradient_accumulation_steps": 8, "num_train_epochs": 3, "save_steps": 100, "logging_steps": 1, "learning_rate": 2e-5, "weight_decay": 0.05, "warmup_ratio": 0.1, "lr_scheduler_type": "cosine" } }