distillation/configs/rl_ppo.json

{
  "job_type": "rl_ppo",
  "dataset": {
    "instruction_path": "sample.json",
    "template" : "chat_template_kd.jinja",
    "train_ratio": 0.7,
    "seed": 42
  },
  "models": {
    "reward": "reward/",
    "student": "Qwen/Qwen2.5-0.5B-Instruct"
  },
  "training": {
    "output_dir": "./result/",
    "total_episodes": 1000,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "save_steps": 100,
    "logging_steps": 1,
    "learning_rate": 2e-5,
    "weight_decay": 0.05,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine",
    "missing_eos_penalty": 1.0,
    "stop_token": "eos",
    "response_length": 512
  }
}