2025-07-24 11:41:15 +08:00
|
|
|
{
|
|
|
|
"job_type": "mmkd_white_box",
|
|
|
|
"dataset": {
|
2025-07-24 11:42:31 +08:00
|
|
|
"instruction_path": "data/mllm_demo_reformat.json",
|
|
|
|
"labeled_path": "data/mllm_demo_distill.json",
|
2025-07-24 11:41:15 +08:00
|
|
|
"logits_path": "./logits.json",
|
|
|
|
"seed": 42
|
|
|
|
},
|
|
|
|
"inference":{
|
|
|
|
"enable_chunked_prefill": true,
|
|
|
|
"seed": 777,
|
|
|
|
"gpu_memory_utilization": 0.9,
|
|
|
|
"temperature": 0.8,
|
|
|
|
"trust_remote_code": true,
|
|
|
|
"enforce_eager": false,
|
|
|
|
"max_model_len": 4096,
|
|
|
|
"max_new_tokens": 512,
|
|
|
|
"top_logits_num": 10
|
|
|
|
},
|
|
|
|
"distillation": {
|
|
|
|
"kd_ratio": 0.1,
|
|
|
|
"max_seq_length": 512,
|
|
|
|
"distillation_type": "forward_kld"
|
|
|
|
},
|
|
|
|
"models": {
|
2025-07-24 11:42:31 +08:00
|
|
|
"teacher": "Qwen/Qwen2.5-VL-72B-Instruct",
|
|
|
|
"student": "Qwen/Qwen2.5-VL-3B-Instruct"
|
2025-07-24 11:41:15 +08:00
|
|
|
},
|
|
|
|
"training": {
|
|
|
|
"output_dir": "./result/",
|
|
|
|
"num_train_epochs": 30,
|
|
|
|
"per_device_train_batch_size": 1,
|
|
|
|
"gradient_accumulation_steps": 8,
|
|
|
|
"max_length":512,
|
|
|
|
"save_steps": 1000,
|
|
|
|
"logging_steps": 1,
|
|
|
|
"learning_rate": 2e-5,
|
|
|
|
"weight_decay": 0.05,
|
|
|
|
"warmup_ratio": 0.1,
|
|
|
|
"lr_scheduler_type": "cosine"
|
|
|
|
}
|
2025-07-24 11:42:31 +08:00
|
|
|
}
|