From a4b27d083507497eb0595f12a61d1e983cf992da Mon Sep 17 00:00:00 2001 From: Andrewyan123 <49424389+Andrewyan123@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:14:21 +0800 Subject: [PATCH] Add files via upload --- .../distillqwen2.5-thoughtX/filter.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 recipes/distilqwen_series/distillqwen2.5-thoughtX/filter.py diff --git a/recipes/distilqwen_series/distillqwen2.5-thoughtX/filter.py b/recipes/distilqwen_series/distillqwen2.5-thoughtX/filter.py new file mode 100644 index 0000000..189f3e4 --- /dev/null +++ b/recipes/distilqwen_series/distillqwen2.5-thoughtX/filter.py @@ -0,0 +1,37 @@ +import json + +def filter_dataset(input_file, output_file, rv_condition, cd_condition): + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + filtered_data = [] + + for item in data: + if item.get('logical_correctness', 0) == 1 and \ + rv_condition(item.get('reasoning_verbosity', 0)) and \ + cd_condition(item.get('cognitive_difficulty', 0)): + + filtered_item = { + 'instruction': item['instruction'], + 'output': item['output'] + } + filtered_data.append(filtered_item) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(filtered_data, f, ensure_ascii=False, indent=2) + + print(f"筛选完成,共找到{len(filtered_data)}条符合条件的记录,已保存到{output_file}") + +if __name__ == "__main__": + def rv_condition(score): + return score >= 3 and score <=5 + + def cd_condition(score): + return score == 4 + + filter_dataset( + input_file='input.json', + output_file='filtered_output.json', + rv_condition=rv_condition, + cd_condition=cd_condition + ) \ No newline at end of file