This commit is contained in:
echo840
2023-05-23 18:24:16 +08:00
parent da758a9ca7
commit b388fba03e
470 changed files with 2523750 additions and 7307 deletions

View File

@@ -0,0 +1,58 @@
import argparse
import json
import pathlib
# Prompt from stanford alpaca's training script
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
}
def main(args):
data_path = pathlib.Path(args.data_path)
with data_path.open() as f:
data = json.load(f)
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [
prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
for example in data
]
targets = [example['output'] for example in data]
new_data = []
cnt = 1
for s, t in zip(sources, targets):
new_data.append({
'id': str(cnt),
'conversations': [
{
'from': 'human',
'value': s,
},
{
'from': 'gpt',
'value': t,
}
]
})
cnt += 1
json.dump(new_data, open(args.output_path, 'w'), indent=2)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default='alpaca-data.json')
parser.add_argument('--output_path', type=str, default='alpaca-data-conversation.json')
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,195 @@
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.
Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
from concurrent.futures import ProcessPoolExecutor
import json
import logging
import re
from typing import Dict, Union
import bs4
import markdownify # == 0.11.6
from tqdm import tqdm
div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
def reformat_code(val: str) -> str:
# Input code format is:
# ```
# $<language>Copy code$<exact_code_here>
#
# ```
# This function convert it into the correct markdown format
return re.sub(code_lang_pattern, code_lang_format, val)
def html_to_markdown(val: str) -> str:
# Remove all <div>. This is required to make intent work in code blocks.
val = re.sub(div_pattern, "", val)
# Remove all <span>. This is required to make underscores work in code blocks.
val = re.sub(span_pattern, "", val)
# Markdown to html
val = markdownify.markdownify(val).strip()
# Reformat code
val = reformat_code(val)
# Remove noisy "[number] / [number]" at the beginning
noise = re.search(regenerate_pattern, val)
if noise and noise.start() == 0:
val = val[noise.end() :]
# Remove noisy "Copy[number] chars / [number] words"
val = re.sub(copy_chars_pattern, "", val)
# Remove empty code block ```\nCopy code\n```
val = re.sub(copy_code_pattern, "", val)
# Strip
val = val.replace("\n\n\n", "\n").strip()
return val
def contain_blocked_words(val: str) -> bool:
blocked_words = ["openai", "chatgpt"]
for w in blocked_words:
if w in val.lower():
return True
return False
def clean_html_one_sample(sample):
roles = ["human", "gpt"]
if len(sample["conversations"]) <= 1:
return (sample, 1)
# Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
if sample["conversations"][0]["from"] != "human":
sample["conversations"] = sample["conversations"][1:]
if len(sample["conversations"]) <= 1:
return (sample, 1)
if sample["conversations"][-1]["from"] == "human":
sample["conversations"] = sample["conversations"][:-1]
if len(sample["conversations"]) <= 1:
return (sample, 1)
for i, c in enumerate(sample["conversations"]):
if c["from"] != roles[i % 2]:
return (sample, 2)
if contain_blocked_words(c["value"]):
return (sample, 3)
try:
new_val = html_to_markdown(c["value"])
except (bs4.builder.ParserRejectedMarkup, AssertionError):
return (sample, 4)
c["value"] = new_val
return (sample, 0)
def clean_html_all(content, begin, end):
"""
Clean the source html files.
"""
cnt_skip = 0
cnt_blocked_words = 0
cnt_wrong_format = 0
cnt_parser_error = 0
cnt_too_short = 0
cnt_id_duplication = 0
cnt_value_duplication = 0
cnt_tag = 0
content = content[begin:end]
processed = []
with ProcessPoolExecutor() as executor:
for result in tqdm(
executor.map(clean_html_one_sample, content), total=len(content)
):
processed.append(result)
visited = {}
new_content = []
for sample, error_code in tqdm(processed):
cid = sample["id"]
skipped = True
if error_code != 0:
if error_code == 1:
print(f"id {cid} is too short")
cnt_too_short += 1
elif error_code == 2:
print(f"id {cid} has a wrong format")
cnt_wrong_format += 1
elif error_code == 3:
print(f"id {cid} contains blocked words")
cnt_blocked_words += 1
elif error_code == 4:
print(f"id {cid} contains parser errors")
cnt_parser_error += 1
else:
raise ValueError(f"Invalid error_code: {error_code}")
elif cid in visited:
print(f"id {cid} is an id duplication of {visited[cid]}")
cnt_id_duplication += 1
elif (
sample["conversations"][1]["value"],
len(sample["conversations"]),
) in visited:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
print(f"id {cid} is a value duplication of {visited[key]}")
cnt_value_duplication += 1
else:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
visited[cid] = visited[key] = cid
skipped = False
if not skipped:
new_content.append(sample)
else:
cnt_skip += 1
print(
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
f"cnt_wrong_format: {cnt_wrong_format}, "
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
f"cnt_value_duplication: {cnt_value_duplication}, "
)
return new_content
def main(args):
content = json.load(open(args["in_file"], "r"))
content = clean_html_all(content, args["begin"], args["end"])
json.dump(content, open(args["out_file"], "w"), indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
parser.add_argument("--begin", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
main(vars(args))

View File

@@ -0,0 +1,23 @@
"""
Usage:
python3 -m fastchat.data.inspect --in sharegpt_20230322_clean_lang_split.json
"""
import argparse
import json
import tqdm
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--begin", type=int)
args = parser.parse_args()
content = json.load(open(args.in_file, "r"))
for sample in tqdm.tqdm(content[args.begin:]):
print(f"id: {sample['id']}")
for conv in sample["conversations"]:
print(conv["from"] + ": ")
print(conv["value"])
input()

View File

@@ -0,0 +1,80 @@
"""
Usage:
python3 -m fastchat.data.optional_clean --lang en --reduce-rep --in sharegpt_clean.json --out output.json
python3 -m fastchat.data.optional_clean --skip-lang en --reduce-rep --in sharegpt_clean.json --out output.json
"""
import argparse
import json
import re
import polyglot
from polyglot.detect import Detector
import pycld2
from tqdm import tqdm
def skip(conv, args):
# Remove certain languages
if args.lang != "all" or args.skip_lang is not None:
text = "\n".join([x["value"] for x in conv["conversations"]])
try:
lang_code = Detector(text).language.code
except (pycld2.error, polyglot.detect.base.UnknownLanguage):
lang_code = "unknown"
if args.lang != "all" and lang_code != args.lang:
return True
if lang_code == args.skip_lang:
return True
# Remove repetitive numbers
if args.reduce_rep:
for sentence in conv["conversations"]:
val = sentence["value"]
sub = re.search(r"(\d)\1{8}", val)
if sub is not None:
return True
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, default="")
parser.add_argument("--lang", type=str, default="all",
choices=["all", "en"])
parser.add_argument("--skip-lang", type=str)
# NOTE: Be careful about reduce_rep which may remove some good data.
# For example, addresses could have long consecutive 0's
parser.add_argument("--reduce-rep", action="store_true")
args = parser.parse_args()
in_file = args.in_file
out_file = args.out_file
lang = args.lang
skip_lang = args.skip_lang
reduce_rep = args.reduce_rep
assert (lang == "all" or skip_lang is None)
if out_file == "":
out_file = "sharegpt_clean"
if lang != "all":
out_file += "_" + lang
if skip_lang is not None:
out_file += "_skip_" + skip_lang
if reduce_rep:
out_file += "_reduce_rep"
out_file += ".json"
content = json.load(open(in_file, "r"))
num_conv = len(content)
new_content = []
for conv in tqdm(content):
if not skip(conv, args):
new_content.append(conv)
print(f"return {len(new_content)} out of {len(content)}, start dump ...")
json.dump(new_content, open(out_file, "w"), indent=2)

View File

@@ -0,0 +1,20 @@
"""
Usage:
python3 pretty_json.py --in in.json --out out.json
"""
import argparse
import json
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, required=True)
args = parser.parse_args()
with open(args.in_file, "r") as fin:
data = json.load(fin)
with open(args.out_file, "w") as fout:
json.dump(data, fout, indent=2)

View File

@@ -0,0 +1,99 @@
"""
Split long conversations based on certain max length.
Usage: python3 -m fastchat.data.split_long_conversation \
--in sharegpt_clean.json \
--out sharegpt_split.json \
--model-name-or-path $<model-name>
"""
import argparse
import json
from typing import Dict, Sequence, Optional
import transformers
import tqdm
from llava import conversation as conversation_lib
DEFAULT_PAD_TOKEN = "[PAD]"
BEGIN_SIGNAL = "### "
END_SIGNAL = "\n"
def split_sample(sample, start_idx, end_idx):
# only ends in the bot because otherwise the last human part is useless.
end_speaker = sample["conversations"][end_idx]["from"]
end_idx = end_idx + 1 if end_speaker != "human" else end_idx
return {
"id": sample["id"] + "_" + str(start_idx),
"conversations": sample["conversations"][start_idx:end_idx]
}
def split_contents(content, begin, end, tokenizer, max_length):
"""
Keep the maximum round of conversations within the max token length constraint
"""
content = content[begin:end]
new_content = []
for sample in tqdm.tqdm(content):
tokenized_lens = []
for c in sample["conversations"]:
from_str = c["from"]
if from_str.lower() == "human":
from_str = conversation_lib.default_conversation.roles[0]
elif from_str.lower() == "gpt":
from_str = conversation_lib.default_conversation.roles[1]
else:
from_str = 'unknown'
sentence = (BEGIN_SIGNAL + from_str + ": " + c["value"] +
END_SIGNAL)
length = tokenizer(sentence, return_tensors="pt", padding="longest"
).input_ids.ne(tokenizer.pad_token_id).sum().item()
tokenized_lens.append(length)
num_tokens = 0
start_idx = 0
for idx, l in enumerate(tokenized_lens):
# TODO: shall we also only starts from a specific speaker?
if num_tokens + l > max_length:
new_content.append(split_sample(sample, start_idx, idx))
start_idx = idx
num_tokens = l
else:
num_tokens += l
if idx == len(tokenized_lens) - 1:
new_content.append(split_sample(sample, start_idx, idx))
print(f"total: {len(content)}, new: {len(new_content)}")
return new_content
def main(args):
content = json.load(open(args.in_file, "r"))
tokenizer = transformers.AutoTokenizer.from_pretrained(
args.model_name_or_path,
model_max_length=args.max_length,
padding_side="right",
use_fast=False,
)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
content = split_contents(content, args.begin, args.end,
tokenizer, args.max_length)
json.dump(content, open(args.out_file, "w"), indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
parser.add_argument("--begin", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--model-name-or-path", type=str, required=True)
parser.add_argument("--max-length", type=int, default=2304)
args = parser.parse_args()
main(args)