add
This commit is contained in:
195
models/LLaVA/build/lib/llava/data/clean_sharegpt.py
Normal file
195
models/LLaVA/build/lib/llava/data/clean_sharegpt.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
- Convert html to markdown with basic data cleaning.
|
||||
- Deduplication.
|
||||
|
||||
Usage:
|
||||
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
|
||||
"""
|
||||
import argparse
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, Union
|
||||
|
||||
import bs4
|
||||
import markdownify # == 0.11.6
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
div_pattern = re.compile("<div.*?>")
|
||||
span_pattern = re.compile("<span.*?>")
|
||||
code_lang_pattern = re.compile(
|
||||
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
|
||||
)
|
||||
code_lang_format = "```\g<1>\n\g<2>\n```"
|
||||
regenerate_pattern = re.compile("\d+ / \d+")
|
||||
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
|
||||
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
|
||||
|
||||
|
||||
def reformat_code(val: str) -> str:
|
||||
# Input code format is:
|
||||
# ```
|
||||
# $<language>Copy code$<exact_code_here>
|
||||
#
|
||||
# ```
|
||||
# This function convert it into the correct markdown format
|
||||
return re.sub(code_lang_pattern, code_lang_format, val)
|
||||
|
||||
|
||||
def html_to_markdown(val: str) -> str:
|
||||
# Remove all <div>. This is required to make intent work in code blocks.
|
||||
val = re.sub(div_pattern, "", val)
|
||||
# Remove all <span>. This is required to make underscores work in code blocks.
|
||||
val = re.sub(span_pattern, "", val)
|
||||
# Markdown to html
|
||||
val = markdownify.markdownify(val).strip()
|
||||
# Reformat code
|
||||
val = reformat_code(val)
|
||||
|
||||
# Remove noisy "[number] / [number]" at the beginning
|
||||
noise = re.search(regenerate_pattern, val)
|
||||
if noise and noise.start() == 0:
|
||||
val = val[noise.end() :]
|
||||
# Remove noisy "Copy[number] chars / [number] words"
|
||||
val = re.sub(copy_chars_pattern, "", val)
|
||||
# Remove empty code block ```\nCopy code\n```
|
||||
val = re.sub(copy_code_pattern, "", val)
|
||||
|
||||
# Strip
|
||||
val = val.replace("\n\n\n", "\n").strip()
|
||||
|
||||
return val
|
||||
|
||||
|
||||
def contain_blocked_words(val: str) -> bool:
|
||||
blocked_words = ["openai", "chatgpt"]
|
||||
for w in blocked_words:
|
||||
if w in val.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def clean_html_one_sample(sample):
|
||||
roles = ["human", "gpt"]
|
||||
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
# Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
|
||||
if sample["conversations"][0]["from"] != "human":
|
||||
sample["conversations"] = sample["conversations"][1:]
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
if sample["conversations"][-1]["from"] == "human":
|
||||
sample["conversations"] = sample["conversations"][:-1]
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
for i, c in enumerate(sample["conversations"]):
|
||||
if c["from"] != roles[i % 2]:
|
||||
return (sample, 2)
|
||||
|
||||
if contain_blocked_words(c["value"]):
|
||||
return (sample, 3)
|
||||
|
||||
try:
|
||||
new_val = html_to_markdown(c["value"])
|
||||
except (bs4.builder.ParserRejectedMarkup, AssertionError):
|
||||
return (sample, 4)
|
||||
|
||||
c["value"] = new_val
|
||||
|
||||
return (sample, 0)
|
||||
|
||||
|
||||
def clean_html_all(content, begin, end):
|
||||
"""
|
||||
Clean the source html files.
|
||||
"""
|
||||
cnt_skip = 0
|
||||
cnt_blocked_words = 0
|
||||
cnt_wrong_format = 0
|
||||
cnt_parser_error = 0
|
||||
cnt_too_short = 0
|
||||
cnt_id_duplication = 0
|
||||
cnt_value_duplication = 0
|
||||
cnt_tag = 0
|
||||
|
||||
content = content[begin:end]
|
||||
processed = []
|
||||
with ProcessPoolExecutor() as executor:
|
||||
for result in tqdm(
|
||||
executor.map(clean_html_one_sample, content), total=len(content)
|
||||
):
|
||||
processed.append(result)
|
||||
|
||||
visited = {}
|
||||
new_content = []
|
||||
for sample, error_code in tqdm(processed):
|
||||
cid = sample["id"]
|
||||
skipped = True
|
||||
|
||||
if error_code != 0:
|
||||
if error_code == 1:
|
||||
print(f"id {cid} is too short")
|
||||
cnt_too_short += 1
|
||||
elif error_code == 2:
|
||||
print(f"id {cid} has a wrong format")
|
||||
cnt_wrong_format += 1
|
||||
elif error_code == 3:
|
||||
print(f"id {cid} contains blocked words")
|
||||
cnt_blocked_words += 1
|
||||
elif error_code == 4:
|
||||
print(f"id {cid} contains parser errors")
|
||||
cnt_parser_error += 1
|
||||
else:
|
||||
raise ValueError(f"Invalid error_code: {error_code}")
|
||||
elif cid in visited:
|
||||
print(f"id {cid} is an id duplication of {visited[cid]}")
|
||||
cnt_id_duplication += 1
|
||||
elif (
|
||||
sample["conversations"][1]["value"],
|
||||
len(sample["conversations"]),
|
||||
) in visited:
|
||||
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
|
||||
print(f"id {cid} is a value duplication of {visited[key]}")
|
||||
cnt_value_duplication += 1
|
||||
else:
|
||||
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
|
||||
visited[cid] = visited[key] = cid
|
||||
skipped = False
|
||||
|
||||
if not skipped:
|
||||
new_content.append(sample)
|
||||
else:
|
||||
cnt_skip += 1
|
||||
|
||||
print(
|
||||
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
|
||||
f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
|
||||
f"cnt_wrong_format: {cnt_wrong_format}, "
|
||||
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
|
||||
f"cnt_value_duplication: {cnt_value_duplication}, "
|
||||
)
|
||||
|
||||
return new_content
|
||||
|
||||
|
||||
def main(args):
|
||||
content = json.load(open(args["in_file"], "r"))
|
||||
content = clean_html_all(content, args["begin"], args["end"])
|
||||
json.dump(content, open(args["out_file"], "w"), indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
|
||||
parser.add_argument("--begin", type=int)
|
||||
parser.add_argument("--end", type=int)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
main(vars(args))
|
Reference in New Issue
Block a user