add
This commit is contained in:
0
models/LLaVA/build/lib/llava/data/__init__.py
Normal file
0
models/LLaVA/build/lib/llava/data/__init__.py
Normal file
58
models/LLaVA/build/lib/llava/data/alpaca-converter.py
Normal file
58
models/LLaVA/build/lib/llava/data/alpaca-converter.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
|
||||
# Prompt from stanford alpaca's training script
|
||||
PROMPT_DICT = {
|
||||
"prompt_input": (
|
||||
"Below is an instruction that describes a task, paired with an input that provides further context. "
|
||||
"Write a response that appropriately completes the request.\n\n"
|
||||
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
|
||||
),
|
||||
"prompt_no_input": (
|
||||
"Below is an instruction that describes a task. "
|
||||
"Write a response that appropriately completes the request.\n\n"
|
||||
"### Instruction:\n{instruction}\n\n### Response:"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def main(args):
|
||||
data_path = pathlib.Path(args.data_path)
|
||||
with data_path.open() as f:
|
||||
data = json.load(f)
|
||||
|
||||
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
|
||||
sources = [
|
||||
prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
|
||||
for example in data
|
||||
]
|
||||
targets = [example['output'] for example in data]
|
||||
|
||||
new_data = []
|
||||
cnt = 1
|
||||
for s, t in zip(sources, targets):
|
||||
new_data.append({
|
||||
'id': str(cnt),
|
||||
'conversations': [
|
||||
{
|
||||
'from': 'human',
|
||||
'value': s,
|
||||
},
|
||||
{
|
||||
'from': 'gpt',
|
||||
'value': t,
|
||||
}
|
||||
]
|
||||
})
|
||||
cnt += 1
|
||||
|
||||
json.dump(new_data, open(args.output_path, 'w'), indent=2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_path', type=str, default='alpaca-data.json')
|
||||
parser.add_argument('--output_path', type=str, default='alpaca-data-conversation.json')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
195
models/LLaVA/build/lib/llava/data/clean_sharegpt.py
Normal file
195
models/LLaVA/build/lib/llava/data/clean_sharegpt.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
- Convert html to markdown with basic data cleaning.
|
||||
- Deduplication.
|
||||
|
||||
Usage:
|
||||
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
|
||||
"""
|
||||
import argparse
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, Union
|
||||
|
||||
import bs4
|
||||
import markdownify # == 0.11.6
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
div_pattern = re.compile("<div.*?>")
|
||||
span_pattern = re.compile("<span.*?>")
|
||||
code_lang_pattern = re.compile(
|
||||
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
|
||||
)
|
||||
code_lang_format = "```\g<1>\n\g<2>\n```"
|
||||
regenerate_pattern = re.compile("\d+ / \d+")
|
||||
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
|
||||
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
|
||||
|
||||
|
||||
def reformat_code(val: str) -> str:
|
||||
# Input code format is:
|
||||
# ```
|
||||
# $<language>Copy code$<exact_code_here>
|
||||
#
|
||||
# ```
|
||||
# This function convert it into the correct markdown format
|
||||
return re.sub(code_lang_pattern, code_lang_format, val)
|
||||
|
||||
|
||||
def html_to_markdown(val: str) -> str:
|
||||
# Remove all <div>. This is required to make intent work in code blocks.
|
||||
val = re.sub(div_pattern, "", val)
|
||||
# Remove all <span>. This is required to make underscores work in code blocks.
|
||||
val = re.sub(span_pattern, "", val)
|
||||
# Markdown to html
|
||||
val = markdownify.markdownify(val).strip()
|
||||
# Reformat code
|
||||
val = reformat_code(val)
|
||||
|
||||
# Remove noisy "[number] / [number]" at the beginning
|
||||
noise = re.search(regenerate_pattern, val)
|
||||
if noise and noise.start() == 0:
|
||||
val = val[noise.end() :]
|
||||
# Remove noisy "Copy[number] chars / [number] words"
|
||||
val = re.sub(copy_chars_pattern, "", val)
|
||||
# Remove empty code block ```\nCopy code\n```
|
||||
val = re.sub(copy_code_pattern, "", val)
|
||||
|
||||
# Strip
|
||||
val = val.replace("\n\n\n", "\n").strip()
|
||||
|
||||
return val
|
||||
|
||||
|
||||
def contain_blocked_words(val: str) -> bool:
|
||||
blocked_words = ["openai", "chatgpt"]
|
||||
for w in blocked_words:
|
||||
if w in val.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def clean_html_one_sample(sample):
|
||||
roles = ["human", "gpt"]
|
||||
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
# Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
|
||||
if sample["conversations"][0]["from"] != "human":
|
||||
sample["conversations"] = sample["conversations"][1:]
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
if sample["conversations"][-1]["from"] == "human":
|
||||
sample["conversations"] = sample["conversations"][:-1]
|
||||
if len(sample["conversations"]) <= 1:
|
||||
return (sample, 1)
|
||||
|
||||
for i, c in enumerate(sample["conversations"]):
|
||||
if c["from"] != roles[i % 2]:
|
||||
return (sample, 2)
|
||||
|
||||
if contain_blocked_words(c["value"]):
|
||||
return (sample, 3)
|
||||
|
||||
try:
|
||||
new_val = html_to_markdown(c["value"])
|
||||
except (bs4.builder.ParserRejectedMarkup, AssertionError):
|
||||
return (sample, 4)
|
||||
|
||||
c["value"] = new_val
|
||||
|
||||
return (sample, 0)
|
||||
|
||||
|
||||
def clean_html_all(content, begin, end):
|
||||
"""
|
||||
Clean the source html files.
|
||||
"""
|
||||
cnt_skip = 0
|
||||
cnt_blocked_words = 0
|
||||
cnt_wrong_format = 0
|
||||
cnt_parser_error = 0
|
||||
cnt_too_short = 0
|
||||
cnt_id_duplication = 0
|
||||
cnt_value_duplication = 0
|
||||
cnt_tag = 0
|
||||
|
||||
content = content[begin:end]
|
||||
processed = []
|
||||
with ProcessPoolExecutor() as executor:
|
||||
for result in tqdm(
|
||||
executor.map(clean_html_one_sample, content), total=len(content)
|
||||
):
|
||||
processed.append(result)
|
||||
|
||||
visited = {}
|
||||
new_content = []
|
||||
for sample, error_code in tqdm(processed):
|
||||
cid = sample["id"]
|
||||
skipped = True
|
||||
|
||||
if error_code != 0:
|
||||
if error_code == 1:
|
||||
print(f"id {cid} is too short")
|
||||
cnt_too_short += 1
|
||||
elif error_code == 2:
|
||||
print(f"id {cid} has a wrong format")
|
||||
cnt_wrong_format += 1
|
||||
elif error_code == 3:
|
||||
print(f"id {cid} contains blocked words")
|
||||
cnt_blocked_words += 1
|
||||
elif error_code == 4:
|
||||
print(f"id {cid} contains parser errors")
|
||||
cnt_parser_error += 1
|
||||
else:
|
||||
raise ValueError(f"Invalid error_code: {error_code}")
|
||||
elif cid in visited:
|
||||
print(f"id {cid} is an id duplication of {visited[cid]}")
|
||||
cnt_id_duplication += 1
|
||||
elif (
|
||||
sample["conversations"][1]["value"],
|
||||
len(sample["conversations"]),
|
||||
) in visited:
|
||||
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
|
||||
print(f"id {cid} is a value duplication of {visited[key]}")
|
||||
cnt_value_duplication += 1
|
||||
else:
|
||||
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
|
||||
visited[cid] = visited[key] = cid
|
||||
skipped = False
|
||||
|
||||
if not skipped:
|
||||
new_content.append(sample)
|
||||
else:
|
||||
cnt_skip += 1
|
||||
|
||||
print(
|
||||
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
|
||||
f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
|
||||
f"cnt_wrong_format: {cnt_wrong_format}, "
|
||||
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
|
||||
f"cnt_value_duplication: {cnt_value_duplication}, "
|
||||
)
|
||||
|
||||
return new_content
|
||||
|
||||
|
||||
def main(args):
|
||||
content = json.load(open(args["in_file"], "r"))
|
||||
content = clean_html_all(content, args["begin"], args["end"])
|
||||
json.dump(content, open(args["out_file"], "w"), indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
|
||||
parser.add_argument("--begin", type=int)
|
||||
parser.add_argument("--end", type=int)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
main(vars(args))
|
23
models/LLaVA/build/lib/llava/data/inspect.py
Normal file
23
models/LLaVA/build/lib/llava/data/inspect.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Usage:
|
||||
python3 -m fastchat.data.inspect --in sharegpt_20230322_clean_lang_split.json
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import tqdm
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--begin", type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
content = json.load(open(args.in_file, "r"))
|
||||
for sample in tqdm.tqdm(content[args.begin:]):
|
||||
print(f"id: {sample['id']}")
|
||||
for conv in sample["conversations"]:
|
||||
print(conv["from"] + ": ")
|
||||
print(conv["value"])
|
||||
input()
|
80
models/LLaVA/build/lib/llava/data/optional_clean.py
Normal file
80
models/LLaVA/build/lib/llava/data/optional_clean.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Usage:
|
||||
python3 -m fastchat.data.optional_clean --lang en --reduce-rep --in sharegpt_clean.json --out output.json
|
||||
python3 -m fastchat.data.optional_clean --skip-lang en --reduce-rep --in sharegpt_clean.json --out output.json
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
|
||||
import polyglot
|
||||
from polyglot.detect import Detector
|
||||
import pycld2
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def skip(conv, args):
|
||||
# Remove certain languages
|
||||
if args.lang != "all" or args.skip_lang is not None:
|
||||
text = "\n".join([x["value"] for x in conv["conversations"]])
|
||||
try:
|
||||
lang_code = Detector(text).language.code
|
||||
except (pycld2.error, polyglot.detect.base.UnknownLanguage):
|
||||
lang_code = "unknown"
|
||||
|
||||
if args.lang != "all" and lang_code != args.lang:
|
||||
return True
|
||||
|
||||
if lang_code == args.skip_lang:
|
||||
return True
|
||||
|
||||
# Remove repetitive numbers
|
||||
if args.reduce_rep:
|
||||
for sentence in conv["conversations"]:
|
||||
val = sentence["value"]
|
||||
sub = re.search(r"(\d)\1{8}", val)
|
||||
if sub is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--out-file", type=str, default="")
|
||||
parser.add_argument("--lang", type=str, default="all",
|
||||
choices=["all", "en"])
|
||||
parser.add_argument("--skip-lang", type=str)
|
||||
# NOTE: Be careful about reduce_rep which may remove some good data.
|
||||
# For example, addresses could have long consecutive 0's
|
||||
parser.add_argument("--reduce-rep", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
in_file = args.in_file
|
||||
out_file = args.out_file
|
||||
lang = args.lang
|
||||
skip_lang = args.skip_lang
|
||||
reduce_rep = args.reduce_rep
|
||||
assert (lang == "all" or skip_lang is None)
|
||||
|
||||
if out_file == "":
|
||||
out_file = "sharegpt_clean"
|
||||
if lang != "all":
|
||||
out_file += "_" + lang
|
||||
if skip_lang is not None:
|
||||
out_file += "_skip_" + skip_lang
|
||||
if reduce_rep:
|
||||
out_file += "_reduce_rep"
|
||||
out_file += ".json"
|
||||
|
||||
content = json.load(open(in_file, "r"))
|
||||
num_conv = len(content)
|
||||
|
||||
new_content = []
|
||||
for conv in tqdm(content):
|
||||
if not skip(conv, args):
|
||||
new_content.append(conv)
|
||||
|
||||
print(f"return {len(new_content)} out of {len(content)}, start dump ...")
|
||||
json.dump(new_content, open(out_file, "w"), indent=2)
|
20
models/LLaVA/build/lib/llava/data/pretty_json.py
Normal file
20
models/LLaVA/build/lib/llava/data/pretty_json.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""
|
||||
Usage:
|
||||
python3 pretty_json.py --in in.json --out out.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--out-file", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.in_file, "r") as fin:
|
||||
data = json.load(fin)
|
||||
|
||||
with open(args.out_file, "w") as fout:
|
||||
json.dump(data, fout, indent=2)
|
99
models/LLaVA/build/lib/llava/data/split_long_conversation.py
Normal file
99
models/LLaVA/build/lib/llava/data/split_long_conversation.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Split long conversations based on certain max length.
|
||||
|
||||
Usage: python3 -m fastchat.data.split_long_conversation \
|
||||
--in sharegpt_clean.json \
|
||||
--out sharegpt_split.json \
|
||||
--model-name-or-path $<model-name>
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
from typing import Dict, Sequence, Optional
|
||||
|
||||
import transformers
|
||||
import tqdm
|
||||
|
||||
from llava import conversation as conversation_lib
|
||||
|
||||
DEFAULT_PAD_TOKEN = "[PAD]"
|
||||
BEGIN_SIGNAL = "### "
|
||||
END_SIGNAL = "\n"
|
||||
|
||||
|
||||
def split_sample(sample, start_idx, end_idx):
|
||||
# only ends in the bot because otherwise the last human part is useless.
|
||||
end_speaker = sample["conversations"][end_idx]["from"]
|
||||
end_idx = end_idx + 1 if end_speaker != "human" else end_idx
|
||||
return {
|
||||
"id": sample["id"] + "_" + str(start_idx),
|
||||
"conversations": sample["conversations"][start_idx:end_idx]
|
||||
}
|
||||
|
||||
|
||||
def split_contents(content, begin, end, tokenizer, max_length):
|
||||
"""
|
||||
Keep the maximum round of conversations within the max token length constraint
|
||||
"""
|
||||
content = content[begin:end]
|
||||
new_content = []
|
||||
|
||||
for sample in tqdm.tqdm(content):
|
||||
tokenized_lens = []
|
||||
|
||||
for c in sample["conversations"]:
|
||||
from_str = c["from"]
|
||||
if from_str.lower() == "human":
|
||||
from_str = conversation_lib.default_conversation.roles[0]
|
||||
elif from_str.lower() == "gpt":
|
||||
from_str = conversation_lib.default_conversation.roles[1]
|
||||
else:
|
||||
from_str = 'unknown'
|
||||
|
||||
sentence = (BEGIN_SIGNAL + from_str + ": " + c["value"] +
|
||||
END_SIGNAL)
|
||||
length = tokenizer(sentence, return_tensors="pt", padding="longest"
|
||||
).input_ids.ne(tokenizer.pad_token_id).sum().item()
|
||||
tokenized_lens.append(length)
|
||||
|
||||
num_tokens = 0
|
||||
start_idx = 0
|
||||
for idx, l in enumerate(tokenized_lens):
|
||||
# TODO: shall we also only starts from a specific speaker?
|
||||
if num_tokens + l > max_length:
|
||||
new_content.append(split_sample(sample, start_idx, idx))
|
||||
start_idx = idx
|
||||
num_tokens = l
|
||||
else:
|
||||
num_tokens += l
|
||||
if idx == len(tokenized_lens) - 1:
|
||||
new_content.append(split_sample(sample, start_idx, idx))
|
||||
|
||||
print(f"total: {len(content)}, new: {len(new_content)}")
|
||||
return new_content
|
||||
|
||||
|
||||
def main(args):
|
||||
content = json.load(open(args.in_file, "r"))
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
model_max_length=args.max_length,
|
||||
padding_side="right",
|
||||
use_fast=False,
|
||||
)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
|
||||
content = split_contents(content, args.begin, args.end,
|
||||
tokenizer, args.max_length)
|
||||
json.dump(content, open(args.out_file, "w"), indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--in-file", type=str, required=True)
|
||||
parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
|
||||
parser.add_argument("--begin", type=int)
|
||||
parser.add_argument("--end", type=int)
|
||||
parser.add_argument("--model-name-or-path", type=str, required=True)
|
||||
parser.add_argument("--max-length", type=int, default=2304)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
Reference in New Issue
Block a user