add

2023-05-23 18:24:16 +08:00
parent da758a9ca7
commit b388fba03e
470 changed files with 2523750 additions and 7307 deletions
--- a/models/LLaVA/build/lib/llava/data/init.py
+++ b/models/LLaVA/build/lib/llava/data/init.py
--- a/models/LLaVA/build/lib/llava/data/alpaca-converter.py
+++ b/models/LLaVA/build/lib/llava/data/alpaca-converter.py
@@ -0,0 +1,58 @@
+import argparse
+import json
+import pathlib
+
+# Prompt from stanford alpaca's training script
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
+}
+
+
+def main(args):
+    data_path = pathlib.Path(args.data_path)
+    with data_path.open() as f:
+        data = json.load(f)
+
+    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
+    sources = [
+        prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
+        for example in data
+    ]
+    targets = [example['output'] for example in data]
+
+    new_data = []
+    cnt = 1
+    for s, t in zip(sources, targets):
+        new_data.append({
+            'id': str(cnt),
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': s,
+                },
+                {
+                    'from': 'gpt',
+                    'value': t,
+                }
+            ]
+        })
+        cnt += 1
+
+    json.dump(new_data, open(args.output_path, 'w'), indent=2)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, default='alpaca-data.json')
+    parser.add_argument('--output_path', type=str, default='alpaca-data-conversation.json')
+    args = parser.parse_args()
+    main(args)
+
--- a/models/LLaVA/build/lib/llava/data/clean_sharegpt.py
+++ b/models/LLaVA/build/lib/llava/data/clean_sharegpt.py
@@ -0,0 +1,195 @@
+"""
+- Convert html to markdown with basic data cleaning.
+- Deduplication.
+
+Usage:
+python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
+"""
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import json
+import logging
+import re
+from typing import Dict, Union
+
+import bs4
+import markdownify  # == 0.11.6
+from tqdm import tqdm
+
+
+div_pattern = re.compile("<div.*?>")
+span_pattern = re.compile("<span.*?>")
+code_lang_pattern = re.compile(
+    "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
+)
+code_lang_format = "```\g<1>\n\g<2>\n```"
+regenerate_pattern = re.compile("\d+ / \d+")
+copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
+copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
+
+
+def reformat_code(val: str) -> str:
+    # Input code format is:
+    # ```
+    # $<language>Copy code$<exact_code_here>
+    #
+    # ```
+    # This function convert it into the correct markdown format
+    return re.sub(code_lang_pattern, code_lang_format, val)
+
+
+def html_to_markdown(val: str) -> str:
+    # Remove all <div>. This is required to make intent work in code blocks.
+    val = re.sub(div_pattern, "", val)
+    # Remove all <span>. This is required to make underscores work in code blocks.
+    val = re.sub(span_pattern, "", val)
+    # Markdown to html
+    val = markdownify.markdownify(val).strip()
+    # Reformat code
+    val = reformat_code(val)
+
+    # Remove noisy "[number] / [number]" at the beginning
+    noise = re.search(regenerate_pattern, val)
+    if noise and noise.start() == 0:
+        val = val[noise.end() :]
+    # Remove noisy "Copy[number] chars / [number] words"
+    val = re.sub(copy_chars_pattern, "", val)
+    # Remove empty code block ```\nCopy code\n```
+    val = re.sub(copy_code_pattern, "", val)
+
+    # Strip
+    val = val.replace("\n\n\n", "\n").strip()
+
+    return val
+
+
+def contain_blocked_words(val: str) -> bool:
+    blocked_words = ["openai", "chatgpt"]
+    for w in blocked_words:
+        if w in val.lower():
+            return True
+    return False
+
+
+def clean_html_one_sample(sample):
+    roles = ["human", "gpt"]
+
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+
+    # Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
+    if sample["conversations"][0]["from"] != "human":
+        sample["conversations"] = sample["conversations"][1:]
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+
+    if sample["conversations"][-1]["from"] == "human":
+        sample["conversations"] = sample["conversations"][:-1]
+    if len(sample["conversations"]) <= 1:
+        return (sample, 1)
+
+    for i, c in enumerate(sample["conversations"]):
+        if c["from"] != roles[i % 2]:
+            return (sample, 2)
+
+        if contain_blocked_words(c["value"]):
+            return (sample, 3)
+
+        try:
+            new_val = html_to_markdown(c["value"])
+        except (bs4.builder.ParserRejectedMarkup, AssertionError):
+            return (sample, 4)
+
+        c["value"] = new_val
+
+    return (sample, 0)
+
+
+def clean_html_all(content, begin, end):
+    """
+    Clean the source html files.
+    """
+    cnt_skip = 0
+    cnt_blocked_words = 0
+    cnt_wrong_format = 0
+    cnt_parser_error = 0
+    cnt_too_short = 0
+    cnt_id_duplication = 0
+    cnt_value_duplication = 0
+    cnt_tag = 0
+
+    content = content[begin:end]
+    processed = []
+    with ProcessPoolExecutor() as executor:
+        for result in tqdm(
+            executor.map(clean_html_one_sample, content), total=len(content)
+        ):
+            processed.append(result)
+
+    visited = {}
+    new_content = []
+    for sample, error_code in tqdm(processed):
+        cid = sample["id"]
+        skipped = True
+
+        if error_code != 0:
+            if error_code == 1:
+                print(f"id {cid} is too short")
+                cnt_too_short += 1
+            elif error_code == 2:
+                print(f"id {cid} has a wrong format")
+                cnt_wrong_format += 1
+            elif error_code == 3:
+                print(f"id {cid} contains blocked words")
+                cnt_blocked_words += 1
+            elif error_code == 4:
+                print(f"id {cid} contains parser errors")
+                cnt_parser_error += 1
+            else:
+                raise ValueError(f"Invalid error_code: {error_code}")
+        elif cid in visited:
+            print(f"id {cid} is an id duplication of {visited[cid]}")
+            cnt_id_duplication += 1
+        elif (
+            sample["conversations"][1]["value"],
+            len(sample["conversations"]),
+        ) in visited:
+            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
+            print(f"id {cid} is a value duplication of {visited[key]}")
+            cnt_value_duplication += 1
+        else:
+            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
+            visited[cid] = visited[key] = cid
+            skipped = False
+
+        if not skipped:
+            new_content.append(sample)
+        else:
+            cnt_skip += 1
+
+    print(
+        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
+        f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
+        f"cnt_wrong_format: {cnt_wrong_format}, "
+        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
+        f"cnt_value_duplication: {cnt_value_duplication}, "
+    )
+
+    return new_content
+
+
+def main(args):
+    content = json.load(open(args["in_file"], "r"))
+    content = clean_html_all(content, args["begin"], args["end"])
+    json.dump(content, open(args["out_file"], "w"), indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    main(vars(args))
--- a/models/LLaVA/build/lib/llava/data/inspect.py
+++ b/models/LLaVA/build/lib/llava/data/inspect.py
@@ -0,0 +1,23 @@
+"""
+Usage:
+python3 -m fastchat.data.inspect --in sharegpt_20230322_clean_lang_split.json
+"""
+import argparse
+import json
+
+import tqdm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--begin", type=int)
+    args = parser.parse_args()
+
+    content = json.load(open(args.in_file, "r"))
+    for sample in tqdm.tqdm(content[args.begin:]):
+        print(f"id: {sample['id']}")
+        for conv in sample["conversations"]:
+            print(conv["from"] + ": ")
+            print(conv["value"])
+            input()
--- a/models/LLaVA/build/lib/llava/data/optional_clean.py
+++ b/models/LLaVA/build/lib/llava/data/optional_clean.py
@@ -0,0 +1,80 @@
+"""
+Usage:
+python3 -m fastchat.data.optional_clean --lang en --reduce-rep --in sharegpt_clean.json --out output.json
+python3 -m fastchat.data.optional_clean --skip-lang en --reduce-rep --in sharegpt_clean.json --out output.json
+"""
+import argparse
+import json
+import re
+
+import polyglot
+from polyglot.detect import Detector
+import pycld2
+from tqdm import tqdm
+
+
+def skip(conv, args):
+    # Remove certain languages
+    if args.lang != "all" or args.skip_lang is not None:
+        text = "\n".join([x["value"] for x in conv["conversations"]])
+        try:
+            lang_code = Detector(text).language.code
+        except (pycld2.error, polyglot.detect.base.UnknownLanguage):
+            lang_code = "unknown"
+
+        if args.lang != "all" and lang_code != args.lang:
+            return True
+
+        if lang_code == args.skip_lang:
+            return True
+
+    # Remove repetitive numbers
+    if args.reduce_rep:
+        for sentence in conv["conversations"]:
+            val = sentence["value"]
+            sub = re.search(r"(\d)\1{8}", val)
+            if sub is not None:
+                return True
+
+    return False
+ 
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="")
+    parser.add_argument("--lang", type=str, default="all",
+                        choices=["all", "en"])
+    parser.add_argument("--skip-lang", type=str)
+    # NOTE: Be careful about reduce_rep which may remove some good data.
+    # For example, addresses could have long consecutive 0's
+    parser.add_argument("--reduce-rep", action="store_true")
+    args = parser.parse_args()
+
+    in_file = args.in_file
+    out_file = args.out_file
+    lang = args.lang
+    skip_lang = args.skip_lang
+    reduce_rep = args.reduce_rep
+    assert (lang == "all" or skip_lang is None)
+
+    if out_file == "":
+        out_file = "sharegpt_clean"
+        if lang != "all":
+            out_file += "_" + lang
+        if skip_lang is not None:
+            out_file += "_skip_" + skip_lang
+        if reduce_rep:
+            out_file += "_reduce_rep"
+        out_file += ".json"
+ 
+    content = json.load(open(in_file, "r"))
+    num_conv = len(content)
+
+    new_content = []
+    for conv in tqdm(content):
+        if not skip(conv, args):
+            new_content.append(conv)
+
+    print(f"return {len(new_content)} out of {len(content)}, start dump ...")
+    json.dump(new_content, open(out_file, "w"), indent=2)
--- a/models/LLaVA/build/lib/llava/data/pretty_json.py
+++ b/models/LLaVA/build/lib/llava/data/pretty_json.py
@@ -0,0 +1,20 @@
+"""
+Usage:
+python3 pretty_json.py --in in.json --out out.json
+"""
+
+import argparse
+import json
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, required=True)
+    args = parser.parse_args()
+
+    with open(args.in_file, "r") as fin:
+        data = json.load(fin)
+
+    with open(args.out_file, "w") as fout:
+        json.dump(data, fout, indent=2)
--- a/models/LLaVA/build/lib/llava/data/split_long_conversation.py
+++ b/models/LLaVA/build/lib/llava/data/split_long_conversation.py
@@ -0,0 +1,99 @@
+"""
+Split long conversations based on certain max length.
+
+Usage: python3 -m fastchat.data.split_long_conversation \
+    --in sharegpt_clean.json \
+    --out sharegpt_split.json \
+    --model-name-or-path $<model-name>
+"""
+import argparse
+import json
+from typing import Dict, Sequence, Optional
+
+import transformers
+import tqdm
+
+from llava import conversation as conversation_lib
+
+DEFAULT_PAD_TOKEN = "[PAD]"
+BEGIN_SIGNAL = "### "
+END_SIGNAL = "\n"
+
+
+def split_sample(sample, start_idx, end_idx):
+    # only ends in the bot because otherwise the last human part is useless.
+    end_speaker = sample["conversations"][end_idx]["from"]
+    end_idx = end_idx + 1 if end_speaker != "human" else end_idx
+    return {
+        "id": sample["id"] + "_" + str(start_idx),
+        "conversations": sample["conversations"][start_idx:end_idx]
+    }
+
+
+def split_contents(content, begin, end, tokenizer, max_length):
+    """
+    Keep the maximum round of conversations within the max token length constraint
+    """
+    content = content[begin:end]
+    new_content = []
+
+    for sample in tqdm.tqdm(content):
+        tokenized_lens = []
+
+        for c in sample["conversations"]:
+            from_str = c["from"]
+            if from_str.lower() == "human":
+                from_str = conversation_lib.default_conversation.roles[0]
+            elif from_str.lower() == "gpt":
+                from_str = conversation_lib.default_conversation.roles[1]
+            else:
+                from_str = 'unknown'
+
+            sentence = (BEGIN_SIGNAL + from_str + ": " + c["value"] +
+                        END_SIGNAL)
+            length = tokenizer(sentence, return_tensors="pt", padding="longest"
+                ).input_ids.ne(tokenizer.pad_token_id).sum().item()
+            tokenized_lens.append(length)
+
+        num_tokens = 0
+        start_idx = 0
+        for idx, l in enumerate(tokenized_lens):
+            # TODO: shall we also only starts from a specific speaker?
+            if num_tokens + l > max_length:
+                new_content.append(split_sample(sample, start_idx, idx))
+                start_idx = idx
+                num_tokens = l
+            else:
+                num_tokens += l
+                if idx == len(tokenized_lens) - 1:
+                    new_content.append(split_sample(sample, start_idx, idx))
+
+    print(f"total: {len(content)}, new: {len(new_content)}")
+    return new_content
+
+
+def main(args):
+    content = json.load(open(args.in_file, "r"))
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.model_name_or_path,
+        model_max_length=args.max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
+    content = split_contents(content, args.begin, args.end,
+        tokenizer, args.max_length)
+    json.dump(content, open(args.out_file, "w"), indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-file", type=str, required=True)
+    parser.add_argument("--out-file", type=str, default="sharegpt_split.json")
+    parser.add_argument("--begin", type=int)
+    parser.add_argument("--end", type=int)
+    parser.add_argument("--model-name-or-path", type=str, required=True)
+    parser.add_argument("--max-length", type=int, default=2304)
+    args = parser.parse_args()
+    main(args)