From f5cbec66817b643f732f11cdc7b3a550963b70dc Mon Sep 17 00:00:00 2001
From: 99Franklin <358622371@qq.com>
Date: Mon, 30 Dec 2024 19:30:31 +0800
Subject: [PATCH] add OCRBench v2
---
OCRBench/README.md | 58 +
example.py => OCRBench/example.py | 0
{images => OCRBench/images}/GPT4V_Gemini.png | Bin
{images => OCRBench/images}/all_data.png | Bin
OCRBench/{ => json_files}/FullTest.json | 0
OCRBench/{ => json_files}/OCRBench.json | 0
{scripts => OCRBench/scripts}/GPT4V.py | 0
{scripts => OCRBench/scripts}/Genimi.py | 0
{scripts => OCRBench/scripts}/LLaVA1_5.py | 0
{scripts => OCRBench/scripts}/MiniMonkey.py | 0
{scripts => OCRBench/scripts}/blip2.py | 0
.../scripts}/blip2_vicuna_instruct.py | 0
{scripts => OCRBench/scripts}/bliva.py | 0
{scripts => OCRBench/scripts}/interlm.py | 0
{scripts => OCRBench/scripts}/interlm2.py | 0
{scripts => OCRBench/scripts}/internvl2_s | 0
{scripts => OCRBench/scripts}/intervl.py | 0
{scripts => OCRBench/scripts}/llavar.py | 0
.../scripts}/mPLUG-DocOwl15.py | 0
{scripts => OCRBench/scripts}/mPLUG-owl.py | 0
{scripts => OCRBench/scripts}/mPLUG-owl2.py | 0
{scripts => OCRBench/scripts}/minigpt4v2.py | 0
{scripts => OCRBench/scripts}/monkey.py | 0
{scripts => OCRBench/scripts}/qwenvl.py | 0
{scripts => OCRBench/scripts}/qwenvl_api.py | 0
OCRBench_v2/README.md | 84 +
OCRBench_v2/eval_scripts/IoUscore_metric.py | 91 +
OCRBench_v2/eval_scripts/TEDS_metric.py | 931 +
.../IoUscore_metric.cpython-310.pyc | Bin 0 -> 2739 bytes
.../__pycache__/TEDS_metric.cpython-310.pyc | Bin 0 -> 27162 bytes
.../page_ocr_metric.cpython-310.pyc | Bin 0 -> 1416 bytes
.../__pycache__/parallel.cpython-310.pyc | Bin 0 -> 2166 bytes
.../spotting_metric.cpython-310.pyc | Bin 0 -> 4248 bytes
.../__pycache__/vqa_metric.cpython-310.pyc | Bin 0 -> 5347 bytes
OCRBench_v2/eval_scripts/eval.py | 381 +
OCRBench_v2/eval_scripts/get_score.py | 125 +
OCRBench_v2/eval_scripts/page_ocr_metric.py | 50 +
OCRBench_v2/eval_scripts/parallel.py | 50 +
.../eval_scripts/spotting_eval/__init__.py | 0
.../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 178 bytes
.../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 162 bytes
.../rrc_evaluation_funcs_1_1.cpython-310.pyc | Bin 0 -> 15564 bytes
.../rrc_evaluation_funcs_1_1.cpython-39.pyc | Bin 0 -> 15753 bytes
.../__pycache__/script.cpython-310.pyc | Bin 0 -> 10895 bytes
.../__pycache__/script.cpython-39.pyc | Bin 0 -> 10798 bytes
OCRBench_v2/eval_scripts/spotting_eval/gt.zip | Bin 0 -> 236 bytes
.../spotting_eval/gt/gt_img_0.txt | 6 +
.../eval_scripts/spotting_eval/readme.txt | 26 +
.../eval_scripts/spotting_eval/results.zip | Bin 0 -> 1596 bytes
.../spotting_eval/rrc_evaluation_funcs_1_1.py | 456 +
.../eval_scripts/spotting_eval/script.py | 451 +
.../script_test_ch4_t4_e1-1577983164.zip | Bin 0 -> 113280 bytes
.../eval_scripts/spotting_eval/submit.zip | Bin 0 -> 158 bytes
.../spotting_eval/submit/res_img_0.txt | 1 +
OCRBench_v2/eval_scripts/spotting_metric.py | 184 +
OCRBench_v2/eval_scripts/vqa_metric.py | 282 +
OCRBench_v2/pred_folder/internvl2_5_26b.json | 149267 +++++++++++++++
OCRBench_v2/requirements.txt | 12 +
README.md | 48 +-
59 files changed, 152469 insertions(+), 34 deletions(-)
create mode 100644 OCRBench/README.md
rename example.py => OCRBench/example.py (100%)
rename {images => OCRBench/images}/GPT4V_Gemini.png (100%)
rename {images => OCRBench/images}/all_data.png (100%)
rename OCRBench/{ => json_files}/FullTest.json (100%)
rename OCRBench/{ => json_files}/OCRBench.json (100%)
rename {scripts => OCRBench/scripts}/GPT4V.py (100%)
rename {scripts => OCRBench/scripts}/Genimi.py (100%)
rename {scripts => OCRBench/scripts}/LLaVA1_5.py (100%)
rename {scripts => OCRBench/scripts}/MiniMonkey.py (100%)
rename {scripts => OCRBench/scripts}/blip2.py (100%)
rename {scripts => OCRBench/scripts}/blip2_vicuna_instruct.py (100%)
rename {scripts => OCRBench/scripts}/bliva.py (100%)
rename {scripts => OCRBench/scripts}/interlm.py (100%)
rename {scripts => OCRBench/scripts}/interlm2.py (100%)
rename {scripts => OCRBench/scripts}/internvl2_s (100%)
rename {scripts => OCRBench/scripts}/intervl.py (100%)
rename {scripts => OCRBench/scripts}/llavar.py (100%)
rename {scripts => OCRBench/scripts}/mPLUG-DocOwl15.py (100%)
rename {scripts => OCRBench/scripts}/mPLUG-owl.py (100%)
rename {scripts => OCRBench/scripts}/mPLUG-owl2.py (100%)
rename {scripts => OCRBench/scripts}/minigpt4v2.py (100%)
rename {scripts => OCRBench/scripts}/monkey.py (100%)
rename {scripts => OCRBench/scripts}/qwenvl.py (100%)
rename {scripts => OCRBench/scripts}/qwenvl_api.py (100%)
create mode 100644 OCRBench_v2/README.md
create mode 100644 OCRBench_v2/eval_scripts/IoUscore_metric.py
create mode 100644 OCRBench_v2/eval_scripts/TEDS_metric.py
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/IoUscore_metric.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/TEDS_metric.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/page_ocr_metric.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/parallel.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/spotting_metric.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/__pycache__/vqa_metric.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/eval.py
create mode 100644 OCRBench_v2/eval_scripts/get_score.py
create mode 100644 OCRBench_v2/eval_scripts/page_ocr_metric.py
create mode 100644 OCRBench_v2/eval_scripts/parallel.py
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__init__.py
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/__init__.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/__init__.cpython-39.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/rrc_evaluation_funcs_1_1.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/rrc_evaluation_funcs_1_1.cpython-39.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/script.cpython-310.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/__pycache__/script.cpython-39.pyc
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/gt.zip
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/gt/gt_img_0.txt
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/readme.txt
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/results.zip
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/rrc_evaluation_funcs_1_1.py
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/script.py
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/script_test_ch4_t4_e1-1577983164.zip
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/submit.zip
create mode 100644 OCRBench_v2/eval_scripts/spotting_eval/submit/res_img_0.txt
create mode 100644 OCRBench_v2/eval_scripts/spotting_metric.py
create mode 100644 OCRBench_v2/eval_scripts/vqa_metric.py
create mode 100644 OCRBench_v2/pred_folder/internvl2_5_26b.json
create mode 100644 OCRBench_v2/requirements.txt
diff --git a/OCRBench/README.md b/OCRBench/README.md
new file mode 100644
index 0000000..440308c
--- /dev/null
+++ b/OCRBench/README.md
@@ -0,0 +1,58 @@
+# OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models
+
+
+> Large models have recently played a dominant role in natural language processing and multimodal vision-language learning. However, their effectiveness in text-related visual tasks remains relatively unexplored. In this paper, we conducted a comprehensive evaluation of Large Multimodal Models, such as GPT4V and Gemini, in various text-related visual tasks including Text Recognition, Scene Text-Centric Visual Question Answering (VQA), Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten Mathematical Expression Recognition (HMER). To facilitate the assessment of Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we propose OCRBench, a comprehensive evaluation benchmark. Our study encompasses 29 datasets, making it the most comprehensive OCR evaluation benchmark available. Furthermore, our study reveals both the strengths and weaknesses of these models, particularly in handling multilingual text, handwritten text, non-semantic text, and mathematical expression recognition. Most importantly, the baseline results showcased in this study could provide a foundational framework for the conception and assessment of innovative strategies targeted at enhancing zero-shot multimodal techniques.
+
+**[Project Page [This Page]](https://github.com/Yuliang-Liu/MultimodalOCR)** | **[Paper](https://arxiv.org/abs/2305.07895)** |**[OCRBench Leaderboard](https://huggingface.co/spaces/echo840/ocrbench-leaderboard)**|**[Opencompass Leaderboard](https://rank.opencompass.org.cn/leaderboard-multimodal)**|
+
+
+# Data
+| Data | Link | Description |
+| --- | --- | --- |
+| Full Test Json | [Full Test](./json_files/FullTest.json) | This file contains the test data used in Table 1 and Table 2 from [Paper](https://arxiv.org/abs/2305.07895). |
+| OCRBench Json | [OCRBench](./json_files/OCRBench.json) | This file contains the test data in OCRBench used in Table3 from [Paper](https://arxiv.org/abs/2305.07895). |
+| All Test Images |[All Images](https://drive.google.com/file/d/1U5AtLoJ7FrJe9yfcbssfeLmlKb7dTosc/view?usp=drive_link) | This file contains all the testing images used in [Paper](https://arxiv.org/abs/2305.07895), including OCRBench Images.|
+| OCRBench Images | [OCRBench Images](https://drive.google.com/file/d/1a3VRJx3V3SdOmPr7499Ky0Ug8AwqGUHO/view?usp=drive_link) | This file only contains the images used in OCRBench. |
+| Test Results | [Test Results](https://drive.google.com/drive/folders/15XlHCuNTavI1Ihqm4G7u3J34BHpkaqyE?usp=drive_link) | This file file contains the result files for the test models. |
+
+
+# OCRBench
+
+OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.
+
+You can find the results of Large Multimodal Models in **[OCRBench Leaderboard](https://huggingface.co/spaces/echo840/ocrbench-leaderboard)**, if you would like to include your model in the OCRBench leaderboard, please follow the evaluation instructions provided below and feel free to contact us via email at zhangli123@hust.edu.cn. We will update the leaderboard in time.
+
+
+
+# Evaluation
+The test code for evaluating models in the paper can be found in [scripts](./scripts). Before conducting the evaluation, you need to configure the model weights and environment based on the official code link provided in the scripts. If you want to evaluate other models, please edit the "TODO" things in [example](./example.py).
+
+You can also use [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) and [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) for evaluation.
+
+Example evaluation scripts:
+```python
+
+python ./scripts/monkey.py --image_folder ./OCRBench_Images --OCRBench_file ./OCRBench/OCRBench.json --save_name Monkey_OCRBench --num_workers GPU_Nums # Test on OCRBench
+python ./scripts/monkey.py --image_folder ./OCRBench_Images --OCRBench_file ./OCRBench/FullTest.json --save_name Monkey_FullTest --num_workers GPU_Nums # Full Test
+
+```
+
+# Citation
+If you wish to refer to the baseline results published here, please use the following BibTeX entries:
+```BibTeX
+@article{Liu_2024,
+ title={OCRBench: on the hidden mystery of OCR in large multimodal models},
+ volume={67},
+ ISSN={1869-1919},
+ url={http://dx.doi.org/10.1007/s11432-024-4235-6},
+ DOI={10.1007/s11432-024-4235-6},
+ number={12},
+ journal={Science China Information Sciences},
+ publisher={Springer Science and Business Media LLC},
+ author={Liu, Yuliang and Li, Zhang and Huang, Mingxin and Yang, Biao and Yu, Wenwen and Li, Chunyuan and Yin, Xu-Cheng and Liu, Cheng-Lin and Jin, Lianwen and Bai, Xiang},
+ year={2024},
+ month=dec }
+```
+
+
+
diff --git a/example.py b/OCRBench/example.py
similarity index 100%
rename from example.py
rename to OCRBench/example.py
diff --git a/images/GPT4V_Gemini.png b/OCRBench/images/GPT4V_Gemini.png
similarity index 100%
rename from images/GPT4V_Gemini.png
rename to OCRBench/images/GPT4V_Gemini.png
diff --git a/images/all_data.png b/OCRBench/images/all_data.png
similarity index 100%
rename from images/all_data.png
rename to OCRBench/images/all_data.png
diff --git a/OCRBench/FullTest.json b/OCRBench/json_files/FullTest.json
similarity index 100%
rename from OCRBench/FullTest.json
rename to OCRBench/json_files/FullTest.json
diff --git a/OCRBench/OCRBench.json b/OCRBench/json_files/OCRBench.json
similarity index 100%
rename from OCRBench/OCRBench.json
rename to OCRBench/json_files/OCRBench.json
diff --git a/scripts/GPT4V.py b/OCRBench/scripts/GPT4V.py
similarity index 100%
rename from scripts/GPT4V.py
rename to OCRBench/scripts/GPT4V.py
diff --git a/scripts/Genimi.py b/OCRBench/scripts/Genimi.py
similarity index 100%
rename from scripts/Genimi.py
rename to OCRBench/scripts/Genimi.py
diff --git a/scripts/LLaVA1_5.py b/OCRBench/scripts/LLaVA1_5.py
similarity index 100%
rename from scripts/LLaVA1_5.py
rename to OCRBench/scripts/LLaVA1_5.py
diff --git a/scripts/MiniMonkey.py b/OCRBench/scripts/MiniMonkey.py
similarity index 100%
rename from scripts/MiniMonkey.py
rename to OCRBench/scripts/MiniMonkey.py
diff --git a/scripts/blip2.py b/OCRBench/scripts/blip2.py
similarity index 100%
rename from scripts/blip2.py
rename to OCRBench/scripts/blip2.py
diff --git a/scripts/blip2_vicuna_instruct.py b/OCRBench/scripts/blip2_vicuna_instruct.py
similarity index 100%
rename from scripts/blip2_vicuna_instruct.py
rename to OCRBench/scripts/blip2_vicuna_instruct.py
diff --git a/scripts/bliva.py b/OCRBench/scripts/bliva.py
similarity index 100%
rename from scripts/bliva.py
rename to OCRBench/scripts/bliva.py
diff --git a/scripts/interlm.py b/OCRBench/scripts/interlm.py
similarity index 100%
rename from scripts/interlm.py
rename to OCRBench/scripts/interlm.py
diff --git a/scripts/interlm2.py b/OCRBench/scripts/interlm2.py
similarity index 100%
rename from scripts/interlm2.py
rename to OCRBench/scripts/interlm2.py
diff --git a/scripts/internvl2_s b/OCRBench/scripts/internvl2_s
similarity index 100%
rename from scripts/internvl2_s
rename to OCRBench/scripts/internvl2_s
diff --git a/scripts/intervl.py b/OCRBench/scripts/intervl.py
similarity index 100%
rename from scripts/intervl.py
rename to OCRBench/scripts/intervl.py
diff --git a/scripts/llavar.py b/OCRBench/scripts/llavar.py
similarity index 100%
rename from scripts/llavar.py
rename to OCRBench/scripts/llavar.py
diff --git a/scripts/mPLUG-DocOwl15.py b/OCRBench/scripts/mPLUG-DocOwl15.py
similarity index 100%
rename from scripts/mPLUG-DocOwl15.py
rename to OCRBench/scripts/mPLUG-DocOwl15.py
diff --git a/scripts/mPLUG-owl.py b/OCRBench/scripts/mPLUG-owl.py
similarity index 100%
rename from scripts/mPLUG-owl.py
rename to OCRBench/scripts/mPLUG-owl.py
diff --git a/scripts/mPLUG-owl2.py b/OCRBench/scripts/mPLUG-owl2.py
similarity index 100%
rename from scripts/mPLUG-owl2.py
rename to OCRBench/scripts/mPLUG-owl2.py
diff --git a/scripts/minigpt4v2.py b/OCRBench/scripts/minigpt4v2.py
similarity index 100%
rename from scripts/minigpt4v2.py
rename to OCRBench/scripts/minigpt4v2.py
diff --git a/scripts/monkey.py b/OCRBench/scripts/monkey.py
similarity index 100%
rename from scripts/monkey.py
rename to OCRBench/scripts/monkey.py
diff --git a/scripts/qwenvl.py b/OCRBench/scripts/qwenvl.py
similarity index 100%
rename from scripts/qwenvl.py
rename to OCRBench/scripts/qwenvl.py
diff --git a/scripts/qwenvl_api.py b/OCRBench/scripts/qwenvl_api.py
similarity index 100%
rename from scripts/qwenvl_api.py
rename to OCRBench/scripts/qwenvl_api.py
diff --git a/OCRBench_v2/README.md b/OCRBench_v2/README.md
new file mode 100644
index 0000000..f4bd189
--- /dev/null
+++ b/OCRBench_v2/README.md
@@ -0,0 +1,84 @@
+# OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning
+
+> Scoring the Optical Character Recognition (OCR) capabilities of Large Multimodal Models (LMMs) has witnessed growing interest recently. Existing benchmarks have highlighted the impressive performance of LMMs in text recognition; however, their abilities in certain challenging tasks, such as text localization, handwritten content extraction, and logical reasoning, remain underexplored. To bridge this gap, we introduce OCRBench v2, a large-scale bilingual text-centric benchmark with currently the most comprehensive set of tasks (4X more tasks than the previous multi-scene benchmark OCRBench), the widest coverage of scenarios (31 diverse scenarios including street scene, receipt, formula, diagram, and so on), and thorough evaluation metrics, with a total of 10,000 human-verified question-answering pairs and a high proportion of difficult samples. After carefully benchmarking state-of-the-art LMMs on OCRBench v2, we find that 36 out of 38 LMMs score below 50 (100 in total) and suffer from five-type limitations, including less frequently encountered text recognition, fine-grained perception, layout perception, complex element parsing, and logical reasoning.
+
+**[Project Page](https://github.com/Yuliang-Liu/MultimodalOCR)** | **Paper(Coming soon)** | **[OCRBench Leaderboard](https://huggingface.co/spaces/ling99/OCRBench-v2-leaderboard)**
+
+
+
+
+
+# Data
+You can download OCRBench v2 from [Google Drive](https://drive.google.com/file/d/1Hk1TMu--7nr5vJ7iaNwMQZ_Iw9W_KI3C/view?usp=sharing)
+After downloading and extracting the dataset, the directory structure is as follows:
+```
+OCRBench_v2/
+├── EN_part/
+├── CN_part/
+├── OCRBench_v2.json
+```
+# Evaluation
+
+## Environment
+All Python dependencies required for the evaluation process are specified in the **requirements.txt**.
+To set up the environment, simply run the following commands in the project directory:
+```python
+conda create -n ocrbench_v2 python==3.10 -y
+conda activate ocrbench_v2
+pip install -r requirements.txt
+```
+
+## Inference
+To evaluate the model's performance on OCRBench v2, please save the model's inference results in the JSON file within the `predict` field.
+
+Example structure of the JSON file:
+
+```json
+{
+ [
+ "dataset_name": "xx",
+ "type": "xx",
+ "id": 0,
+ "image_path": "xx",
+ "question": "xx",
+ "answers": [
+ "xx"
+ ],
+ "predict": "xx"
+ ]
+ ...
+}
+```
+
+## Evaluation Scripts
+After obtaining the inference results of the model, you can see use the bellow scripts to obtain the final score of OCRBench v2. `./pred_folder/internvl2_5_26b.json` is an example inference result of InternVL2.5-26B with [VLMEvalKit](https://github.com/open-compass/VLMEvalKit). You can use `./eval_scripts/eval.py` to get the score for each samples, and the results are saved under `./res_folder`.
+
+```python
+python ./eval_scripts/eval.py --input_path ./pred_folder/internvl2_5_26b.json --output_path ./res_folder/internvl2_5_26b.json
+```
+
+After obtaining the scores for all samples, you can use `./eval_scripts/get_score.py` to get the metrics for OCRBench v2.
+
+```python
+python ./eval_scripts/get_score.py --json_file ./res_folder/internvl2_5_26b.json
+```
+
+# Leaderboard
+
+## Performance of LMMs on English subsets
+
+
+
+
+ +## Performance of LMMs on English subsets + +
+
+
+ +# Copyright Statement +The data are collected from public datasets and community user contributions. This dataset is for research purposes only and not for commercial use. If there are any copyright concerns, please contact ling_fu@hust.edu.cn. + +# Citation +Coming soon diff --git a/OCRBench_v2/eval_scripts/IoUscore_metric.py b/OCRBench_v2/eval_scripts/IoUscore_metric.py new file mode 100644 index 0000000..6af265e --- /dev/null +++ b/OCRBench_v2/eval_scripts/IoUscore_metric.py @@ -0,0 +1,91 @@ +import os +import re +import ast +import ipdb +from vqa_metric import vqa_evaluation + + +def calculate_iou(box1, box2): + + try: + box1 = [int(coordinate) for coordinate in box1] + box2 = [int(coordinate) for coordinate in box2] + except: + return 0 + + x1_inter = max(box1[0], box2[0]) + y1_inter = max(box1[1], box2[1]) + x2_inter = min(box1[2], box2[2]) + y2_inter = min(box1[3], box2[3]) + + inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter) + + box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) + box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]) + + union_area = box1_area + box2_area - inter_area + + iou = inter_area / union_area if union_area != 0 else 0 + + return iou + + +def vqa_with_position_evaluation(predict, img_metas): + + score_content, score_bbox = .0, .0 + if "answer" in predict.keys(): + score_content = vqa_evaluation(predict["answer"], img_metas["answers"]) + if "bbox" in predict.keys(): + gt_bbox = img_metas["bbox"] + try: + predict_bbox_list = ast.literal_eval(predict["bbox"]) + score_bbox = calculate_iou(predict_bbox_list, gt_bbox) + except: + score_bbox = 0 + return 0.5 * score_content + 0.5 * score_bbox + + +def extract_coordinates(text): + # Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format + + pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]' + + matches = list(re.finditer(pattern, text)) + coords_list = [] + coords_set = set() + for match in matches: + + x1, y1, x2, y2 = map(int, match.groups()) + + if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]): + coords = (x1, y1, x2, y2) + + if coords in coords_set: + coords_list = [c for c in coords_list if c != coords] + + coords_list.append(coords) + coords_set.add(coords) + if coords_list: + last_coords = coords_list[-1] + return list(last_coords) + else: + return None + + +if __name__ == "__main__": + + print("Example for Text Grounding task.") + box1 = [50, 50, 150, 150] + box2 = [60, 60, 140, 140] + iou_score = calculate_iou(box1, box2) + print(f"IoU score: {iou_score}") + + print("Example for VQA with position task.") + pred = {"content": "The content is Hello Buddies", "bbox": box1} + gt = {"content": "Hello Buddies", "bbox": box2} + + vqa_score = vqa_evaluation(pred["content"], gt["content"]) + iou_score = calculate_iou(pred["bbox"], gt["bbox"]) + + print(f"VQA score: {vqa_score}") + print(f"IoU score: {iou_score}") diff --git a/OCRBench_v2/eval_scripts/TEDS_metric.py b/OCRBench_v2/eval_scripts/TEDS_metric.py new file mode 100644 index 0000000..7e7ccf6 --- /dev/null +++ b/OCRBench_v2/eval_scripts/TEDS_metric.py @@ -0,0 +1,931 @@ +# Copyright 2020 IBM +# Author: peter.zhong@au1.ibm.com +# +# This is free software; you can redistribute it and/or modify +# it under the terms of the Apache 2.0 License. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Apache 2.0 License for more details. + +import re +import ast +import json +import ipdb +import distance +from apted import APTED, Config +from itertools import product +from apted.helpers import Tree +from lxml import etree, html +from collections import deque +from parallel import parallel_process +from tqdm import tqdm +from zss import simple_distance, Node +import string +from typing import Any, Callable, Optional, Sequence +import numpy as np +import Levenshtein +import editdistance + + +class TableTree(Tree): + def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation""" + if self.tag == 'td': + result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ + (self.tag, self.colspan, self.rowspan, self.content) + else: + result = '"tag": %s' % self.tag + for child in self.children: + result += child.bracket() + return "{{{}}}".format(result) + + +class CustomConfig(Config): + @staticmethod + def maximum(*sequences): + """Get maximum possible value + """ + return max(map(len, sequences)) + + def normalized_distance(self, *sequences): + """Get distance from 0 to 1 + """ + return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) + + def rename(self, node1, node2): + """Compares attributes of trees""" + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == 'td': + if node1.content or node2.content: + return self.normalized_distance(node1.content, node2.content) + return 0. + + +class TEDS(object): + ''' Tree Edit Distance basead Similarity + ''' + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1' + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + ''' Tokenizes table cells + ''' + self.__tokens__.append('<%s>' % node.tag) + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != 'unk': + self.__tokens__.append('%s>' % node.tag) + if node.tag != 'td' and node.tail is not None: + self.__tokens__ += list(node.tail) + + def load_html_tree(self, node, parent=None): + ''' Converts HTML tree to the format required by apted + ''' + global __tokens__ + if node.tag == 'td': + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + new_node = TableTree(node.tag, + int(node.attrib.get('colspan', '1')), + int(node.attrib.get('rowspan', '1')), + cell, *deque()) + else: + new_node = TableTree(node.tag, None, None, None, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != 'td': + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred, true): + ''' Computes TEDS score between the prediction and the ground truth of a + given sample + ''' + if (not pred) or (not true): + return 0.0 + parser = html.HTMLParser(remove_comments=True, encoding='utf-8') + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + #print("pred:",pred) + #print("true:",true) + if pred.xpath('body/table') and true.xpath('body/table'): + pred = pred.xpath('body/table')[0] + true = true.xpath('body/table')[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + def batch_evaluate(self, pred_json, true_json): + ''' Computes TEDS score between the prediction and the ground truth of + a batch of samples + @params pred_json: {'FILENAME': 'HTML CODE', ...} + @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + @output: {'FILENAME': 'TEDS SCORE', ...} + ''' + samples = true_json.keys() + if self.n_jobs == 1: + scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)] + else: + #inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples] + inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]} for filename in samples] + scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1) + scores = dict(zip(samples, scores)) + return scores + + +def convert_table_to_html_str(table_row_list=[]): + """ + Given a list of table rows, build the corresponding html string, which is used to compute the TEDS score. + We use the official code of PubTabNet to compute TEDS score, it does not consider '
{cell_str} | " + html_table_str += "
{key} | {value_str} |