update PaddleOCR result & docTR result

2025-08-09 22:29:33 +07:00
parent f63589a10a
commit 028e3237bb
15 changed files with 12838 additions and 0 deletions
--- a/src/model/text_detector/docTR/.gitignore
+++ b/src/model/text_detector/docTR/.gitignore
@@ -0,0 +1,32 @@
+# Re-include all image files and JSON within this folder (and subfolders)
+
+# PNG
+!*.png
+!**/*.png
+
+# JPG/JPEG
+!*.jpg
+!**/*.jpg
+!*.jpeg
+!**/*.jpeg
+
+# BMP/GIF/TIFF/WEBP
+!*.bmp
+!**/*.bmp
+!*.gif
+!**/*.gif
+!*.tif
+!**/*.tif
+!*.tiff
+!**/*.tiff
+!*.webp
+!**/*.webp
+
+# JSON
+!*.json
+!**/*.json
+
+# Ensure this file itself is tracked
+!.gitignore
+
+
--- a/src/model/text_detector/docTR/CNI.png
+++ b/src/model/text_detector/docTR/CNI.png
--- a/src/model/text_detector/docTR/CNI_doctr.json
+++ b/src/model/text_detector/docTR/CNI_doctr.json
--- a/src/model/text_detector/docTR/CNI_doctr_vis.png
+++ b/src/model/text_detector/docTR/CNI_doctr_vis.png
--- a/src/model/text_detector/docTR/form_doctr.json
+++ b/src/model/text_detector/docTR/form_doctr.json
--- a/src/model/text_detector/docTR/form_doctr_vis.png
+++ b/src/model/text_detector/docTR/form_doctr_vis.png
--- a/src/model/text_detector/docTR/run.py
+++ b/src/model/text_detector/docTR/run.py
@@ -0,0 +1,95 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import matplotlib
+
+# Use non-interactive backend for headless execution/environments
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import cv2
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from doctr.utils.visualization import visualize_page
+
+
+def run_doctr(
+    image_path: str,
+    output_dir: str = "docTR_outputs",
+    det_arch: str = "db_resnet50",
+    reco_arch: str = "crnn_vgg16_bn",
+) -> Path:
+    image_path = Path(image_path)
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Image not found: {image_path}")
+
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+
+    # Load image for visualization
+    bgr = cv2.imread(str(image_path))
+    if bgr is None:
+        raise RuntimeError(f"Failed to read image with OpenCV: {image_path}")
+    image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+
+    # Build predictor
+    predictor = ocr_predictor(det_arch=det_arch, reco_arch=reco_arch, pretrained=True)
+
+    # Inference
+    doc = DocumentFile.from_images(str(image_path))
+    result = predictor(doc)
+
+    # Export structured result to JSON
+    export = result.export()
+    json_path = output_dir_path / f"{image_path.stem}_doctr.json"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(export, f, ensure_ascii=False, indent=2)
+
+    # Visualization
+    page = result.pages[0]
+    page_dict = export["pages"][0]
+    fig = visualize_page(page_dict, image=image)
+    vis_path = output_dir_path / f"{image_path.stem}_doctr_vis.png"
+    fig.savefig(vis_path, dpi=200, bbox_inches="tight")
+    plt.close(fig)
+
+    # Write a simple text file with detected lines (if any)
+    lines = []
+    for block in page.blocks:
+        for line in block.lines:
+            text = " ".join([word.value for word in line.words])
+            if text:
+                lines.append(text)
+    if lines:
+        txt_path = output_dir_path / f"{image_path.stem}_doctr.txt"
+        txt_path.write_text("\n".join(lines), encoding="utf-8")
+
+    return vis_path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run docTR OCR on an image")
+    parser.add_argument("--image", required=True, help="Path to input image")
+    parser.add_argument(
+        "--output-dir",
+        default="docTR_outputs",
+        help="Directory to store outputs (JSON/visualization)",
+    )
+    parser.add_argument("--det-arch", default="db_resnet50", help="Detection architecture")
+    parser.add_argument("--reco-arch", default="crnn_vgg16_bn", help="Recognition architecture")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    vis_path = run_doctr(
+        image_path=args.image,
+        output_dir=args.output_dir,
+        det_arch=args.det_arch,
+        reco_arch=args.reco_arch,
+    )
+    print(f"Saved visualization to: {vis_path}")
+
+