update PaddleOCR result & docTR result

2025-08-09 22:29:33 +07:00
parent f63589a10a
commit 028e3237bb
15 changed files with 12838 additions and 0 deletions
--- a/src/model/text_detector/PaddleOCR/.gitignore
+++ b/src/model/text_detector/PaddleOCR/.gitignore
@@ -0,0 +1,32 @@
+# Re-include all image files and JSON within this folder (and subfolders)
+
+# PNG
+!*.png
+!**/*.png
+
+# JPG/JPEG
+!*.jpg
+!**/*.jpg
+!*.jpeg
+!**/*.jpeg
+
+# BMP/GIF/TIFF/WEBP
+!*.bmp
+!**/*.bmp
+!*.gif
+!**/*.gif
+!*.tif
+!**/*.tif
+!*.tiff
+!**/*.tiff
+!*.webp
+!**/*.webp
+
+# JSON
+!*.json
+!**/*.json
+
+# Ensure this file itself is tracked
+!.gitignore
+
+
--- a/src/model/text_detector/PaddleOCR/CNI.png
+++ b/src/model/text_detector/PaddleOCR/CNI.png
--- a/src/model/text_detector/PaddleOCR/CNI_r.json
+++ b/src/model/text_detector/PaddleOCR/CNI_r.json
--- a/src/model/text_detector/PaddleOCR/CNI_r.png
+++ b/src/model/text_detector/PaddleOCR/CNI_r.png
--- a/src/model/text_detector/PaddleOCR/im1.png
+++ b/src/model/text_detector/PaddleOCR/im1.png
--- a/src/model/text_detector/PaddleOCR/main.py
+++ b/src/model/text_detector/PaddleOCR/main.py
@@ -0,0 +1,643 @@
+import argparse
+import json
+import os
+import sys
+from typing import List, Tuple, Any, Optional, Dict
+
+from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageOps
+import numpy as np
+import cv2
+from paddleocr import PaddleOCR
+try:  # Some paddleocr builds do not expose draw_ocr at top-level
+    from paddleocr import draw_ocr as paddle_draw_ocr  # type: ignore
+except Exception:  # pragma: no cover - environment dependent
+    paddle_draw_ocr = None
+
+
+def resolve_font_path() -> str:
+    """Return a system font path that supports Latin accents (Windows-friendly)."""
+    candidate_fonts: List[str] = []
+
+    # Prefer Arial Unicode or Arial on Windows
+    windir = os.environ.get("WINDIR", r"C:\\Windows")
+    candidate_fonts.extend(
+        [
+            os.path.join(windir, "Fonts", "arialuni.ttf"),
+            os.path.join(windir, "Fonts", "arial.ttf"),
+            os.path.join(windir, "Fonts", "tahoma.ttf"),
+            os.path.join(windir, "Fonts", "seguiemj.ttf"),
+        ]
+    )
+
+    # Common fallbacks on other platforms
+    candidate_fonts.extend(
+        [
+            "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",  # macOS
+            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",  # Linux
+        ]
+    )
+
+    for font in candidate_fonts:
+        if os.path.exists(font):
+            return font
+    return ""  # paddleocr will still draw, but non-ASCII may not render correctly
+
+
+def build_argparser() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Simple PaddleOCR runner for single image"
+    )
+    parser.add_argument(
+        "--image",
+        default="im1.png",
+        help="Path to input image (default: im1.png)",
+    )
+    parser.add_argument(
+        "--lang",
+        default="fr",
+        help="Language code for PaddleOCR (try: fr, en, vi, etc.). Default: fr",
+    )
+    parser.add_argument(
+        "--out-json",
+        default="ocr_result.json",
+        help="Path to save raw OCR results as JSON",
+    )
+    parser.add_argument(
+        "--out-image",
+        default="ocr_vis.png",
+        help="Path to save visualization image with boxes and text",
+    )
+    parser.add_argument(
+        "--poly-source",
+        choices=["dt", "rec", "auto"],
+        default="auto",
+        help="Which polygons to visualize: detection(dt), recognition(rec) or auto (prefer dt)",
+    )
+    parser.add_argument(
+        "--box-color",
+        default="#FF0000",
+        help="Outline color for polygons (hex like #RRGGBB or 'R,G,B').",
+    )
+    parser.add_argument(
+        "--fill-color",
+        default="#FF000033",
+        help="Fill color with alpha for polygons (e.g., #FF000033).",
+    )
+    parser.add_argument(
+        "--box-width",
+        type=int,
+        default=3,
+        help="Outline width for polygons.",
+    )
+    parser.add_argument(
+        "--scale-ratio",
+        type=float,
+        default=None,
+        help="Scale polygons around centroid (e.g., 0.96 to shrink, 1.05 to expand).",
+    )
+    parser.add_argument(
+        "--shrink-ratio",
+        type=float,
+        default=None,
+        help="Deprecated: use --scale-ratio. If given, will be used when --scale-ratio is not set.",
+    )
+    parser.add_argument(
+        "--offset-x",
+        type=int,
+        default=0,
+        help="Shift polygons horizontally (pixels). Positive moves right.",
+    )
+    parser.add_argument(
+        "--offset-y",
+        type=int,
+        default=0,
+        help="Shift polygons vertically (pixels). Positive moves down.",
+    )
+    parser.add_argument(
+        "--scale-x",
+        type=float,
+        default=None,
+        help="Non-uniform scale along X around centroid (e.g., 1.1). Overrides --scale-ratio on X if set.",
+    )
+    parser.add_argument(
+        "--scale-y",
+        type=float,
+        default=None,
+        help="Non-uniform scale along Y around centroid (e.g., 1.05). Overrides --scale-ratio on Y if set.",
+    )
+    parser.add_argument(
+        "--pad-x",
+        type=int,
+        default=0,
+        help="Pixels to pad left/right after scaling.",
+    )
+    parser.add_argument(
+        "--pad-y",
+        type=int,
+        default=0,
+        help="Pixels to pad top/bottom after scaling.",
+    )
+    parser.add_argument(
+        "--rect",
+        choices=["poly", "axis", "rotated"],
+        default="poly",
+        help="Visualization mode: polygon, axis-aligned rectangle, or rotated min-area rectangle.",
+    )
+    parser.add_argument(
+        "--enhance",
+        action="store_true",
+        help="Apply light image enhancement (contrast/sharpen) before OCR.",
+    )
+    parser.add_argument(
+        "--rec-model",
+        default="",
+        help=(
+            "Recognition model name to force (e.g. latin_PP-OCRv5_server_rec, "
+            "latin_PP-OCRv5_mobile_rec, en_PP-OCRv4_rec, en_PP-OCRv3_rec)."
+        ),
+    )
+    parser.add_argument(
+        "--det-model",
+        default="",
+        help=(
+            "Detection model name to force (e.g. PP-OCRv5_server_det, "
+            "PP-OCRv5_mobile_det)."
+        ),
+    )
+    parser.add_argument(
+        "--ocr-version",
+        default="",
+        help="Optional OCR version hint (e.g., PP-OCRv5).",
+    )
+    parser.add_argument(
+        "--rec-score-thresh",
+        type=float,
+        default=0.0,
+        help="Minimum recognition score to keep a text (default 0.0 to keep all)",
+    )
+    parser.add_argument(
+        "--det-db-thresh",
+        type=float,
+        default=None,
+        help="DB detector binary threshold (e.g., 0.2).",
+    )
+    parser.add_argument(
+        "--det-db-box-thresh",
+        type=float,
+        default=None,
+        help="DB detector box threshold (e.g., 0.3-0.6). Lower to get more boxes.",
+    )
+    parser.add_argument(
+        "--det-db-unclip",
+        type=float,
+        default=None,
+        help="DB detector unclip ratio (e.g., 1.5).",
+    )
+    parser.add_argument(
+        "--det-limit-side",
+        type=int,
+        default=None,
+        help="text_det_limit_side_len. Smaller may speed up and merge boxes.",
+    )
+    return parser.parse_args()
+
+
+def initialize_ocr(preferred_lang: str, args: argparse.Namespace) -> Tuple[PaddleOCR, str]:
+    """Initialize PaddleOCR, trying the preferred language first then fallbacks."""
+    tried_errors: List[str] = []
+    candidate_langs: List[str] = []
+    # Keep order but deduplicate
+    for code in [preferred_lang, "fr", "en"]:
+        if code not in candidate_langs:
+            candidate_langs.append(code)
+
+    for code in candidate_langs:
+        # First, try with requested model overrides (if any)
+        try:
+            init_kwargs: Dict[str, Any] = {
+                "lang": code,
+                "use_textline_orientation": True,
+            }
+            if args.rec_model:
+                init_kwargs["text_recognition_model_name"] = args.rec_model
+            if args.det_model:
+                init_kwargs["text_detection_model_name"] = args.det_model
+            if args.ocr_version:
+                init_kwargs["ocr_version"] = args.ocr_version
+            # thresholds & limits (kwargs accepted by PaddleOCR >=3.x)
+            if args.rec_score_thresh is not None:
+                init_kwargs["text_rec_score_thresh"] = float(args.rec_score_thresh)
+            if args.det_db_thresh is not None:
+                init_kwargs["text_det_db_thresh"] = float(args.det_db_thresh)
+            if args.det_db_box_thresh is not None:
+                init_kwargs["text_det_db_box_thresh"] = float(args.det_db_box_thresh)
+            # Some builds may not support unclip ratio as kwarg; skip if unsupported
+            if args.det_limit_side is not None:
+                init_kwargs["text_det_limit_side_len"] = int(args.det_limit_side)
+            ocr = PaddleOCR(**init_kwargs)
+            return ocr, code
+        except Exception as exc_with_models:  # pragma: no cover
+            tried_errors.append(f"{code} (with models): {exc_with_models}")
+            # Fallback: try default models for this language
+            try:
+                base_kwargs: Dict[str, Any] = {
+                    "lang": code,
+                    "use_textline_orientation": True,
+                }
+                if args.rec_score_thresh is not None:
+                    base_kwargs["text_rec_score_thresh"] = float(args.rec_score_thresh)
+                if args.det_limit_side is not None:
+                    base_kwargs["text_det_limit_side_len"] = int(args.det_limit_side)
+                if args.det_db_thresh is not None:
+                    base_kwargs["text_det_db_thresh"] = float(args.det_db_thresh)
+                if args.det_db_box_thresh is not None:
+                    base_kwargs["text_det_db_box_thresh"] = float(args.det_db_box_thresh)
+                # Skip setting unclip if unsupported in this build
+                ocr = PaddleOCR(**base_kwargs)
+                return ocr, code
+            except Exception as exc_default:
+                tried_errors.append(f"{code} (default): {exc_default}")
+
+    raise RuntimeError(
+        "Failed to initialize PaddleOCR. Tried languages: "
+        + ", ".join(candidate_langs)
+        + "\nErrors: "
+        + " | ".join(tried_errors)
+    )
+
+
+def main() -> None:
+    args = build_argparser()
+
+    if not os.path.exists(args.image):
+        print(f"[ERROR] Image not found: {args.image}")
+        sys.exit(1)
+
+    try:
+        ocr, used_lang = initialize_ocr(args.lang, args)
+    except Exception as init_exc:
+        print(f"[ERROR] {init_exc}")
+        sys.exit(2)
+
+    chosen_rec = args.rec_model if args.rec_model else "auto"
+    chosen_det = args.det_model if args.det_model else "auto"
+    print(
+        f"[INFO] Running OCR on '{args.image}' with lang='{used_lang}', "
+        f"rec='{chosen_rec}', det='{chosen_det}' ..."
+    )
+    # Prefer new API for PaddleOCR >=3.x, fallback to legacy .ocr
+    try:
+        result: Any = ocr.predict(args.image)  # type: ignore[assignment]
+    except Exception:
+        result = ocr.ocr(args.image)  # type: ignore[assignment]
+
+    # Optional light enhancement for Latin documents to make OCR tighter
+    def maybe_enhance_image(path: str) -> str:
+        if not args.enhance:
+            return path
+        try:
+            img = Image.open(path).convert("RGB")
+            # Slight auto-contrast and sharpening
+            img = ImageOps.autocontrast(img, cutoff=1)
+            img = ImageEnhance.Sharpness(img).enhance(1.4)
+            img = ImageEnhance.Contrast(img).enhance(1.1)
+            tmp_path = os.path.splitext(path)[0] + "_enh.png"
+            img.save(tmp_path)
+            return tmp_path
+        except Exception:
+            return path
+
+    # Prefer to draw detection polygons only (no text) for clarity
+    def extract_polygons(res: Any) -> List[List[Tuple[int, int]]]:
+        def to_tuple_list(poly_any: Any) -> List[Tuple[int, int]]:
+            try:
+                return [(int(p[0]), int(p[1])) for p in list(poly_any)]
+            except Exception:
+                return []
+
+        # Common outputs in new pipelines
+        if isinstance(res, dict):
+            if args.poly_source in ("auto", "dt"):
+                polys = res.get("dt_polys") or res.get("det_polygons")
+            else:
+                polys = None
+            if polys is None and args.poly_source in ("auto", "rec"):
+                polys = res.get("rec_polys")
+            if polys is None:
+                polys = res.get("polygons") or res.get("boxes")
+            if polys is None:
+                return []
+            return [to_tuple_list(poly) for poly in list(polys)]
+
+        if isinstance(res, list) and len(res) > 0:
+            # Often result is a list with a single dict
+            if isinstance(res[0], dict):
+                return extract_polygons(res[0])
+
+        return []
+
+    # Normalize result across PaddleOCR versions
+    def poly_to_list(poly_any: Any) -> List[List[int]]:
+        try:
+            # numpy array path
+            if hasattr(poly_any, "tolist"):
+                lst = poly_any.tolist()
+                return [[int(p[0]), int(p[1])] for p in lst]
+        except Exception:
+            pass
+        try:
+            return [[int(p[0]), int(p[1])] for p in list(poly_any)]
+        except Exception:
+            return []
+    def parse_lines(res: Any) -> List[Dict[str, Any]]:
+        items: List[Dict[str, Any]] = []
+        # Case 0: top-level dict output (some 3.x pipelines)
+        if isinstance(res, dict):
+            polys = res.get("det_polygons") or res.get("boxes") or res.get("polygons")
+            texts = res.get("rec_texts") or res.get("rec_text") or res.get("texts") or []
+            scores = res.get("rec_scores") or res.get("scores") or []
+            if isinstance(texts, str):
+                texts = [texts]
+            if polys is None:
+                polys = [None] * len(texts)
+            if not isinstance(scores, list):
+                try:
+                    scores = list(scores)
+                except Exception:
+                    scores = [None] * len(texts)
+            if len(scores) < len(texts):
+                scores = list(scores) + [None] * (len(texts) - len(scores))
+            for poly, text, score in zip(polys, texts, scores):
+                try:
+                    score_val = float(score) if score is not None else None
+                except Exception:
+                    score_val = None
+                items.append({"text": str(text), "score": score_val, "box": poly_to_list(poly) if poly is not None else None})
+            return items
+        if isinstance(res, list) and len(res) > 0:
+            # Special: list with a single dict that holds batched arrays (rec_texts, rec_scores, dt_polys, ...)
+            if len(res) == 1 and isinstance(res[0], dict) and (
+                "rec_texts" in res[0] or "texts" in res[0]
+            ):
+                obj = res[0]
+                texts = obj.get("rec_texts") or obj.get("texts") or []
+                scores = obj.get("rec_scores") or obj.get("scores") or []
+                boxes = obj.get("rec_polys") or obj.get("dt_polys") or []
+                # Normalize lengths
+                n = min(len(texts), len(scores) if hasattr(scores, "__len__") else len(texts), len(boxes) if hasattr(boxes, "__len__") else len(texts))
+                out: List[Dict[str, Any]] = []
+                for i in range(n):
+                    txt = texts[i]
+                    try:
+                        sc = float(scores[i])
+                    except Exception:
+                        sc = None
+                    bx = boxes[i] if i < len(boxes) else None
+                    out.append({"text": str(txt), "score": sc, "box": poly_to_list(bx) if bx is not None else None})
+                return out
+
+            # Case A: legacy format [[ [poly], (text, score) ], ...] wrapped by [ ... ]
+            if isinstance(res[0], list) and len(res[0]) > 0 and isinstance(res[0][0], list):
+                lines_local = res[0]
+                for line in lines_local:
+                    if not isinstance(line, (list, tuple)) or len(line) < 2:
+                        continue
+                    box = line[0]
+                    text: str = ""
+                    score: Optional[float] = None
+                    payload = line[1]
+                    if isinstance(payload, (list, tuple)) and len(payload) >= 1:
+                        text = str(payload[0])
+                        if len(payload) >= 2:
+                            try:
+                                score = float(payload[1])
+                            except Exception:
+                                score = None
+                    elif isinstance(payload, str):
+                        text = payload
+                        if len(line) >= 3:
+                            try:
+                                score = float(line[2])
+                            except Exception:
+                                score = None
+                    items.append({"text": text, "score": score, "box": box})
+                return items
+
+            # Case B: new format already a flat list of dicts or lists per detection
+            # Try dict format first
+            if isinstance(res[0], dict):
+                for obj in res:
+                    box = obj.get("box") or obj.get("poly") or obj.get("bbox") or obj.get("det_polygons")
+                    text = obj.get("text") or obj.get("rec_text") or ""
+                    score = obj.get("score") or obj.get("rec_score")
+                    try:
+                        score = float(score) if score is not None else None
+                    except Exception:
+                        score = None
+                    items.append({"text": str(text), "score": score, "box": poly_to_list(box) if box is not None else None})
+                return items
+
+            # Case C: flat list of [poly, text, (maybe score)]
+            if isinstance(res[0], (list, tuple)):
+                for line in res:
+                    if not isinstance(line, (list, tuple)) or len(line) < 2:
+                        continue
+                    box = line[0]
+                    text = str(line[1])
+                    score: Optional[float] = None
+                    if len(line) >= 3:
+                        try:
+                            score = float(line[2])
+                        except Exception:
+                            score = None
+                    items.append({"text": text, "score": score, "box": poly_to_list(box) if box is not None else None})
+                return items
+        return items
+
+    parsed = parse_lines(result)
+    det_polys = extract_polygons(result)
+
+    # Additionally collect both dt and rec polygons for JSON output
+    def extract_both(res: Any) -> Tuple[List[List[Tuple[int, int]]], List[List[Tuple[int, int]]]]:
+        def to_tuple_list(poly_any: Any) -> List[Tuple[int, int]]:
+            try:
+                if hasattr(poly_any, "tolist"):
+                    poly_any = poly_any.tolist()
+                return [(int(p[0]), int(p[1])) for p in list(poly_any)]
+            except Exception:
+                return []
+        if isinstance(res, dict):
+            dt = res.get("dt_polys") or res.get("det_polygons") or []
+            rc = res.get("rec_polys") or []
+            return [to_tuple_list(p) for p in list(dt)], [to_tuple_list(p) for p in list(rc)]
+        if isinstance(res, list) and len(res) > 0 and isinstance(res[0], dict):
+            return extract_both(res[0])
+        return [], []
+
+    all_dt_polys, all_rec_polys = extract_both(result)
+
+    # Print quick summary to console
+    print("\n[TEXT]\n" + "\n".join([p["text"] for p in parsed]))
+
+    # Save JSON
+    with open(args.out_json, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "image": os.path.abspath(args.image),
+                "language": used_lang,
+                "num_items": len(parsed),
+                "items": parsed,
+                "poly_source": args.poly_source,
+                "det_polygons": [[list(pt) for pt in poly] for poly in all_dt_polys],
+                "rec_polygons": [[list(pt) for pt in poly] for poly in all_rec_polys],
+                "enhance": bool(args.enhance),
+                "recognition_model": chosen_rec,
+                "detection_model": chosen_det,
+                "ocr_version": args.ocr_version or "auto",
+                "box_color": args.box_color,
+                "fill_color": args.fill_color,
+                "box_width": int(args.box_width),
+                "scale_ratio": float(args.scale_ratio) if args.scale_ratio is not None else None,
+                "offset_x": int(args.offset_x),
+                "offset_y": int(args.offset_y),
+            },
+            f,
+            ensure_ascii=False,
+            indent=2,
+        )
+    print(f"[INFO] Saved JSON: {args.out_json}")
+    # Also store raw result for debugging purposes
+    try:
+        with open("raw_result.txt", "w", encoding="utf-8") as rf:
+            rf.write(repr(result))
+    except Exception:
+        pass
+
+    # Draw and save visualization (only polygons)
+    image = Image.open(args.image).convert("RGBA")
+    canvas = image.copy()
+    overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    # Prefer detection polygons from pipeline; fallback to parsed boxes
+    polygons: List[List[Tuple[int, int]]] = det_polys
+    if not polygons:
+        polygons = [
+            [tuple(p) for p in (box or [])]  # type: ignore[misc]
+            for box in [p.get("box") for p in parsed]
+            if box
+        ]
+    def parse_color(color_str: str, default=(255, 0, 0, 255)) -> Tuple[int, int, int, int]:
+        try:
+            if color_str.startswith("#"):
+                hexv = color_str.lstrip("#")
+                if len(hexv) == 6:
+                    r = int(hexv[0:2], 16)
+                    g = int(hexv[2:4], 16)
+                    b = int(hexv[4:6], 16)
+                    return (r, g, b, 255)
+                if len(hexv) == 8:
+                    r = int(hexv[0:2], 16)
+                    g = int(hexv[2:4], 16)
+                    b = int(hexv[4:6], 16)
+                    a = int(hexv[6:8], 16)
+                    return (r, g, b, a)
+            else:
+                parts = [int(x) for x in color_str.split(",")]
+                if len(parts) == 3:
+                    return (parts[0], parts[1], parts[2], 255)
+                if len(parts) == 4:
+                    return (parts[0], parts[1], parts[2], parts[3])
+        except Exception:
+            pass
+        return default
+
+    outline_rgba = parse_color(args.box_color)
+    fill_rgba = parse_color(args.fill_color, default=(255, 0, 0, 51))
+
+    def transform_polygon(
+        poly: List[Tuple[int, int]],
+        scale: Optional[float],
+        dx: int,
+        dy: int,
+        scale_x: Optional[float] = None,
+        scale_y: Optional[float] = None,
+        pad_x: int = 0,
+        pad_y: int = 0,
+    ) -> List[Tuple[int, int]]:
+        if not poly:
+            return poly
+        cx = sum(p[0] for p in poly) / len(poly)
+        cy = sum(p[1] for p in poly) / len(poly)
+        out: List[Tuple[int, int]] = []
+        for (x, y) in poly:
+            sx = scale_x if scale_x is not None else scale
+            sy = scale_y if scale_y is not None else scale
+            if sx is not None:
+                x = cx + sx * (x - cx)
+            if sy is not None:
+                y = cy + sy * (y - cy)
+            out.append((int(round(x + dx)), int(round(y + dy))))
+        # pad expands rect-like by moving points outwards along axes
+        if pad_x or pad_y:
+            out = [(x - pad_x if x < cx else x + pad_x, y - pad_y if y < cy else y + pad_y) for (x, y) in out]
+        return out
+
+    vis_polys: List[List[Tuple[int, int]]] = []
+    def draw_axis_aligned(draw_obj, pts: List[Tuple[int, int]]):
+        xs = [p[0] for p in pts]
+        ys = [p[1] for p in pts]
+        box = [(min(xs), min(ys)), (max(xs), min(ys)), (max(xs), max(ys)), (min(xs), max(ys))]
+        draw_obj.polygon(box, outline=outline_rgba, fill=fill_rgba)
+        draw_obj.line(box + [box[0]], fill=outline_rgba, width=args.box_width)
+
+    def draw_rotated(draw_obj, pts: List[Tuple[int, int]]):
+        cnt = np.array(pts, dtype=np.int32).reshape(-1, 1, 2)
+        rect = cv2.minAreaRect(cnt)
+        box = cv2.boxPoints(rect)
+        box = np.int0(box)
+        poly = [(int(x), int(y)) for x, y in box.tolist()]
+        draw_obj.polygon(poly, outline=outline_rgba, fill=fill_rgba)
+        draw_obj.line(poly + [poly[0]], fill=outline_rgba, width=args.box_width)
+
+    for poly in polygons:
+        if len(poly) >= 3:
+            # Backward-compat: if scale-ratio not provided, use shrink-ratio (<1.0)
+            scale = args.scale_ratio if args.scale_ratio is not None else args.shrink_ratio
+            sp = transform_polygon(
+                poly,
+                scale,
+                args.offset_x,
+                args.offset_y,
+                args.scale_x,
+                args.scale_y,
+                args.pad_x,
+                args.pad_y,
+            )
+            vis_polys.append(sp)
+            if args.rect == "axis":
+                draw_axis_aligned(draw, sp)
+            elif args.rect == "rotated":
+                draw_rotated(draw, sp)
+            else:
+                draw.polygon(sp, outline=outline_rgba, fill=fill_rgba)
+                draw.line(sp + [sp[0]], fill=outline_rgba, width=args.box_width)
+
+    out = Image.alpha_composite(canvas, overlay).convert("RGB")
+    out.save(args.out_image)
+    print(f"[INFO] Saved visualization: {args.out_image}")
+
+    # Append the visualization polygons to JSON file for exact reproducibility
+    try:
+        with open(args.out_json, "r", encoding="utf-8") as fjson:
+            data = json.load(fjson)
+        data["vis_polygons"] = [[list(pt) for pt in poly] for poly in vis_polys]
+        with open(args.out_json, "w", encoding="utf-8") as fjson:
+            json.dump(data, fjson, ensure_ascii=False, indent=2)
+    except Exception:
+        pass
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/src/model/text_detector/PaddleOCR/ocr_result.json
+++ b/src/model/text_detector/PaddleOCR/ocr_result.json
--- a/src/model/text_detector/PaddleOCR/ocr_vis.png
+++ b/src/model/text_detector/PaddleOCR/ocr_vis.png
--- a/src/model/text_detector/docTR/.gitignore
+++ b/src/model/text_detector/docTR/.gitignore
@@ -0,0 +1,32 @@
+# Re-include all image files and JSON within this folder (and subfolders)
+
+# PNG
+!*.png
+!**/*.png
+
+# JPG/JPEG
+!*.jpg
+!**/*.jpg
+!*.jpeg
+!**/*.jpeg
+
+# BMP/GIF/TIFF/WEBP
+!*.bmp
+!**/*.bmp
+!*.gif
+!**/*.gif
+!*.tif
+!**/*.tif
+!*.tiff
+!**/*.tiff
+!*.webp
+!**/*.webp
+
+# JSON
+!*.json
+!**/*.json
+
+# Ensure this file itself is tracked
+!.gitignore
+
+
--- a/src/model/text_detector/docTR/CNI.png
+++ b/src/model/text_detector/docTR/CNI.png
--- a/src/model/text_detector/docTR/CNI_doctr.json
+++ b/src/model/text_detector/docTR/CNI_doctr.json
--- a/src/model/text_detector/docTR/CNI_doctr_vis.png
+++ b/src/model/text_detector/docTR/CNI_doctr_vis.png
--- a/src/model/text_detector/docTR/form_doctr.json
+++ b/src/model/text_detector/docTR/form_doctr.json
--- a/src/model/text_detector/docTR/form_doctr_vis.png
+++ b/src/model/text_detector/docTR/form_doctr_vis.png
--- a/src/model/text_detector/docTR/run.py
+++ b/src/model/text_detector/docTR/run.py
@@ -0,0 +1,95 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import matplotlib
+
+# Use non-interactive backend for headless execution/environments
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import cv2
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from doctr.utils.visualization import visualize_page
+
+
+def run_doctr(
+    image_path: str,
+    output_dir: str = "docTR_outputs",
+    det_arch: str = "db_resnet50",
+    reco_arch: str = "crnn_vgg16_bn",
+) -> Path:
+    image_path = Path(image_path)
+    if not image_path.is_file():
+        raise FileNotFoundError(f"Image not found: {image_path}")
+
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+
+    # Load image for visualization
+    bgr = cv2.imread(str(image_path))
+    if bgr is None:
+        raise RuntimeError(f"Failed to read image with OpenCV: {image_path}")
+    image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+
+    # Build predictor
+    predictor = ocr_predictor(det_arch=det_arch, reco_arch=reco_arch, pretrained=True)
+
+    # Inference
+    doc = DocumentFile.from_images(str(image_path))
+    result = predictor(doc)
+
+    # Export structured result to JSON
+    export = result.export()
+    json_path = output_dir_path / f"{image_path.stem}_doctr.json"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(export, f, ensure_ascii=False, indent=2)
+
+    # Visualization
+    page = result.pages[0]
+    page_dict = export["pages"][0]
+    fig = visualize_page(page_dict, image=image)
+    vis_path = output_dir_path / f"{image_path.stem}_doctr_vis.png"
+    fig.savefig(vis_path, dpi=200, bbox_inches="tight")
+    plt.close(fig)
+
+    # Write a simple text file with detected lines (if any)
+    lines = []
+    for block in page.blocks:
+        for line in block.lines:
+            text = " ".join([word.value for word in line.words])
+            if text:
+                lines.append(text)
+    if lines:
+        txt_path = output_dir_path / f"{image_path.stem}_doctr.txt"
+        txt_path.write_text("\n".join(lines), encoding="utf-8")
+
+    return vis_path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run docTR OCR on an image")
+    parser.add_argument("--image", required=True, help="Path to input image")
+    parser.add_argument(
+        "--output-dir",
+        default="docTR_outputs",
+        help="Directory to store outputs (JSON/visualization)",
+    )
+    parser.add_argument("--det-arch", default="db_resnet50", help="Detection architecture")
+    parser.add_argument("--reco-arch", default="crnn_vgg16_bn", help="Recognition architecture")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    vis_path = run_doctr(
+        image_path=args.image,
+        output_dir=args.output_dir,
+        det_arch=args.det_arch,
+        reco_arch=args.reco_arch,
+    )
+    print(f"Saved visualization to: {vis_path}")
+
+