update PaddleOCR result & docTR result
32
src/model/text_detector/PaddleOCR/.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# Re-include all image files and JSON within this folder (and subfolders)
|
||||
|
||||
# PNG
|
||||
!*.png
|
||||
!**/*.png
|
||||
|
||||
# JPG/JPEG
|
||||
!*.jpg
|
||||
!**/*.jpg
|
||||
!*.jpeg
|
||||
!**/*.jpeg
|
||||
|
||||
# BMP/GIF/TIFF/WEBP
|
||||
!*.bmp
|
||||
!**/*.bmp
|
||||
!*.gif
|
||||
!**/*.gif
|
||||
!*.tif
|
||||
!**/*.tif
|
||||
!*.tiff
|
||||
!**/*.tiff
|
||||
!*.webp
|
||||
!**/*.webp
|
||||
|
||||
# JSON
|
||||
!*.json
|
||||
!**/*.json
|
||||
|
||||
# Ensure this file itself is tracked
|
||||
!.gitignore
|
||||
|
||||
|
BIN
src/model/text_detector/PaddleOCR/CNI.png
Normal file
After Width: | Height: | Size: 1.9 MiB |
1544
src/model/text_detector/PaddleOCR/CNI_r.json
Normal file
BIN
src/model/text_detector/PaddleOCR/CNI_r.png
Normal file
After Width: | Height: | Size: 1.6 MiB |
BIN
src/model/text_detector/PaddleOCR/im1.png
Normal file
After Width: | Height: | Size: 5.6 MiB |
643
src/model/text_detector/PaddleOCR/main.py
Normal file
@@ -0,0 +1,643 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from typing import List, Tuple, Any, Optional, Dict
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageOps
|
||||
import numpy as np
|
||||
import cv2
|
||||
from paddleocr import PaddleOCR
|
||||
try: # Some paddleocr builds do not expose draw_ocr at top-level
|
||||
from paddleocr import draw_ocr as paddle_draw_ocr # type: ignore
|
||||
except Exception: # pragma: no cover - environment dependent
|
||||
paddle_draw_ocr = None
|
||||
|
||||
|
||||
def resolve_font_path() -> str:
|
||||
"""Return a system font path that supports Latin accents (Windows-friendly)."""
|
||||
candidate_fonts: List[str] = []
|
||||
|
||||
# Prefer Arial Unicode or Arial on Windows
|
||||
windir = os.environ.get("WINDIR", r"C:\\Windows")
|
||||
candidate_fonts.extend(
|
||||
[
|
||||
os.path.join(windir, "Fonts", "arialuni.ttf"),
|
||||
os.path.join(windir, "Fonts", "arial.ttf"),
|
||||
os.path.join(windir, "Fonts", "tahoma.ttf"),
|
||||
os.path.join(windir, "Fonts", "seguiemj.ttf"),
|
||||
]
|
||||
)
|
||||
|
||||
# Common fallbacks on other platforms
|
||||
candidate_fonts.extend(
|
||||
[
|
||||
"/System/Library/Fonts/Supplemental/Arial Unicode.ttf", # macOS
|
||||
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", # Linux
|
||||
]
|
||||
)
|
||||
|
||||
for font in candidate_fonts:
|
||||
if os.path.exists(font):
|
||||
return font
|
||||
return "" # paddleocr will still draw, but non-ASCII may not render correctly
|
||||
|
||||
|
||||
def build_argparser() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Simple PaddleOCR runner for single image"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image",
|
||||
default="im1.png",
|
||||
help="Path to input image (default: im1.png)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang",
|
||||
default="fr",
|
||||
help="Language code for PaddleOCR (try: fr, en, vi, etc.). Default: fr",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-json",
|
||||
default="ocr_result.json",
|
||||
help="Path to save raw OCR results as JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-image",
|
||||
default="ocr_vis.png",
|
||||
help="Path to save visualization image with boxes and text",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--poly-source",
|
||||
choices=["dt", "rec", "auto"],
|
||||
default="auto",
|
||||
help="Which polygons to visualize: detection(dt), recognition(rec) or auto (prefer dt)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--box-color",
|
||||
default="#FF0000",
|
||||
help="Outline color for polygons (hex like #RRGGBB or 'R,G,B').",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fill-color",
|
||||
default="#FF000033",
|
||||
help="Fill color with alpha for polygons (e.g., #FF000033).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--box-width",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Outline width for polygons.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale-ratio",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Scale polygons around centroid (e.g., 0.96 to shrink, 1.05 to expand).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shrink-ratio",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Deprecated: use --scale-ratio. If given, will be used when --scale-ratio is not set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offset-x",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Shift polygons horizontally (pixels). Positive moves right.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offset-y",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Shift polygons vertically (pixels). Positive moves down.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale-x",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Non-uniform scale along X around centroid (e.g., 1.1). Overrides --scale-ratio on X if set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scale-y",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Non-uniform scale along Y around centroid (e.g., 1.05). Overrides --scale-ratio on Y if set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pad-x",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Pixels to pad left/right after scaling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pad-y",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Pixels to pad top/bottom after scaling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rect",
|
||||
choices=["poly", "axis", "rotated"],
|
||||
default="poly",
|
||||
help="Visualization mode: polygon, axis-aligned rectangle, or rotated min-area rectangle.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enhance",
|
||||
action="store_true",
|
||||
help="Apply light image enhancement (contrast/sharpen) before OCR.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rec-model",
|
||||
default="",
|
||||
help=(
|
||||
"Recognition model name to force (e.g. latin_PP-OCRv5_server_rec, "
|
||||
"latin_PP-OCRv5_mobile_rec, en_PP-OCRv4_rec, en_PP-OCRv3_rec)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--det-model",
|
||||
default="",
|
||||
help=(
|
||||
"Detection model name to force (e.g. PP-OCRv5_server_det, "
|
||||
"PP-OCRv5_mobile_det)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ocr-version",
|
||||
default="",
|
||||
help="Optional OCR version hint (e.g., PP-OCRv5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rec-score-thresh",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Minimum recognition score to keep a text (default 0.0 to keep all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--det-db-thresh",
|
||||
type=float,
|
||||
default=None,
|
||||
help="DB detector binary threshold (e.g., 0.2).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--det-db-box-thresh",
|
||||
type=float,
|
||||
default=None,
|
||||
help="DB detector box threshold (e.g., 0.3-0.6). Lower to get more boxes.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--det-db-unclip",
|
||||
type=float,
|
||||
default=None,
|
||||
help="DB detector unclip ratio (e.g., 1.5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--det-limit-side",
|
||||
type=int,
|
||||
default=None,
|
||||
help="text_det_limit_side_len. Smaller may speed up and merge boxes.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def initialize_ocr(preferred_lang: str, args: argparse.Namespace) -> Tuple[PaddleOCR, str]:
|
||||
"""Initialize PaddleOCR, trying the preferred language first then fallbacks."""
|
||||
tried_errors: List[str] = []
|
||||
candidate_langs: List[str] = []
|
||||
# Keep order but deduplicate
|
||||
for code in [preferred_lang, "fr", "en"]:
|
||||
if code not in candidate_langs:
|
||||
candidate_langs.append(code)
|
||||
|
||||
for code in candidate_langs:
|
||||
# First, try with requested model overrides (if any)
|
||||
try:
|
||||
init_kwargs: Dict[str, Any] = {
|
||||
"lang": code,
|
||||
"use_textline_orientation": True,
|
||||
}
|
||||
if args.rec_model:
|
||||
init_kwargs["text_recognition_model_name"] = args.rec_model
|
||||
if args.det_model:
|
||||
init_kwargs["text_detection_model_name"] = args.det_model
|
||||
if args.ocr_version:
|
||||
init_kwargs["ocr_version"] = args.ocr_version
|
||||
# thresholds & limits (kwargs accepted by PaddleOCR >=3.x)
|
||||
if args.rec_score_thresh is not None:
|
||||
init_kwargs["text_rec_score_thresh"] = float(args.rec_score_thresh)
|
||||
if args.det_db_thresh is not None:
|
||||
init_kwargs["text_det_db_thresh"] = float(args.det_db_thresh)
|
||||
if args.det_db_box_thresh is not None:
|
||||
init_kwargs["text_det_db_box_thresh"] = float(args.det_db_box_thresh)
|
||||
# Some builds may not support unclip ratio as kwarg; skip if unsupported
|
||||
if args.det_limit_side is not None:
|
||||
init_kwargs["text_det_limit_side_len"] = int(args.det_limit_side)
|
||||
ocr = PaddleOCR(**init_kwargs)
|
||||
return ocr, code
|
||||
except Exception as exc_with_models: # pragma: no cover
|
||||
tried_errors.append(f"{code} (with models): {exc_with_models}")
|
||||
# Fallback: try default models for this language
|
||||
try:
|
||||
base_kwargs: Dict[str, Any] = {
|
||||
"lang": code,
|
||||
"use_textline_orientation": True,
|
||||
}
|
||||
if args.rec_score_thresh is not None:
|
||||
base_kwargs["text_rec_score_thresh"] = float(args.rec_score_thresh)
|
||||
if args.det_limit_side is not None:
|
||||
base_kwargs["text_det_limit_side_len"] = int(args.det_limit_side)
|
||||
if args.det_db_thresh is not None:
|
||||
base_kwargs["text_det_db_thresh"] = float(args.det_db_thresh)
|
||||
if args.det_db_box_thresh is not None:
|
||||
base_kwargs["text_det_db_box_thresh"] = float(args.det_db_box_thresh)
|
||||
# Skip setting unclip if unsupported in this build
|
||||
ocr = PaddleOCR(**base_kwargs)
|
||||
return ocr, code
|
||||
except Exception as exc_default:
|
||||
tried_errors.append(f"{code} (default): {exc_default}")
|
||||
|
||||
raise RuntimeError(
|
||||
"Failed to initialize PaddleOCR. Tried languages: "
|
||||
+ ", ".join(candidate_langs)
|
||||
+ "\nErrors: "
|
||||
+ " | ".join(tried_errors)
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = build_argparser()
|
||||
|
||||
if not os.path.exists(args.image):
|
||||
print(f"[ERROR] Image not found: {args.image}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
ocr, used_lang = initialize_ocr(args.lang, args)
|
||||
except Exception as init_exc:
|
||||
print(f"[ERROR] {init_exc}")
|
||||
sys.exit(2)
|
||||
|
||||
chosen_rec = args.rec_model if args.rec_model else "auto"
|
||||
chosen_det = args.det_model if args.det_model else "auto"
|
||||
print(
|
||||
f"[INFO] Running OCR on '{args.image}' with lang='{used_lang}', "
|
||||
f"rec='{chosen_rec}', det='{chosen_det}' ..."
|
||||
)
|
||||
# Prefer new API for PaddleOCR >=3.x, fallback to legacy .ocr
|
||||
try:
|
||||
result: Any = ocr.predict(args.image) # type: ignore[assignment]
|
||||
except Exception:
|
||||
result = ocr.ocr(args.image) # type: ignore[assignment]
|
||||
|
||||
# Optional light enhancement for Latin documents to make OCR tighter
|
||||
def maybe_enhance_image(path: str) -> str:
|
||||
if not args.enhance:
|
||||
return path
|
||||
try:
|
||||
img = Image.open(path).convert("RGB")
|
||||
# Slight auto-contrast and sharpening
|
||||
img = ImageOps.autocontrast(img, cutoff=1)
|
||||
img = ImageEnhance.Sharpness(img).enhance(1.4)
|
||||
img = ImageEnhance.Contrast(img).enhance(1.1)
|
||||
tmp_path = os.path.splitext(path)[0] + "_enh.png"
|
||||
img.save(tmp_path)
|
||||
return tmp_path
|
||||
except Exception:
|
||||
return path
|
||||
|
||||
# Prefer to draw detection polygons only (no text) for clarity
|
||||
def extract_polygons(res: Any) -> List[List[Tuple[int, int]]]:
|
||||
def to_tuple_list(poly_any: Any) -> List[Tuple[int, int]]:
|
||||
try:
|
||||
return [(int(p[0]), int(p[1])) for p in list(poly_any)]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# Common outputs in new pipelines
|
||||
if isinstance(res, dict):
|
||||
if args.poly_source in ("auto", "dt"):
|
||||
polys = res.get("dt_polys") or res.get("det_polygons")
|
||||
else:
|
||||
polys = None
|
||||
if polys is None and args.poly_source in ("auto", "rec"):
|
||||
polys = res.get("rec_polys")
|
||||
if polys is None:
|
||||
polys = res.get("polygons") or res.get("boxes")
|
||||
if polys is None:
|
||||
return []
|
||||
return [to_tuple_list(poly) for poly in list(polys)]
|
||||
|
||||
if isinstance(res, list) and len(res) > 0:
|
||||
# Often result is a list with a single dict
|
||||
if isinstance(res[0], dict):
|
||||
return extract_polygons(res[0])
|
||||
|
||||
return []
|
||||
|
||||
# Normalize result across PaddleOCR versions
|
||||
def poly_to_list(poly_any: Any) -> List[List[int]]:
|
||||
try:
|
||||
# numpy array path
|
||||
if hasattr(poly_any, "tolist"):
|
||||
lst = poly_any.tolist()
|
||||
return [[int(p[0]), int(p[1])] for p in lst]
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return [[int(p[0]), int(p[1])] for p in list(poly_any)]
|
||||
except Exception:
|
||||
return []
|
||||
def parse_lines(res: Any) -> List[Dict[str, Any]]:
|
||||
items: List[Dict[str, Any]] = []
|
||||
# Case 0: top-level dict output (some 3.x pipelines)
|
||||
if isinstance(res, dict):
|
||||
polys = res.get("det_polygons") or res.get("boxes") or res.get("polygons")
|
||||
texts = res.get("rec_texts") or res.get("rec_text") or res.get("texts") or []
|
||||
scores = res.get("rec_scores") or res.get("scores") or []
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
if polys is None:
|
||||
polys = [None] * len(texts)
|
||||
if not isinstance(scores, list):
|
||||
try:
|
||||
scores = list(scores)
|
||||
except Exception:
|
||||
scores = [None] * len(texts)
|
||||
if len(scores) < len(texts):
|
||||
scores = list(scores) + [None] * (len(texts) - len(scores))
|
||||
for poly, text, score in zip(polys, texts, scores):
|
||||
try:
|
||||
score_val = float(score) if score is not None else None
|
||||
except Exception:
|
||||
score_val = None
|
||||
items.append({"text": str(text), "score": score_val, "box": poly_to_list(poly) if poly is not None else None})
|
||||
return items
|
||||
if isinstance(res, list) and len(res) > 0:
|
||||
# Special: list with a single dict that holds batched arrays (rec_texts, rec_scores, dt_polys, ...)
|
||||
if len(res) == 1 and isinstance(res[0], dict) and (
|
||||
"rec_texts" in res[0] or "texts" in res[0]
|
||||
):
|
||||
obj = res[0]
|
||||
texts = obj.get("rec_texts") or obj.get("texts") or []
|
||||
scores = obj.get("rec_scores") or obj.get("scores") or []
|
||||
boxes = obj.get("rec_polys") or obj.get("dt_polys") or []
|
||||
# Normalize lengths
|
||||
n = min(len(texts), len(scores) if hasattr(scores, "__len__") else len(texts), len(boxes) if hasattr(boxes, "__len__") else len(texts))
|
||||
out: List[Dict[str, Any]] = []
|
||||
for i in range(n):
|
||||
txt = texts[i]
|
||||
try:
|
||||
sc = float(scores[i])
|
||||
except Exception:
|
||||
sc = None
|
||||
bx = boxes[i] if i < len(boxes) else None
|
||||
out.append({"text": str(txt), "score": sc, "box": poly_to_list(bx) if bx is not None else None})
|
||||
return out
|
||||
|
||||
# Case A: legacy format [[ [poly], (text, score) ], ...] wrapped by [ ... ]
|
||||
if isinstance(res[0], list) and len(res[0]) > 0 and isinstance(res[0][0], list):
|
||||
lines_local = res[0]
|
||||
for line in lines_local:
|
||||
if not isinstance(line, (list, tuple)) or len(line) < 2:
|
||||
continue
|
||||
box = line[0]
|
||||
text: str = ""
|
||||
score: Optional[float] = None
|
||||
payload = line[1]
|
||||
if isinstance(payload, (list, tuple)) and len(payload) >= 1:
|
||||
text = str(payload[0])
|
||||
if len(payload) >= 2:
|
||||
try:
|
||||
score = float(payload[1])
|
||||
except Exception:
|
||||
score = None
|
||||
elif isinstance(payload, str):
|
||||
text = payload
|
||||
if len(line) >= 3:
|
||||
try:
|
||||
score = float(line[2])
|
||||
except Exception:
|
||||
score = None
|
||||
items.append({"text": text, "score": score, "box": box})
|
||||
return items
|
||||
|
||||
# Case B: new format already a flat list of dicts or lists per detection
|
||||
# Try dict format first
|
||||
if isinstance(res[0], dict):
|
||||
for obj in res:
|
||||
box = obj.get("box") or obj.get("poly") or obj.get("bbox") or obj.get("det_polygons")
|
||||
text = obj.get("text") or obj.get("rec_text") or ""
|
||||
score = obj.get("score") or obj.get("rec_score")
|
||||
try:
|
||||
score = float(score) if score is not None else None
|
||||
except Exception:
|
||||
score = None
|
||||
items.append({"text": str(text), "score": score, "box": poly_to_list(box) if box is not None else None})
|
||||
return items
|
||||
|
||||
# Case C: flat list of [poly, text, (maybe score)]
|
||||
if isinstance(res[0], (list, tuple)):
|
||||
for line in res:
|
||||
if not isinstance(line, (list, tuple)) or len(line) < 2:
|
||||
continue
|
||||
box = line[0]
|
||||
text = str(line[1])
|
||||
score: Optional[float] = None
|
||||
if len(line) >= 3:
|
||||
try:
|
||||
score = float(line[2])
|
||||
except Exception:
|
||||
score = None
|
||||
items.append({"text": text, "score": score, "box": poly_to_list(box) if box is not None else None})
|
||||
return items
|
||||
return items
|
||||
|
||||
parsed = parse_lines(result)
|
||||
det_polys = extract_polygons(result)
|
||||
|
||||
# Additionally collect both dt and rec polygons for JSON output
|
||||
def extract_both(res: Any) -> Tuple[List[List[Tuple[int, int]]], List[List[Tuple[int, int]]]]:
|
||||
def to_tuple_list(poly_any: Any) -> List[Tuple[int, int]]:
|
||||
try:
|
||||
if hasattr(poly_any, "tolist"):
|
||||
poly_any = poly_any.tolist()
|
||||
return [(int(p[0]), int(p[1])) for p in list(poly_any)]
|
||||
except Exception:
|
||||
return []
|
||||
if isinstance(res, dict):
|
||||
dt = res.get("dt_polys") or res.get("det_polygons") or []
|
||||
rc = res.get("rec_polys") or []
|
||||
return [to_tuple_list(p) for p in list(dt)], [to_tuple_list(p) for p in list(rc)]
|
||||
if isinstance(res, list) and len(res) > 0 and isinstance(res[0], dict):
|
||||
return extract_both(res[0])
|
||||
return [], []
|
||||
|
||||
all_dt_polys, all_rec_polys = extract_both(result)
|
||||
|
||||
# Print quick summary to console
|
||||
print("\n[TEXT]\n" + "\n".join([p["text"] for p in parsed]))
|
||||
|
||||
# Save JSON
|
||||
with open(args.out_json, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
"image": os.path.abspath(args.image),
|
||||
"language": used_lang,
|
||||
"num_items": len(parsed),
|
||||
"items": parsed,
|
||||
"poly_source": args.poly_source,
|
||||
"det_polygons": [[list(pt) for pt in poly] for poly in all_dt_polys],
|
||||
"rec_polygons": [[list(pt) for pt in poly] for poly in all_rec_polys],
|
||||
"enhance": bool(args.enhance),
|
||||
"recognition_model": chosen_rec,
|
||||
"detection_model": chosen_det,
|
||||
"ocr_version": args.ocr_version or "auto",
|
||||
"box_color": args.box_color,
|
||||
"fill_color": args.fill_color,
|
||||
"box_width": int(args.box_width),
|
||||
"scale_ratio": float(args.scale_ratio) if args.scale_ratio is not None else None,
|
||||
"offset_x": int(args.offset_x),
|
||||
"offset_y": int(args.offset_y),
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
print(f"[INFO] Saved JSON: {args.out_json}")
|
||||
# Also store raw result for debugging purposes
|
||||
try:
|
||||
with open("raw_result.txt", "w", encoding="utf-8") as rf:
|
||||
rf.write(repr(result))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Draw and save visualization (only polygons)
|
||||
image = Image.open(args.image).convert("RGBA")
|
||||
canvas = image.copy()
|
||||
overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
|
||||
draw = ImageDraw.Draw(overlay)
|
||||
# Prefer detection polygons from pipeline; fallback to parsed boxes
|
||||
polygons: List[List[Tuple[int, int]]] = det_polys
|
||||
if not polygons:
|
||||
polygons = [
|
||||
[tuple(p) for p in (box or [])] # type: ignore[misc]
|
||||
for box in [p.get("box") for p in parsed]
|
||||
if box
|
||||
]
|
||||
def parse_color(color_str: str, default=(255, 0, 0, 255)) -> Tuple[int, int, int, int]:
|
||||
try:
|
||||
if color_str.startswith("#"):
|
||||
hexv = color_str.lstrip("#")
|
||||
if len(hexv) == 6:
|
||||
r = int(hexv[0:2], 16)
|
||||
g = int(hexv[2:4], 16)
|
||||
b = int(hexv[4:6], 16)
|
||||
return (r, g, b, 255)
|
||||
if len(hexv) == 8:
|
||||
r = int(hexv[0:2], 16)
|
||||
g = int(hexv[2:4], 16)
|
||||
b = int(hexv[4:6], 16)
|
||||
a = int(hexv[6:8], 16)
|
||||
return (r, g, b, a)
|
||||
else:
|
||||
parts = [int(x) for x in color_str.split(",")]
|
||||
if len(parts) == 3:
|
||||
return (parts[0], parts[1], parts[2], 255)
|
||||
if len(parts) == 4:
|
||||
return (parts[0], parts[1], parts[2], parts[3])
|
||||
except Exception:
|
||||
pass
|
||||
return default
|
||||
|
||||
outline_rgba = parse_color(args.box_color)
|
||||
fill_rgba = parse_color(args.fill_color, default=(255, 0, 0, 51))
|
||||
|
||||
def transform_polygon(
|
||||
poly: List[Tuple[int, int]],
|
||||
scale: Optional[float],
|
||||
dx: int,
|
||||
dy: int,
|
||||
scale_x: Optional[float] = None,
|
||||
scale_y: Optional[float] = None,
|
||||
pad_x: int = 0,
|
||||
pad_y: int = 0,
|
||||
) -> List[Tuple[int, int]]:
|
||||
if not poly:
|
||||
return poly
|
||||
cx = sum(p[0] for p in poly) / len(poly)
|
||||
cy = sum(p[1] for p in poly) / len(poly)
|
||||
out: List[Tuple[int, int]] = []
|
||||
for (x, y) in poly:
|
||||
sx = scale_x if scale_x is not None else scale
|
||||
sy = scale_y if scale_y is not None else scale
|
||||
if sx is not None:
|
||||
x = cx + sx * (x - cx)
|
||||
if sy is not None:
|
||||
y = cy + sy * (y - cy)
|
||||
out.append((int(round(x + dx)), int(round(y + dy))))
|
||||
# pad expands rect-like by moving points outwards along axes
|
||||
if pad_x or pad_y:
|
||||
out = [(x - pad_x if x < cx else x + pad_x, y - pad_y if y < cy else y + pad_y) for (x, y) in out]
|
||||
return out
|
||||
|
||||
vis_polys: List[List[Tuple[int, int]]] = []
|
||||
def draw_axis_aligned(draw_obj, pts: List[Tuple[int, int]]):
|
||||
xs = [p[0] for p in pts]
|
||||
ys = [p[1] for p in pts]
|
||||
box = [(min(xs), min(ys)), (max(xs), min(ys)), (max(xs), max(ys)), (min(xs), max(ys))]
|
||||
draw_obj.polygon(box, outline=outline_rgba, fill=fill_rgba)
|
||||
draw_obj.line(box + [box[0]], fill=outline_rgba, width=args.box_width)
|
||||
|
||||
def draw_rotated(draw_obj, pts: List[Tuple[int, int]]):
|
||||
cnt = np.array(pts, dtype=np.int32).reshape(-1, 1, 2)
|
||||
rect = cv2.minAreaRect(cnt)
|
||||
box = cv2.boxPoints(rect)
|
||||
box = np.int0(box)
|
||||
poly = [(int(x), int(y)) for x, y in box.tolist()]
|
||||
draw_obj.polygon(poly, outline=outline_rgba, fill=fill_rgba)
|
||||
draw_obj.line(poly + [poly[0]], fill=outline_rgba, width=args.box_width)
|
||||
|
||||
for poly in polygons:
|
||||
if len(poly) >= 3:
|
||||
# Backward-compat: if scale-ratio not provided, use shrink-ratio (<1.0)
|
||||
scale = args.scale_ratio if args.scale_ratio is not None else args.shrink_ratio
|
||||
sp = transform_polygon(
|
||||
poly,
|
||||
scale,
|
||||
args.offset_x,
|
||||
args.offset_y,
|
||||
args.scale_x,
|
||||
args.scale_y,
|
||||
args.pad_x,
|
||||
args.pad_y,
|
||||
)
|
||||
vis_polys.append(sp)
|
||||
if args.rect == "axis":
|
||||
draw_axis_aligned(draw, sp)
|
||||
elif args.rect == "rotated":
|
||||
draw_rotated(draw, sp)
|
||||
else:
|
||||
draw.polygon(sp, outline=outline_rgba, fill=fill_rgba)
|
||||
draw.line(sp + [sp[0]], fill=outline_rgba, width=args.box_width)
|
||||
|
||||
out = Image.alpha_composite(canvas, overlay).convert("RGB")
|
||||
out.save(args.out_image)
|
||||
print(f"[INFO] Saved visualization: {args.out_image}")
|
||||
|
||||
# Append the visualization polygons to JSON file for exact reproducibility
|
||||
try:
|
||||
with open(args.out_json, "r", encoding="utf-8") as fjson:
|
||||
data = json.load(fjson)
|
||||
data["vis_polygons"] = [[list(pt) for pt in poly] for poly in vis_polys]
|
||||
with open(args.out_json, "w", encoding="utf-8") as fjson:
|
||||
json.dump(data, fjson, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
5040
src/model/text_detector/PaddleOCR/ocr_result.json
Normal file
BIN
src/model/text_detector/PaddleOCR/ocr_vis.png
Normal file
After Width: | Height: | Size: 5.7 MiB |
32
src/model/text_detector/docTR/.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# Re-include all image files and JSON within this folder (and subfolders)
|
||||
|
||||
# PNG
|
||||
!*.png
|
||||
!**/*.png
|
||||
|
||||
# JPG/JPEG
|
||||
!*.jpg
|
||||
!**/*.jpg
|
||||
!*.jpeg
|
||||
!**/*.jpeg
|
||||
|
||||
# BMP/GIF/TIFF/WEBP
|
||||
!*.bmp
|
||||
!**/*.bmp
|
||||
!*.gif
|
||||
!**/*.gif
|
||||
!*.tif
|
||||
!**/*.tif
|
||||
!*.tiff
|
||||
!**/*.tiff
|
||||
!*.webp
|
||||
!**/*.webp
|
||||
|
||||
# JSON
|
||||
!*.json
|
||||
!**/*.json
|
||||
|
||||
# Ensure this file itself is tracked
|
||||
!.gitignore
|
||||
|
||||
|
BIN
src/model/text_detector/docTR/CNI.png
Normal file
After Width: | Height: | Size: 1.9 MiB |
1116
src/model/text_detector/docTR/CNI_doctr.json
Normal file
BIN
src/model/text_detector/docTR/CNI_doctr_vis.png
Normal file
After Width: | Height: | Size: 3.0 MiB |
4336
src/model/text_detector/docTR/form_doctr.json
Normal file
BIN
src/model/text_detector/docTR/form_doctr_vis.png
Normal file
After Width: | Height: | Size: 1.6 MiB |
95
src/model/text_detector/docTR/run.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
|
||||
# Use non-interactive backend for headless execution/environments
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import cv2
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
from doctr.utils.visualization import visualize_page
|
||||
|
||||
|
||||
def run_doctr(
|
||||
image_path: str,
|
||||
output_dir: str = "docTR_outputs",
|
||||
det_arch: str = "db_resnet50",
|
||||
reco_arch: str = "crnn_vgg16_bn",
|
||||
) -> Path:
|
||||
image_path = Path(image_path)
|
||||
if not image_path.is_file():
|
||||
raise FileNotFoundError(f"Image not found: {image_path}")
|
||||
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load image for visualization
|
||||
bgr = cv2.imread(str(image_path))
|
||||
if bgr is None:
|
||||
raise RuntimeError(f"Failed to read image with OpenCV: {image_path}")
|
||||
image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Build predictor
|
||||
predictor = ocr_predictor(det_arch=det_arch, reco_arch=reco_arch, pretrained=True)
|
||||
|
||||
# Inference
|
||||
doc = DocumentFile.from_images(str(image_path))
|
||||
result = predictor(doc)
|
||||
|
||||
# Export structured result to JSON
|
||||
export = result.export()
|
||||
json_path = output_dir_path / f"{image_path.stem}_doctr.json"
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(export, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Visualization
|
||||
page = result.pages[0]
|
||||
page_dict = export["pages"][0]
|
||||
fig = visualize_page(page_dict, image=image)
|
||||
vis_path = output_dir_path / f"{image_path.stem}_doctr_vis.png"
|
||||
fig.savefig(vis_path, dpi=200, bbox_inches="tight")
|
||||
plt.close(fig)
|
||||
|
||||
# Write a simple text file with detected lines (if any)
|
||||
lines = []
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
text = " ".join([word.value for word in line.words])
|
||||
if text:
|
||||
lines.append(text)
|
||||
if lines:
|
||||
txt_path = output_dir_path / f"{image_path.stem}_doctr.txt"
|
||||
txt_path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
return vis_path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Run docTR OCR on an image")
|
||||
parser.add_argument("--image", required=True, help="Path to input image")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="docTR_outputs",
|
||||
help="Directory to store outputs (JSON/visualization)",
|
||||
)
|
||||
parser.add_argument("--det-arch", default="db_resnet50", help="Detection architecture")
|
||||
parser.add_argument("--reco-arch", default="crnn_vgg16_bn", help="Recognition architecture")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
vis_path = run_doctr(
|
||||
image_path=args.image,
|
||||
output_dir=args.output_dir,
|
||||
det_arch=args.det_arch,
|
||||
reco_arch=args.reco_arch,
|
||||
)
|
||||
print(f"Saved visualization to: {vis_path}")
|
||||
|
||||
|