update PaddleOCR result & docTR result

This commit is contained in:
Nguyễn Phước Thành
2025-08-09 22:29:33 +07:00
parent f63589a10a
commit 028e3237bb
15 changed files with 12838 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
# Re-include all image files and JSON within this folder (and subfolders)
# PNG
!*.png
!**/*.png
# JPG/JPEG
!*.jpg
!**/*.jpg
!*.jpeg
!**/*.jpeg
# BMP/GIF/TIFF/WEBP
!*.bmp
!**/*.bmp
!*.gif
!**/*.gif
!*.tif
!**/*.tif
!*.tiff
!**/*.tiff
!*.webp
!**/*.webp
# JSON
!*.json
!**/*.json
# Ensure this file itself is tracked
!.gitignore

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 MiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

View File

@@ -0,0 +1,95 @@
import argparse
import json
import os
from pathlib import Path
import matplotlib
# Use non-interactive backend for headless execution/environments
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import cv2
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from doctr.utils.visualization import visualize_page
def run_doctr(
image_path: str,
output_dir: str = "docTR_outputs",
det_arch: str = "db_resnet50",
reco_arch: str = "crnn_vgg16_bn",
) -> Path:
image_path = Path(image_path)
if not image_path.is_file():
raise FileNotFoundError(f"Image not found: {image_path}")
output_dir_path = Path(output_dir)
output_dir_path.mkdir(parents=True, exist_ok=True)
# Load image for visualization
bgr = cv2.imread(str(image_path))
if bgr is None:
raise RuntimeError(f"Failed to read image with OpenCV: {image_path}")
image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
# Build predictor
predictor = ocr_predictor(det_arch=det_arch, reco_arch=reco_arch, pretrained=True)
# Inference
doc = DocumentFile.from_images(str(image_path))
result = predictor(doc)
# Export structured result to JSON
export = result.export()
json_path = output_dir_path / f"{image_path.stem}_doctr.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(export, f, ensure_ascii=False, indent=2)
# Visualization
page = result.pages[0]
page_dict = export["pages"][0]
fig = visualize_page(page_dict, image=image)
vis_path = output_dir_path / f"{image_path.stem}_doctr_vis.png"
fig.savefig(vis_path, dpi=200, bbox_inches="tight")
plt.close(fig)
# Write a simple text file with detected lines (if any)
lines = []
for block in page.blocks:
for line in block.lines:
text = " ".join([word.value for word in line.words])
if text:
lines.append(text)
if lines:
txt_path = output_dir_path / f"{image_path.stem}_doctr.txt"
txt_path.write_text("\n".join(lines), encoding="utf-8")
return vis_path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run docTR OCR on an image")
parser.add_argument("--image", required=True, help="Path to input image")
parser.add_argument(
"--output-dir",
default="docTR_outputs",
help="Directory to store outputs (JSON/visualization)",
)
parser.add_argument("--det-arch", default="db_resnet50", help="Detection architecture")
parser.add_argument("--reco-arch", default="crnn_vgg16_bn", help="Recognition architecture")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
vis_path = run_doctr(
image_path=args.image,
output_dir=args.output_dir,
det_arch=args.det_arch,
reco_arch=args.reco_arch,
)
print(f"Saved visualization to: {vis_path}")