update PaddleOCR result & docTR result
This commit is contained in:
32
src/model/text_detector/docTR/.gitignore
vendored
Normal file
32
src/model/text_detector/docTR/.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# Re-include all image files and JSON within this folder (and subfolders)
|
||||
|
||||
# PNG
|
||||
!*.png
|
||||
!**/*.png
|
||||
|
||||
# JPG/JPEG
|
||||
!*.jpg
|
||||
!**/*.jpg
|
||||
!*.jpeg
|
||||
!**/*.jpeg
|
||||
|
||||
# BMP/GIF/TIFF/WEBP
|
||||
!*.bmp
|
||||
!**/*.bmp
|
||||
!*.gif
|
||||
!**/*.gif
|
||||
!*.tif
|
||||
!**/*.tif
|
||||
!*.tiff
|
||||
!**/*.tiff
|
||||
!*.webp
|
||||
!**/*.webp
|
||||
|
||||
# JSON
|
||||
!*.json
|
||||
!**/*.json
|
||||
|
||||
# Ensure this file itself is tracked
|
||||
!.gitignore
|
||||
|
||||
|
BIN
src/model/text_detector/docTR/CNI.png
Normal file
BIN
src/model/text_detector/docTR/CNI.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
1116
src/model/text_detector/docTR/CNI_doctr.json
Normal file
1116
src/model/text_detector/docTR/CNI_doctr.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
src/model/text_detector/docTR/CNI_doctr_vis.png
Normal file
BIN
src/model/text_detector/docTR/CNI_doctr_vis.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.0 MiB |
4336
src/model/text_detector/docTR/form_doctr.json
Normal file
4336
src/model/text_detector/docTR/form_doctr.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
src/model/text_detector/docTR/form_doctr_vis.png
Normal file
BIN
src/model/text_detector/docTR/form_doctr_vis.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.6 MiB |
95
src/model/text_detector/docTR/run.py
Normal file
95
src/model/text_detector/docTR/run.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
|
||||
# Use non-interactive backend for headless execution/environments
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import cv2
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
from doctr.utils.visualization import visualize_page
|
||||
|
||||
|
||||
def run_doctr(
|
||||
image_path: str,
|
||||
output_dir: str = "docTR_outputs",
|
||||
det_arch: str = "db_resnet50",
|
||||
reco_arch: str = "crnn_vgg16_bn",
|
||||
) -> Path:
|
||||
image_path = Path(image_path)
|
||||
if not image_path.is_file():
|
||||
raise FileNotFoundError(f"Image not found: {image_path}")
|
||||
|
||||
output_dir_path = Path(output_dir)
|
||||
output_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load image for visualization
|
||||
bgr = cv2.imread(str(image_path))
|
||||
if bgr is None:
|
||||
raise RuntimeError(f"Failed to read image with OpenCV: {image_path}")
|
||||
image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Build predictor
|
||||
predictor = ocr_predictor(det_arch=det_arch, reco_arch=reco_arch, pretrained=True)
|
||||
|
||||
# Inference
|
||||
doc = DocumentFile.from_images(str(image_path))
|
||||
result = predictor(doc)
|
||||
|
||||
# Export structured result to JSON
|
||||
export = result.export()
|
||||
json_path = output_dir_path / f"{image_path.stem}_doctr.json"
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(export, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Visualization
|
||||
page = result.pages[0]
|
||||
page_dict = export["pages"][0]
|
||||
fig = visualize_page(page_dict, image=image)
|
||||
vis_path = output_dir_path / f"{image_path.stem}_doctr_vis.png"
|
||||
fig.savefig(vis_path, dpi=200, bbox_inches="tight")
|
||||
plt.close(fig)
|
||||
|
||||
# Write a simple text file with detected lines (if any)
|
||||
lines = []
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
text = " ".join([word.value for word in line.words])
|
||||
if text:
|
||||
lines.append(text)
|
||||
if lines:
|
||||
txt_path = output_dir_path / f"{image_path.stem}_doctr.txt"
|
||||
txt_path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
return vis_path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Run docTR OCR on an image")
|
||||
parser.add_argument("--image", required=True, help="Path to input image")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="docTR_outputs",
|
||||
help="Directory to store outputs (JSON/visualization)",
|
||||
)
|
||||
parser.add_argument("--det-arch", default="db_resnet50", help="Detection architecture")
|
||||
parser.add_argument("--reco-arch", default="crnn_vgg16_bn", help="Recognition architecture")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
vis_path = run_doctr(
|
||||
image_path=args.image,
|
||||
output_dir=args.output_dir,
|
||||
det_arch=args.det_arch,
|
||||
reco_arch=args.reco_arch,
|
||||
)
|
||||
print(f"Saved visualization to: {vis_path}")
|
||||
|
||||
|
Reference in New Issue
Block a user