update source code and pipeline

2025-09-04 14:39:02 +00:00
parent 9aabd991c5
commit 878310a551
82 changed files with 24373 additions and 0 deletions
--- a/filter/analyze_labels.py
+++ b/filter/analyze_labels.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""
+Analyze 'label' fields in a JSON dataset and produce summaries.
+
+- Handles entries where 'label' is either an object or a list of objects.
+- Computes distributions (is_bill, profession, currency, IDs presence, handwriting/rotation).
+- Computes numeric stats (total_billed, amount_paid, remaining_payment, coverages).
+- Parses dates and shows temporal distribution.
+- Analyzes items: count, sum of amounts and coverages, and mismatches vs total_billed.
+- Emits a concise stdout summary and writes CSVs and a Markdown report.
+
+Usage:
+  python analyze_labels.py --input 008_label_data_sample_seed_1997.json --out-dir .
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import math
+import re
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+from statistics import mean, median
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+
+NUMERIC_FIELDS = [
+    "total_billed",
+    "amount_paid",
+    "remaining_payment",
+    "client_part",
+    "mandatory_coverage",
+    "complementary_coverage",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Analyze 'label' fields in JSON dataset")
+    p.add_argument("--input", required=True, help="Path to JSON file (list of records)")
+    p.add_argument(
+        "--out-dir", default=None, help="Output directory for reports (default: alongside input)"
+    )
+    p.add_argument(
+        "--max-professions", type=int, default=50, help="Max professions to list in report"
+    )
+    p.add_argument(
+        "--no-plots",
+        action="store_true",
+        help="Disable generating plots (PNG) and embedding into report",
+    )
+    p.add_argument(
+        "--plot-top-k",
+        type=int,
+        default=20,
+        help="Top-K categories to visualize for profession/currency",
+    )
+    p.add_argument(
+        "--plot-format",
+        type=str,
+        default="png",
+        choices=["png", "jpg", "jpeg"],
+        help="Image format for plots",
+    )
+    return p.parse_args()
+
+
+def load_json(path: Path) -> List[Dict[str, Any]]:
+    with path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError("Top-level JSON must be a list of records")
+    return data
+
+
+def to_bool(value: Any) -> Optional[bool]:
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, (int, float)):
+        return bool(value)
+    if isinstance(value, str):
+        v = value.strip().lower()
+        if v in {"true", "t", "1", "yes", "y"}:
+            return True
+        if v in {"false", "f", "0", "no", "n"}:
+            return False
+    return None
+
+
+def to_float(value: Any) -> Optional[float]:
+    if value is None or value == "":
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def parse_date(value: Any) -> Optional[datetime]:
+    if not value or not isinstance(value, str):
+        return None
+    s = value.strip()
+    if not s:
+        return None
+    # Common patterns (day-first)
+    fmts = [
+        "%d-%m-%Y",
+        "%d/%m/%Y",
+        "%Y-%m-%d",
+        "%d-%m-%y",
+        "%d/%m/%y",
+    ]
+    for fmt in fmts:
+        try:
+            return datetime.strptime(s, fmt)
+        except ValueError:
+            pass
+    # Try to extract a date-like token using regex (e.g., 2025-02-07 or 07-02-2025)
+    m = re.search(r"(\d{2}[/-]\d{2}[/-]\d{4}|\d{4}-\d{2}-\d{2})", s)
+    if m:
+        token = m.group(1)
+        for fmt in fmts:
+            try:
+                return datetime.strptime(token, fmt)
+            except ValueError:
+                continue
+    return None
+
+
+def safe_get(d: Dict[str, Any], key: str, default=None):
+    return d.get(key, default)
+
+
+def flatten_labels(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    out: List[Dict[str, Any]] = []
+    for rec in records:
+        src_image = rec.get("image") or ",".join(rec.get("image_files", []) or [])
+        label = rec.get("label")
+        if label is None:
+            continue
+        if isinstance(label, list):
+            for idx, lab in enumerate(label):
+                if not isinstance(lab, dict):
+                    continue
+                o = dict(lab)
+                o["__source_image__"] = src_image
+                o["__multi_index__"] = idx
+                out.append(o)
+        elif isinstance(label, dict):
+            o = dict(label)
+            o["__source_image__"] = src_image
+            out.append(o)
+    return out
+
+
+def presence_counts(labels: List[Dict[str, Any]], fields: Iterable[str]) -> Dict[str, int]:
+    counts: Dict[str, int] = {}
+    for field in fields:
+        present = 0
+        for lbl in labels:
+            if safe_get(lbl, field) not in (None, ""):
+                present += 1
+        counts[field] = present
+    return counts
+
+
+def numeric_summary(values: List[Optional[float]]) -> Dict[str, Any]:
+    clean = [v for v in values if isinstance(v, (int, float)) and not math.isnan(v)]
+    if not clean:
+        return {"count": 0}
+    return {
+        "count": len(clean),
+        "min": min(clean),
+        "p25": percentile(clean, 25),
+        "median": median(clean),
+        "p75": percentile(clean, 75),
+        "max": max(clean),
+        "mean": mean(clean),
+        "sum": sum(clean),
+        "missing": len(values) - len(clean),
+    }
+
+
+def percentile(arr: List[float], p: float) -> float:
+    if not arr:
+        return float("nan")
+    a = sorted(arr)
+    k = (len(a) - 1) * (p / 100.0)
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return a[int(k)]
+    d0 = a[f] * (c - k)
+    d1 = a[c] * (k - f)
+    return d0 + d1
+
+
+def write_csv(path: Path, headers: List[str], rows: Iterable[Iterable[Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(headers)
+        for row in rows:
+            w.writerow(row)
+
+
+def try_import_matplotlib():
+    try:
+        import matplotlib  # type: ignore[import-not-found]
+        matplotlib.use("Agg")  # headless backend
+        import matplotlib.pyplot as plt  # type: ignore[import-not-found]
+        return plt
+    except Exception:
+        return None
+
+
+def save_bar_plot(plt, x_labels: List[str], values: List[float], title: str, out_path: Path, rotation: int = 0):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig, ax = plt.subplots(figsize=(max(6, min(14, 0.4 * len(x_labels) + 3)), 4))
+    ax.bar(range(len(values)), values, color="#4C78A8")
+    ax.set_title(title)
+    ax.set_ylabel("count")
+    ax.set_xticks(range(len(x_labels)))
+    ax.set_xticklabels(x_labels, rotation=rotation, ha="right" if rotation else "center")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def save_hist_plot(plt, values: List[float], title: str, out_path: Path, bins: int = 30):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig, ax = plt.subplots(figsize=(7, 4))
+    ax.hist(values, bins=bins, color="#72B7B2", edgecolor="white")
+    ax.set_title(title)
+    ax.set_ylabel("count")
+    ax.set_xlabel("value")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def produce_plots(
+    out_dir: Path,
+    args: argparse.Namespace,
+    is_bill_counter: Counter,
+    bill_paid_counter: Counter,
+    handwriting_counter: Counter,
+    rotation_counter: Counter,
+    profession_counter: Counter,
+    currency_counter: Counter,
+    year_month_counter: Counter,
+    numeric_data: Dict[str, List[Optional[float]]],
+    items_per_label: List[int],
+) -> List[Path]:
+    """Generate plots and return list of created file paths."""
+    if args.no_plots:
+        return []
+    plt = try_import_matplotlib()
+    if plt is None:
+        # matplotlib not available; skip plotting gracefully
+        return []
+
+    created: List[Path] = []
+    plots_dir = out_dir / "plots"
+    ext = args.plot_format
+
+    # is_bill
+    if is_bill_counter:
+        labels = [str(k) for k, _ in is_bill_counter.items()]
+        vals = [v for _, v in is_bill_counter.items()]
+        p = plots_dir / f"is_bill.{ext}"
+        save_bar_plot(plt, labels, vals, "is_bill distribution", p)
+        created.append(p)
+
+    # bill_paid
+    if bill_paid_counter:
+        labels = [str(k) for k, _ in bill_paid_counter.items()]
+        vals = [v for _, v in bill_paid_counter.items()]
+        p = plots_dir / f"bill_paid.{ext}"
+        save_bar_plot(plt, labels, vals, "bill_paid distribution", p)
+        created.append(p)
+
+    # Flags
+    if handwriting_counter:
+        labels = [str(k) for k, _ in handwriting_counter.items()]
+        vals = [v for _, v in handwriting_counter.items()]
+        p = plots_dir / f"is_handwriting.{ext}"
+        save_bar_plot(plt, labels, vals, "is_handwriting", p)
+        created.append(p)
+    if rotation_counter:
+        labels = [str(k) for k, _ in rotation_counter.items()]
+        vals = [v for _, v in rotation_counter.items()]
+        p = plots_dir / f"is_rotated.{ext}"
+        save_bar_plot(plt, labels, vals, "is_rotated", p)
+        created.append(p)
+
+    # Professions (top-K)
+    if profession_counter:
+        top = profession_counter.most_common(max(1, min(args.plot_top_k, len(profession_counter))))
+        labels = [k if len(str(k)) <= 20 else str(k)[:17] + "…" for k, _ in top]
+        vals = [v for _, v in top]
+        p = plots_dir / f"professions_top{len(labels)}.{ext}"
+        save_bar_plot(plt, labels, vals, f"Top {len(labels)} professions", p, rotation=45)
+        created.append(p)
+
+    # Currency
+    if currency_counter:
+        top = currency_counter.most_common(max(1, min(args.plot_top_k, len(currency_counter))))
+        labels = [str(k) for k, _ in top]
+        vals = [v for _, v in top]
+        p = plots_dir / f"currency.{ext}"
+        save_bar_plot(plt, labels, vals, "Currency distribution", p)
+        created.append(p)
+
+    # Year-month
+    if year_month_counter:
+        items = sorted(year_month_counter.items(), key=lambda x: (x[0][0], x[0][1]))
+        labels = [f"{y:04d}-{m:02d}" for (y, m), _ in items]
+        vals = [v for _, v in items]
+        p = plots_dir / f"invoice_year_month.{ext}"
+        save_bar_plot(plt, labels, vals, "Invoices by year-month", p, rotation=45)
+        created.append(p)
+
+    # Items per label
+    if items_per_label:
+        p = plots_dir / f"items_per_label.{ext}"
+        save_hist_plot(plt, items_per_label, "Items per label (histogram)", p, bins=min(30, max(5, int(len(items_per_label) ** 0.5))))
+        created.append(p)
+
+    # Numeric fields histograms
+    for k, vals_all in numeric_data.items():
+        vals = [float(v) for v in vals_all if isinstance(v, (int, float)) and not math.isnan(v)]
+        if not vals:
+            continue
+        p = plots_dir / f"hist_{k}.{ext}"
+        save_hist_plot(plt, vals, f"{k} (histogram)", p)
+        created.append(p)
+
+    return created
+
+
+def main() -> None:
+    args = parse_args()
+    in_path = Path(args.input).resolve()
+    out_dir = Path(args.out_dir).resolve() if args.out_dir else in_path.parent
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    records = load_json(in_path)
+    labels = flatten_labels(records)
+    n_total_rec = len(records)
+    n_labels = len(labels)
+
+    # Normalize some fields
+    for lbl in labels:
+        lbl["is_bill"] = to_bool(lbl.get("is_bill"))
+        lbl["bill_paid"] = to_bool(lbl.get("bill_paid"))
+        # Normalize numeric fields in-place for ease of stats
+        for k in NUMERIC_FIELDS:
+            lbl[k] = to_float(lbl.get(k))
+
+    # Basic distributions
+    is_bill_counter = Counter(lbl.get("is_bill") for lbl in labels)
+    bill_paid_counter = Counter(lbl.get("bill_paid") for lbl in labels)
+    currency_counter = Counter(lbl.get("currency") for lbl in labels if lbl.get("currency"))
+    profession_counter = Counter((lbl.get("profession") or "").strip() or "(missing)" for lbl in labels)
+
+    # Presence of identifiers and key fields
+    id_presence = presence_counts(labels, [
+        "adeli_number",
+        "rpps_number",
+        "finess_number",
+        "prescripteur_finess_number",
+        "doctor_name",
+        "invoice_issuer",
+        "insured_name",
+        "beneficiary_name",
+        "security_number",
+        "currency",
+    ])
+
+    # Handwriting/rotation flags
+    handwriting_counter = Counter(to_bool(lbl.get("is_handwriting")) for lbl in labels)
+    rotation_counter = Counter(to_bool(lbl.get("is_rotated")) for lbl in labels)
+
+    # Numeric stats
+    numeric_stats: Dict[str, Dict[str, Any]] = {}
+    for k in NUMERIC_FIELDS:
+        numeric_stats[k] = numeric_summary([lbl.get(k) for lbl in labels])
+    # Keep raw numeric data for histograms
+    numeric_raw: Dict[str, List[Optional[float]]] = {k: [lbl.get(k) for lbl in labels] for k in NUMERIC_FIELDS}
+
+    # Dates
+    invoice_dates = [parse_date(lbl.get("invoice_date")) for lbl in labels]
+    invoice_dates_clean = [d for d in invoice_dates if d is not None]
+    year_month_counter = Counter((d.year, d.month) for d in invoice_dates_clean)
+
+    # Items analysis
+    items_per_label: List[int] = []
+    sum_item_amount: List[Optional[float]] = []
+    sum_item_mandatory: List[Optional[float]] = []
+    mismatch_records: List[Tuple[str, Optional[float], Optional[float], Optional[float]]] = []
+
+    for lbl in labels:
+        items = lbl.get("items") or []
+        if not isinstance(items, list):
+            items = []
+        items_per_label.append(len(items))
+        s_amount = None
+        s_mand = None
+        for it in items:
+            if not isinstance(it, dict):
+                continue
+            a = to_float(it.get("amount"))
+            m = to_float(it.get("mandatory_coverage"))
+            s_amount = (s_amount or 0.0) + (a or 0.0)
+            s_mand = (s_mand or 0.0) + (m or 0.0)
+        sum_item_amount.append(s_amount)
+        sum_item_mandatory.append(s_mand)
+        total_billed = to_float(lbl.get("total_billed"))
+        if total_billed is not None and s_amount is not None:
+            diff = total_billed - s_amount
+            if abs(diff) > 1e-6:
+                mismatch_records.append((
+                    str(lbl.get("__source_image__")), total_billed, s_amount, diff
+                ))
+
+    # Data quality issues
+    issues: List[Dict[str, Any]] = []
+    for lbl in labels:
+        src = str(lbl.get("__source_image__"))
+        # is_bill must be True/False or None (unknown); flag strings that could not be parsed
+        if "is_bill" in lbl and not isinstance(lbl.get("is_bill"), (bool, type(None))):
+            issues.append({"source": src, "issue": "is_bill not boolean"})
+        # bill_paid True but amount_paid missing
+        if lbl.get("bill_paid") is True and to_float(lbl.get("amount_paid")) is None:
+            issues.append({"source": src, "issue": "bill_paid True but amount_paid missing"})
+        # remaining_payment > 0 but bill_paid True
+        rp = to_float(lbl.get("remaining_payment"))
+        if lbl.get("bill_paid") is True and (rp or 0) > 0:
+            issues.append({"source": src, "issue": "bill_paid True but remaining_payment > 0"})
+        # Negative or zero amounts on items
+        items = lbl.get("items") or []
+        if isinstance(items, list):
+            for idx, it in enumerate(items):
+                if not isinstance(it, dict):
+                    continue
+                a = to_float(it.get("amount"))
+                if a is not None and a < 0:
+                    issues.append({"source": src, "issue": f"item[{idx}].amount negative: {a}"})
+                q = to_float(it.get("quantity"))
+                if q is None:
+                    # Not strictly an issue, but mark for completeness
+                    issues.append({"source": src, "issue": f"item[{idx}].quantity missing"})
+        # Missing currency on bill
+        if lbl.get("is_bill") is True and not lbl.get("currency"):
+            issues.append({"source": src, "issue": "currency missing for bill"})
+
+    # Outputs
+    # 1) CSVs
+    write_csv(out_dir / "professions_counts.csv", ["profession", "count"], profession_counter.most_common())
+    write_csv(out_dir / "currency_counts.csv", ["currency", "count"], currency_counter.most_common())
+    write_csv(out_dir / "is_bill_counts.csv", ["is_bill", "count"], is_bill_counter.items())
+    write_csv(out_dir / "bill_paid_counts.csv", ["bill_paid", "count"], bill_paid_counter.items())
+    write_csv(out_dir / "id_presence.csv", ["field", "present_count"], id_presence.items())
+    write_csv(out_dir / "item_total_billed_mismatches.csv", ["source_image", "total_billed", "sum_item_amount", "diff"], mismatch_records)
+    write_csv(out_dir / "issues.csv", ["source", "issue"], ((i["source"], i["issue"]) for i in issues))
+
+    # 2) Markdown report
+    md = []
+    md.append("# Label Analysis Report\n")
+    md.append(f"Input: `{in_path.name}`\n")
+    md.append("")
+    md.append("## Overview\n")
+    md.append(f"- Total records: {n_total_rec}")
+    md.append(f"- Total labels (flattened): {n_labels}")
+    md.append(f"- is_bill distribution: {dict(is_bill_counter)}")
+    md.append(f"- bill_paid distribution: {dict(bill_paid_counter)}")
+    if invoice_dates_clean:
+        md.append(
+            f"- Invoice dates span: {min(invoice_dates_clean).date()} .. {max(invoice_dates_clean).date()}"
+        )
+        md.append(f"- Unique year-month pairs: {len(year_month_counter)}")
+    else:
+        md.append("- Invoice dates: none parseable")
+
+    md.append("\n## Professions (top)\n")
+    for prof, cnt in profession_counter.most_common(args.max_professions):
+        md.append(f"- {prof}: {cnt}")
+
+    md.append("\n## Currency distribution\n")
+    for cur, cnt in currency_counter.most_common():
+        md.append(f"- {cur}: {cnt}")
+
+    md.append("\n## Identifier and key field presence\n")
+    for k, v in id_presence.items():
+        md.append(f"- {k}: {v} present")
+
+    md.append("\n## Flags\n")
+    md.append(f"- is_handwriting: {dict(handwriting_counter)}")
+    md.append(f"- is_rotated: {dict(rotation_counter)}")
+
+    md.append("\n## Numeric summaries\n")
+    for k, stats in numeric_stats.items():
+        md.append(f"- {k}: {stats}")
+
+    if items_per_label:
+        md.append("\n## Items analysis\n")
+        md.append(f"- Items per label: count={len(items_per_label)}, min={min(items_per_label)}, max={max(items_per_label)}, mean={mean(items_per_label):.2f}")
+        n_mismatch = len(mismatch_records)
+        md.append(f"- total_billed vs sum(items.amount) mismatches: {n_mismatch}")
+
+    if issues:
+        md.append("\n## Data quality issues (sample)\n")
+        for row in issues[:50]:
+            md.append(f"- {row['source']}: {row['issue']}")
+
+    # 3) Plots (if enabled)
+    created_plots = produce_plots(
+        out_dir=out_dir,
+        args=args,
+        is_bill_counter=is_bill_counter,
+        bill_paid_counter=bill_paid_counter,
+        handwriting_counter=handwriting_counter,
+        rotation_counter=rotation_counter,
+        profession_counter=profession_counter,
+        currency_counter=currency_counter,
+        year_month_counter=year_month_counter,
+        numeric_data=numeric_raw,
+        items_per_label=items_per_label,
+    )
+    if created_plots:
+        md.append("\n## Plots\n")
+        for p in created_plots:
+            rel = p.relative_to(out_dir)
+            md.append(f"- {p.stem}")
+            md.append(f"![]({rel.as_posix()})\n")
+    elif not args.no_plots:
+        md.append("\n## Plots\n")
+        md.append("- matplotlib not available or no data to plot.")
+
+    report_path = out_dir / "label_analysis_report.md"
+    report_path.write_text("\n".join(md), encoding="utf-8")
+
+    # Console summary
+    print("Label analysis complete.")
+    print(f"- Records: {n_total_rec}, Labels: {n_labels}")
+    print(f"- is_bill: {dict(is_bill_counter)} | bill_paid: {dict(bill_paid_counter)}")
+    print(f"- Professions (top 10): {profession_counter.most_common(10)}")
+    print(f"- Currency: {dict(currency_counter)}")
+    print(f"Report written to: {report_path}")
+    if created_plots:
+        print(f"- Plots saved under: {(out_dir / 'plots').as_posix()} ({len(created_plots)} files)")
+
+
+if __name__ == "__main__":
+    main()