update ressult

2025-09-11 09:39:02 +00:00
parent 3cbf74149a
commit 0d6091fc0c
33 changed files with 1429 additions and 875 deletions
--- a/scripts/compute_hallucination_from_anls.py
+++ b/scripts/compute_hallucination_from_anls.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Compute ANLS and hallucination scores from per-sample evaluation JSONs and plot results.
+
+Inputs: one or more JSON files with schema like:
+[
+  {
+    "image": "image (22)",
+    "num_pred_fields": 10,
+    "num_gt_fields": 12,
+    "num_correct": 9,
+    "all_correct": false,
+    "fields": [
+      {"field": "address", "pred": "...", "gt": "...", "correct": true},
+      ...
+    ]
+  },
+  ...
+]
+
+ANLS definition: average normalized Levenshtein similarity over fields present in ground truth.
+Here we approximate per-field similarity as:
+  sim = 1 - (levenshtein_distance(pred, gt) / max(len(pred), len(gt)))
+  clipped into [0, 1], and treat empty max length as exact match (1.0).
+
+Per-image ANLS is the mean of field similarities for that image. Hallucination is 1 - ANLS.
+
+Outputs:
+- CSV per input JSON placed next to it: per_image_anls.csv with columns [image, anls, hallucination_score, num_fields]
+- PNG bar chart per input JSON: hallucination_per_image.png with mean line and title.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def levenshtein_distance(a: str, b: str) -> int:
+    """Compute Levenshtein distance between two strings (iterative DP)."""
+    if a == b:
+        return 0
+    if len(a) == 0:
+        return len(b)
+    if len(b) == 0:
+        return len(a)
+    previous_row = list(range(len(b) + 1))
+    for i, ca in enumerate(a, start=1):
+        current_row = [i]
+        for j, cb in enumerate(b, start=1):
+            insertions = previous_row[j] + 1
+            deletions = current_row[j - 1] + 1
+            substitutions = previous_row[j - 1] + (0 if ca == cb else 1)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+
+
+def normalized_similarity(pred: str, gt: str) -> float:
+    """Return 1 - normalized edit distance in [0, 1]."""
+    pred = pred or ""
+    gt = gt or ""
+    max_len = max(len(pred), len(gt))
+    if max_len == 0:
+        return 1.0
+    dist = levenshtein_distance(pred, gt)
+    sim = 1.0 - (dist / max_len)
+    if sim < 0.0:
+        return 0.0
+    if sim > 1.0:
+        return 1.0
+    return sim
+
+
+def compute_anls_for_record(record: Dict) -> Tuple[float, int]:
+    """Compute ANLS and number of fields for a single record object."""
+    fields = record.get("fields") or []
+    if not isinstance(fields, list) or len(fields) == 0:
+        return 0.0, 0
+    sims: List[float] = []
+    for f in fields:
+        pred = str(f.get("pred", ""))
+        gt = str(f.get("gt", ""))
+        sims.append(normalized_similarity(pred, gt))
+    anls = float(sum(sims) / len(sims)) if sims else 0.0
+    return anls, len(sims)
+
+
+def process_json(json_path: Path) -> Path:
+    with json_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    rows = []
+    for rec in data:
+        image_name = rec.get("image")
+        anls, num_fields = compute_anls_for_record(rec)
+        hallucination = 1.0 - anls
+        rows.append({
+            "image": image_name,
+            "anls": anls,
+            "hallucination_score": hallucination,
+            "num_fields": int(num_fields),
+        })
+
+    df = pd.DataFrame(rows)
+    out_csv = json_path.parent / "per_image_anls.csv"
+    df.to_csv(out_csv, index=False)
+
+    # Plot hallucination bar chart with mean line
+    if len(df) > 0:
+        sorted_df = df.sort_values("hallucination_score", ascending=False).reset_index(drop=True)
+        plt.figure(figsize=(max(8, len(sorted_df) * 0.12), 5))
+        plt.bar(range(len(sorted_df)), sorted_df["hallucination_score"].values, color="#1f77b4")
+        mean_val = float(sorted_df["hallucination_score"].mean())
+        plt.axhline(mean_val, color="red", linestyle="--", label=f"Mean={mean_val:.3f}")
+        plt.xlabel("Image (sorted by hallucination)")
+        plt.ylabel("Hallucination = 1 - ANLS")
+        plt.title(f"Hallucination per image: {json_path.parent.name}")
+        plt.legend()
+        plt.tight_layout()
+        out_png = json_path.parent / "hallucination_per_image.png"
+        plt.savefig(out_png, dpi=150)
+        plt.close()
+
+    return out_csv
+
+
+def common_parent(paths: List[Path]) -> Path:
+    if not paths:
+        return Path.cwd()
+    common = Path(Path(paths[0]).anchor)
+    parts = list(Path(paths[0]).resolve().parts)
+    for i in range(1, len(paths)):
+        other_parts = list(Path(paths[i]).resolve().parts)
+        # shrink parts to common prefix
+        new_parts: List[str] = []
+        for a, b in zip(parts, other_parts):
+            if a == b:
+                new_parts.append(a)
+            else:
+                break
+        parts = new_parts
+    if not parts:
+        return Path.cwd()
+    return Path(*parts)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compute ANLS and hallucination from per-sample JSONs and plot results.")
+    parser.add_argument("inputs", nargs="+", help="Paths to per_sample_eval.json files")
+    args = parser.parse_args()
+
+    any_error = False
+    combined_rows: List[Dict] = []
+    input_paths: List[Path] = []
+    for in_path_str in args.inputs:
+        path = Path(in_path_str)
+        if not path.exists():
+            print(f"[WARN] File does not exist: {path}", file=sys.stderr)
+            any_error = True
+            continue
+        try:
+            out_csv = process_json(path)
+            print(f"Processed: {path} -> {out_csv}")
+            # Load just-written CSV to aggregate and tag method
+            df = pd.read_csv(out_csv)
+            method_name = path.parent.name
+            df["method"] = method_name
+            combined_rows.extend(df.to_dict(orient="records"))
+            input_paths.append(path)
+        except Exception as exc:
+            print(f"[ERROR] Failed to process {path}: {exc}", file=sys.stderr)
+            any_error = True
+    # Create combined outputs if we have multiple inputs
+    if combined_rows:
+        combo_df = pd.DataFrame(combined_rows)
+        # Reorder columns
+        cols = ["image", "method", "anls", "hallucination_score", "num_fields"]
+        combo_df = combo_df[cols]
+        base_outdir = common_parent(input_paths)
+        combined_dir = base_outdir / "combined_anls"
+        combined_dir.mkdir(parents=True, exist_ok=True)
+        combined_csv = combined_dir / "combined_per_image_anls.csv"
+        combo_df.to_csv(combined_csv, index=False)
+
+        # Mean hallucination per method (bar chart)
+        means = combo_df.groupby("method")["hallucination_score"].mean().sort_values(ascending=False)
+        stds = combo_df.groupby("method")["hallucination_score"].std().reindex(means.index)
+        plt.figure(figsize=(max(6, len(means) * 1.2), 5))
+        plt.bar(means.index, means.values, yerr=stds.values, capsize=4, color="#2ca02c")
+        overall_mean = float(combo_df["hallucination_score"].mean())
+        plt.axhline(overall_mean, color="red", linestyle="--", label=f"Overall mean={overall_mean:.3f}")
+        plt.ylabel("Mean hallucination (1 - ANLS)")
+        plt.title("Mean hallucination by method")
+        plt.xticks(rotation=20, ha="right")
+        plt.legend()
+        plt.tight_layout()
+        bar_png = combined_dir / "mean_hallucination_by_method.png"
+        plt.savefig(bar_png, dpi=160)
+        plt.close()
+
+        # Heatmap: images x methods (hallucination)
+        pivot = combo_df.pivot_table(index="image", columns="method", values="hallucination_score", aggfunc="mean")
+        # Sort images by average hallucination descending for readability
+        pivot = pivot.reindex(pivot.mean(axis=1).sort_values(ascending=False).index)
+        plt.figure(figsize=(max(8, len(pivot.columns) * 1.0), max(6, len(pivot.index) * 0.25)))
+        im = plt.imshow(pivot.values, aspect="auto", cmap="viridis")
+        plt.colorbar(im, label="Hallucination (1 - ANLS)")
+        plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=30, ha="right")
+        plt.yticks(range(len(pivot.index)), pivot.index)
+        plt.title("Hallucination per image across methods")
+        plt.tight_layout()
+        heatmap_png = combined_dir / "hallucination_heatmap.png"
+        plt.savefig(heatmap_png, dpi=160)
+        plt.close()
+
+        print(f"Combined CSV: {combined_csv}")
+        print(f"Saved: {bar_png}")
+        print(f"Saved: {heatmap_png}")
+
+        # Line chart: 1 line per method over images, hide image names
+        # Use same image order as pivot
+        methods = list(pivot.columns)
+        x = list(range(len(pivot.index)))
+        plt.figure(figsize=(max(10, len(x) * 0.12), 5))
+        colors = plt.rcParams['axes.prop_cycle'].by_key().get('color', ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd'])
+        for idx, method in enumerate(methods):
+            y = pivot[method].to_numpy()
+            plt.plot(x, y, label=method, linewidth=1.8, color=colors[idx % len(colors)])
+        plt.ylim(0.0, 1.0)
+        plt.xlabel("Images (sorted by overall hallucination)")
+        plt.ylabel("Hallucination (1 - ANLS)")
+        plt.title("Hallucination across images by method")
+        plt.xticks([], [])  # hide image names
+        # Mean note box
+        mean_lines = []
+        for method in methods:
+            m = float(combo_df[combo_df["method"] == method]["hallucination_score"].mean())
+            mean_lines.append(f"{method}: {m:.3f}")
+        text = "\n".join(mean_lines)
+        plt.gca().text(0.99, 0.01, text, transform=plt.gca().transAxes,
+                       fontsize=9, va='bottom', ha='right',
+                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, edgecolor='gray'))
+        plt.legend(loc="upper right", ncol=min(3, len(methods)))
+        plt.tight_layout()
+        line_png = combined_dir / "hallucination_lines_by_method.png"
+        plt.savefig(line_png, dpi=160)
+        plt.close()
+
+        # Grouped-by-image interlocking line chart with image labels
+        # Build a consistent x position per image, with small offsets per method
+        base_x = list(range(len(pivot.index)))
+        offsets = {
+            m: ((i - (len(methods) - 1) / 2) * 0.12) for i, m in enumerate(methods)
+        }
+        # Cap width to avoid extremely long images; dynamic but limited
+        width = min(16, max(10, len(base_x) * 0.12))
+        plt.figure(figsize=(width, 6))
+        for idx, method in enumerate(methods):
+            # Fill missing values with 0 to connect lines seamlessly
+            y = pivot[method].fillna(0.0).to_numpy()
+            x_shifted = [bx + offsets[method] for bx in base_x]
+            plt.plot(x_shifted, y, label=method, linewidth=1.8, marker='o', markersize=3,
+                     color=colors[idx % len(colors)])
+        plt.ylim(0.0, 1.0)
+        plt.xlim(-0.5, len(base_x) - 0.5)
+        # Hide image names; keep index ticks sparse for readability
+        plt.xticks([], [])
+        plt.xlabel("Images (index)")
+        plt.ylabel("Hallucination (1 - ANLS)")
+        plt.title("Hallucination by image (interlocked methods)")
+        plt.grid(axis='y', linestyle='--', alpha=0.3)
+        # Add box with per-method mean
+        text2 = "\n".join([f"{m}: {float(combo_df[combo_df['method']==m]['hallucination_score'].mean()):.3f}" for m in methods])
+        plt.gca().text(0.99, 0.01, text2, transform=plt.gca().transAxes,
+                       fontsize=9, va='bottom', ha='right',
+                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8, edgecolor='gray'))
+        plt.legend(loc='upper right', ncol=min(3, len(methods)))
+        plt.tight_layout()
+        group_line_png = combined_dir / "hallucination_interlocked_by_image.png"
+        plt.savefig(group_line_png, dpi=160)
+        plt.close()
+
+        print(f"Combined CSV: {combined_csv}")
+        print(f"Saved: {bar_png}")
+        print(f"Saved: {heatmap_png}")
+        print(f"Saved: {line_png}")
+        print(f"Saved: {group_line_png}")
+
+    if any_error:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
+
+
--- a/scripts/pipeline_compare.py
+++ b/scripts/pipeline_compare.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""
+Compare hallucination across three pipelines over five preprocessing methods:
+  1) Raw: all images
+  2) DeQA-filtered: keep images with DeQA score >= threshold (default 2.6)
+  3) Human-filtered: keep images labeled High in CSV labels
+
+Inputs:
+  - One or more per_sample_eval.json files (or per_image_anls.csv already generated)
+  - DeQA score file (txt): lines like "3.9  - image (9)_0.png"
+  - Human labels CSV with columns: filename,label where label in {High,Low}
+
+Outputs:
+  - Combined means CSV: method vs mean hallucination for each pipeline
+  - Line chart (3 lines): hallucination mean per method across the three pipelines
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def canonical_key(name: str) -> str:
+    """Map various filenames to a canonical key used by per_sample_eval 'image' field.
+
+    Examples:
+      - "image (9)_0.png" -> "image (9)"
+      - "image (22)" -> "image (22)"
+      - "foo/bar/image (15)_3.jpg" -> "image (15)"
+      - other names -> stem without extension
+    """
+    if not name:
+        return name
+    # Keep only basename
+    base = Path(name).name
+    # Try pattern image (N)
+    m = re.search(r"(image \(\d+\))", base, flags=re.IGNORECASE)
+    if m:
+        return m.group(1)
+    # Fallback: remove extension
+    return Path(base).stem
+
+
+def read_deqa_scores(txt_path: Path) -> Dict[str, float]:
+    scores: Dict[str, float] = {}
+    with txt_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            # Accept formats: "3.9  - filename" or "filename,3.9" etc.
+            m = re.match(r"\s*([0-9]+(?:\.[0-9]+)?)\s*[-,:]?\s*(.+)$", line)
+            if m:
+                score = float(m.group(1))
+                filename = m.group(2)
+            else:
+                parts = re.split(r"[,\t]", line)
+                if len(parts) >= 2:
+                    try:
+                        score = float(parts[1])
+                        filename = parts[0]
+                    except Exception:
+                        continue
+                else:
+                    continue
+            key = canonical_key(filename)
+            scores[key] = score
+    return scores
+
+
+def read_human_labels(csv_path: Path) -> Dict[str, str]:
+    labels: Dict[str, str] = {}
+    with csv_path.open("r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            filename = (row.get("filename") or row.get("file") or "").strip()
+            label = (row.get("label") or row.get("Label") or "").strip()
+            if not filename:
+                continue
+            key = canonical_key(filename)
+            if label:
+                labels[key] = label
+    return labels
+
+
+def levenshtein_distance(a: str, b: str) -> int:
+    if a == b:
+        return 0
+    if len(a) == 0:
+        return len(b)
+    if len(b) == 0:
+        return len(a)
+    previous_row = list(range(len(b) + 1))
+    for i, ca in enumerate(a, start=1):
+        current_row = [i]
+        for j, cb in enumerate(b, start=1):
+            insertions = previous_row[j] + 1
+            deletions = current_row[j - 1] + 1
+            substitutions = previous_row[j - 1] + (0 if ca == cb else 1)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+
+
+def normalized_similarity(pred: str, gt: str) -> float:
+    pred = pred or ""
+    gt = gt or ""
+    max_len = max(len(pred), len(gt))
+    if max_len == 0:
+        return 1.0
+    dist = levenshtein_distance(pred, gt)
+    sim = 1.0 - (dist / max_len)
+    if sim < 0.0:
+        return 0.0
+    if sim > 1.0:
+        return 1.0
+    return sim
+
+
+def compute_anls_for_record(record: Dict) -> tuple[float, int]:
+    fields = record.get("fields") or []
+    if not isinstance(fields, list) or len(fields) == 0:
+        return 0.0, 0
+    sims: List[float] = []
+    for f in fields:
+        pred = str(f.get("pred", ""))
+        gt = str(f.get("gt", ""))
+        sims.append(normalized_similarity(pred, gt))
+    anls = float(sum(sims) / len(sims)) if sims else 0.0
+    return anls, len(sims)
+
+
+def load_per_image_anls(input_json: Path) -> pd.DataFrame:
+    # Prefer existing per_image_anls.csv, otherwise compute quickly
+    per_image_csv = input_json.parent / "per_image_anls.csv"
+    if per_image_csv.exists():
+        df = pd.read_csv(per_image_csv)
+        return df
+    # Fallback: compute minimal ANLS like in the other script
+    with input_json.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    rows = []
+    for rec in data:
+        anls, num_fields = compute_anls_for_record(rec)
+        rows.append({
+            "image": rec.get("image"),
+            "anls": anls,
+            "hallucination_score": 1.0 - anls,
+            "num_fields": int(num_fields),
+        })
+    return pd.DataFrame(rows)
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Compare hallucination across raw/DeQA/Human pipelines over methods")
+    p.add_argument("inputs", nargs="+", help="per_sample_eval.json files for each method")
+    p.add_argument("--deqa_txt", required=True, help="Path to DeQA scores txt (e.g., cni.txt)")
+    p.add_argument("--human_csv", required=True, help="Path to human labels CSV")
+    p.add_argument("--deqa_threshold", type=float, default=2.6, help="DeQA threshold (>=)")
+    args = p.parse_args()
+
+    # Load filters
+    deqa_scores = read_deqa_scores(Path(args.deqa_txt))
+    human_labels = read_human_labels(Path(args.human_csv))
+
+    # Aggregate per method
+    method_to_df: Dict[str, pd.DataFrame] = {}
+    for ip in args.inputs:
+        path = Path(ip)
+        df = load_per_image_anls(path)
+        df["method"] = path.parent.name
+        df["image_key"] = df["image"].apply(canonical_key)
+        method_to_df[path.parent.name] = df
+
+    # Compute means per pipeline (fair comparison: set excluded images to hallucination=0)
+    records = []
+    for method, df in method_to_df.items():
+        raw_mean = float(df["hallucination_score"].mean()) if len(df) else float("nan")
+
+        # DeQA filter: mark DeQA < threshold as hallucination=0, keep all images
+        df_deqa = df.copy()
+        mask_deqa = df_deqa["image_key"].map(lambda k: deqa_scores.get(k, None))
+        # Set hallucination=0 for images with DeQA < threshold (or missing DeQA)
+        df_deqa.loc[mask_deqa.isna() | (mask_deqa < args.deqa_threshold), "hallucination_score"] = 0.0
+        deqa_mean = float(df_deqa["hallucination_score"].mean()) if len(df_deqa) else float("nan")
+
+        # Human filter: mark Low labels as hallucination=0, keep all images
+        df_human = df.copy()
+        mask_human = df_human["image_key"].map(lambda k: human_labels.get(k, "").lower())
+        # Set hallucination=0 for images labeled Low (or missing label)
+        df_human.loc[mask_human != "high", "hallucination_score"] = 0.0
+        human_mean = float(df_human["hallucination_score"].mean()) if len(df_human) else float("nan")
+
+        records.append({
+            "method": method,
+            "raw_mean": raw_mean,
+            "deqa_mean": deqa_mean,
+            "human_mean": human_mean,
+            "raw_count": int(len(df)),
+            "deqa_count": int(len(df_deqa)),  # Now equal to raw_count
+            "human_count": int(len(df_human)),  # Now equal to raw_count
+        })
+
+    outdir = Path(args.inputs[0]).parent.parent / "combined_anls" / "pipeline"
+    outdir.mkdir(parents=True, exist_ok=True)
+    out_csv = outdir / "pipeline_means.csv"
+    means_df = pd.DataFrame(records).sort_values("method")
+    means_df.to_csv(out_csv, index=False)
+
+    # 3-line comparison plot over methods (narrower with score annotations)
+    x = range(len(means_df))
+    plt.figure(figsize=(7, 5))
+    
+    # Plot lines and add score annotations
+    raw_vals = means_df["raw_mean"].values
+    deqa_vals = means_df["deqa_mean"].values
+    human_vals = means_df["human_mean"].values
+    
+    plt.plot(x, raw_vals, marker="o", label="Raw", linewidth=2, markersize=6)
+    plt.plot(x, deqa_vals, marker="s", label=f"DeQA >= {args.deqa_threshold}", linewidth=2, markersize=6)
+    plt.plot(x, human_vals, marker="^", label="Human High", linewidth=2, markersize=6)
+    
+    # Annotate each point with its score
+    for i, (r, d, h) in enumerate(zip(raw_vals, deqa_vals, human_vals)):
+        plt.annotate(f"{r:.3f}", (i, r), textcoords="offset points", xytext=(0,8), ha='center', fontsize=8)
+        plt.annotate(f"{d:.3f}", (i, d), textcoords="offset points", xytext=(0,8), ha='center', fontsize=8)
+        plt.annotate(f"{h:.3f}", (i, h), textcoords="offset points", xytext=(0,8), ha='center', fontsize=8)
+    
+    plt.xticks(list(x), means_df["method"].tolist(), rotation=25, ha="right")
+    plt.ylabel("Mean hallucination (1 - ANLS)")
+    plt.title("Pipeline comparison over preprocessing methods")
+    plt.grid(axis="y", linestyle="--", alpha=0.3)
+    plt.legend()
+    plt.tight_layout()
+    out_png = outdir / "pipeline_comparison.png"
+    plt.savefig(out_png, dpi=160)
+    plt.close()
+
+    print(f"Saved: {out_csv}")
+    print(f"Saved: {out_png}")
+
+
+if __name__ == "__main__":
+    main()
+
+