update IQA results

2025-09-09 12:51:55 +07:00
parent eb465155d1
commit eb0a12448f
32 changed files with 3551 additions and 4 deletions
--- a/docs/task/cni/results/facture_confusion_matrix.png
+++ b/docs/task/cni/results/facture_confusion_matrix.png
--- a/docs/task/cni/results/facture_deqa_images.xlsx
+++ b/docs/task/cni/results/facture_deqa_images.xlsx
--- a/docs/task/cni/results/facture_metric_curves.png
+++ b/docs/task/cni/results/facture_metric_curves.png
--- a/docs/task/cni/results/facture_precision_recall_curve.png
+++ b/docs/task/cni/results/facture_precision_recall_curve.png
--- a/docs/task/cni/results/facture_roc_like_curve.png
+++ b/docs/task/cni/results/facture_roc_like_curve.png
--- a/docs/task/cni/results/facture_score_distributions.png
+++ b/docs/task/cni/results/facture_score_distributions.png
--- a/docs/task/cni/results/facture_sorted_scores_with_thr.png
+++ b/docs/task/cni/results/facture_sorted_scores_with_thr.png
--- a/docs/task/facture/200_samples/facture_confusion_matrix.png
+++ b/docs/task/facture/200_samples/facture_confusion_matrix.png
--- a/docs/task/facture/200_samples/facture_decisions.csv
+++ b/docs/task/facture/200_samples/facture_decisions.csv
--- a/docs/task/facture/200_samples/facture_deqa_images.xlsx
+++ b/docs/task/facture/200_samples/facture_deqa_images.xlsx
--- a/docs/task/facture/200_samples/facture_metric_curves.png
+++ b/docs/task/facture/200_samples/facture_metric_curves.png
--- a/docs/task/facture/200_samples/facture_precision_recall_curve.png
+++ b/docs/task/facture/200_samples/facture_precision_recall_curve.png
--- a/docs/task/facture/200_samples/facture_roc_like_curve.png
+++ b/docs/task/facture/200_samples/facture_roc_like_curve.png
--- a/docs/task/facture/200_samples/facture_score_distributions.png
+++ b/docs/task/facture/200_samples/facture_score_distributions.png
--- a/docs/task/facture/200_samples/facture_sorted_scores_with_thr.png
+++ b/docs/task/facture/200_samples/facture_sorted_scores_with_thr.png
--- a/docs/task/facture/200_samples/facture_thresholds_summary.json
+++ b/docs/task/facture/200_samples/facture_thresholds_summary.json
--- a/docs/task/facture/full_samples/facture_confusion_matrix.png
+++ b/docs/task/facture/full_samples/facture_confusion_matrix.png
--- a/docs/task/facture/full_samples/facture_decisions.csv
+++ b/docs/task/facture/full_samples/facture_decisions.csv
--- a/docs/task/facture/full_samples/facture_deqa_images.xlsx
+++ b/docs/task/facture/full_samples/facture_deqa_images.xlsx
--- a/docs/task/facture/full_samples/facture_metric_curves.png
+++ b/docs/task/facture/full_samples/facture_metric_curves.png
--- a/docs/task/facture/full_samples/facture_precision_recall_curve.png
+++ b/docs/task/facture/full_samples/facture_precision_recall_curve.png
--- a/docs/task/facture/full_samples/facture_roc_like_curve.png
+++ b/docs/task/facture/full_samples/facture_roc_like_curve.png
--- a/docs/task/facture/full_samples/facture_score_distributions.png
+++ b/docs/task/facture/full_samples/facture_score_distributions.png
--- a/docs/task/facture/full_samples/facture_sorted_scores_with_thr.png
+++ b/docs/task/facture/full_samples/facture_sorted_scores_with_thr.png
--- a/docs/task/facture/full_samples/facture_thresholds_summary.json
+++ b/docs/task/facture/full_samples/facture_thresholds_summary.json
@@ -0,0 +1,50 @@
+{
+  "positive_definition": "HIGH when score >= threshold",
+  "best_thresholds": {
+    "f1": {
+      "threshold": 1.6,
+      "value": 0.9249762583095917,
+      "confusion": {
+        "TP": 2922,
+        "FP": 474,
+        "FN": 0,
+        "TN": 0
+      }
+    },
+    "accuracy": {
+      "threshold": 1.6,
+      "value": 0.8604240282685512,
+      "confusion": {
+        "TP": 2922,
+        "FP": 474,
+        "FN": 0,
+        "TN": 0
+      }
+    },
+    "precision": {
+      "threshold": 4.2,
+      "value": 1.0,
+      "confusion": {
+        "TP": 3,
+        "FP": 0,
+        "FN": 2919,
+        "TN": 474
+      }
+    },
+    "recall": {
+      "threshold": 1.6,
+      "value": 1.0,
+      "confusion": {
+        "TP": 2922,
+        "FP": 474,
+        "FN": 0,
+        "TN": 0
+      }
+    }
+  },
+  "counts": {
+    "total": 3396,
+    "positives": 2922,
+    "negatives": 474
+  }
+}
--- a/docs/task/facture/results/facture_confusion_matrix.png
+++ b/docs/task/facture/results/facture_confusion_matrix.png
--- a/docs/task/facture/results/facture_metric_curves.png
+++ b/docs/task/facture/results/facture_metric_curves.png
--- a/docs/task/facture/results/facture_precision_recall_curve.png
+++ b/docs/task/facture/results/facture_precision_recall_curve.png
--- a/docs/task/facture/results/facture_roc_like_curve.png
+++ b/docs/task/facture/results/facture_roc_like_curve.png
--- a/docs/task/facture/results/facture_score_distributions.png
+++ b/docs/task/facture/results/facture_score_distributions.png
--- a/docs/task/facture/results/facture_sorted_scores_with_thr.png
+++ b/docs/task/facture/results/facture_sorted_scores_with_thr.png
--- a/scripts/threshold_analysis.py
+++ b/scripts/threshold_analysis.py
@@ -130,12 +130,102 @@ def compute_metric_curves(scores: np.ndarray, y_true: np.ndarray) -> pd.DataFram
    return pd.DataFrame(data).sort_values("threshold").reset_index(drop=True)


-def plot_distributions(df: pd.DataFrame, out_path: Path) -> None:
-    plt.figure(figsize=(8, 5))
-    sns.histplot(data=df, x="score", hue="label", bins=30, kde=True, stat="density", common_norm=False)
+def plot_distributions(
+    df: pd.DataFrame,
+    out_path: Path,
+    threshold: float | None = None,
+    acc_at_thr: float | None = None,
+    f1_at_thr: float | None = None,
+) -> None:
+    # Clean, white background without gray grid
+    sns.set_style("white")
+    plt.figure(figsize=(10, 6))
+    # Side-by-side bars (dodge) with wider bars
+    palette = {"High": "tab:blue", "Low": "tab:orange"}
+    sns.histplot(
+        data=df,
+        x="score",
+        hue="label",
+        bins=None,
+        binwidth=0.18,
+        kde=False,
+        stat="density",
+        common_norm=False,
+        multiple="dodge",
+        palette=palette,
+        element="bars",
+        shrink=0.85,
+        alpha=0.8,
+        edgecolor="white",
+        linewidth=0.5,
+    )
+    
+    # KDE lines for High, Low, and All samples (three lines)
+    try:
+        high_scores = df.loc[df["label"] == "High", "score"].astype(float)
+        low_scores = df.loc[df["label"] == "Low", "score"].astype(float)
+        all_scores = df["score"].astype(float)
+        if len(high_scores) > 1:
+            sns.kdeplot(high_scores, color="tab:blue", linewidth=2.0, label="High density")
+        if len(low_scores) > 1:
+            sns.kdeplot(low_scores, color="tab:orange", linewidth=2.0, label="Low density")
+        if len(all_scores) > 1:
+            sns.kdeplot(all_scores, color="black", linewidth=2.2, linestyle="-", label="All density")
+    except Exception:
+        pass
+    
+    # Threshold vertical line with styled annotation
+    if threshold is not None:
+        ax = plt.gca()
+        ax.axvline(threshold, color="red", linestyle=(0, (6, 4)), linewidth=2.0)
+        acc_str = f"{acc_at_thr:.3f}" if acc_at_thr is not None else "NA"
+        f1_str = f"{f1_at_thr:.3f}" if f1_at_thr is not None else "NA"
+        label_text = f"threshold={threshold:.3f}  Accuracy={acc_str}  F1={f1_str}"
+        ymax = ax.get_ylim()[1]
+        ax.text(
+            threshold + 0.02,
+            ymax * 0.97,
+            label_text,
+            color="red",
+            ha="left",
+            va="top",
+            fontsize=10,
+            bbox=dict(boxstyle="round,pad=0.3", facecolor="#ffecec", edgecolor="#ff9a9a", alpha=0.85),
+        )
+    
+    # Add stats box in bottom-right: counts and mean/std per class and overall
+    try:
+        high_scores = df.loc[df["label"] == "High", "score"].astype(float)
+        low_scores = df.loc[df["label"] == "Low", "score"].astype(float)
+        n_high = int(high_scores.shape[0])
+        n_low = int(low_scores.shape[0])
+        mean_high = float(high_scores.mean()) if n_high > 0 else float("nan")
+        std_high = float(high_scores.std(ddof=1)) if n_high > 1 else float("nan")
+        mean_low = float(low_scores.mean()) if n_low > 0 else float("nan")
+        std_low = float(low_scores.std(ddof=1)) if n_low > 1 else float("nan")
+        all_scores = df["score"].astype(float)
+        mean_all = float(all_scores.mean()) if all_scores.shape[0] > 0 else float("nan")
+        std_all = float(all_scores.std(ddof=1)) if all_scores.shape[0] > 1 else float("nan")
+        stats_text = (
+            f"High: n={n_high}, \u03BC={mean_high:.3f}, \u03C3={std_high:.3f}\n"
+            f"Low:  n={n_low}, \u03BC={mean_low:.3f}, \u03C3={std_low:.3f}\n"
+            f"All:  n={n_high+n_low}, \u03BC={mean_all:.3f}, \u03C3={std_all:.3f}"
+        )
+        ax = plt.gca()
+        ax.text(
+            0.99, 0.02, stats_text,
+            transform=ax.transAxes,
+            ha="right", va="bottom",
+            fontsize=9,
+            bbox=dict(boxstyle="round,pad=0.5", facecolor="white", edgecolor="gray", alpha=0.95),
+        )
+    except Exception:
+        pass
+    
    plt.title("DeQA score distributions by label")
    plt.xlabel("DeQA score")
    plt.ylabel("Density")
+    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
@@ -272,7 +362,17 @@ def main() -> None:

    # Metric curves and figures
    curves = compute_metric_curves(scores, y_true)
-    plot_distributions(df, outdir / "facture_score_distributions.png")
+    # Accuracy and F1 at selected threshold for annotation
+    tp_f1, fp_f1, fn_f1, tn_f1 = confusion_from_threshold(scores, y_true, thr_f1)
+    acc_at_thr = metric_from_confusion(tp_f1, fp_f1, fn_f1, tn_f1, "accuracy")
+    f1_at_thr = metric_from_confusion(tp_f1, fp_f1, fn_f1, tn_f1, "f1")
+    plot_distributions(
+        df,
+        outdir / "facture_score_distributions.png",
+        threshold=thr_f1,
+        acc_at_thr=acc_at_thr,
+        f1_at_thr=f1_at_thr,
+    )
    plot_metric_curves(curves, outdir / "facture_metric_curves.png")
    # Extra plots
    plot_sorted_scores_with_threshold(df, thr_f1, outdir / "facture_sorted_scores_with_thr.png")