update source code and pipeline
This commit is contained in:
560
filter/analyze_labels.py
Normal file
560
filter/analyze_labels.py
Normal file
@@ -0,0 +1,560 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze 'label' fields in a JSON dataset and produce summaries.
|
||||
|
||||
- Handles entries where 'label' is either an object or a list of objects.
|
||||
- Computes distributions (is_bill, profession, currency, IDs presence, handwriting/rotation).
|
||||
- Computes numeric stats (total_billed, amount_paid, remaining_payment, coverages).
|
||||
- Parses dates and shows temporal distribution.
|
||||
- Analyzes items: count, sum of amounts and coverages, and mismatches vs total_billed.
|
||||
- Emits a concise stdout summary and writes CSVs and a Markdown report.
|
||||
|
||||
Usage:
|
||||
python analyze_labels.py --input 008_label_data_sample_seed_1997.json --out-dir .
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from statistics import mean, median
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
|
||||
NUMERIC_FIELDS = [
|
||||
"total_billed",
|
||||
"amount_paid",
|
||||
"remaining_payment",
|
||||
"client_part",
|
||||
"mandatory_coverage",
|
||||
"complementary_coverage",
|
||||
]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Analyze 'label' fields in JSON dataset")
|
||||
p.add_argument("--input", required=True, help="Path to JSON file (list of records)")
|
||||
p.add_argument(
|
||||
"--out-dir", default=None, help="Output directory for reports (default: alongside input)"
|
||||
)
|
||||
p.add_argument(
|
||||
"--max-professions", type=int, default=50, help="Max professions to list in report"
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-plots",
|
||||
action="store_true",
|
||||
help="Disable generating plots (PNG) and embedding into report",
|
||||
)
|
||||
p.add_argument(
|
||||
"--plot-top-k",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Top-K categories to visualize for profession/currency",
|
||||
)
|
||||
p.add_argument(
|
||||
"--plot-format",
|
||||
type=str,
|
||||
default="png",
|
||||
choices=["png", "jpg", "jpeg"],
|
||||
help="Image format for plots",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> List[Dict[str, Any]]:
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Top-level JSON must be a list of records")
|
||||
return data
|
||||
|
||||
|
||||
def to_bool(value: Any) -> Optional[bool]:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, (int, float)):
|
||||
return bool(value)
|
||||
if isinstance(value, str):
|
||||
v = value.strip().lower()
|
||||
if v in {"true", "t", "1", "yes", "y"}:
|
||||
return True
|
||||
if v in {"false", "f", "0", "no", "n"}:
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
def to_float(value: Any) -> Optional[float]:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_date(value: Any) -> Optional[datetime]:
|
||||
if not value or not isinstance(value, str):
|
||||
return None
|
||||
s = value.strip()
|
||||
if not s:
|
||||
return None
|
||||
# Common patterns (day-first)
|
||||
fmts = [
|
||||
"%d-%m-%Y",
|
||||
"%d/%m/%Y",
|
||||
"%Y-%m-%d",
|
||||
"%d-%m-%y",
|
||||
"%d/%m/%y",
|
||||
]
|
||||
for fmt in fmts:
|
||||
try:
|
||||
return datetime.strptime(s, fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
# Try to extract a date-like token using regex (e.g., 2025-02-07 or 07-02-2025)
|
||||
m = re.search(r"(\d{2}[/-]\d{2}[/-]\d{4}|\d{4}-\d{2}-\d{2})", s)
|
||||
if m:
|
||||
token = m.group(1)
|
||||
for fmt in fmts:
|
||||
try:
|
||||
return datetime.strptime(token, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def safe_get(d: Dict[str, Any], key: str, default=None):
|
||||
return d.get(key, default)
|
||||
|
||||
|
||||
def flatten_labels(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
out: List[Dict[str, Any]] = []
|
||||
for rec in records:
|
||||
src_image = rec.get("image") or ",".join(rec.get("image_files", []) or [])
|
||||
label = rec.get("label")
|
||||
if label is None:
|
||||
continue
|
||||
if isinstance(label, list):
|
||||
for idx, lab in enumerate(label):
|
||||
if not isinstance(lab, dict):
|
||||
continue
|
||||
o = dict(lab)
|
||||
o["__source_image__"] = src_image
|
||||
o["__multi_index__"] = idx
|
||||
out.append(o)
|
||||
elif isinstance(label, dict):
|
||||
o = dict(label)
|
||||
o["__source_image__"] = src_image
|
||||
out.append(o)
|
||||
return out
|
||||
|
||||
|
||||
def presence_counts(labels: List[Dict[str, Any]], fields: Iterable[str]) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = {}
|
||||
for field in fields:
|
||||
present = 0
|
||||
for lbl in labels:
|
||||
if safe_get(lbl, field) not in (None, ""):
|
||||
present += 1
|
||||
counts[field] = present
|
||||
return counts
|
||||
|
||||
|
||||
def numeric_summary(values: List[Optional[float]]) -> Dict[str, Any]:
|
||||
clean = [v for v in values if isinstance(v, (int, float)) and not math.isnan(v)]
|
||||
if not clean:
|
||||
return {"count": 0}
|
||||
return {
|
||||
"count": len(clean),
|
||||
"min": min(clean),
|
||||
"p25": percentile(clean, 25),
|
||||
"median": median(clean),
|
||||
"p75": percentile(clean, 75),
|
||||
"max": max(clean),
|
||||
"mean": mean(clean),
|
||||
"sum": sum(clean),
|
||||
"missing": len(values) - len(clean),
|
||||
}
|
||||
|
||||
|
||||
def percentile(arr: List[float], p: float) -> float:
|
||||
if not arr:
|
||||
return float("nan")
|
||||
a = sorted(arr)
|
||||
k = (len(a) - 1) * (p / 100.0)
|
||||
f = math.floor(k)
|
||||
c = math.ceil(k)
|
||||
if f == c:
|
||||
return a[int(k)]
|
||||
d0 = a[f] * (c - k)
|
||||
d1 = a[c] * (k - f)
|
||||
return d0 + d1
|
||||
|
||||
|
||||
def write_csv(path: Path, headers: List[str], rows: Iterable[Iterable[Any]]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8", newline="") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(headers)
|
||||
for row in rows:
|
||||
w.writerow(row)
|
||||
|
||||
|
||||
def try_import_matplotlib():
|
||||
try:
|
||||
import matplotlib # type: ignore[import-not-found]
|
||||
matplotlib.use("Agg") # headless backend
|
||||
import matplotlib.pyplot as plt # type: ignore[import-not-found]
|
||||
return plt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def save_bar_plot(plt, x_labels: List[str], values: List[float], title: str, out_path: Path, rotation: int = 0):
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig, ax = plt.subplots(figsize=(max(6, min(14, 0.4 * len(x_labels) + 3)), 4))
|
||||
ax.bar(range(len(values)), values, color="#4C78A8")
|
||||
ax.set_title(title)
|
||||
ax.set_ylabel("count")
|
||||
ax.set_xticks(range(len(x_labels)))
|
||||
ax.set_xticklabels(x_labels, rotation=rotation, ha="right" if rotation else "center")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def save_hist_plot(plt, values: List[float], title: str, out_path: Path, bins: int = 30):
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig, ax = plt.subplots(figsize=(7, 4))
|
||||
ax.hist(values, bins=bins, color="#72B7B2", edgecolor="white")
|
||||
ax.set_title(title)
|
||||
ax.set_ylabel("count")
|
||||
ax.set_xlabel("value")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def produce_plots(
|
||||
out_dir: Path,
|
||||
args: argparse.Namespace,
|
||||
is_bill_counter: Counter,
|
||||
bill_paid_counter: Counter,
|
||||
handwriting_counter: Counter,
|
||||
rotation_counter: Counter,
|
||||
profession_counter: Counter,
|
||||
currency_counter: Counter,
|
||||
year_month_counter: Counter,
|
||||
numeric_data: Dict[str, List[Optional[float]]],
|
||||
items_per_label: List[int],
|
||||
) -> List[Path]:
|
||||
"""Generate plots and return list of created file paths."""
|
||||
if args.no_plots:
|
||||
return []
|
||||
plt = try_import_matplotlib()
|
||||
if plt is None:
|
||||
# matplotlib not available; skip plotting gracefully
|
||||
return []
|
||||
|
||||
created: List[Path] = []
|
||||
plots_dir = out_dir / "plots"
|
||||
ext = args.plot_format
|
||||
|
||||
# is_bill
|
||||
if is_bill_counter:
|
||||
labels = [str(k) for k, _ in is_bill_counter.items()]
|
||||
vals = [v for _, v in is_bill_counter.items()]
|
||||
p = plots_dir / f"is_bill.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "is_bill distribution", p)
|
||||
created.append(p)
|
||||
|
||||
# bill_paid
|
||||
if bill_paid_counter:
|
||||
labels = [str(k) for k, _ in bill_paid_counter.items()]
|
||||
vals = [v for _, v in bill_paid_counter.items()]
|
||||
p = plots_dir / f"bill_paid.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "bill_paid distribution", p)
|
||||
created.append(p)
|
||||
|
||||
# Flags
|
||||
if handwriting_counter:
|
||||
labels = [str(k) for k, _ in handwriting_counter.items()]
|
||||
vals = [v for _, v in handwriting_counter.items()]
|
||||
p = plots_dir / f"is_handwriting.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "is_handwriting", p)
|
||||
created.append(p)
|
||||
if rotation_counter:
|
||||
labels = [str(k) for k, _ in rotation_counter.items()]
|
||||
vals = [v for _, v in rotation_counter.items()]
|
||||
p = plots_dir / f"is_rotated.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "is_rotated", p)
|
||||
created.append(p)
|
||||
|
||||
# Professions (top-K)
|
||||
if profession_counter:
|
||||
top = profession_counter.most_common(max(1, min(args.plot_top_k, len(profession_counter))))
|
||||
labels = [k if len(str(k)) <= 20 else str(k)[:17] + "…" for k, _ in top]
|
||||
vals = [v for _, v in top]
|
||||
p = plots_dir / f"professions_top{len(labels)}.{ext}"
|
||||
save_bar_plot(plt, labels, vals, f"Top {len(labels)} professions", p, rotation=45)
|
||||
created.append(p)
|
||||
|
||||
# Currency
|
||||
if currency_counter:
|
||||
top = currency_counter.most_common(max(1, min(args.plot_top_k, len(currency_counter))))
|
||||
labels = [str(k) for k, _ in top]
|
||||
vals = [v for _, v in top]
|
||||
p = plots_dir / f"currency.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "Currency distribution", p)
|
||||
created.append(p)
|
||||
|
||||
# Year-month
|
||||
if year_month_counter:
|
||||
items = sorted(year_month_counter.items(), key=lambda x: (x[0][0], x[0][1]))
|
||||
labels = [f"{y:04d}-{m:02d}" for (y, m), _ in items]
|
||||
vals = [v for _, v in items]
|
||||
p = plots_dir / f"invoice_year_month.{ext}"
|
||||
save_bar_plot(plt, labels, vals, "Invoices by year-month", p, rotation=45)
|
||||
created.append(p)
|
||||
|
||||
# Items per label
|
||||
if items_per_label:
|
||||
p = plots_dir / f"items_per_label.{ext}"
|
||||
save_hist_plot(plt, items_per_label, "Items per label (histogram)", p, bins=min(30, max(5, int(len(items_per_label) ** 0.5))))
|
||||
created.append(p)
|
||||
|
||||
# Numeric fields histograms
|
||||
for k, vals_all in numeric_data.items():
|
||||
vals = [float(v) for v in vals_all if isinstance(v, (int, float)) and not math.isnan(v)]
|
||||
if not vals:
|
||||
continue
|
||||
p = plots_dir / f"hist_{k}.{ext}"
|
||||
save_hist_plot(plt, vals, f"{k} (histogram)", p)
|
||||
created.append(p)
|
||||
|
||||
return created
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
in_path = Path(args.input).resolve()
|
||||
out_dir = Path(args.out_dir).resolve() if args.out_dir else in_path.parent
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
records = load_json(in_path)
|
||||
labels = flatten_labels(records)
|
||||
n_total_rec = len(records)
|
||||
n_labels = len(labels)
|
||||
|
||||
# Normalize some fields
|
||||
for lbl in labels:
|
||||
lbl["is_bill"] = to_bool(lbl.get("is_bill"))
|
||||
lbl["bill_paid"] = to_bool(lbl.get("bill_paid"))
|
||||
# Normalize numeric fields in-place for ease of stats
|
||||
for k in NUMERIC_FIELDS:
|
||||
lbl[k] = to_float(lbl.get(k))
|
||||
|
||||
# Basic distributions
|
||||
is_bill_counter = Counter(lbl.get("is_bill") for lbl in labels)
|
||||
bill_paid_counter = Counter(lbl.get("bill_paid") for lbl in labels)
|
||||
currency_counter = Counter(lbl.get("currency") for lbl in labels if lbl.get("currency"))
|
||||
profession_counter = Counter((lbl.get("profession") or "").strip() or "(missing)" for lbl in labels)
|
||||
|
||||
# Presence of identifiers and key fields
|
||||
id_presence = presence_counts(labels, [
|
||||
"adeli_number",
|
||||
"rpps_number",
|
||||
"finess_number",
|
||||
"prescripteur_finess_number",
|
||||
"doctor_name",
|
||||
"invoice_issuer",
|
||||
"insured_name",
|
||||
"beneficiary_name",
|
||||
"security_number",
|
||||
"currency",
|
||||
])
|
||||
|
||||
# Handwriting/rotation flags
|
||||
handwriting_counter = Counter(to_bool(lbl.get("is_handwriting")) for lbl in labels)
|
||||
rotation_counter = Counter(to_bool(lbl.get("is_rotated")) for lbl in labels)
|
||||
|
||||
# Numeric stats
|
||||
numeric_stats: Dict[str, Dict[str, Any]] = {}
|
||||
for k in NUMERIC_FIELDS:
|
||||
numeric_stats[k] = numeric_summary([lbl.get(k) for lbl in labels])
|
||||
# Keep raw numeric data for histograms
|
||||
numeric_raw: Dict[str, List[Optional[float]]] = {k: [lbl.get(k) for lbl in labels] for k in NUMERIC_FIELDS}
|
||||
|
||||
# Dates
|
||||
invoice_dates = [parse_date(lbl.get("invoice_date")) for lbl in labels]
|
||||
invoice_dates_clean = [d for d in invoice_dates if d is not None]
|
||||
year_month_counter = Counter((d.year, d.month) for d in invoice_dates_clean)
|
||||
|
||||
# Items analysis
|
||||
items_per_label: List[int] = []
|
||||
sum_item_amount: List[Optional[float]] = []
|
||||
sum_item_mandatory: List[Optional[float]] = []
|
||||
mismatch_records: List[Tuple[str, Optional[float], Optional[float], Optional[float]]] = []
|
||||
|
||||
for lbl in labels:
|
||||
items = lbl.get("items") or []
|
||||
if not isinstance(items, list):
|
||||
items = []
|
||||
items_per_label.append(len(items))
|
||||
s_amount = None
|
||||
s_mand = None
|
||||
for it in items:
|
||||
if not isinstance(it, dict):
|
||||
continue
|
||||
a = to_float(it.get("amount"))
|
||||
m = to_float(it.get("mandatory_coverage"))
|
||||
s_amount = (s_amount or 0.0) + (a or 0.0)
|
||||
s_mand = (s_mand or 0.0) + (m or 0.0)
|
||||
sum_item_amount.append(s_amount)
|
||||
sum_item_mandatory.append(s_mand)
|
||||
total_billed = to_float(lbl.get("total_billed"))
|
||||
if total_billed is not None and s_amount is not None:
|
||||
diff = total_billed - s_amount
|
||||
if abs(diff) > 1e-6:
|
||||
mismatch_records.append((
|
||||
str(lbl.get("__source_image__")), total_billed, s_amount, diff
|
||||
))
|
||||
|
||||
# Data quality issues
|
||||
issues: List[Dict[str, Any]] = []
|
||||
for lbl in labels:
|
||||
src = str(lbl.get("__source_image__"))
|
||||
# is_bill must be True/False or None (unknown); flag strings that could not be parsed
|
||||
if "is_bill" in lbl and not isinstance(lbl.get("is_bill"), (bool, type(None))):
|
||||
issues.append({"source": src, "issue": "is_bill not boolean"})
|
||||
# bill_paid True but amount_paid missing
|
||||
if lbl.get("bill_paid") is True and to_float(lbl.get("amount_paid")) is None:
|
||||
issues.append({"source": src, "issue": "bill_paid True but amount_paid missing"})
|
||||
# remaining_payment > 0 but bill_paid True
|
||||
rp = to_float(lbl.get("remaining_payment"))
|
||||
if lbl.get("bill_paid") is True and (rp or 0) > 0:
|
||||
issues.append({"source": src, "issue": "bill_paid True but remaining_payment > 0"})
|
||||
# Negative or zero amounts on items
|
||||
items = lbl.get("items") or []
|
||||
if isinstance(items, list):
|
||||
for idx, it in enumerate(items):
|
||||
if not isinstance(it, dict):
|
||||
continue
|
||||
a = to_float(it.get("amount"))
|
||||
if a is not None and a < 0:
|
||||
issues.append({"source": src, "issue": f"item[{idx}].amount negative: {a}"})
|
||||
q = to_float(it.get("quantity"))
|
||||
if q is None:
|
||||
# Not strictly an issue, but mark for completeness
|
||||
issues.append({"source": src, "issue": f"item[{idx}].quantity missing"})
|
||||
# Missing currency on bill
|
||||
if lbl.get("is_bill") is True and not lbl.get("currency"):
|
||||
issues.append({"source": src, "issue": "currency missing for bill"})
|
||||
|
||||
# Outputs
|
||||
# 1) CSVs
|
||||
write_csv(out_dir / "professions_counts.csv", ["profession", "count"], profession_counter.most_common())
|
||||
write_csv(out_dir / "currency_counts.csv", ["currency", "count"], currency_counter.most_common())
|
||||
write_csv(out_dir / "is_bill_counts.csv", ["is_bill", "count"], is_bill_counter.items())
|
||||
write_csv(out_dir / "bill_paid_counts.csv", ["bill_paid", "count"], bill_paid_counter.items())
|
||||
write_csv(out_dir / "id_presence.csv", ["field", "present_count"], id_presence.items())
|
||||
write_csv(out_dir / "item_total_billed_mismatches.csv", ["source_image", "total_billed", "sum_item_amount", "diff"], mismatch_records)
|
||||
write_csv(out_dir / "issues.csv", ["source", "issue"], ((i["source"], i["issue"]) for i in issues))
|
||||
|
||||
# 2) Markdown report
|
||||
md = []
|
||||
md.append("# Label Analysis Report\n")
|
||||
md.append(f"Input: `{in_path.name}`\n")
|
||||
md.append("")
|
||||
md.append("## Overview\n")
|
||||
md.append(f"- Total records: {n_total_rec}")
|
||||
md.append(f"- Total labels (flattened): {n_labels}")
|
||||
md.append(f"- is_bill distribution: {dict(is_bill_counter)}")
|
||||
md.append(f"- bill_paid distribution: {dict(bill_paid_counter)}")
|
||||
if invoice_dates_clean:
|
||||
md.append(
|
||||
f"- Invoice dates span: {min(invoice_dates_clean).date()} .. {max(invoice_dates_clean).date()}"
|
||||
)
|
||||
md.append(f"- Unique year-month pairs: {len(year_month_counter)}")
|
||||
else:
|
||||
md.append("- Invoice dates: none parseable")
|
||||
|
||||
md.append("\n## Professions (top)\n")
|
||||
for prof, cnt in profession_counter.most_common(args.max_professions):
|
||||
md.append(f"- {prof}: {cnt}")
|
||||
|
||||
md.append("\n## Currency distribution\n")
|
||||
for cur, cnt in currency_counter.most_common():
|
||||
md.append(f"- {cur}: {cnt}")
|
||||
|
||||
md.append("\n## Identifier and key field presence\n")
|
||||
for k, v in id_presence.items():
|
||||
md.append(f"- {k}: {v} present")
|
||||
|
||||
md.append("\n## Flags\n")
|
||||
md.append(f"- is_handwriting: {dict(handwriting_counter)}")
|
||||
md.append(f"- is_rotated: {dict(rotation_counter)}")
|
||||
|
||||
md.append("\n## Numeric summaries\n")
|
||||
for k, stats in numeric_stats.items():
|
||||
md.append(f"- {k}: {stats}")
|
||||
|
||||
if items_per_label:
|
||||
md.append("\n## Items analysis\n")
|
||||
md.append(f"- Items per label: count={len(items_per_label)}, min={min(items_per_label)}, max={max(items_per_label)}, mean={mean(items_per_label):.2f}")
|
||||
n_mismatch = len(mismatch_records)
|
||||
md.append(f"- total_billed vs sum(items.amount) mismatches: {n_mismatch}")
|
||||
|
||||
if issues:
|
||||
md.append("\n## Data quality issues (sample)\n")
|
||||
for row in issues[:50]:
|
||||
md.append(f"- {row['source']}: {row['issue']}")
|
||||
|
||||
# 3) Plots (if enabled)
|
||||
created_plots = produce_plots(
|
||||
out_dir=out_dir,
|
||||
args=args,
|
||||
is_bill_counter=is_bill_counter,
|
||||
bill_paid_counter=bill_paid_counter,
|
||||
handwriting_counter=handwriting_counter,
|
||||
rotation_counter=rotation_counter,
|
||||
profession_counter=profession_counter,
|
||||
currency_counter=currency_counter,
|
||||
year_month_counter=year_month_counter,
|
||||
numeric_data=numeric_raw,
|
||||
items_per_label=items_per_label,
|
||||
)
|
||||
if created_plots:
|
||||
md.append("\n## Plots\n")
|
||||
for p in created_plots:
|
||||
rel = p.relative_to(out_dir)
|
||||
md.append(f"- {p.stem}")
|
||||
md.append(f"})\n")
|
||||
elif not args.no_plots:
|
||||
md.append("\n## Plots\n")
|
||||
md.append("- matplotlib not available or no data to plot.")
|
||||
|
||||
report_path = out_dir / "label_analysis_report.md"
|
||||
report_path.write_text("\n".join(md), encoding="utf-8")
|
||||
|
||||
# Console summary
|
||||
print("Label analysis complete.")
|
||||
print(f"- Records: {n_total_rec}, Labels: {n_labels}")
|
||||
print(f"- is_bill: {dict(is_bill_counter)} | bill_paid: {dict(bill_paid_counter)}")
|
||||
print(f"- Professions (top 10): {profession_counter.most_common(10)}")
|
||||
print(f"- Currency: {dict(currency_counter)}")
|
||||
print(f"Report written to: {report_path}")
|
||||
if created_plots:
|
||||
print(f"- Plots saved under: {(out_dir / 'plots').as_posix()} ({len(created_plots)} files)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user