feat: add diff_truth.py CLI

2026-06-12 18:15:30 -05:00 · 2026-03-29 13:12:29 +02:00
parent 2aab7420d7
commit 5247512bea
2 changed files with 208 additions and 1 deletions
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""Compare scraped platform YAMLs against ground-truth YAMLs.
+
+Usage:
+    python scripts/diff_truth.py --all
+    python scripts/diff_truth.py --platform retroarch
+    python scripts/diff_truth.py --platform retroarch --json
+    python scripts/diff_truth.py --all --format markdown
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(__file__))
+from common import diff_platform_truth, list_registered_platforms, load_platform_config
+
+try:
+    import yaml
+except ImportError:
+    print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr)
+    sys.exit(1)
+
+
+def _load_truth(truth_dir: str, platform: str) -> dict | None:
+    path = os.path.join(truth_dir, f"{platform}.yml")
+    if not os.path.exists(path):
+        return None
+    with open(path) as f:
+        return yaml.safe_load(f) or {}
+
+
+def _format_terminal(report: dict) -> str:
+    lines: list[str] = []
+    platform = report.get("platform", "unknown")
+    s = report["summary"]
+
+    lines.append(f"=== {platform} ===")
+    lines.append(
+        f"  {s['systems_compared']} systems compared: "
+        f"{s['systems_fully_covered']} full, "
+        f"{s['systems_partially_covered']} partial, "
+        f"{s['systems_uncovered']} uncovered"
+    )
+
+    totals = []
+    if s["total_missing"]:
+        totals.append(f"{s['total_missing']} missing")
+    if s["total_extra_phantom"]:
+        totals.append(f"{s['total_extra_phantom']} phantom")
+    if s["total_extra_unprofiled"]:
+        totals.append(f"{s['total_extra_unprofiled']} unprofiled")
+    if s["total_hash_mismatch"]:
+        totals.append(f"{s['total_hash_mismatch']} hash")
+    if s["total_required_mismatch"]:
+        totals.append(f"{s['total_required_mismatch']} required")
+    if totals:
+        lines.append(f"  divergences: {', '.join(totals)}")
+    else:
+        lines.append("  no divergences")
+
+    for sys_id, div in sorted(report.get("divergences", {}).items()):
+        labels: list[str] = []
+        if div.get("missing"):
+            labels.append(f"MISSING:{len(div['missing'])}")
+        if div.get("extra_phantom"):
+            labels.append(f"PHANTOM:{len(div['extra_phantom'])}")
+        if div.get("extra_unprofiled"):
+            labels.append(f"UNPROF:{len(div['extra_unprofiled'])}")
+        if div.get("hash_mismatch"):
+            labels.append(f"HASH:{len(div['hash_mismatch'])}")
+        if div.get("required_mismatch"):
+            labels.append(f"REQ:{len(div['required_mismatch'])}")
+        lines.append(f"  {sys_id}: {' '.join(labels)}")
+
+        for m in div.get("missing", []):
+            cores = ", ".join(m.get("cores", []))
+            lines.append(f"    + {m['name']}  [{cores}]")
+        for h in div.get("hash_mismatch", []):
+            ht = h["hash_type"]
+            lines.append(f"    ~ {h['name']}  {ht}: {h[f'truth_{ht}']} != {h[f'scraped_{ht}']}")
+        for p in div.get("extra_phantom", []):
+            lines.append(f"    - {p['name']}  (phantom)")
+        for u in div.get("extra_unprofiled", []):
+            lines.append(f"    ? {u['name']}  (unprofiled)")
+        for r in div.get("required_mismatch", []):
+            lines.append(f"    ! {r['name']}  required: {r['truth_required']} != {r['scraped_required']}")
+
+    uncovered = report.get("uncovered_systems", [])
+    if uncovered:
+        lines.append(f"  uncovered ({len(uncovered)}): {', '.join(uncovered)}")
+
+    return "\n".join(lines)
+
+
+def _format_markdown(report: dict) -> str:
+    lines: list[str] = []
+    platform = report.get("platform", "unknown")
+    s = report["summary"]
+
+    lines.append(f"# {platform}")
+    lines.append("")
+    lines.append(
+        f"**{s['systems_compared']}** systems compared | "
+        f"**{s['systems_fully_covered']}** full | "
+        f"**{s['systems_partially_covered']}** partial | "
+        f"**{s['systems_uncovered']}** uncovered"
+    )
+    lines.append(
+        f"**{s['total_missing']}** missing | "
+        f"**{s['total_extra_phantom']}** phantom | "
+        f"**{s['total_extra_unprofiled']}** unprofiled | "
+        f"**{s['total_hash_mismatch']}** hash | "
+        f"**{s['total_required_mismatch']}** required"
+    )
+    lines.append("")
+
+    for sys_id, div in sorted(report.get("divergences", {}).items()):
+        lines.append(f"## {sys_id}")
+        lines.append("")
+        for m in div.get("missing", []):
+            refs = ""
+            if m.get("source_refs"):
+                refs = " " + " ".join(f"`{r}`" for r in m["source_refs"])
+            lines.append(f"- **Add** `{m['name']}`{refs}")
+        for h in div.get("hash_mismatch", []):
+            ht = h["hash_type"]
+            lines.append(f"- **Fix hash** `{h['name']}` {ht}: `{h[f'truth_{ht}']}` != `{h[f'scraped_{ht}']}`")
+        for p in div.get("extra_phantom", []):
+            lines.append(f"- **Remove** `{p['name']}` (phantom)")
+        for u in div.get("extra_unprofiled", []):
+            lines.append(f"- **Check** `{u['name']}` (unprofiled cores)")
+        for r in div.get("required_mismatch", []):
+            lines.append(f"- **Fix required** `{r['name']}`: truth={r['truth_required']}, scraped={r['scraped_required']}")
+        lines.append("")
+
+    uncovered = report.get("uncovered_systems", [])
+    if uncovered:
+        lines.append("## Uncovered systems")
+        lines.append("")
+        for u in uncovered:
+            lines.append(f"- {u}")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compare scraped vs truth YAMLs")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--all", action="store_true", help="diff all registered platforms")
+    group.add_argument("--platform", help="diff a single platform")
+    parser.add_argument("--json", action="store_true", dest="json_output", help="JSON output")
+    parser.add_argument("--format", choices=["terminal", "markdown"], default="terminal")
+    parser.add_argument("--truth-dir", default="dist/truth")
+    parser.add_argument("--platforms-dir", default="platforms")
+    parser.add_argument("--include-archived", action="store_true")
+    args = parser.parse_args()
+
+    if args.all:
+        platforms = list_registered_platforms(args.platforms_dir, include_archived=args.include_archived)
+    else:
+        platforms = [args.platform]
+
+    reports: list[dict] = []
+    formatter = _format_markdown if args.format == "markdown" else _format_terminal
+
+    for platform in platforms:
+        truth = _load_truth(args.truth_dir, platform)
+        if truth is None:
+            if not args.json_output:
+                print(f"skip {platform}: no truth YAML in {args.truth_dir}/", file=sys.stderr)
+            continue
+
+        try:
+            scraped = load_platform_config(platform, args.platforms_dir)
+        except FileNotFoundError:
+            if not args.json_output:
+                print(f"skip {platform}: no scraped config", file=sys.stderr)
+            continue
+
+        report = diff_platform_truth(truth, scraped)
+        report["platform"] = platform
+
+        if args.json_output:
+            reports.append(report)
+        else:
+            print(formatter(report))
+            print()
+
+    if args.json_output:
+        json.dump(reports, sys.stdout, indent=2)
+        print()
+
+
+if __name__ == "__main__":
+    main()