feat: add diff_truth.py CLI

This commit is contained in:
Abdessamad Derraz
2026-03-29 13:12:29 +02:00
parent 2aab7420d7
commit 5247512bea
2 changed files with 208 additions and 1 deletions

200
scripts/diff_truth.py Normal file
View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""Compare scraped platform YAMLs against ground-truth YAMLs.
Usage:
python scripts/diff_truth.py --all
python scripts/diff_truth.py --platform retroarch
python scripts/diff_truth.py --platform retroarch --json
python scripts/diff_truth.py --all --format markdown
"""
from __future__ import annotations
import argparse
import json
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
from common import diff_platform_truth, list_registered_platforms, load_platform_config
try:
import yaml
except ImportError:
print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr)
sys.exit(1)
def _load_truth(truth_dir: str, platform: str) -> dict | None:
path = os.path.join(truth_dir, f"{platform}.yml")
if not os.path.exists(path):
return None
with open(path) as f:
return yaml.safe_load(f) or {}
def _format_terminal(report: dict) -> str:
lines: list[str] = []
platform = report.get("platform", "unknown")
s = report["summary"]
lines.append(f"=== {platform} ===")
lines.append(
f" {s['systems_compared']} systems compared: "
f"{s['systems_fully_covered']} full, "
f"{s['systems_partially_covered']} partial, "
f"{s['systems_uncovered']} uncovered"
)
totals = []
if s["total_missing"]:
totals.append(f"{s['total_missing']} missing")
if s["total_extra_phantom"]:
totals.append(f"{s['total_extra_phantom']} phantom")
if s["total_extra_unprofiled"]:
totals.append(f"{s['total_extra_unprofiled']} unprofiled")
if s["total_hash_mismatch"]:
totals.append(f"{s['total_hash_mismatch']} hash")
if s["total_required_mismatch"]:
totals.append(f"{s['total_required_mismatch']} required")
if totals:
lines.append(f" divergences: {', '.join(totals)}")
else:
lines.append(" no divergences")
for sys_id, div in sorted(report.get("divergences", {}).items()):
labels: list[str] = []
if div.get("missing"):
labels.append(f"MISSING:{len(div['missing'])}")
if div.get("extra_phantom"):
labels.append(f"PHANTOM:{len(div['extra_phantom'])}")
if div.get("extra_unprofiled"):
labels.append(f"UNPROF:{len(div['extra_unprofiled'])}")
if div.get("hash_mismatch"):
labels.append(f"HASH:{len(div['hash_mismatch'])}")
if div.get("required_mismatch"):
labels.append(f"REQ:{len(div['required_mismatch'])}")
lines.append(f" {sys_id}: {' '.join(labels)}")
for m in div.get("missing", []):
cores = ", ".join(m.get("cores", []))
lines.append(f" + {m['name']} [{cores}]")
for h in div.get("hash_mismatch", []):
ht = h["hash_type"]
lines.append(f" ~ {h['name']} {ht}: {h[f'truth_{ht}']} != {h[f'scraped_{ht}']}")
for p in div.get("extra_phantom", []):
lines.append(f" - {p['name']} (phantom)")
for u in div.get("extra_unprofiled", []):
lines.append(f" ? {u['name']} (unprofiled)")
for r in div.get("required_mismatch", []):
lines.append(f" ! {r['name']} required: {r['truth_required']} != {r['scraped_required']}")
uncovered = report.get("uncovered_systems", [])
if uncovered:
lines.append(f" uncovered ({len(uncovered)}): {', '.join(uncovered)}")
return "\n".join(lines)
def _format_markdown(report: dict) -> str:
lines: list[str] = []
platform = report.get("platform", "unknown")
s = report["summary"]
lines.append(f"# {platform}")
lines.append("")
lines.append(
f"**{s['systems_compared']}** systems compared | "
f"**{s['systems_fully_covered']}** full | "
f"**{s['systems_partially_covered']}** partial | "
f"**{s['systems_uncovered']}** uncovered"
)
lines.append(
f"**{s['total_missing']}** missing | "
f"**{s['total_extra_phantom']}** phantom | "
f"**{s['total_extra_unprofiled']}** unprofiled | "
f"**{s['total_hash_mismatch']}** hash | "
f"**{s['total_required_mismatch']}** required"
)
lines.append("")
for sys_id, div in sorted(report.get("divergences", {}).items()):
lines.append(f"## {sys_id}")
lines.append("")
for m in div.get("missing", []):
refs = ""
if m.get("source_refs"):
refs = " " + " ".join(f"`{r}`" for r in m["source_refs"])
lines.append(f"- **Add** `{m['name']}`{refs}")
for h in div.get("hash_mismatch", []):
ht = h["hash_type"]
lines.append(f"- **Fix hash** `{h['name']}` {ht}: `{h[f'truth_{ht}']}` != `{h[f'scraped_{ht}']}`")
for p in div.get("extra_phantom", []):
lines.append(f"- **Remove** `{p['name']}` (phantom)")
for u in div.get("extra_unprofiled", []):
lines.append(f"- **Check** `{u['name']}` (unprofiled cores)")
for r in div.get("required_mismatch", []):
lines.append(f"- **Fix required** `{r['name']}`: truth={r['truth_required']}, scraped={r['scraped_required']}")
lines.append("")
uncovered = report.get("uncovered_systems", [])
if uncovered:
lines.append("## Uncovered systems")
lines.append("")
for u in uncovered:
lines.append(f"- {u}")
lines.append("")
return "\n".join(lines)
def main() -> None:
parser = argparse.ArgumentParser(description="Compare scraped vs truth YAMLs")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--all", action="store_true", help="diff all registered platforms")
group.add_argument("--platform", help="diff a single platform")
parser.add_argument("--json", action="store_true", dest="json_output", help="JSON output")
parser.add_argument("--format", choices=["terminal", "markdown"], default="terminal")
parser.add_argument("--truth-dir", default="dist/truth")
parser.add_argument("--platforms-dir", default="platforms")
parser.add_argument("--include-archived", action="store_true")
args = parser.parse_args()
if args.all:
platforms = list_registered_platforms(args.platforms_dir, include_archived=args.include_archived)
else:
platforms = [args.platform]
reports: list[dict] = []
formatter = _format_markdown if args.format == "markdown" else _format_terminal
for platform in platforms:
truth = _load_truth(args.truth_dir, platform)
if truth is None:
if not args.json_output:
print(f"skip {platform}: no truth YAML in {args.truth_dir}/", file=sys.stderr)
continue
try:
scraped = load_platform_config(platform, args.platforms_dir)
except FileNotFoundError:
if not args.json_output:
print(f"skip {platform}: no scraped config", file=sys.stderr)
continue
report = diff_platform_truth(truth, scraped)
report["platform"] = platform
if args.json_output:
reports.append(report)
else:
print(formatter(report))
print()
if args.json_output:
json.dump(reports, sys.stdout, indent=2)
print()
if __name__ == "__main__":
main()