From 5247512beaf6f7148b5d94ba31d717769b8f49fd Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Sun, 29 Mar 2026 13:12:29 +0200 Subject: [PATCH] feat: add diff_truth.py CLI --- scripts/common.py | 9 +- scripts/diff_truth.py | 200 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 scripts/diff_truth.py diff --git a/scripts/common.py b/scripts/common.py index 7ccc95e4..93762666 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -1349,7 +1349,14 @@ def _diff_system(truth_sys: dict, scraped_sys: dict) -> dict: for h in ("sha1", "md5", "crc32"): t_hash = t_entry.get(h, "") s_hash = s_entry.get(h, "") - if t_hash and s_hash and t_hash.lower() != s_hash.lower(): + if not t_hash or not s_hash: + continue + # Normalize to list for multi-hash support + t_list = t_hash if isinstance(t_hash, list) else [t_hash] + s_list = s_hash if isinstance(s_hash, list) else [s_hash] + t_set = {v.lower() for v in t_list} + s_set = {v.lower() for v in s_list} + if not t_set & s_set: hash_mismatch.append({ "name": s_entry["name"], "hash_type": h, diff --git a/scripts/diff_truth.py b/scripts/diff_truth.py new file mode 100644 index 00000000..56bd3f54 --- /dev/null +++ b/scripts/diff_truth.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +"""Compare scraped platform YAMLs against ground-truth YAMLs. + +Usage: + python scripts/diff_truth.py --all + python scripts/diff_truth.py --platform retroarch + python scripts/diff_truth.py --platform retroarch --json + python scripts/diff_truth.py --all --format markdown +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) +from common import diff_platform_truth, list_registered_platforms, load_platform_config + +try: + import yaml +except ImportError: + print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr) + sys.exit(1) + + +def _load_truth(truth_dir: str, platform: str) -> dict | None: + path = os.path.join(truth_dir, f"{platform}.yml") + if not os.path.exists(path): + return None + with open(path) as f: + return yaml.safe_load(f) or {} + + +def _format_terminal(report: dict) -> str: + lines: list[str] = [] + platform = report.get("platform", "unknown") + s = report["summary"] + + lines.append(f"=== {platform} ===") + lines.append( + f" {s['systems_compared']} systems compared: " + f"{s['systems_fully_covered']} full, " + f"{s['systems_partially_covered']} partial, " + f"{s['systems_uncovered']} uncovered" + ) + + totals = [] + if s["total_missing"]: + totals.append(f"{s['total_missing']} missing") + if s["total_extra_phantom"]: + totals.append(f"{s['total_extra_phantom']} phantom") + if s["total_extra_unprofiled"]: + totals.append(f"{s['total_extra_unprofiled']} unprofiled") + if s["total_hash_mismatch"]: + totals.append(f"{s['total_hash_mismatch']} hash") + if s["total_required_mismatch"]: + totals.append(f"{s['total_required_mismatch']} required") + if totals: + lines.append(f" divergences: {', '.join(totals)}") + else: + lines.append(" no divergences") + + for sys_id, div in sorted(report.get("divergences", {}).items()): + labels: list[str] = [] + if div.get("missing"): + labels.append(f"MISSING:{len(div['missing'])}") + if div.get("extra_phantom"): + labels.append(f"PHANTOM:{len(div['extra_phantom'])}") + if div.get("extra_unprofiled"): + labels.append(f"UNPROF:{len(div['extra_unprofiled'])}") + if div.get("hash_mismatch"): + labels.append(f"HASH:{len(div['hash_mismatch'])}") + if div.get("required_mismatch"): + labels.append(f"REQ:{len(div['required_mismatch'])}") + lines.append(f" {sys_id}: {' '.join(labels)}") + + for m in div.get("missing", []): + cores = ", ".join(m.get("cores", [])) + lines.append(f" + {m['name']} [{cores}]") + for h in div.get("hash_mismatch", []): + ht = h["hash_type"] + lines.append(f" ~ {h['name']} {ht}: {h[f'truth_{ht}']} != {h[f'scraped_{ht}']}") + for p in div.get("extra_phantom", []): + lines.append(f" - {p['name']} (phantom)") + for u in div.get("extra_unprofiled", []): + lines.append(f" ? {u['name']} (unprofiled)") + for r in div.get("required_mismatch", []): + lines.append(f" ! {r['name']} required: {r['truth_required']} != {r['scraped_required']}") + + uncovered = report.get("uncovered_systems", []) + if uncovered: + lines.append(f" uncovered ({len(uncovered)}): {', '.join(uncovered)}") + + return "\n".join(lines) + + +def _format_markdown(report: dict) -> str: + lines: list[str] = [] + platform = report.get("platform", "unknown") + s = report["summary"] + + lines.append(f"# {platform}") + lines.append("") + lines.append( + f"**{s['systems_compared']}** systems compared | " + f"**{s['systems_fully_covered']}** full | " + f"**{s['systems_partially_covered']}** partial | " + f"**{s['systems_uncovered']}** uncovered" + ) + lines.append( + f"**{s['total_missing']}** missing | " + f"**{s['total_extra_phantom']}** phantom | " + f"**{s['total_extra_unprofiled']}** unprofiled | " + f"**{s['total_hash_mismatch']}** hash | " + f"**{s['total_required_mismatch']}** required" + ) + lines.append("") + + for sys_id, div in sorted(report.get("divergences", {}).items()): + lines.append(f"## {sys_id}") + lines.append("") + for m in div.get("missing", []): + refs = "" + if m.get("source_refs"): + refs = " " + " ".join(f"`{r}`" for r in m["source_refs"]) + lines.append(f"- **Add** `{m['name']}`{refs}") + for h in div.get("hash_mismatch", []): + ht = h["hash_type"] + lines.append(f"- **Fix hash** `{h['name']}` {ht}: `{h[f'truth_{ht}']}` != `{h[f'scraped_{ht}']}`") + for p in div.get("extra_phantom", []): + lines.append(f"- **Remove** `{p['name']}` (phantom)") + for u in div.get("extra_unprofiled", []): + lines.append(f"- **Check** `{u['name']}` (unprofiled cores)") + for r in div.get("required_mismatch", []): + lines.append(f"- **Fix required** `{r['name']}`: truth={r['truth_required']}, scraped={r['scraped_required']}") + lines.append("") + + uncovered = report.get("uncovered_systems", []) + if uncovered: + lines.append("## Uncovered systems") + lines.append("") + for u in uncovered: + lines.append(f"- {u}") + lines.append("") + + return "\n".join(lines) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compare scraped vs truth YAMLs") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--all", action="store_true", help="diff all registered platforms") + group.add_argument("--platform", help="diff a single platform") + parser.add_argument("--json", action="store_true", dest="json_output", help="JSON output") + parser.add_argument("--format", choices=["terminal", "markdown"], default="terminal") + parser.add_argument("--truth-dir", default="dist/truth") + parser.add_argument("--platforms-dir", default="platforms") + parser.add_argument("--include-archived", action="store_true") + args = parser.parse_args() + + if args.all: + platforms = list_registered_platforms(args.platforms_dir, include_archived=args.include_archived) + else: + platforms = [args.platform] + + reports: list[dict] = [] + formatter = _format_markdown if args.format == "markdown" else _format_terminal + + for platform in platforms: + truth = _load_truth(args.truth_dir, platform) + if truth is None: + if not args.json_output: + print(f"skip {platform}: no truth YAML in {args.truth_dir}/", file=sys.stderr) + continue + + try: + scraped = load_platform_config(platform, args.platforms_dir) + except FileNotFoundError: + if not args.json_output: + print(f"skip {platform}: no scraped config", file=sys.stderr) + continue + + report = diff_platform_truth(truth, scraped) + report["platform"] = platform + + if args.json_output: + reports.append(report) + else: + print(formatter(report)) + print() + + if args.json_output: + json.dump(reports, sys.stdout, indent=2) + print() + + +if __name__ == "__main__": + main()