From 2dd2b724c153f95d067f3d82584c4171c1f182a9 Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Sun, 29 Mar 2026 13:05:51 +0200 Subject: [PATCH] feat: add generate_truth.py CLI --- scripts/common.py | 18 ++++-- scripts/generate_truth.py | 123 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 6 deletions(-) create mode 100644 scripts/generate_truth.py diff --git a/scripts/common.py b/scripts/common.py index 059400da..fce07427 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -1171,7 +1171,8 @@ def _enrich_hashes(entry: dict, db: dict) -> None: record = db["files"].get(sha1) if record is None and md5: by_md5 = db.get("by_md5", {}) - ref_sha1 = by_md5.get(md5.lower()) + md5_str = md5 if isinstance(md5, str) else md5[0] if md5 else "" + ref_sha1 = by_md5.get(md5_str.lower()) if md5_str else None if ref_sha1 and db.get("files"): record = db["files"].get(ref_sha1) if record is None: @@ -1197,9 +1198,12 @@ def _merge_file_into_system( if existing is not None: existing["_cores"] = existing.get("_cores", set()) | {emu_name} - existing["_source_refs"] = existing.get("_source_refs", set()) | ( - {file_entry["source_ref"]} if file_entry.get("source_ref") else set() - ) + sr = file_entry.get("source_ref") + if sr is not None: + sr_key = str(sr) if not isinstance(sr, str) else sr + existing["_source_refs"] = existing.get("_source_refs", set()) | {sr_key} + else: + existing.setdefault("_source_refs", set()) if file_entry.get("required") and not existing.get("required"): existing["required"] = True for h in ("sha1", "md5", "sha256", "crc32"): @@ -1226,8 +1230,10 @@ def _merge_file_into_system( if val is not None: entry[field] = val entry["_cores"] = {emu_name} - if file_entry.get("source_ref"): - entry["_source_refs"] = {file_entry["source_ref"]} + sr = file_entry.get("source_ref") + if sr is not None: + sr_key = str(sr) if not isinstance(sr, str) else sr + entry["_source_refs"] = {sr_key} else: entry["_source_refs"] = set() diff --git a/scripts/generate_truth.py b/scripts/generate_truth.py new file mode 100644 index 00000000..352b90a5 --- /dev/null +++ b/scripts/generate_truth.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +"""Generate ground-truth YAML files per platform from emulator profiles. + +Usage: + python scripts/generate_truth.py --platform retroarch --output-dir dist/truth + python scripts/generate_truth.py --all --output-dir dist/truth +""" + +from __future__ import annotations + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) +from common import ( + generate_platform_truth, + list_registered_platforms, + load_database, + load_emulator_profiles, + load_target_config, +) + +try: + import yaml +except ImportError: + print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr) + sys.exit(1) + +DEFAULT_OUTPUT_DIR = "dist/truth" +DEFAULT_PLATFORMS_DIR = "platforms" +DEFAULT_EMULATORS_DIR = "emulators" +DEFAULT_DB_FILE = "database.json" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate ground-truth YAML from emulator profiles", + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--all", action="store_true", help="all registered platforms") + group.add_argument("--platform", help="single platform name") + parser.add_argument( + "--output-dir", default=DEFAULT_OUTPUT_DIR, help="output directory", + ) + parser.add_argument( + "--target", "-t", default=None, help="hardware target filter", + ) + parser.add_argument( + "--include-archived", action="store_true", + help="include archived platforms with --all", + ) + parser.add_argument( + "--platforms-dir", default=DEFAULT_PLATFORMS_DIR, + ) + parser.add_argument( + "--emulators-dir", default=DEFAULT_EMULATORS_DIR, + ) + parser.add_argument("--db", default=DEFAULT_DB_FILE, help="database.json path") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + + # Load registry + registry_path = os.path.join(args.platforms_dir, "_registry.yml") + with open(registry_path) as f: + registry = (yaml.safe_load(f) or {}).get("platforms", {}) + + # Load emulator profiles + profiles = load_emulator_profiles(args.emulators_dir) + + # Load database (optional) + db: dict | None = None + if os.path.exists(args.db): + db = load_database(args.db) + + # Determine platforms + if args.all: + platforms = list_registered_platforms( + args.platforms_dir, include_archived=args.include_archived, + ) + else: + platforms = [args.platform] + + os.makedirs(args.output_dir, exist_ok=True) + + for name in platforms: + # Resolve target cores + target_cores: set[str] | None = None + if args.target: + try: + target_cores = load_target_config( + name, args.target, args.platforms_dir, + ) + except FileNotFoundError: + print(f" {name}: no target config, skipped") + continue + + result = generate_platform_truth( + name, registry, profiles, db=db, target_cores=target_cores, + ) + + out_path = os.path.join(args.output_dir, f"{name}.yml") + with open(out_path, "w") as f: + yaml.dump( + result, f, + default_flow_style=False, sort_keys=False, allow_unicode=True, + ) + + n_systems = len(result.get("systems", {})) + n_files = sum( + len(sys_data.get("files", {})) + for sys_data in result.get("systems", {}).values() + ) + print(f" {name}: {n_systems} systems, {n_files} files -> {out_path}") + + print("Done.") + + +if __name__ == "__main__": + main()