From 617f5cc3642e2ab709e24c297ebe1bba251c8a05 Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Sat, 18 Apr 2026 09:00:31 +0200 Subject: [PATCH] refactor: hash-anchor truth enrichment --- scripts/truth.py | 163 ++++++++++++++++++++++++++++++++++++++++------ tests/test_e2e.py | 9 ++- 2 files changed, 150 insertions(+), 22 deletions(-) diff --git a/scripts/truth.py b/scripts/truth.py index b66b406f..f61fc14f 100644 --- a/scripts/truth.py +++ b/scripts/truth.py @@ -12,6 +12,16 @@ from common import _norm_system_id, resolve_platform_cores from validation import filter_files_by_mode +def _serialize_source_ref(sr: object) -> str: + """Convert a source_ref value to a clean string for serialization.""" + if isinstance(sr, str): + return sr + if isinstance(sr, dict): + parts = [f"{k}: {v}" for k, v in sr.items()] + return "; ".join(parts) + return str(sr) + + def _determine_core_mode( emu_name: str, profile: dict, @@ -35,31 +45,78 @@ def _determine_core_mode( def _enrich_hashes(entry: dict, db: dict) -> None: - """Fill missing hash fields from the database.""" - sha1 = entry.get("sha1", "") - md5 = entry.get("md5", "") + """Fill missing sibling hashes from the database, ground-truth preserving. - # Hashes can be lists (multi-hash) — use first string value - if isinstance(sha1, list): - sha1 = sha1[0] if sha1 else "" - if isinstance(md5, list): - md5 = md5[0] if md5 else "" + The profile's hashes come from the emulator source code (ground truth). + Any hash of a given file set of bytes is a projection of that same + ground truth — sha1, md5, crc32 all identify the same bytes. If the + profile has ONE ground-truth hash, the DB can supply its siblings. + + Lookup order (all are hash-anchored, never name-based): + 1. SHA1 direct + 2. MD5 -> SHA1 via indexes.by_md5 + 3. CRC32 -> SHA1 via indexes.by_crc32 (weaker 32-bit anchor, + requires size match when profile has size) + + Name-based enrichment is NEVER used: a name alone has no ground-truth + anchor, the file in bios/ may not match what the source code expects. + + Multi-hash entries (lists of accepted variants) are left untouched to + preserve variant information. + """ + # Skip multi-hash entries — they express ground truth as "any of these N + # variants", enriching with a single sibling would lose that information. + for h in ("sha1", "md5", "crc32"): + if isinstance(entry.get(h), list): + return + + files_db = db.get("files", {}) + indexes = db.get("indexes", {}) record = None - if sha1 and isinstance(sha1, str) and db.get("files"): - record = db["files"].get(sha1) - if record is None and md5: - by_md5 = db.get("by_md5", {}) - md5_str = md5 if isinstance(md5, str) else md5[0] if md5 else "" - ref_sha1 = by_md5.get(md5_str.lower()) if md5_str else None - if ref_sha1 and db.get("files"): - record = db["files"].get(ref_sha1) + + # Anchor 1: SHA1 (strongest) + sha1 = entry.get("sha1") + if sha1 and isinstance(sha1, str): + record = files_db.get(sha1) + + # Anchor 2: MD5 (strong) + if record is None: + md5 = entry.get("md5") + if md5 and isinstance(md5, str): + by_md5 = indexes.get("by_md5", {}) + ref = by_md5.get(md5.lower()) + if ref: + ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None) + if ref_sha1: + record = files_db.get(ref_sha1) + + # Anchor 3: CRC32 (32-bit, collisions theoretically possible). + # Require size match when profile has a size to guard against collisions. + if record is None: + crc = entry.get("crc32") + if crc and isinstance(crc, str): + by_crc32 = indexes.get("by_crc32", {}) + ref = by_crc32.get(crc.lower()) + if ref: + ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None) + if ref_sha1: + candidate = files_db.get(ref_sha1) + if candidate is not None: + profile_size = entry.get("size") + if not profile_size or candidate.get("size") == profile_size: + record = candidate + if record is None: return + # Copy sibling hashes and size from the anchored record. + # These are projections of the same ground-truth bytes. for field in ("sha1", "md5", "sha256", "crc32"): if not entry.get(field) and record.get(field): entry[field] = record[field] + if not entry.get("size") and record.get("size"): + entry["size"] = record["size"] def _merge_file_into_system( @@ -82,7 +139,7 @@ def _merge_file_into_system( existing["_cores"] = existing.get("_cores", set()) | {emu_name} sr = file_entry.get("source_ref") if sr is not None: - sr_key = str(sr) if not isinstance(sr, str) else sr + sr_key = _serialize_source_ref(sr) existing["_source_refs"] = existing.get("_source_refs", set()) | {sr_key} else: existing.setdefault("_source_refs", set()) @@ -91,14 +148,41 @@ def _merge_file_into_system( for h in ("sha1", "md5", "sha256", "crc32"): theirs = file_entry.get(h, "") ours = existing.get(h, "") - if theirs and ours and theirs.lower() != ours.lower(): + # Skip empty strings + if not theirs or theirs == "": + continue + if not ours or ours == "": + existing[h] = theirs + continue + # Normalize to sets for multi-hash comparison + t_list = theirs if isinstance(theirs, list) else [theirs] + o_list = ours if isinstance(ours, list) else [ours] + t_set = {str(v).lower() for v in t_list} + o_set = {str(v).lower() for v in o_list} + if not t_set & o_set: print( f"WARNING: hash conflict for {file_entry['name']} " f"({h}: {ours} vs {theirs}, core {emu_name})", file=sys.stderr, ) - elif theirs and not ours: - existing[h] = theirs + # Merge non-hash data fields if existing lacks them. + # A core that creates an entry without size/path/validation may be + # enriched by a sibling core that has those fields. + for field in ( + "size", + "min_size", + "max_size", + "path", + "validation", + "description", + "category", + "hle_fallback", + "note", + "aliases", + "contents", + ): + if file_entry.get(field) is not None and existing.get(field) is None: + existing[field] = file_entry[field] return entry: dict = {"name": file_entry["name"]} @@ -119,14 +203,25 @@ def _merge_file_into_system( "min_size", "max_size", "aliases", + "contents", ): val = file_entry.get(field) if val is not None: entry[field] = val + # Strip empty string hashes (profile says "" when hash is unknown) + for h in ("sha1", "md5", "sha256", "crc32"): + if entry.get(h) == "": + del entry[h] + # Normalize CRC32: strip 0x prefix, lowercase + crc = entry.get("crc32") + if isinstance(crc, str) and crc.startswith("0x"): + entry["crc32"] = crc[2:].lower() + elif isinstance(crc, str) and crc != crc.lower(): + entry["crc32"] = crc.lower() entry["_cores"] = {emu_name} sr = file_entry.get("source_ref") if sr is not None: - sr_key = str(sr) if not isinstance(sr, str) else sr + sr_key = _serialize_source_ref(sr) entry["_source_refs"] = {sr_key} else: entry["_source_refs"] = set() @@ -137,6 +232,23 @@ def _merge_file_into_system( files.append(entry) +def _has_exploitable_data(entry: dict) -> bool: + """Check if an entry has any data beyond its name that can drive verification. + + Applied AFTER merging all cores so entries benefit from enrichment by + sibling cores before being judged empty. + """ + return bool( + any(entry.get(h) for h in ("sha1", "md5", "sha256", "crc32")) + or entry.get("path") + or entry.get("size") + or entry.get("min_size") + or entry.get("max_size") + or entry.get("validation") + or entry.get("contents") + ) + + def generate_platform_truth( platform_name: str, config: dict, @@ -274,6 +386,15 @@ def generate_platform_truth( ) sys_cov["unprofiled"].add(emu_name) + # Drop files with no exploitable data AFTER all cores have contributed. + # A file declared by one core without hash/size/path may be enriched by + # another core that has the same entry with data — the filter must run + # once at the end, not per-core at creation time. + for sys_data in systems.values(): + files_list = sys_data.get("files", []) + if files_list: + sys_data["files"] = [fe for fe in files_list if _has_exploitable_data(fe)] + # Convert sets to sorted lists for serialization for sys_id, sys_data in systems.items(): for fe in sys_data.get("files", []): diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 321afdb3..b3435c91 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -3095,20 +3095,23 @@ class TestE2E(unittest.TestCase): "system": "test-system", "required": True, "mode": "both", + "sha1": "aaaa", }, { "name": "lr_only.bin", "system": "test-system", "required": True, "mode": "libretro", + "sha1": "bbbb", }, { "name": "sa_only.bin", "system": "test-system", "required": True, "mode": "standalone", + "sha1": "cccc", }, - {"name": "nomode.bin", "system": "test-system", "required": True}, + {"name": "nomode.bin", "system": "test-system", "required": True, "sha1": "dddd"}, ], } with open(os.path.join(self.emulators_dir, "dualmode.yml"), "w") as f: @@ -3142,12 +3145,14 @@ class TestE2E(unittest.TestCase): "system": "test-system", "required": True, "mode": "libretro", + "sha1": "aaaa", }, { "name": "sa_file.bin", "system": "test-system", "required": True, "mode": "standalone", + "sha1": "bbbb", }, ], } @@ -3183,6 +3188,7 @@ class TestE2E(unittest.TestCase): "system": "test-system", "required": False, "source_ref": "a.cpp:10", + "sha1": "aaaa", }, ], } @@ -3197,6 +3203,7 @@ class TestE2E(unittest.TestCase): "system": "test-system", "required": True, "source_ref": "b.cpp:20", + "sha1": "aaaa", }, ], }