mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-18 23:02:33 -05:00
refactor: hash-anchor truth enrichment
This commit is contained in:
163
scripts/truth.py
163
scripts/truth.py
@@ -12,6 +12,16 @@ from common import _norm_system_id, resolve_platform_cores
|
|||||||
from validation import filter_files_by_mode
|
from validation import filter_files_by_mode
|
||||||
|
|
||||||
|
|
||||||
|
def _serialize_source_ref(sr: object) -> str:
|
||||||
|
"""Convert a source_ref value to a clean string for serialization."""
|
||||||
|
if isinstance(sr, str):
|
||||||
|
return sr
|
||||||
|
if isinstance(sr, dict):
|
||||||
|
parts = [f"{k}: {v}" for k, v in sr.items()]
|
||||||
|
return "; ".join(parts)
|
||||||
|
return str(sr)
|
||||||
|
|
||||||
|
|
||||||
def _determine_core_mode(
|
def _determine_core_mode(
|
||||||
emu_name: str,
|
emu_name: str,
|
||||||
profile: dict,
|
profile: dict,
|
||||||
@@ -35,31 +45,78 @@ def _determine_core_mode(
|
|||||||
|
|
||||||
|
|
||||||
def _enrich_hashes(entry: dict, db: dict) -> None:
|
def _enrich_hashes(entry: dict, db: dict) -> None:
|
||||||
"""Fill missing hash fields from the database."""
|
"""Fill missing sibling hashes from the database, ground-truth preserving.
|
||||||
sha1 = entry.get("sha1", "")
|
|
||||||
md5 = entry.get("md5", "")
|
|
||||||
|
|
||||||
# Hashes can be lists (multi-hash) — use first string value
|
The profile's hashes come from the emulator source code (ground truth).
|
||||||
if isinstance(sha1, list):
|
Any hash of a given file set of bytes is a projection of that same
|
||||||
sha1 = sha1[0] if sha1 else ""
|
ground truth — sha1, md5, crc32 all identify the same bytes. If the
|
||||||
if isinstance(md5, list):
|
profile has ONE ground-truth hash, the DB can supply its siblings.
|
||||||
md5 = md5[0] if md5 else ""
|
|
||||||
|
Lookup order (all are hash-anchored, never name-based):
|
||||||
|
1. SHA1 direct
|
||||||
|
2. MD5 -> SHA1 via indexes.by_md5
|
||||||
|
3. CRC32 -> SHA1 via indexes.by_crc32 (weaker 32-bit anchor,
|
||||||
|
requires size match when profile has size)
|
||||||
|
|
||||||
|
Name-based enrichment is NEVER used: a name alone has no ground-truth
|
||||||
|
anchor, the file in bios/ may not match what the source code expects.
|
||||||
|
|
||||||
|
Multi-hash entries (lists of accepted variants) are left untouched to
|
||||||
|
preserve variant information.
|
||||||
|
"""
|
||||||
|
# Skip multi-hash entries — they express ground truth as "any of these N
|
||||||
|
# variants", enriching with a single sibling would lose that information.
|
||||||
|
for h in ("sha1", "md5", "crc32"):
|
||||||
|
if isinstance(entry.get(h), list):
|
||||||
|
return
|
||||||
|
|
||||||
|
files_db = db.get("files", {})
|
||||||
|
indexes = db.get("indexes", {})
|
||||||
|
|
||||||
record = None
|
record = None
|
||||||
if sha1 and isinstance(sha1, str) and db.get("files"):
|
|
||||||
record = db["files"].get(sha1)
|
# Anchor 1: SHA1 (strongest)
|
||||||
if record is None and md5:
|
sha1 = entry.get("sha1")
|
||||||
by_md5 = db.get("by_md5", {})
|
if sha1 and isinstance(sha1, str):
|
||||||
md5_str = md5 if isinstance(md5, str) else md5[0] if md5 else ""
|
record = files_db.get(sha1)
|
||||||
ref_sha1 = by_md5.get(md5_str.lower()) if md5_str else None
|
|
||||||
if ref_sha1 and db.get("files"):
|
# Anchor 2: MD5 (strong)
|
||||||
record = db["files"].get(ref_sha1)
|
if record is None:
|
||||||
|
md5 = entry.get("md5")
|
||||||
|
if md5 and isinstance(md5, str):
|
||||||
|
by_md5 = indexes.get("by_md5", {})
|
||||||
|
ref = by_md5.get(md5.lower())
|
||||||
|
if ref:
|
||||||
|
ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None)
|
||||||
|
if ref_sha1:
|
||||||
|
record = files_db.get(ref_sha1)
|
||||||
|
|
||||||
|
# Anchor 3: CRC32 (32-bit, collisions theoretically possible).
|
||||||
|
# Require size match when profile has a size to guard against collisions.
|
||||||
|
if record is None:
|
||||||
|
crc = entry.get("crc32")
|
||||||
|
if crc and isinstance(crc, str):
|
||||||
|
by_crc32 = indexes.get("by_crc32", {})
|
||||||
|
ref = by_crc32.get(crc.lower())
|
||||||
|
if ref:
|
||||||
|
ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None)
|
||||||
|
if ref_sha1:
|
||||||
|
candidate = files_db.get(ref_sha1)
|
||||||
|
if candidate is not None:
|
||||||
|
profile_size = entry.get("size")
|
||||||
|
if not profile_size or candidate.get("size") == profile_size:
|
||||||
|
record = candidate
|
||||||
|
|
||||||
if record is None:
|
if record is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Copy sibling hashes and size from the anchored record.
|
||||||
|
# These are projections of the same ground-truth bytes.
|
||||||
for field in ("sha1", "md5", "sha256", "crc32"):
|
for field in ("sha1", "md5", "sha256", "crc32"):
|
||||||
if not entry.get(field) and record.get(field):
|
if not entry.get(field) and record.get(field):
|
||||||
entry[field] = record[field]
|
entry[field] = record[field]
|
||||||
|
if not entry.get("size") and record.get("size"):
|
||||||
|
entry["size"] = record["size"]
|
||||||
|
|
||||||
|
|
||||||
def _merge_file_into_system(
|
def _merge_file_into_system(
|
||||||
@@ -82,7 +139,7 @@ def _merge_file_into_system(
|
|||||||
existing["_cores"] = existing.get("_cores", set()) | {emu_name}
|
existing["_cores"] = existing.get("_cores", set()) | {emu_name}
|
||||||
sr = file_entry.get("source_ref")
|
sr = file_entry.get("source_ref")
|
||||||
if sr is not None:
|
if sr is not None:
|
||||||
sr_key = str(sr) if not isinstance(sr, str) else sr
|
sr_key = _serialize_source_ref(sr)
|
||||||
existing["_source_refs"] = existing.get("_source_refs", set()) | {sr_key}
|
existing["_source_refs"] = existing.get("_source_refs", set()) | {sr_key}
|
||||||
else:
|
else:
|
||||||
existing.setdefault("_source_refs", set())
|
existing.setdefault("_source_refs", set())
|
||||||
@@ -91,14 +148,41 @@ def _merge_file_into_system(
|
|||||||
for h in ("sha1", "md5", "sha256", "crc32"):
|
for h in ("sha1", "md5", "sha256", "crc32"):
|
||||||
theirs = file_entry.get(h, "")
|
theirs = file_entry.get(h, "")
|
||||||
ours = existing.get(h, "")
|
ours = existing.get(h, "")
|
||||||
if theirs and ours and theirs.lower() != ours.lower():
|
# Skip empty strings
|
||||||
|
if not theirs or theirs == "":
|
||||||
|
continue
|
||||||
|
if not ours or ours == "":
|
||||||
|
existing[h] = theirs
|
||||||
|
continue
|
||||||
|
# Normalize to sets for multi-hash comparison
|
||||||
|
t_list = theirs if isinstance(theirs, list) else [theirs]
|
||||||
|
o_list = ours if isinstance(ours, list) else [ours]
|
||||||
|
t_set = {str(v).lower() for v in t_list}
|
||||||
|
o_set = {str(v).lower() for v in o_list}
|
||||||
|
if not t_set & o_set:
|
||||||
print(
|
print(
|
||||||
f"WARNING: hash conflict for {file_entry['name']} "
|
f"WARNING: hash conflict for {file_entry['name']} "
|
||||||
f"({h}: {ours} vs {theirs}, core {emu_name})",
|
f"({h}: {ours} vs {theirs}, core {emu_name})",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
elif theirs and not ours:
|
# Merge non-hash data fields if existing lacks them.
|
||||||
existing[h] = theirs
|
# A core that creates an entry without size/path/validation may be
|
||||||
|
# enriched by a sibling core that has those fields.
|
||||||
|
for field in (
|
||||||
|
"size",
|
||||||
|
"min_size",
|
||||||
|
"max_size",
|
||||||
|
"path",
|
||||||
|
"validation",
|
||||||
|
"description",
|
||||||
|
"category",
|
||||||
|
"hle_fallback",
|
||||||
|
"note",
|
||||||
|
"aliases",
|
||||||
|
"contents",
|
||||||
|
):
|
||||||
|
if file_entry.get(field) is not None and existing.get(field) is None:
|
||||||
|
existing[field] = file_entry[field]
|
||||||
return
|
return
|
||||||
|
|
||||||
entry: dict = {"name": file_entry["name"]}
|
entry: dict = {"name": file_entry["name"]}
|
||||||
@@ -119,14 +203,25 @@ def _merge_file_into_system(
|
|||||||
"min_size",
|
"min_size",
|
||||||
"max_size",
|
"max_size",
|
||||||
"aliases",
|
"aliases",
|
||||||
|
"contents",
|
||||||
):
|
):
|
||||||
val = file_entry.get(field)
|
val = file_entry.get(field)
|
||||||
if val is not None:
|
if val is not None:
|
||||||
entry[field] = val
|
entry[field] = val
|
||||||
|
# Strip empty string hashes (profile says "" when hash is unknown)
|
||||||
|
for h in ("sha1", "md5", "sha256", "crc32"):
|
||||||
|
if entry.get(h) == "":
|
||||||
|
del entry[h]
|
||||||
|
# Normalize CRC32: strip 0x prefix, lowercase
|
||||||
|
crc = entry.get("crc32")
|
||||||
|
if isinstance(crc, str) and crc.startswith("0x"):
|
||||||
|
entry["crc32"] = crc[2:].lower()
|
||||||
|
elif isinstance(crc, str) and crc != crc.lower():
|
||||||
|
entry["crc32"] = crc.lower()
|
||||||
entry["_cores"] = {emu_name}
|
entry["_cores"] = {emu_name}
|
||||||
sr = file_entry.get("source_ref")
|
sr = file_entry.get("source_ref")
|
||||||
if sr is not None:
|
if sr is not None:
|
||||||
sr_key = str(sr) if not isinstance(sr, str) else sr
|
sr_key = _serialize_source_ref(sr)
|
||||||
entry["_source_refs"] = {sr_key}
|
entry["_source_refs"] = {sr_key}
|
||||||
else:
|
else:
|
||||||
entry["_source_refs"] = set()
|
entry["_source_refs"] = set()
|
||||||
@@ -137,6 +232,23 @@ def _merge_file_into_system(
|
|||||||
files.append(entry)
|
files.append(entry)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_exploitable_data(entry: dict) -> bool:
|
||||||
|
"""Check if an entry has any data beyond its name that can drive verification.
|
||||||
|
|
||||||
|
Applied AFTER merging all cores so entries benefit from enrichment by
|
||||||
|
sibling cores before being judged empty.
|
||||||
|
"""
|
||||||
|
return bool(
|
||||||
|
any(entry.get(h) for h in ("sha1", "md5", "sha256", "crc32"))
|
||||||
|
or entry.get("path")
|
||||||
|
or entry.get("size")
|
||||||
|
or entry.get("min_size")
|
||||||
|
or entry.get("max_size")
|
||||||
|
or entry.get("validation")
|
||||||
|
or entry.get("contents")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_platform_truth(
|
def generate_platform_truth(
|
||||||
platform_name: str,
|
platform_name: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
@@ -274,6 +386,15 @@ def generate_platform_truth(
|
|||||||
)
|
)
|
||||||
sys_cov["unprofiled"].add(emu_name)
|
sys_cov["unprofiled"].add(emu_name)
|
||||||
|
|
||||||
|
# Drop files with no exploitable data AFTER all cores have contributed.
|
||||||
|
# A file declared by one core without hash/size/path may be enriched by
|
||||||
|
# another core that has the same entry with data — the filter must run
|
||||||
|
# once at the end, not per-core at creation time.
|
||||||
|
for sys_data in systems.values():
|
||||||
|
files_list = sys_data.get("files", [])
|
||||||
|
if files_list:
|
||||||
|
sys_data["files"] = [fe for fe in files_list if _has_exploitable_data(fe)]
|
||||||
|
|
||||||
# Convert sets to sorted lists for serialization
|
# Convert sets to sorted lists for serialization
|
||||||
for sys_id, sys_data in systems.items():
|
for sys_id, sys_data in systems.items():
|
||||||
for fe in sys_data.get("files", []):
|
for fe in sys_data.get("files", []):
|
||||||
|
|||||||
@@ -3095,20 +3095,23 @@ class TestE2E(unittest.TestCase):
|
|||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"mode": "both",
|
"mode": "both",
|
||||||
|
"sha1": "aaaa",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "lr_only.bin",
|
"name": "lr_only.bin",
|
||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"mode": "libretro",
|
"mode": "libretro",
|
||||||
|
"sha1": "bbbb",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "sa_only.bin",
|
"name": "sa_only.bin",
|
||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"mode": "standalone",
|
"mode": "standalone",
|
||||||
|
"sha1": "cccc",
|
||||||
},
|
},
|
||||||
{"name": "nomode.bin", "system": "test-system", "required": True},
|
{"name": "nomode.bin", "system": "test-system", "required": True, "sha1": "dddd"},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
with open(os.path.join(self.emulators_dir, "dualmode.yml"), "w") as f:
|
with open(os.path.join(self.emulators_dir, "dualmode.yml"), "w") as f:
|
||||||
@@ -3142,12 +3145,14 @@ class TestE2E(unittest.TestCase):
|
|||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"mode": "libretro",
|
"mode": "libretro",
|
||||||
|
"sha1": "aaaa",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "sa_file.bin",
|
"name": "sa_file.bin",
|
||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"mode": "standalone",
|
"mode": "standalone",
|
||||||
|
"sha1": "bbbb",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@@ -3183,6 +3188,7 @@ class TestE2E(unittest.TestCase):
|
|||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": False,
|
"required": False,
|
||||||
"source_ref": "a.cpp:10",
|
"source_ref": "a.cpp:10",
|
||||||
|
"sha1": "aaaa",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@@ -3197,6 +3203,7 @@ class TestE2E(unittest.TestCase):
|
|||||||
"system": "test-system",
|
"system": "test-system",
|
||||||
"required": True,
|
"required": True,
|
||||||
"source_ref": "b.cpp:20",
|
"source_ref": "b.cpp:20",
|
||||||
|
"sha1": "aaaa",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user