feat: add by_sha256 index, fix reporting attribution

generate_db: add by_sha256 index for O(1) variant lookup.
verify: _find_best_variant uses indexed sha256 instead of O(n) scan.
validation: check_file_validation returns (reason, emulators) tuple,
attributing mismatch only to emulators whose check actually failed.
beetle_psx: remove incorrect size field for ps1_rom.bin (code does
not validate size, swanstation is sole size authority).
This commit is contained in:
Abdessamad Derraz
2026-04-02 00:59:01 +02:00
parent 95b7a9813c
commit 0401d058a1
7 changed files with 7382 additions and 51 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -92,10 +92,9 @@ files:
note: "override_bios=1. Falls back to region BIOS if not found." note: "override_bios=1. Falls back to region BIOS if not found."
- name: "ps1_rom.bin" - name: "ps1_rom.bin"
description: "PS3 embedded PS1 BIOS (region-free override)" description: "PS3 embedded PS1 BIOS, first 512KB extracted (region-free override)"
region: "Auto" region: "Auto"
required: false required: false
size: 524288
sha1: "c40146361eb8cf670b19fdc9759190257803cab7" sha1: "c40146361eb8cf670b19fdc9759190257803cab7"
md5: "81bbe60ba7a3d1cea1d48c14cbcc647b" md5: "81bbe60ba7a3d1cea1d48c14cbcc647b"
validation: [sha1] validation: [sha1]

View File

@@ -168,6 +168,7 @@ def build_indexes(files: dict, aliases: dict) -> dict:
by_md5 = {} by_md5 = {}
by_name = {} by_name = {}
by_crc32 = {} by_crc32 = {}
by_sha256 = {}
by_path_suffix = {} by_path_suffix = {}
for sha1, entry in files.items(): for sha1, entry in files.items():
@@ -179,6 +180,7 @@ def build_indexes(files: dict, aliases: dict) -> dict:
by_name[name].append(sha1) by_name[name].append(sha1)
by_crc32[entry["crc32"]] = sha1 by_crc32[entry["crc32"]] = sha1
by_sha256[entry["sha256"]] = sha1
# Path suffix index for regional variant resolution # Path suffix index for regional variant resolution
suffix = _path_suffix(entry["path"]) suffix = _path_suffix(entry["path"])
@@ -208,6 +210,7 @@ def build_indexes(files: dict, aliases: dict) -> dict:
"by_md5": by_md5, "by_md5": by_md5,
"by_name": by_name, "by_name": by_name,
"by_crc32": by_crc32, "by_crc32": by_crc32,
"by_sha256": by_sha256,
"by_path_suffix": by_path_suffix, "by_path_suffix": by_path_suffix,
} }

View File

@@ -1308,10 +1308,11 @@ def generate_pack(
and validation_index and validation_index
): ):
fname = file_entry.get("name", "") fname = file_entry.get("name", "")
reason = check_file_validation( check = check_file_validation(
local_path, fname, validation_index, bios_dir local_path, fname, validation_index, bios_dir
) )
if reason: if check:
reason, emus_list = check
better = _find_candidate_satisfying_both( better = _find_candidate_satisfying_both(
file_entry, file_entry,
db, db,
@@ -1322,8 +1323,7 @@ def generate_pack(
if better: if better:
local_path = better local_path = better
else: else:
ventry = validation_index.get(fname, {}) emus = ", ".join(emus_list)
emus = ", ".join(ventry.get("emulators", []))
file_reasons.setdefault( file_reasons.setdefault(
dedup_key, dedup_key,
f"{platform_display} says OK but {emus} says {reason}", f"{platform_display} says OK but {emus} says {reason}",

View File

@@ -204,35 +204,56 @@ def build_ground_truth(filename: str, validation_index: dict[str, dict]) -> list
return result return result
def _emulators_for_check(
check_type: str, per_emulator: dict[str, dict],
) -> list[str]:
"""Return emulator names that validate a specific check type."""
result = []
for emu, detail in per_emulator.items():
emu_checks = detail.get("checks", [])
if check_type in emu_checks:
result.append(emu)
# adler32 is stored as known_hash, not always in validation list
if check_type == "adler32" and detail.get("expected", {}).get("adler32"):
if emu not in result:
result.append(emu)
return sorted(result)
def check_file_validation( def check_file_validation(
local_path: str, local_path: str,
filename: str, filename: str,
validation_index: dict[str, dict], validation_index: dict[str, dict],
bios_dir: str = "bios", bios_dir: str = "bios",
) -> str | None: ) -> tuple[str, list[str]] | None:
"""Check emulator-level validation on a resolved file. """Check emulator-level validation on a resolved file.
Supports: size (exact/min/max), crc32, md5, sha1, adler32, Supports: size (exact/min/max), crc32, md5, sha1, adler32,
signature (RSA-2048 PKCS1v15 SHA256), crypto (AES-128-CBC + SHA256). signature (RSA-2048 PKCS1v15 SHA256), crypto (AES-128-CBC + SHA256).
Returns None if all checks pass or no validation applies. Returns None if all checks pass or no validation applies.
Returns a reason string if a check fails. Returns (reason, emulators) tuple on failure, where *emulators*
lists only those cores whose check actually failed.
""" """
entry = validation_index.get(filename) entry = validation_index.get(filename)
if not entry: if not entry:
return None return None
checks = entry["checks"] checks = entry["checks"]
pe = entry.get("per_emulator", {})
# Size checks -sizes is a set of accepted values # Size checks -sizes is a set of accepted values
if "size" in checks: if "size" in checks:
actual_size = os.path.getsize(local_path) actual_size = os.path.getsize(local_path)
if entry["sizes"] and actual_size not in entry["sizes"]: if entry["sizes"] and actual_size not in entry["sizes"]:
expected = ",".join(str(s) for s in sorted(entry["sizes"])) expected = ",".join(str(s) for s in sorted(entry["sizes"]))
return f"size mismatch: got {actual_size}, accepted [{expected}]" emus = _emulators_for_check("size", pe)
return f"size mismatch: got {actual_size}, accepted [{expected}]", emus
if entry["min_size"] is not None and actual_size < entry["min_size"]: if entry["min_size"] is not None and actual_size < entry["min_size"]:
return f"size too small: min {entry['min_size']}, got {actual_size}" emus = _emulators_for_check("size", pe)
return f"size too small: min {entry['min_size']}, got {actual_size}", emus
if entry["max_size"] is not None and actual_size > entry["max_size"]: if entry["max_size"] is not None and actual_size > entry["max_size"]:
return f"size too large: max {entry['max_size']}, got {actual_size}" emus = _emulators_for_check("size", pe)
return f"size too large: max {entry['max_size']}, got {actual_size}", emus
# Hash checks -compute once, reuse for all hash types. # Hash checks -compute once, reuse for all hash types.
# Each hash field is a set of accepted values (multiple valid ROM versions). # Each hash field is a set of accepted values (multiple valid ROM versions).
@@ -245,14 +266,23 @@ def check_file_validation(
if hash_type in checks and entry[hash_type]: if hash_type in checks and entry[hash_type]:
if hashes[hash_type].lower() not in entry[hash_type]: if hashes[hash_type].lower() not in entry[hash_type]:
expected = ",".join(sorted(entry[hash_type])) expected = ",".join(sorted(entry[hash_type]))
return f"{hash_type} mismatch: got {hashes[hash_type]}, accepted [{expected}]" emus = _emulators_for_check(hash_type, pe)
return (
f"{hash_type} mismatch: got {hashes[hash_type]}, "
f"accepted [{expected}]",
emus,
)
if entry["adler32"]: if entry["adler32"]:
actual_adler = hashes["adler32"].lower() actual_adler = hashes["adler32"].lower()
if entry.get("adler32_byteswap"): if entry.get("adler32_byteswap"):
actual_adler = _adler32_byteswapped(local_path) actual_adler = _adler32_byteswapped(local_path)
if actual_adler not in entry["adler32"]: if actual_adler not in entry["adler32"]:
expected = ",".join(sorted(entry["adler32"])) expected = ",".join(sorted(entry["adler32"]))
return f"adler32 mismatch: got 0x{actual_adler}, accepted [{expected}]" emus = _emulators_for_check("adler32", pe)
return (
f"adler32 mismatch: got 0x{actual_adler}, accepted [{expected}]",
emus,
)
# Signature/crypto checks (3DS RSA, AES) # Signature/crypto checks (3DS RSA, AES)
if entry["crypto_only"]: if entry["crypto_only"]:
@@ -260,7 +290,8 @@ def check_file_validation(
crypto_reason = check_crypto_validation(local_path, filename, bios_dir) crypto_reason = check_crypto_validation(local_path, filename, bios_dir)
if crypto_reason: if crypto_reason:
return crypto_reason emus = sorted(entry.get("emulators", []))
return crypto_reason, emus
return None return None

View File

@@ -103,8 +103,9 @@ def verify_entry_existence(
return {"name": name, "status": Status.MISSING, "required": required} return {"name": name, "status": Status.MISSING, "required": required}
result = {"name": name, "status": Status.OK, "required": required} result = {"name": name, "status": Status.OK, "required": required}
if validation_index: if validation_index:
reason = check_file_validation(local_path, name, validation_index) check = check_file_validation(local_path, name, validation_index)
if reason: if check:
reason, emus_list = check
suppressed = False suppressed = False
if db: if db:
better = _find_best_variant( better = _find_best_variant(
@@ -113,8 +114,7 @@ def verify_entry_existence(
if better: if better:
suppressed = True suppressed = True
if not suppressed: if not suppressed:
ventry = validation_index.get(name, {}) emus = ", ".join(emus_list)
emus = ", ".join(ventry.get("emulators", []))
result["discrepancy"] = ( result["discrepancy"] = (
f"file present (OK) but {emus} says {reason}" f"file present (OK) but {emus} says {reason}"
) )
@@ -608,24 +608,25 @@ def _find_best_variant(
# Pass 1: hash-based lookup from emulator expected values # Pass 1: hash-based lookup from emulator expected values
ventry = validation_index[fname] ventry = validation_index[fname]
indexes = db.get("indexes", {})
for hash_type, db_index_key in ( for hash_type, db_index_key in (
("sha1", "by_sha1"), ("sha1", None),
("md5", "by_md5"), ("md5", "by_md5"),
("crc32", "by_crc32"), ("crc32", "by_crc32"),
("sha256", "by_sha256"),
): ):
expected = ventry.get(hash_type) expected = ventry.get(hash_type)
if not expected: if not expected:
continue continue
db_index = db.get("indexes", {}).get(db_index_key, {}) if db_index_key is None:
if not db_index: # SHA1 is the primary key of files_db
# by_sha1 is the files dict itself (sha1 = primary key) for h in expected:
if hash_type == "sha1": if h in files_db:
for h in expected: result = _try_candidate(h)
if h in files_db: if result:
result = _try_candidate(h) return result
if result:
return result
continue continue
db_index = indexes.get(db_index_key, {})
for h in expected: for h in expected:
entries = db_index.get(h) entries = db_index.get(h)
if not entries: if not entries:
@@ -640,16 +641,7 @@ def _find_best_variant(
if result: if result:
return result return result
# Pass 2: SHA256 scan (no index, but emulators like ares validate by sha256) # Pass 2: name-based lookup (aliases, .variants/ with same filename)
expected_sha256 = ventry.get("sha256")
if expected_sha256:
for sha1, entry in files_db.items():
if entry.get("sha256", "").lower() in expected_sha256:
result = _try_candidate(sha1)
if result:
return result
# Pass 3: name-based lookup (aliases, .variants/ with same filename)
by_name = db.get("indexes", {}).get("by_name", {}) by_name = db.get("indexes", {}).get("by_name", {})
for sha1 in by_name.get(fname, []): for sha1 in by_name.get(fname, []):
result = _try_candidate(sha1) result = _try_candidate(sha1)
@@ -732,8 +724,11 @@ def verify_platform(
# mismatches are reported as discrepancies, not failures. # mismatches are reported as discrepancies, not failures.
if result["status"] == Status.OK and local_path and validation_index: if result["status"] == Status.OK and local_path and validation_index:
fname = file_entry.get("name", "") fname = file_entry.get("name", "")
reason = check_file_validation(local_path, fname, validation_index) check = check_file_validation(
if reason: local_path, fname, validation_index,
)
if check:
reason, emus_list = check
better = _find_best_variant( better = _find_best_variant(
file_entry, file_entry,
db, db,
@@ -741,8 +736,7 @@ def verify_platform(
validation_index, validation_index,
) )
if not better: if not better:
ventry = validation_index.get(fname, {}) emus = ", ".join(emus_list)
emus = ", ".join(ventry.get("emulators", []))
result["discrepancy"] = ( result["discrepancy"] = (
f"{platform} says OK but {emus} says {reason}" f"{platform} says OK but {emus} says {reason}"
) )
@@ -1233,8 +1227,9 @@ def verify_emulator(
result = {"name": name, "status": Status.MISSING, "required": required} result = {"name": name, "status": Status.MISSING, "required": required}
else: else:
# Apply emulator validation # Apply emulator validation
reason = check_file_validation(local_path, name, validation_index) check = check_file_validation(local_path, name, validation_index)
if reason: if check:
reason, _emus = check
result = { result = {
"name": name, "name": name,
"status": Status.UNTESTED, "status": Status.UNTESTED,

View File

@@ -1137,9 +1137,11 @@ class TestE2E(unittest.TestCase):
profiles = load_emulator_profiles(self.emulators_dir) profiles = load_emulator_profiles(self.emulators_dir)
index = _build_validation_index(profiles) index = _build_validation_index(profiles)
path = self.files["present_opt.bin"]["path"] path = self.files["present_opt.bin"]["path"]
reason = check_file_validation(path, "present_opt.bin", index) result = check_file_validation(path, "present_opt.bin", index)
self.assertIsNotNone(reason) self.assertIsNotNone(result)
reason, emus = result
self.assertIn("size mismatch", reason) self.assertIn("size mismatch", reason)
self.assertIsInstance(emus, list)
def test_73_validation_crc32_pass(self): def test_73_validation_crc32_pass(self):
"""File with correct CRC32 passes validation.""" """File with correct CRC32 passes validation."""
@@ -1154,9 +1156,11 @@ class TestE2E(unittest.TestCase):
profiles = load_emulator_profiles(self.emulators_dir) profiles = load_emulator_profiles(self.emulators_dir)
index = _build_validation_index(profiles) index = _build_validation_index(profiles)
path = self.files["no_md5.bin"]["path"] path = self.files["no_md5.bin"]["path"]
reason = check_file_validation(path, "no_md5.bin", index) result = check_file_validation(path, "no_md5.bin", index)
self.assertIsNotNone(reason) self.assertIsNotNone(result)
reason, emus = result
self.assertIn("crc32 mismatch", reason) self.assertIn("crc32 mismatch", reason)
self.assertIsInstance(emus, list)
def test_75_validation_applied_in_existence_mode(self): def test_75_validation_applied_in_existence_mode(self):
"""Existence mode reports discrepancy when validation fails, keeps OK.""" """Existence mode reports discrepancy when validation fails, keeps OK."""
@@ -1212,9 +1216,11 @@ class TestE2E(unittest.TestCase):
profiles = load_emulator_profiles(self.emulators_dir) profiles = load_emulator_profiles(self.emulators_dir)
index = _build_validation_index(profiles) index = _build_validation_index(profiles)
path = self.files["alias_target.bin"]["path"] path = self.files["alias_target.bin"]["path"]
reason = check_file_validation(path, "alias_target.bin", index) result = check_file_validation(path, "alias_target.bin", index)
self.assertIsNotNone(reason) self.assertIsNotNone(result)
reason, emus = result
self.assertIn("md5 mismatch", reason) self.assertIn("md5 mismatch", reason)
self.assertIsInstance(emus, list)
def test_81_validation_index_has_md5_sha1(self): def test_81_validation_index_has_md5_sha1(self):
"""Validation index stores md5 and sha1 when declared.""" """Validation index stores md5 and sha1 when declared."""