feat: hash-based matching for cross-reference

expand_platform_declared_names resolves platform file MD5s
through the database to recover canonical names and aliases,
eliminating false positive undeclared files when a platform
renames a file (e.g. Batocera ROM1 vs gsplus ROM).
This commit is contained in:
Abdessamad Derraz
2026-03-30 08:25:54 +02:00
parent 4db9e4350c
commit 54022e9db1
4 changed files with 83 additions and 18 deletions

View File

@@ -915,6 +915,39 @@ def filter_systems_by_target(
return filtered
def expand_platform_declared_names(config: dict, db: dict) -> set[str]:
"""Build set of file names declared by a platform config.
Enriches the set with canonical names and aliases from the database
by resolving each platform file's MD5 through by_md5. This handles
cases where a platform declares a file under a different name than
the emulator profile (e.g. Batocera ROM1 vs gsplus ROM).
"""
declared: set[str] = set()
by_md5 = db.get("indexes", {}).get("by_md5", {})
files_db = db.get("files", {})
for system in config.get("systems", {}).values():
for fe in system.get("files", []):
name = fe.get("name", "")
if name:
declared.add(name)
md5 = fe.get("md5", "")
if not md5:
continue
# Skip multi-hash and zippedFile entries (inner ROM MD5, not file MD5)
if "," in md5 or fe.get("zippedFile"):
continue
sha1 = by_md5.get(md5.lower())
if not sha1:
continue
entry = files_db.get(sha1, {})
db_name = entry.get("name", "")
if db_name:
declared.add(db_name)
for alias in entry.get("aliases", []):
declared.add(alias)
return declared
# Validation and mode filtering -extracted to validation.py for SoC.
# Re-exported below for backward compatibility.

View File

@@ -28,7 +28,7 @@ sys.path.insert(0, os.path.dirname(__file__))
from common import (
MANUFACTURER_PREFIXES,
build_target_cores_cache, build_zip_contents_index, check_inside_zip,
compute_hashes, fetch_large_file, group_identical_platforms,
compute_hashes, expand_platform_declared_names, fetch_large_file, group_identical_platforms,
list_emulator_profiles, list_platform_system_ids, list_registered_platforms,
filter_systems_by_target, list_system_ids, load_database,
load_data_dir_registry, load_emulator_profiles, load_platform_config,
@@ -371,12 +371,8 @@ def _collect_emulator_extras(
by_path_suffix = db.get("indexes", {}).get("by_path_suffix", {})
# Build set of filenames already covered (platform baseline + first pass extras)
covered_names: set[str] = set()
for sys_id, system in config.get("systems", {}).items():
for fe in system.get("files", []):
n = fe.get("name", "")
if n:
covered_names.add(n)
# Enriched with canonical names from DB via MD5 (handles platform renaming)
covered_names = expand_platform_declared_names(config, db)
for e in extras:
covered_names.add(e["name"])

View File

@@ -31,10 +31,11 @@ from pathlib import Path
sys.path.insert(0, os.path.dirname(__file__))
from common import (
build_target_cores_cache, build_zip_contents_index, check_inside_zip,
compute_hashes, filter_systems_by_target, group_identical_platforms,
list_emulator_profiles, list_system_ids, load_data_dir_registry,
load_emulator_profiles, load_platform_config, md5sum, md5_composite,
require_yaml, resolve_local_file, resolve_platform_cores,
compute_hashes, expand_platform_declared_names, filter_systems_by_target,
group_identical_platforms, list_emulator_profiles, list_system_ids,
load_data_dir_registry, load_emulator_profiles, load_platform_config,
md5sum, md5_composite, require_yaml, resolve_local_file,
resolve_platform_cores,
)
yaml = require_yaml()
@@ -261,13 +262,9 @@ def find_undeclared_files(
data_names: set[str] | None = None,
) -> list[dict]:
"""Find files needed by cores but not declared in platform config."""
# Collect all filenames declared by this platform
declared_names: set[str] = set()
for sys_id, system in config.get("systems", {}).items():
for fe in system.get("files", []):
name = fe.get("name", "")
if name:
declared_names.add(name)
# Collect all filenames declared by this platform, enriched with
# canonical names from DB via MD5 (handles platform renaming)
declared_names = expand_platform_declared_names(config, db)
# Collect data_directory refs
declared_dd: set[str] = set()