From d274d8a65c6073d84eeb5d09d1b3fd4b2b18cae6 Mon Sep 17 00:00:00 2001
From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com>
Date: Sun, 29 Mar 2026 14:15:06 +0200
Subject: [PATCH] fix: guard MD5 lookup against cross-name contamination

resolve_local_file step 2 (pure MD5 lookup) now verifies that the
found file's name matches the requested name or is a .variants/
derivative. Prevents serving wrong files when an unrelated file
shares the same MD5 in the index (e.g. spi.zip returned for
a7ports.zip because RetroDECK expected an MD5 we don't have).
---
 scripts/common.py | 67 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 9 deletions(-)

diff --git a/scripts/common.py b/scripts/common.py
index 8c34ec1b..0bb15384 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -350,18 +350,30 @@ def resolve_local_file(
             return path, "exact"
 
     # 2. MD5 direct lookup (skip for zipped_file: md5 is inner ROM, not container)
+    # Guard: only accept if the found file's name matches the requested name
+    # (or is a .variants/ derivative). Prevents cross-contamination when an
+    # unrelated file happens to share the same MD5 in the index.
+    _name_set = set(names_to_try)
+
+    def _md5_name_ok(candidate_path: str) -> bool:
+        bn = os.path.basename(candidate_path)
+        if bn in _name_set:
+            return True
+        # .variants/ pattern: filename like "neogeo.zip.fc398ab4"
+        return any(bn.startswith(n + ".") for n in _name_set)
+
     if md5_list and not zipped_file:
         for md5_candidate in md5_list:
             sha1_match = by_md5.get(md5_candidate)
             if sha1_match and sha1_match in files_db:
                 path = files_db[sha1_match]["path"]
-                if os.path.exists(path):
+                if os.path.exists(path) and _md5_name_ok(path):
                     return path, "md5_exact"
             if len(md5_candidate) < 32:
                 for db_md5, db_sha1 in by_md5.items():
                     if db_md5.startswith(md5_candidate) and db_sha1 in files_db:
                         path = files_db[db_sha1]["path"]
-                        if os.path.exists(path):
+                        if os.path.exists(path) and _md5_name_ok(path):
                             return path, "md5_exact"
 
     # 3. No MD5 = any file with that name or alias (existence check)
@@ -686,10 +698,10 @@ def resolve_platform_cores(
 
 
 MANUFACTURER_PREFIXES = (
-    "microsoft-", "nintendo-", "sony-", "sega-", "snk-", "panasonic-",
-    "nec-", "epoch-", "mattel-", "fairchild-", "hartung-", "tiger-",
-    "magnavox-", "philips-", "bandai-", "casio-", "coleco-",
-    "commodore-", "sharp-", "sinclair-", "atari-",
+    "apple-", "microsoft-", "nintendo-", "sony-", "sega-", "snk-",
+    "panasonic-", "nec-", "epoch-", "mattel-", "fairchild-", "hartung-",
+    "tiger-", "magnavox-", "philips-", "bandai-", "casio-", "coleco-",
+    "commodore-", "sharp-", "sinclair-", "atari-", "sammy-",
 )
 
 
@@ -1277,6 +1289,42 @@ def generate_platform_truth(
 
     resolved = resolve_platform_cores(config, profiles, target_cores)
 
+    # Build mapping: profile system ID -> platform system ID
+    # Three strategies, tried in order:
+    # 1. File-based: if the scraped platform already has this file, use its system
+    # 2. Exact match: profile system ID == platform system ID
+    # 3. Normalized match: strip manufacturer prefix + separators
+    platform_sys_ids = set(config.get("systems", {}).keys())
+
+    # File→platform_system reverse index from scraped config
+    file_to_plat_sys: dict[str, str] = {}
+    for psid, sys_data in config.get("systems", {}).items():
+        for fe in sys_data.get("files", []):
+            fname = fe.get("name", "").lower()
+            if fname:
+                file_to_plat_sys[fname] = psid
+            for alias in fe.get("aliases", []):
+                file_to_plat_sys[alias.lower()] = psid
+
+    # Normalized ID → platform system ID
+    norm_to_platform: dict[str, str] = {}
+    for psid in platform_sys_ids:
+        norm_to_platform[_norm_system_id(psid)] = psid
+
+    def _map_sys_id(profile_sid: str, file_name: str = "") -> str:
+        """Map a profile system ID to the platform's system ID."""
+        # 1. File-based lookup (handles composites and name mismatches)
+        if file_name:
+            plat_sys = file_to_plat_sys.get(file_name.lower())
+            if plat_sys:
+                return plat_sys
+        # 2. Exact match
+        if profile_sid in platform_sys_ids:
+            return profile_sid
+        # 3. Normalized match
+        normed = _norm_system_id(profile_sid)
+        return norm_to_platform.get(normed, profile_sid)
+
     systems: dict[str, dict] = {}
     cores_profiled: set[str] = set()
     cores_unprofiled: set[str] = set()
@@ -1298,10 +1346,11 @@ def generate_platform_truth(
             filtered = filter_files_by_mode(raw_files, standalone=(mode == "standalone"))
 
         for fe in filtered:
-            sys_id = fe.get("system", "")
-            if not sys_id:
+            profile_sid = fe.get("system", "")
+            if not profile_sid:
                 sys_ids = profile.get("systems", [])
-                sys_id = sys_ids[0] if sys_ids else "unknown"
+                profile_sid = sys_ids[0] if sys_ids else "unknown"
+            sys_id = _map_sys_id(profile_sid, fe.get("name", ""))
             system = systems.setdefault(sys_id, {})
             _merge_file_into_system(system, fe, emu_name, db)
             # Track core contribution per system