fix: normalize system IDs in diff matching

This commit is contained in:
Abdessamad Derraz
2026-03-29 14:07:14 +02:00
parent ae71a41b32
commit 13b20c2742
2 changed files with 70 additions and 19 deletions

View File

@@ -1455,7 +1455,12 @@ def _update_summary(summary: dict, sys_div: dict) -> None:
def diff_platform_truth(truth: dict, scraped: dict) -> dict:
"""Compare truth YAML against scraped YAML, returning divergences."""
"""Compare truth YAML against scraped YAML, returning divergences.
System IDs are matched using normalized forms (via _norm_system_id) to
handle naming differences between emulator profiles and scraped platforms
(e.g. 'sega-game-gear' vs 'sega-gamegear').
"""
truth_systems = truth.get("systems", {})
scraped_systems = scraped.get("systems", {})
@@ -1474,30 +1479,46 @@ def diff_platform_truth(truth: dict, scraped: dict) -> dict:
divergences: dict[str, dict] = {}
uncovered_systems: list[str] = []
all_sys_ids = sorted(set(truth_systems) | set(scraped_systems))
# Build normalized-ID lookup for truth systems
norm_to_truth: dict[str, str] = {}
for sid in truth_systems:
norm_to_truth[_norm_system_id(sid)] = sid
for sys_id in all_sys_ids:
in_truth = sys_id in truth_systems
in_scraped = sys_id in scraped_systems
# Match scraped systems to truth via normalized IDs
matched_truth: set[str] = set()
if in_scraped and not in_truth:
uncovered_systems.append(sys_id)
for s_sid in sorted(scraped_systems):
norm = _norm_system_id(s_sid)
t_sid = norm_to_truth.get(norm)
if t_sid is None:
# Also try exact match (in case normalization is lossy)
if s_sid in truth_systems:
t_sid = s_sid
else:
uncovered_systems.append(s_sid)
summary["systems_uncovered"] += 1
continue
matched_truth.add(t_sid)
summary["systems_compared"] += 1
if in_truth and not in_scraped:
# All truth files are missing
truth_sys = truth_systems[sys_id]
sys_div = _diff_system(truth_sys, {"files": []})
else:
truth_sys = truth_systems[sys_id]
scraped_sys = scraped_systems[sys_id]
sys_div = _diff_system(truth_sys, scraped_sys)
sys_div = _diff_system(truth_systems[t_sid], scraped_systems[s_sid])
if _has_divergences(sys_div):
divergences[sys_id] = sys_div
divergences[s_sid] = sys_div
_update_summary(summary, sys_div)
summary["systems_partially_covered"] += 1
else:
summary["systems_fully_covered"] += 1
# Truth systems not matched by any scraped system — all files missing
for t_sid in sorted(truth_systems):
if t_sid in matched_truth:
continue
summary["systems_compared"] += 1
sys_div = _diff_system(truth_systems[t_sid], {"files": []})
if _has_divergences(sys_div):
divergences[t_sid] = sys_div
_update_summary(summary, sys_div)
summary["systems_partially_covered"] += 1
else:

View File

@@ -3007,6 +3007,36 @@ class TestE2E(unittest.TestCase):
self.assertEqual(hm["scraped_md5"], "scraped_hash")
def test_104_diff_truth_normalized_system_ids(self):
"""Diff matches systems with different ID formats via normalization."""
from common import diff_platform_truth
truth = {
"systems": {
"sega-gamegear": {
"_coverage": {"cores_profiled": ["c"], "cores_unprofiled": []},
"files": [
{"name": "bios.gg", "required": True, "md5": "a" * 32,
"_cores": ["c"], "_source_refs": []},
],
},
}
}
scraped = {
"systems": {
"sega-game-gear": {
"files": [
{"name": "bios.gg", "required": True, "md5": "a" * 32},
],
},
}
}
result = diff_platform_truth(truth, scraped)
self.assertEqual(result["summary"]["systems_uncovered"], 0)
self.assertEqual(result["summary"]["total_missing"], 0)
self.assertEqual(result["summary"]["systems_compared"], 1)
# ---------------------------------------------------------------
# native_id preservation
# ---------------------------------------------------------------