From 2aab7420d771b61b98bfc1727ae0f8fbdedba90a Mon Sep 17 00:00:00 2001
From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com>
Date: Sun, 29 Mar 2026 13:09:08 +0200
Subject: [PATCH] feat: add diff_platform_truth function and tests

---
 scripts/common.py | 162 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/test_e2e.py | 130 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 291 insertions(+), 1 deletion(-)

diff --git a/scripts/common.py b/scripts/common.py
index fce07427..7ccc95e4 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -1310,3 +1310,165 @@ def generate_platform_truth(
             "cores_unprofiled": sorted(cores_unprofiled),
         },
     }
+
+
+# -------------------------------------------------------------------
+# Platform truth diffing
+# -------------------------------------------------------------------
+
+def _diff_system(truth_sys: dict, scraped_sys: dict) -> dict:
+    """Compare files between truth and scraped for a single system."""
+    # Build truth index: name.lower() -> entry, alias.lower() -> entry
+    truth_index: dict[str, dict] = {}
+    for fe in truth_sys.get("files", []):
+        truth_index[fe["name"].lower()] = fe
+        for alias in fe.get("aliases", []):
+            truth_index[alias.lower()] = fe
+
+    # Build scraped index: name.lower() -> entry
+    scraped_index: dict[str, dict] = {}
+    for fe in scraped_sys.get("files", []):
+        scraped_index[fe["name"].lower()] = fe
+
+    missing: list[dict] = []
+    hash_mismatch: list[dict] = []
+    required_mismatch: list[dict] = []
+    extra_phantom: list[dict] = []
+    extra_unprofiled: list[dict] = []
+
+    matched_truth_names: set[str] = set()
+
+    # Compare scraped files against truth
+    for s_key, s_entry in scraped_index.items():
+        t_entry = truth_index.get(s_key)
+        if t_entry is None:
+            continue
+        matched_truth_names.add(t_entry["name"].lower())
+
+        # Hash comparison
+        for h in ("sha1", "md5", "crc32"):
+            t_hash = t_entry.get(h, "")
+            s_hash = s_entry.get(h, "")
+            if t_hash and s_hash and t_hash.lower() != s_hash.lower():
+                hash_mismatch.append({
+                    "name": s_entry["name"],
+                    "hash_type": h,
+                    f"truth_{h}": t_hash,
+                    f"scraped_{h}": s_hash,
+                    "truth_cores": list(t_entry.get("_cores", [])),
+                })
+                break
+
+        # Required mismatch
+        t_req = t_entry.get("required")
+        s_req = s_entry.get("required")
+        if t_req is not None and s_req is not None and t_req != s_req:
+            required_mismatch.append({
+                "name": s_entry["name"],
+                "truth_required": t_req,
+                "scraped_required": s_req,
+            })
+
+    # Truth files not matched -> missing
+    for fe in truth_sys.get("files", []):
+        if fe["name"].lower() not in matched_truth_names:
+            missing.append({
+                "name": fe["name"],
+                "cores": list(fe.get("_cores", [])),
+                "source_refs": list(fe.get("_source_refs", [])),
+            })
+
+    # Scraped files not in truth -> extra
+    coverage = truth_sys.get("_coverage", {})
+    has_unprofiled = bool(coverage.get("cores_unprofiled"))
+    for s_key, s_entry in scraped_index.items():
+        if s_key not in truth_index:
+            entry = {"name": s_entry["name"]}
+            if has_unprofiled:
+                extra_unprofiled.append(entry)
+            else:
+                extra_phantom.append(entry)
+
+    result: dict = {}
+    if missing:
+        result["missing"] = missing
+    if hash_mismatch:
+        result["hash_mismatch"] = hash_mismatch
+    if required_mismatch:
+        result["required_mismatch"] = required_mismatch
+    if extra_phantom:
+        result["extra_phantom"] = extra_phantom
+    if extra_unprofiled:
+        result["extra_unprofiled"] = extra_unprofiled
+    return result
+
+
+def _has_divergences(sys_div: dict) -> bool:
+    """Check if a system divergence dict contains any actual divergences."""
+    return bool(sys_div)
+
+
+def _update_summary(summary: dict, sys_div: dict) -> None:
+    """Update summary counters from a system divergence dict."""
+    summary["total_missing"] += len(sys_div.get("missing", []))
+    summary["total_extra_phantom"] += len(sys_div.get("extra_phantom", []))
+    summary["total_extra_unprofiled"] += len(sys_div.get("extra_unprofiled", []))
+    summary["total_hash_mismatch"] += len(sys_div.get("hash_mismatch", []))
+    summary["total_required_mismatch"] += len(sys_div.get("required_mismatch", []))
+
+
+def diff_platform_truth(truth: dict, scraped: dict) -> dict:
+    """Compare truth YAML against scraped YAML, returning divergences."""
+    truth_systems = truth.get("systems", {})
+    scraped_systems = scraped.get("systems", {})
+
+    summary = {
+        "systems_compared": 0,
+        "systems_fully_covered": 0,
+        "systems_partially_covered": 0,
+        "systems_uncovered": 0,
+        "total_missing": 0,
+        "total_extra_phantom": 0,
+        "total_extra_unprofiled": 0,
+        "total_hash_mismatch": 0,
+        "total_required_mismatch": 0,
+    }
+
+    divergences: dict[str, dict] = {}
+    uncovered_systems: list[str] = []
+
+    all_sys_ids = sorted(set(truth_systems) | set(scraped_systems))
+
+    for sys_id in all_sys_ids:
+        in_truth = sys_id in truth_systems
+        in_scraped = sys_id in scraped_systems
+
+        if in_scraped and not in_truth:
+            uncovered_systems.append(sys_id)
+            summary["systems_uncovered"] += 1
+            continue
+
+        summary["systems_compared"] += 1
+
+        if in_truth and not in_scraped:
+            # All truth files are missing
+            truth_sys = truth_systems[sys_id]
+            sys_div = _diff_system(truth_sys, {"files": []})
+        else:
+            truth_sys = truth_systems[sys_id]
+            scraped_sys = scraped_systems[sys_id]
+            sys_div = _diff_system(truth_sys, scraped_sys)
+
+        if _has_divergences(sys_div):
+            divergences[sys_id] = sys_div
+            _update_summary(summary, sys_div)
+            summary["systems_partially_covered"] += 1
+        else:
+            summary["systems_fully_covered"] += 1
+
+    result: dict = {"summary": summary}
+    if divergences:
+        result["divergences"] = divergences
+    if uncovered_systems:
+        result["uncovered_systems"] = uncovered_systems
+    return result
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 88aadc0c..6d862807 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -31,7 +31,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts"))
 import yaml
 from common import (
     _build_validation_index, build_zip_contents_index, check_file_validation,
-    check_inside_zip, compute_hashes, filter_files_by_mode,
+    check_inside_zip, compute_hashes, diff_platform_truth,
+    filter_files_by_mode,
     generate_platform_truth,
     group_identical_platforms, load_emulator_profiles, load_platform_config,
     md5_composite, md5sum, parse_md5_list, resolve_local_file,
@@ -2885,5 +2886,132 @@ class TestE2E(unittest.TestCase):
         self.assertFalse((missing_dir / "prod.keys").exists())
 
 
+    # ---------------------------------------------------------------
+    # diff_platform_truth tests
+    # ---------------------------------------------------------------
+
+    def test_98_diff_truth_missing(self):
+        """Truth has 2 files, scraped has 1 -> 1 missing with cores/source_refs."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"], "cores_unprofiled": []},
+            "files": [
+                {"name": "bios_a.bin", "required": True, "md5": "aaa",
+                 "_cores": ["core_a"], "_source_refs": ["src/a.c:10"]},
+                {"name": "bios_b.bin", "required": False, "md5": "bbb",
+                 "_cores": ["core_a"], "_source_refs": ["src/b.c:20"]},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [{"name": "bios_a.bin", "md5": "aaa"}],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_missing"], 1)
+        div = result["divergences"]["test-sys"]
+        self.assertEqual(len(div["missing"]), 1)
+        m = div["missing"][0]
+        self.assertEqual(m["name"], "bios_b.bin")
+        self.assertEqual(m["cores"], ["core_a"])
+        self.assertEqual(m["source_refs"], ["src/b.c:20"])
+
+    def test_99_diff_truth_extra_phantom(self):
+        """All cores profiled, scraped has extra file -> extra_phantom."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"], "cores_unprofiled": []},
+            "files": [
+                {"name": "bios.bin", "md5": "aaa",
+                 "_cores": ["core_a"], "_source_refs": []},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [
+                {"name": "bios.bin", "md5": "aaa"},
+                {"name": "phantom.bin", "md5": "zzz"},
+            ],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_extra_phantom"], 1)
+        div = result["divergences"]["test-sys"]
+        self.assertEqual(len(div["extra_phantom"]), 1)
+        self.assertEqual(div["extra_phantom"][0]["name"], "phantom.bin")
+
+    def test_100_diff_truth_extra_unprofiled(self):
+        """Some cores unprofiled, scraped has extra -> extra_unprofiled."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"],
+                          "cores_unprofiled": ["core_b"]},
+            "files": [
+                {"name": "bios.bin", "md5": "aaa",
+                 "_cores": ["core_a"], "_source_refs": []},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [
+                {"name": "bios.bin", "md5": "aaa"},
+                {"name": "extra.bin", "md5": "yyy"},
+            ],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_extra_unprofiled"], 1)
+        div = result["divergences"]["test-sys"]
+        self.assertEqual(len(div["extra_unprofiled"]), 1)
+        self.assertEqual(div["extra_unprofiled"][0]["name"], "extra.bin")
+
+    def test_101_diff_truth_alias_matching(self):
+        """Truth file with aliases, scraped uses alias -> not extra or missing."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"], "cores_unprofiled": []},
+            "files": [
+                {"name": "bios.bin", "md5": "aaa", "aliases": ["alt.bin"],
+                 "_cores": ["core_a"], "_source_refs": []},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [{"name": "alt.bin", "md5": "aaa"}],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_missing"], 0)
+        self.assertEqual(result["summary"]["total_extra_phantom"], 0)
+        self.assertNotIn("test-sys", result.get("divergences", {}))
+
+    def test_102_diff_truth_case_insensitive(self):
+        """Truth 'BIOS.ROM', scraped 'bios.rom' -> match, no missing."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"], "cores_unprofiled": []},
+            "files": [
+                {"name": "BIOS.ROM", "md5": "aaa",
+                 "_cores": ["core_a"], "_source_refs": []},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [{"name": "bios.rom", "md5": "aaa"}],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_missing"], 0)
+        self.assertNotIn("test-sys", result.get("divergences", {}))
+
+    def test_103_diff_truth_hash_mismatch(self):
+        """Same file, different md5 -> hash_mismatch with truth_cores."""
+        truth = {"systems": {"test-sys": {
+            "_coverage": {"cores_profiled": ["core_a"], "cores_unprofiled": []},
+            "files": [
+                {"name": "bios.bin", "md5": "truth_hash",
+                 "_cores": ["core_a", "core_b"],
+                 "_source_refs": ["src/x.c:5"]},
+            ],
+        }}}
+        scraped = {"systems": {"test-sys": {
+            "files": [{"name": "bios.bin", "md5": "scraped_hash"}],
+        }}}
+        result = diff_platform_truth(truth, scraped)
+        self.assertEqual(result["summary"]["total_hash_mismatch"], 1)
+        div = result["divergences"]["test-sys"]
+        self.assertEqual(len(div["hash_mismatch"]), 1)
+        hm = div["hash_mismatch"][0]
+        self.assertEqual(hm["name"], "bios.bin")
+        self.assertEqual(hm["truth_cores"], ["core_a", "core_b"])
+        self.assertEqual(hm["truth_md5"], "truth_hash")
+        self.assertEqual(hm["scraped_md5"], "scraped_hash")
+
+
 if __name__ == "__main__":
     unittest.main()