"""Platform truth generation and diffing. Generates ground-truth YAML from emulator profiles for gap analysis, and diffs truth against scraped platform data to find divergences. """ from __future__ import annotations import sys from common import _norm_system_id, resolve_platform_cores from validation import filter_files_by_mode def _serialize_source_ref(sr: object) -> str: """Convert a source_ref value to a clean string for serialization.""" if isinstance(sr, str): return sr if isinstance(sr, dict): parts = [f"{k}: {v}" for k, v in sr.items()] return "; ".join(parts) return str(sr) def _determine_core_mode( emu_name: str, profile: dict, cores_config: str | list | None, standalone_set: set[str] | None, ) -> str: """Determine effective mode (libretro/standalone) for a resolved core.""" if cores_config == "all_libretro": return "libretro" if standalone_set is not None: profile_names = {emu_name} | {str(c) for c in profile.get("cores", [])} if profile_names & standalone_set: return "standalone" return "libretro" ptype = profile.get("type", "libretro") if "standalone" in ptype and "libretro" in ptype: return "both" if "standalone" in ptype: return "standalone" return "libretro" def _enrich_hashes(entry: dict, db: dict) -> None: """Fill missing sibling hashes from the database, ground-truth preserving. The profile's hashes come from the emulator source code (ground truth). Any hash of a given file set of bytes is a projection of that same ground truth — sha1, md5, crc32 all identify the same bytes. If the profile has ONE ground-truth hash, the DB can supply its siblings. Lookup order (all are hash-anchored, never name-based): 1. SHA1 direct 2. MD5 -> SHA1 via indexes.by_md5 3. CRC32 -> SHA1 via indexes.by_crc32 (weaker 32-bit anchor, requires size match when profile has size) Name-based enrichment is NEVER used: a name alone has no ground-truth anchor, the file in bios/ may not match what the source code expects. Multi-hash entries (lists of accepted variants) are left untouched to preserve variant information. """ # Skip multi-hash entries — they express ground truth as "any of these N # variants", enriching with a single sibling would lose that information. for h in ("sha1", "md5", "crc32"): if isinstance(entry.get(h), list): return files_db = db.get("files", {}) indexes = db.get("indexes", {}) record = None # Anchor 1: SHA1 (strongest) sha1 = entry.get("sha1") if sha1 and isinstance(sha1, str): record = files_db.get(sha1) # Anchor 2: MD5 (strong) if record is None: md5 = entry.get("md5") if md5 and isinstance(md5, str): by_md5 = indexes.get("by_md5", {}) ref = by_md5.get(md5.lower()) if ref: ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None) if ref_sha1: record = files_db.get(ref_sha1) # Anchor 3: CRC32 (32-bit, collisions theoretically possible). # Require size match when profile has a size to guard against collisions. if record is None: crc = entry.get("crc32") if crc and isinstance(crc, str): by_crc32 = indexes.get("by_crc32", {}) ref = by_crc32.get(crc.lower()) if ref: ref_sha1 = ref if isinstance(ref, str) else (ref[0] if ref else None) if ref_sha1: candidate = files_db.get(ref_sha1) if candidate is not None: profile_size = entry.get("size") if not profile_size or candidate.get("size") == profile_size: record = candidate if record is None: return # Copy sibling hashes and size from the anchored record. # These are projections of the same ground-truth bytes. for field in ("sha1", "md5", "sha256", "crc32"): if not entry.get(field) and record.get(field): entry[field] = record[field] if not entry.get("size") and record.get("size"): entry["size"] = record["size"] def _merge_file_into_system( system: dict, file_entry: dict, emu_name: str, db: dict | None, ) -> None: """Merge a file entry into a system's file list, deduplicating by name.""" files = system.setdefault("files", []) name_lower = file_entry["name"].lower() existing = None for f in files: if f["name"].lower() == name_lower: existing = f break if existing is not None: existing["_cores"] = existing.get("_cores", set()) | {emu_name} sr = file_entry.get("source_ref") if sr is not None: sr_key = _serialize_source_ref(sr) existing["_source_refs"] = existing.get("_source_refs", set()) | {sr_key} else: existing.setdefault("_source_refs", set()) if file_entry.get("required") and not existing.get("required"): existing["required"] = True for h in ("sha1", "md5", "sha256", "crc32"): theirs = file_entry.get(h, "") ours = existing.get(h, "") # Skip empty strings if not theirs or theirs == "": continue if not ours or ours == "": existing[h] = theirs continue # Normalize to sets for multi-hash comparison t_list = theirs if isinstance(theirs, list) else [theirs] o_list = ours if isinstance(ours, list) else [ours] t_set = {str(v).lower() for v in t_list} o_set = {str(v).lower() for v in o_list} if not t_set & o_set: print( f"WARNING: hash conflict for {file_entry['name']} " f"({h}: {ours} vs {theirs}, core {emu_name})", file=sys.stderr, ) # Merge non-hash data fields if existing lacks them. # A core that creates an entry without size/path/validation may be # enriched by a sibling core that has those fields. for field in ( "size", "min_size", "max_size", "path", "validation", "description", "category", "hle_fallback", "note", "aliases", "contents", ): if file_entry.get(field) is not None and existing.get(field) is None: existing[field] = file_entry[field] return entry: dict = {"name": file_entry["name"]} if file_entry.get("required") is not None: entry["required"] = file_entry["required"] for field in ( "sha1", "md5", "sha256", "crc32", "size", "path", "description", "hle_fallback", "category", "note", "validation", "min_size", "max_size", "aliases", "contents", ): val = file_entry.get(field) if val is not None: entry[field] = val # Strip empty string hashes (profile says "" when hash is unknown) for h in ("sha1", "md5", "sha256", "crc32"): if entry.get(h) == "": del entry[h] # Normalize CRC32: strip 0x prefix, lowercase crc = entry.get("crc32") if isinstance(crc, str) and crc.startswith("0x"): entry["crc32"] = crc[2:].lower() elif isinstance(crc, str) and crc != crc.lower(): entry["crc32"] = crc.lower() entry["_cores"] = {emu_name} sr = file_entry.get("source_ref") if sr is not None: sr_key = _serialize_source_ref(sr) entry["_source_refs"] = {sr_key} else: entry["_source_refs"] = set() if db: _enrich_hashes(entry, db) files.append(entry) def _has_exploitable_data(entry: dict) -> bool: """Check if an entry has any data beyond its name that can drive verification. Applied AFTER merging all cores so entries benefit from enrichment by sibling cores before being judged empty. """ return bool( any(entry.get(h) for h in ("sha1", "md5", "sha256", "crc32")) or entry.get("path") or entry.get("size") or entry.get("min_size") or entry.get("max_size") or entry.get("validation") or entry.get("contents") ) def generate_platform_truth( platform_name: str, config: dict, registry_entry: dict, profiles: dict[str, dict], db: dict | None = None, target_cores: set[str] | None = None, ) -> dict: """Generate ground-truth system data for a platform from emulator profiles. Args: platform_name: platform identifier config: loaded platform config (via load_platform_config), has cores, systems, standalone_cores with inheritance resolved registry_entry: registry metadata for hash_type, verification_mode, etc. profiles: all loaded emulator profiles db: optional database for hash enrichment target_cores: optional hardware target core filter Returns a dict with platform metadata, systems, and per-file details including which cores reference each file. """ cores_config = config.get("cores") # Resolve standalone set for mode determination standalone_set: set[str] | None = None standalone_cores = config.get("standalone_cores") if isinstance(standalone_cores, list): standalone_set = {str(c) for c in standalone_cores} resolved = resolve_platform_cores(config, profiles, target_cores) # Build mapping: profile system ID -> platform system ID # Three strategies, tried in order: # 1. File-based: if the scraped platform already has this file, use its system # 2. Exact match: profile system ID == platform system ID # 3. Normalized match: strip manufacturer prefix + separators platform_sys_ids = set(config.get("systems", {}).keys()) # File->platform_system reverse index from scraped config file_to_plat_sys: dict[str, str] = {} for psid, sys_data in config.get("systems", {}).items(): for fe in sys_data.get("files", []): fname = fe.get("name", "").lower() if fname: file_to_plat_sys[fname] = psid for alias in fe.get("aliases", []): file_to_plat_sys[alias.lower()] = psid # Normalized ID -> platform system ID norm_to_platform: dict[str, str] = {} for psid in platform_sys_ids: norm_to_platform[_norm_system_id(psid)] = psid def _map_sys_id(profile_sid: str, file_name: str = "") -> str: """Map a profile system ID to the platform's system ID.""" # 1. File-based lookup (handles composites and name mismatches) if file_name: plat_sys = file_to_plat_sys.get(file_name.lower()) if plat_sys: return plat_sys # 2. Exact match if profile_sid in platform_sys_ids: return profile_sid # 3. Normalized match normed = _norm_system_id(profile_sid) return norm_to_platform.get(normed, profile_sid) systems: dict[str, dict] = {} cores_profiled: set[str] = set() cores_unprofiled: set[str] = set() # Track which cores contribute to each system system_cores: dict[str, dict[str, set[str]]] = {} for emu_name in sorted(resolved): profile = profiles.get(emu_name) if not profile: cores_unprofiled.add(emu_name) continue cores_profiled.add(emu_name) mode = _determine_core_mode(emu_name, profile, cores_config, standalone_set) raw_files = profile.get("files", []) if mode == "both": filtered = raw_files else: filtered = filter_files_by_mode( raw_files, standalone=(mode == "standalone") ) for fe in filtered: profile_sid = fe.get("system", "") if not profile_sid: sys_ids = profile.get("systems", []) profile_sid = sys_ids[0] if sys_ids else "unknown" sys_id = _map_sys_id(profile_sid, fe.get("name", "")) system = systems.setdefault(sys_id, {}) _merge_file_into_system(system, fe, emu_name, db) # Track core contribution per system sys_cov = system_cores.setdefault( sys_id, { "profiled": set(), "unprofiled": set(), }, ) sys_cov["profiled"].add(emu_name) # Ensure all systems of resolved cores have entries (even with 0 files). # This documents that the system is covered -the core was analyzed and # needs no external files for this system. for emu_name in cores_profiled: profile = profiles[emu_name] for prof_sid in profile.get("systems", []): sys_id = _map_sys_id(prof_sid) systems.setdefault(sys_id, {}) sys_cov = system_cores.setdefault( sys_id, { "profiled": set(), "unprofiled": set(), }, ) sys_cov["profiled"].add(emu_name) # Track unprofiled cores per system based on profile system lists for emu_name in cores_unprofiled: for sys_id in systems: sys_cov = system_cores.setdefault( sys_id, { "profiled": set(), "unprofiled": set(), }, ) sys_cov["unprofiled"].add(emu_name) # Drop files with no exploitable data AFTER all cores have contributed. # A file declared by one core without hash/size/path may be enriched by # another core that has the same entry with data — the filter must run # once at the end, not per-core at creation time. for sys_data in systems.values(): files_list = sys_data.get("files", []) if files_list: sys_data["files"] = [fe for fe in files_list if _has_exploitable_data(fe)] # Convert sets to sorted lists for serialization for sys_id, sys_data in systems.items(): for fe in sys_data.get("files", []): fe["_cores"] = sorted(fe.get("_cores", set())) fe["_source_refs"] = sorted(fe.get("_source_refs", set())) # Add per-system coverage cov = system_cores.get(sys_id, {}) sys_data["_coverage"] = { "cores_profiled": sorted(cov.get("profiled", set())), "cores_unprofiled": sorted(cov.get("unprofiled", set())), } return { "platform": platform_name, "generated": True, "systems": systems, "_coverage": { "cores_resolved": len(resolved), "cores_profiled": len(cores_profiled), "cores_unprofiled": sorted(cores_unprofiled), }, } # Platform truth diffing def _diff_system(truth_sys: dict, scraped_sys: dict) -> dict: """Compare files between truth and scraped for a single system.""" # Build truth index: name.lower() -> entry, alias.lower() -> entry truth_index: dict[str, dict] = {} for fe in truth_sys.get("files", []): truth_index[fe["name"].lower()] = fe for alias in fe.get("aliases", []): truth_index[alias.lower()] = fe # Build scraped index: name.lower() -> entry scraped_index: dict[str, dict] = {} for fe in scraped_sys.get("files", []): scraped_index[fe["name"].lower()] = fe missing: list[dict] = [] hash_mismatch: list[dict] = [] required_mismatch: list[dict] = [] extra_phantom: list[dict] = [] extra_unprofiled: list[dict] = [] matched_truth_names: set[str] = set() # Compare scraped files against truth for s_key, s_entry in scraped_index.items(): t_entry = truth_index.get(s_key) if t_entry is None: continue matched_truth_names.add(t_entry["name"].lower()) # Hash comparison for h in ("sha1", "md5", "crc32"): t_hash = t_entry.get(h, "") s_hash = s_entry.get(h, "") if not t_hash or not s_hash: continue # Normalize to list for multi-hash support t_list = t_hash if isinstance(t_hash, list) else [t_hash] s_list = s_hash if isinstance(s_hash, list) else [s_hash] t_set = {v.lower() for v in t_list} s_set = {v.lower() for v in s_list} if not t_set & s_set: hash_mismatch.append( { "name": s_entry["name"], "hash_type": h, f"truth_{h}": t_hash, f"scraped_{h}": s_hash, "truth_cores": list(t_entry.get("_cores", [])), } ) break # Required mismatch t_req = t_entry.get("required") s_req = s_entry.get("required") if t_req is not None and s_req is not None and t_req != s_req: required_mismatch.append( { "name": s_entry["name"], "truth_required": t_req, "scraped_required": s_req, } ) # Collect unmatched files from both sides unmatched_truth = [ fe for fe in truth_sys.get("files", []) if fe["name"].lower() not in matched_truth_names ] unmatched_scraped = { s_key: s_entry for s_key, s_entry in scraped_index.items() if s_key not in truth_index } # Hash-based fallback: detect platform renames (e.g. Batocera ROM → ROM1) # If an unmatched scraped file shares a hash with an unmatched truth file, # it's the same file under a different name — a platform rename, not a gap. rename_matched_truth: set[str] = set() rename_matched_scraped: set[str] = set() if unmatched_truth and unmatched_scraped: # Build hash → truth file index for unmatched truth files truth_hash_index: dict[str, dict] = {} for fe in unmatched_truth: for h in ("sha1", "md5", "crc32"): val = fe.get(h) if val and isinstance(val, str): truth_hash_index[val.lower()] = fe for s_key, s_entry in unmatched_scraped.items(): for h in ("sha1", "md5", "crc32"): s_val = s_entry.get(h) if not s_val or not isinstance(s_val, str): continue t_entry = truth_hash_index.get(s_val.lower()) if t_entry is not None: # Rename detected — count as matched rename_matched_truth.add(t_entry["name"].lower()) rename_matched_scraped.add(s_key) break # Truth files not matched (by name, alias, or hash) -> missing for fe in unmatched_truth: if fe["name"].lower() not in rename_matched_truth: missing.append( { "name": fe["name"], "cores": list(fe.get("_cores", [])), "source_refs": list(fe.get("_source_refs", [])), } ) # Scraped files not in truth -> extra coverage = truth_sys.get("_coverage", {}) has_unprofiled = bool(coverage.get("cores_unprofiled")) for s_key, s_entry in unmatched_scraped.items(): if s_key in rename_matched_scraped: continue entry = {"name": s_entry["name"]} if has_unprofiled: extra_unprofiled.append(entry) else: extra_phantom.append(entry) result: dict = {} if missing: result["missing"] = missing if hash_mismatch: result["hash_mismatch"] = hash_mismatch if required_mismatch: result["required_mismatch"] = required_mismatch if extra_phantom: result["extra_phantom"] = extra_phantom if extra_unprofiled: result["extra_unprofiled"] = extra_unprofiled return result def _has_divergences(sys_div: dict) -> bool: """Check if a system divergence dict contains any actual divergences.""" return bool(sys_div) def _update_summary(summary: dict, sys_div: dict) -> None: """Update summary counters from a system divergence dict.""" summary["total_missing"] += len(sys_div.get("missing", [])) summary["total_extra_phantom"] += len(sys_div.get("extra_phantom", [])) summary["total_extra_unprofiled"] += len(sys_div.get("extra_unprofiled", [])) summary["total_hash_mismatch"] += len(sys_div.get("hash_mismatch", [])) summary["total_required_mismatch"] += len(sys_div.get("required_mismatch", [])) def diff_platform_truth(truth: dict, scraped: dict) -> dict: """Compare truth YAML against scraped YAML, returning divergences. System IDs are matched using normalized forms (via _norm_system_id) to handle naming differences between emulator profiles and scraped platforms (e.g. 'sega-game-gear' vs 'sega-gamegear'). """ truth_systems = truth.get("systems", {}) scraped_systems = scraped.get("systems", {}) summary = { "systems_compared": 0, "systems_fully_covered": 0, "systems_partially_covered": 0, "systems_uncovered": 0, "total_missing": 0, "total_extra_phantom": 0, "total_extra_unprofiled": 0, "total_hash_mismatch": 0, "total_required_mismatch": 0, } divergences: dict[str, dict] = {} uncovered_systems: list[str] = [] # Build normalized-ID lookup for truth systems norm_to_truth: dict[str, str] = {} for sid in truth_systems: norm_to_truth[_norm_system_id(sid)] = sid # Match scraped systems to truth via normalized IDs matched_truth: set[str] = set() for s_sid in sorted(scraped_systems): norm = _norm_system_id(s_sid) t_sid = norm_to_truth.get(norm) if t_sid is None: # Also try exact match (in case normalization is lossy) if s_sid in truth_systems: t_sid = s_sid else: uncovered_systems.append(s_sid) summary["systems_uncovered"] += 1 continue matched_truth.add(t_sid) summary["systems_compared"] += 1 sys_div = _diff_system(truth_systems[t_sid], scraped_systems[s_sid]) if _has_divergences(sys_div): divergences[s_sid] = sys_div _update_summary(summary, sys_div) summary["systems_partially_covered"] += 1 else: summary["systems_fully_covered"] += 1 # Truth systems not matched by any scraped system -all files missing for t_sid in sorted(truth_systems): if t_sid in matched_truth: continue summary["systems_compared"] += 1 sys_div = _diff_system(truth_systems[t_sid], {"files": []}) if _has_divergences(sys_div): divergences[t_sid] = sys_div _update_summary(summary, sys_div) summary["systems_partially_covered"] += 1 else: summary["systems_fully_covered"] += 1 result: dict = {"summary": summary} if divergences: result["divergences"] = divergences if uncovered_systems: result["uncovered_systems"] = uncovered_systems return result