feat: unify gap analysis with verify results and source provenance

Single source of truth for gap page: verification status from
verify.py (verified/untested/missing/mismatch), file provenance
from cross_reference (bios/data/large_file/missing).

cross_reference.py: _find_in_repo -> _resolve_source returning
source category, stop skipping storage: release/large_file,
add by_path_suffix lookup, all_declared param for global check.

generate_site.py: gap page now shows verification by platform,
18 hash mismatches, and core complement with provenance breakdown.
This commit is contained in:
Abdessamad Derraz
2026-04-01 22:33:37 +02:00
parent cbb86c7746
commit 91925120c9
3 changed files with 721 additions and 300 deletions

View File

@@ -20,6 +20,8 @@ theme:
icon: material/brightness-4 icon: material/brightness-4
name: Switch to auto name: Switch to auto
font: false font: false
icon:
logo: material/chip
features: features:
- navigation.tabs - navigation.tabs
- navigation.sections - navigation.sections
@@ -29,6 +31,8 @@ theme:
- search.highlight - search.highlight
- content.tabs.link - content.tabs.link
- toc.follow - toc.follow
extra_css:
- stylesheets/extra.css
markdown_extensions: markdown_extensions:
- tables - tables
- admonition - admonition

View File

@@ -103,32 +103,41 @@ def _build_supplemental_index(
return names return names
def _find_in_repo( def _resolve_source(
fname: str, fname: str,
by_name: dict[str, list], by_name: dict[str, list],
by_name_lower: dict[str, str], by_name_lower: dict[str, str],
data_names: set[str] | None = None, data_names: set[str] | None = None,
) -> bool: by_path_suffix: dict | None = None,
) -> str | None:
"""Return the source category for a file, or None if not found.
Returns ``"bios"`` (in database.json / bios/), ``"data"`` (in data/),
or ``None`` (not available anywhere).
"""
# bios/ via database.json by_name
if fname in by_name: if fname in by_name:
return True return "bios"
# For directory entries or paths, extract the meaningful basename
stripped = fname.rstrip("/") stripped = fname.rstrip("/")
basename = stripped.rsplit("/", 1)[-1] if "/" in stripped else None basename = stripped.rsplit("/", 1)[-1] if "/" in stripped else None
if basename and basename in by_name: if basename and basename in by_name:
return True return "bios"
key = fname.lower() key = fname.lower()
if key in by_name_lower: if key in by_name_lower:
return True return "bios"
if basename: if basename:
key = basename.lower() if basename.lower() in by_name_lower:
if key in by_name_lower: return "bios"
return True # bios/ via by_path_suffix (regional variants)
if by_path_suffix and fname in by_path_suffix:
return "bios"
# data/ supplemental index
if data_names: if data_names:
if fname in data_names or key in data_names: if fname in data_names or key in data_names:
return True return "data"
if basename and (basename in data_names or basename.lower() in data_names): if basename and (basename in data_names or basename.lower() in data_names):
return True return "data"
return False return None
def cross_reference( def cross_reference(
@@ -137,30 +146,44 @@ def cross_reference(
db: dict, db: dict,
platform_data_dirs: dict[str, set[str]] | None = None, platform_data_dirs: dict[str, set[str]] | None = None,
data_names: set[str] | None = None, data_names: set[str] | None = None,
all_declared: set[str] | None = None,
) -> dict: ) -> dict:
"""Compare emulator profiles against platform declarations. """Compare emulator profiles against platform declarations.
Returns a report with gaps (files emulators need but platforms don't list) Returns a report with gaps (files emulators need but platforms don't list)
and coverage stats. Files covered by matching data_directories between and coverage stats. Each gap entry carries a ``source`` field indicating
emulator profile and platform config are not reported as gaps. where the file is available: ``"bios"`` (bios/ via database.json),
Checks both bios/ (via database) and data/ (via data_names index). ``"data"`` (data/ directory), ``"large_file"`` (GitHub release asset),
or ``"missing"`` (not available anywhere).
The boolean ``in_repo`` is derived: ``source != "missing"``.
When *all_declared* is provided (flat set of every filename declared by
any platform for any system), it is used for the ``in_platform`` check
instead of the per-system lookup. This is appropriate for the global
gap analysis page where "undeclared" means "no platform declares it at all".
""" """
platform_data_dirs = platform_data_dirs or {} platform_data_dirs = platform_data_dirs or {}
by_name = db.get("indexes", {}).get("by_name", {}) by_name = db.get("indexes", {}).get("by_name", {})
by_name_lower = {k.lower(): k for k in by_name} by_name_lower = {k.lower(): k for k in by_name}
by_md5 = db.get("indexes", {}).get("by_md5", {})
by_path_suffix = db.get("indexes", {}).get("by_path_suffix", {})
db_files = db.get("files", {})
report = {} report = {}
for emu_name, profile in profiles.items(): for emu_name, profile in profiles.items():
emu_files = profile.get("files", []) emu_files = profile.get("files", [])
systems = profile.get("systems", []) systems = profile.get("systems", [])
platform_names = set() if all_declared is not None:
for sys_id in systems: platform_names = all_declared
platform_names.update(declared.get(sys_id, set())) else:
platform_names = set()
for sys_id in systems:
platform_names.update(declared.get(sys_id, set()))
gaps = [] gaps = []
covered = [] covered = []
by_md5 = db.get("indexes", {}).get("by_md5", {})
for f in emu_files: for f in emu_files:
fname = f.get("name", "") fname = f.get("name", "")
if not fname: if not fname:
@@ -174,37 +197,45 @@ def cross_reference(
if "path" in f and f["path"] is None: if "path" in f and f["path"] is None:
continue continue
# Skip release asset files (stored in GitHub releases, not bios/)
if f.get("storage") == "release":
continue
# Skip standalone-only files # Skip standalone-only files
file_mode = f.get("mode", "both") file_mode = f.get("mode", "both")
if file_mode == "standalone": if file_mode == "standalone":
continue continue
# --- resolve source provenance ---
storage = f.get("storage", "")
if storage in ("release", "large_file"):
source = "large_file"
else:
source = _resolve_source(
fname, by_name, by_name_lower, data_names, by_path_suffix
)
if source is None:
path_field = f.get("path", "")
if path_field and path_field != fname:
source = _resolve_source(
path_field, by_name, by_name_lower,
data_names, by_path_suffix,
)
# Try MD5 hash match
if source is None:
md5_raw = f.get("md5", "")
if md5_raw:
for md5_val in md5_raw.split(","):
md5_val = md5_val.strip().lower()
if md5_val and by_md5.get(md5_val):
source = "bios"
break
# Try SHA1 hash match
if source is None:
sha1 = f.get("sha1", "")
if sha1 and sha1 in db_files:
source = "bios"
if source is None:
source = "missing"
in_repo = source != "missing"
in_platform = fname in platform_names in_platform = fname in platform_names
in_repo = _find_in_repo(fname, by_name, by_name_lower, data_names)
if not in_repo:
path_field = f.get("path", "")
if path_field and path_field != fname:
in_repo = _find_in_repo(
path_field, by_name, by_name_lower, data_names
)
# Try MD5 hash match (handles files that exist under different names)
if not in_repo:
md5_raw = f.get("md5", "")
if md5_raw:
for md5_val in md5_raw.split(","):
md5_val = md5_val.strip().lower()
if md5_val and by_md5.get(md5_val):
in_repo = True
break
# Try SHA1 hash match
if not in_repo:
sha1 = f.get("sha1", "")
if sha1 and sha1 in db.get("files", {}):
in_repo = True
entry = { entry = {
"name": fname, "name": fname,
@@ -213,6 +244,7 @@ def cross_reference(
"source_ref": f.get("source_ref", ""), "source_ref": f.get("source_ref", ""),
"in_platform": in_platform, "in_platform": in_platform,
"in_repo": in_repo, "in_repo": in_repo,
"source": source,
} }
if not in_platform: if not in_platform:
@@ -227,7 +259,10 @@ def cross_reference(
"platform_covered": len(covered), "platform_covered": len(covered),
"gaps": len(gaps), "gaps": len(gaps),
"gap_in_repo": sum(1 for g in gaps if g["in_repo"]), "gap_in_repo": sum(1 for g in gaps if g["in_repo"]),
"gap_missing": sum(1 for g in gaps if not g["in_repo"]), "gap_missing": sum(1 for g in gaps if g["source"] == "missing"),
"gap_bios": sum(1 for g in gaps if g["source"] == "bios"),
"gap_data": sum(1 for g in gaps if g["source"] == "data"),
"gap_large_file": sum(1 for g in gaps if g["source"] == "large_file"),
"gap_details": gaps, "gap_details": gaps,
} }
@@ -240,15 +275,19 @@ def print_report(report: dict) -> None:
print("=" * 60) print("=" * 60)
total_gaps = 0 total_gaps = 0
total_in_repo = 0 totals: dict[str, int] = {"bios": 0, "data": 0, "large_file": 0, "missing": 0}
total_missing = 0
for emu_name, data in sorted(report.items()): for emu_name, data in sorted(report.items()):
gaps = data["gaps"] gaps = data["gaps"]
if gaps == 0: if gaps == 0:
status = "OK" continue
else:
status = f"{data['gap_in_repo']} in repo, {data['gap_missing']} missing" parts = []
for key in ("bios", "data", "large_file", "missing"):
count = data.get(f"gap_{key}", 0)
if count:
parts.append(f"{count} {key}")
status = ", ".join(parts) if parts else "OK"
print(f"\n{data['emulator']} ({', '.join(data['systems'])})") print(f"\n{data['emulator']} ({', '.join(data['systems'])})")
print( print(
@@ -256,23 +295,24 @@ def print_report(report: dict) -> None:
f"{data['platform_covered']} declared by platforms, " f"{data['platform_covered']} declared by platforms, "
f"{gaps} undeclared" f"{gaps} undeclared"
) )
print(f" Gaps: {status}")
if gaps > 0: for g in data["gap_details"]:
print(f" Gaps: {status}") req = "*" if g["required"] else " "
for g in data["gap_details"]: src = g.get("source", "missing").upper()
req = "*" if g["required"] else " " note = f" -- {g['note']}" if g["note"] else ""
loc = "repo" if g["in_repo"] else "MISSING" print(f" {req} {g['name']} [{src}]{note}")
note = f" -- {g['note']}" if g["note"] else ""
print(f" {req} {g['name']} [{loc}]{note}")
total_gaps += gaps total_gaps += gaps
total_in_repo += data["gap_in_repo"] for key in totals:
total_missing += data["gap_missing"] totals[key] += data.get(f"gap_{key}", 0)
print(f"\n{'=' * 60}") print(f"\n{'=' * 60}")
print(f"Total: {total_gaps} undeclared files across all emulators") print(f"Total: {total_gaps} undeclared files across all emulators")
print(f" {total_in_repo} already in repo (can be added to packs)") available = totals["bios"] + totals["data"] + totals["large_file"]
print(f" {total_missing} missing from repo (need to be sourced)") print(f" {available} available (bios: {totals['bios']}, data: {totals['data']}, "
f"large_file: {totals['large_file']})")
print(f" {totals['missing']} missing (need to be sourced)")
def main(): def main():

File diff suppressed because it is too large Load Diff