libretro/scripts/verify.py

#!/usr/bin/env python3
"""Platform-native BIOS verification engine.

Replicates the exact verification logic of each platform:
- RetroArch/Lakka/RetroPie: file existence only (core_info.c path_is_valid)
- Batocera: MD5 + checkInsideZip, no required distinction (batocera-systems:1062-1091)
- Recalbox: MD5 + mandatory/hashMatchMandatory, 3-color severity (Bios.cpp:109-130)
- RetroBat: same as Batocera
- EmuDeck: MD5 whitelist per system

Cross-references emulator profiles to detect undeclared files used by available cores.

Usage:
    python scripts/verify.py --all
    python scripts/verify.py --platform batocera
    python scripts/verify.py --all --include-archived
    python scripts/verify.py --all --json
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import sys
import zipfile
from pathlib import Path

try:
    import yaml
except ImportError:
    print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr)
    sys.exit(1)

sys.path.insert(0, os.path.dirname(__file__))
from common import (
    build_zip_contents_index, check_inside_zip, group_identical_platforms,
    load_emulator_profiles, load_platform_config, md5sum, md5_composite,
    resolve_local_file, resolve_platform_cores,
)

DEFAULT_DB = "database.json"
DEFAULT_PLATFORMS_DIR = "platforms"
DEFAULT_EMULATORS_DIR = "emulators"


# ---------------------------------------------------------------------------
# Status model — aligned with Batocera BiosStatus (batocera-systems:967-969)
# ---------------------------------------------------------------------------

class Status:
    OK = "ok"
    UNTESTED = "untested"   # file present, hash not confirmed
    MISSING = "missing"


# Severity for per-file required/optional distinction
class Severity:
    CRITICAL = "critical"   # required file missing or bad hash (Recalbox RED)
    WARNING = "warning"     # optional missing or hash mismatch (Recalbox YELLOW)
    INFO = "info"           # optional missing on existence-only platform
    OK = "ok"               # file verified

_STATUS_ORDER = {Status.OK: 0, Status.UNTESTED: 1, Status.MISSING: 2}
_SEVERITY_ORDER = {Severity.OK: 0, Severity.INFO: 1, Severity.WARNING: 2, Severity.CRITICAL: 3}


# ---------------------------------------------------------------------------
# Verification functions
# ---------------------------------------------------------------------------

def verify_entry_existence(file_entry: dict, local_path: str | None) -> dict:
    """RetroArch verification: path_is_valid() — file exists = OK."""
    name = file_entry.get("name", "")
    required = file_entry.get("required", True)
    if local_path:
        return {"name": name, "status": Status.OK, "required": required}
    return {"name": name, "status": Status.MISSING, "required": required}


def verify_entry_md5(
    file_entry: dict,
    local_path: str | None,
    resolve_status: str = "",
) -> dict:
    """MD5 verification — Batocera md5sum + Recalbox multi-hash + Md5Composite."""
    name = file_entry.get("name", "")
    expected_md5 = file_entry.get("md5", "")
    zipped_file = file_entry.get("zipped_file")
    required = file_entry.get("required", True)
    base = {"name": name, "required": required}

    if expected_md5 and "," in expected_md5:
        md5_list = [m.strip() for m in expected_md5.split(",") if m.strip()]
    else:
        md5_list = [expected_md5] if expected_md5 else []

    if not local_path:
        return {**base, "status": Status.MISSING}

    if zipped_file:
        found_in_zip = False
        had_error = False
        for md5_candidate in md5_list or [""]:
            result = check_inside_zip(local_path, zipped_file, md5_candidate)
            if result == Status.OK:
                return {**base, "status": Status.OK, "path": local_path}
            if result == "error":
                had_error = True
            elif result != "not_in_zip":
                found_in_zip = True
        if had_error and not found_in_zip:
            return {**base, "status": Status.UNTESTED, "path": local_path,
                    "reason": f"{local_path} read error"}
        if not found_in_zip:
            return {**base, "status": Status.UNTESTED, "path": local_path,
                    "reason": f"{zipped_file} not found inside ZIP"}
        return {**base, "status": Status.UNTESTED, "path": local_path,
                "reason": f"{zipped_file} MD5 mismatch inside ZIP"}

    if not md5_list:
        return {**base, "status": Status.OK, "path": local_path}

    if resolve_status == "md5_exact":
        return {**base, "status": Status.OK, "path": local_path}

    actual_md5 = md5sum(local_path)
    actual_lower = actual_md5.lower()
    for expected in md5_list:
        if actual_lower == expected.lower():
            return {**base, "status": Status.OK, "path": local_path}
        if len(expected) < 32 and actual_lower.startswith(expected.lower()):
            return {**base, "status": Status.OK, "path": local_path}

    if ".zip" in os.path.basename(local_path):
        try:
            composite = md5_composite(local_path)
            for expected in md5_list:
                if composite.lower() == expected.lower():
                    return {**base, "status": Status.OK, "path": local_path}
        except (zipfile.BadZipFile, OSError):
            pass

    return {**base, "status": Status.UNTESTED, "path": local_path,
            "reason": f"expected {md5_list[0][:12]}… got {actual_md5[:12]}…"}


# ---------------------------------------------------------------------------
# Severity mapping per platform
# ---------------------------------------------------------------------------

def compute_severity(
    status: str, required: bool, mode: str, hle_fallback: bool = False,
) -> str:
    """Map (status, required, verification_mode, hle_fallback) → severity.

    Based on native platform behavior + emulator HLE capability:
    - RetroArch (existence): required+missing = warning, optional+missing = info
    - Batocera (md5): no required distinction (batocera-systems has no mandatory field)
    - Recalbox (md5): mandatory+missing = critical, optional+missing = warning
    - hle_fallback: core works without this file via HLE → always INFO when missing
    """
    if status == Status.OK:
        return Severity.OK

    # HLE fallback: core works without this file regardless of platform requirement
    if hle_fallback and status == Status.MISSING:
        return Severity.INFO

    if mode == "existence":
        if status == Status.MISSING:
            return Severity.WARNING if required else Severity.INFO
        return Severity.OK

    # md5 mode (Batocera, Recalbox, RetroBat, EmuDeck)
    if status == Status.MISSING:
        return Severity.CRITICAL if required else Severity.WARNING
    if status == Status.UNTESTED:
        return Severity.WARNING
    return Severity.OK


# ---------------------------------------------------------------------------
# ZIP content index
# ---------------------------------------------------------------------------

# ---------------------------------------------------------------------------
# Cross-reference: undeclared files used by cores
# ---------------------------------------------------------------------------

def find_undeclared_files(
    config: dict,
    emulators_dir: str,
    db: dict,
    emu_profiles: dict | None = None,
) -> list[dict]:
    """Find files needed by cores but not declared in platform config."""
    # Collect all filenames declared by this platform
    declared_names: set[str] = set()
    for sys_id, system in config.get("systems", {}).items():
        for fe in system.get("files", []):
            name = fe.get("name", "")
            if name:
                declared_names.add(name)

    # Collect data_directory refs
    declared_dd: set[str] = set()
    for sys_id, system in config.get("systems", {}).items():
        for dd in system.get("data_directories", []):
            ref = dd.get("ref", "")
            if ref:
                declared_dd.add(ref)

    by_name = db.get("indexes", {}).get("by_name", {})
    profiles = emu_profiles if emu_profiles is not None else load_emulator_profiles(emulators_dir)

    relevant = resolve_platform_cores(config, profiles)
    undeclared = []
    seen = set()
    for emu_name, profile in sorted(profiles.items()):
        if profile.get("type") in ("launcher", "alias"):
            continue
        if emu_name not in relevant:
            continue

        for f in profile.get("files", []):
            fname = f.get("name", "")
            if not fname or fname in seen:
                continue
            # Skip standalone-only files for libretro platforms
            if f.get("mode") == "standalone":
                continue
            if fname in declared_names:
                continue

            in_repo = fname in by_name or fname.rsplit("/", 1)[-1] in by_name
            seen.add(fname)
            undeclared.append({
                "emulator": profile.get("emulator", emu_name),
                "name": fname,
                "required": f.get("required", False),
                "hle_fallback": f.get("hle_fallback", False),
                "in_repo": in_repo,
                "note": f.get("note", ""),
            })

    return undeclared


def find_exclusion_notes(
    config: dict, emulators_dir: str, emu_profiles: dict | None = None,
) -> list[dict]:
    """Document why certain emulator files are intentionally excluded.

    Reports:
    - Launchers (BIOS managed by standalone emulator)
    - Standalone-only files (not needed in libretro mode)
    - Frozen snapshots with files: [] (code doesn't load .info firmware)
    - Files covered by data_directories
    """
    profiles = emu_profiles if emu_profiles is not None else load_emulator_profiles(emulators_dir)
    platform_systems = set()
    for sys_id in config.get("systems", {}):
        platform_systems.add(sys_id)

    relevant = resolve_platform_cores(config, profiles)
    notes = []
    for emu_name, profile in sorted(profiles.items()):
        emu_systems = set(profile.get("systems", []))
        # Match by core resolution OR system intersection (documents all potential emulators)
        if emu_name not in relevant and not (emu_systems & platform_systems):
            continue

        emu_display = profile.get("emulator", emu_name)

        # Launcher excluded entirely
        if profile.get("type") == "launcher":
            notes.append({
                "emulator": emu_display, "reason": "launcher",
                "detail": profile.get("exclusion_note", "BIOS managed by standalone emulator"),
            })
            continue

        # Profile-level exclusion note (frozen snapshots, etc.)
        exclusion_note = profile.get("exclusion_note")
        if exclusion_note:
            notes.append({
                "emulator": emu_display, "reason": "exclusion_note",
                "detail": exclusion_note,
            })
            continue

        # Count standalone-only files
        standalone_files = [f for f in profile.get("files", []) if f.get("mode") == "standalone"]
        if standalone_files:
            names = [f["name"] for f in standalone_files[:3]]
            more = f" +{len(standalone_files)-3}" if len(standalone_files) > 3 else ""
            notes.append({
                "emulator": emu_display, "reason": "standalone_only",
                "detail": f"{len(standalone_files)} files for standalone mode only ({', '.join(names)}{more})",
            })

    return notes


# ---------------------------------------------------------------------------
# Platform verification
# ---------------------------------------------------------------------------

def verify_platform(
    config: dict, db: dict,
    emulators_dir: str = DEFAULT_EMULATORS_DIR,
    emu_profiles: dict | None = None,
) -> dict:
    """Verify all BIOS files for a platform, including cross-reference gaps."""
    mode = config.get("verification_mode", "existence")
    platform = config.get("platform", "unknown")

    has_zipped = any(
        fe.get("zipped_file")
        for sys in config.get("systems", {}).values()
        for fe in sys.get("files", [])
    )
    zip_contents = build_zip_contents_index(db) if has_zipped else {}

    # Build HLE index from emulator profiles: {filename: True} if any core has HLE for it
    profiles = emu_profiles if emu_profiles is not None else load_emulator_profiles(emulators_dir)
    hle_index: dict[str, bool] = {}
    for profile in profiles.values():
        for f in profile.get("files", []):
            if f.get("hle_fallback"):
                hle_index[f.get("name", "")] = True

    # Per-entry results
    details = []
    # Per-destination aggregation
    file_status: dict[str, str] = {}
    file_required: dict[str, bool] = {}
    file_severity: dict[str, str] = {}

    for sys_id, system in config.get("systems", {}).items():
        for file_entry in system.get("files", []):
            local_path, resolve_status = resolve_local_file(
                file_entry, db, zip_contents,
            )
            if mode == "existence":
                result = verify_entry_existence(file_entry, local_path)
            else:
                result = verify_entry_md5(file_entry, local_path, resolve_status)
            result["system"] = sys_id
            result["hle_fallback"] = hle_index.get(file_entry.get("name", ""), False)
            details.append(result)

            # Aggregate by destination
            dest = file_entry.get("destination", file_entry.get("name", ""))
            if not dest:
                dest = f"{sys_id}/{file_entry.get('name', '')}"
            required = file_entry.get("required", True)
            cur = result["status"]
            prev = file_status.get(dest)
            if prev is None or _STATUS_ORDER.get(cur, 0) > _STATUS_ORDER.get(prev, 0):
                file_status[dest] = cur
                file_required[dest] = required
            hle = hle_index.get(file_entry.get("name", ""), False)
            sev = compute_severity(cur, required, mode, hle)
            prev_sev = file_severity.get(dest)
            if prev_sev is None or _SEVERITY_ORDER.get(sev, 0) > _SEVERITY_ORDER.get(prev_sev, 0):
                file_severity[dest] = sev

    # Count by severity
    counts = {Severity.OK: 0, Severity.INFO: 0, Severity.WARNING: 0, Severity.CRITICAL: 0}
    for s in file_severity.values():
        counts[s] = counts.get(s, 0) + 1

    # Count by file status (ok/untested/missing)
    status_counts: dict[str, int] = {}
    for s in file_status.values():
        status_counts[s] = status_counts.get(s, 0) + 1

    # Cross-reference undeclared files
    undeclared = find_undeclared_files(config, emulators_dir, db, emu_profiles)
    exclusions = find_exclusion_notes(config, emulators_dir, emu_profiles)

    return {
        "platform": platform,
        "verification_mode": mode,
        "total_files": len(file_status),
        "severity_counts": counts,
        "status_counts": status_counts,
        "undeclared_files": undeclared,
        "exclusion_notes": exclusions,
        "details": details,
    }


# ---------------------------------------------------------------------------
# Output
# ---------------------------------------------------------------------------

def print_platform_result(result: dict, group: list[str]) -> None:
    mode = result["verification_mode"]
    total = result["total_files"]
    c = result["severity_counts"]
    label = " / ".join(group)
    ok_count = c[Severity.OK]
    problems = total - ok_count

    # Summary line — platform-native terminology
    if mode == "existence":
        if problems:
            missing = c.get(Severity.WARNING, 0) + c.get(Severity.CRITICAL, 0)
            optional_missing = c.get(Severity.INFO, 0)
            parts = [f"{ok_count}/{total} present"]
            if missing:
                parts.append(f"{missing} missing")
            if optional_missing:
                parts.append(f"{optional_missing} optional missing")
        else:
            parts = [f"{ok_count}/{total} present"]
    else:
        sc = result.get("status_counts", {})
        untested = sc.get(Status.UNTESTED, 0)
        missing = sc.get(Status.MISSING, 0)
        parts = [f"{ok_count}/{total} OK"]
        if untested:
            parts.append(f"{untested} untested")
        if missing:
            parts.append(f"{missing} missing")
    print(f"{label}: {', '.join(parts)} [{mode}]")

    # Detail non-OK entries with required/optional
    seen_details = set()
    for d in result["details"]:
        if d["status"] == Status.UNTESTED:
            key = f"{d['system']}/{d['name']}"
            if key in seen_details:
                continue
            seen_details.add(key)
            req = "required" if d.get("required", True) else "optional"
            hle = ", HLE available" if d.get("hle_fallback") else ""
            reason = d.get("reason", "")
            print(f"  UNTESTED ({req}{hle}): {key} — {reason}")
    for d in result["details"]:
        if d["status"] == Status.MISSING:
            key = f"{d['system']}/{d['name']}"
            if key in seen_details:
                continue
            seen_details.add(key)
            req = "required" if d.get("required", True) else "optional"
            hle = ", HLE available" if d.get("hle_fallback") else ""
            print(f"  MISSING ({req}{hle}): {key}")

    # Cross-reference: undeclared files used by cores
    undeclared = result.get("undeclared_files", [])
    if undeclared:
        req_not_in_repo = [u for u in undeclared if u["required"] and not u["in_repo"] and not u.get("hle_fallback")]
        req_hle_not_in_repo = [u for u in undeclared if u["required"] and not u["in_repo"] and u.get("hle_fallback")]
        req_in_repo = [u for u in undeclared if u["required"] and u["in_repo"]]
        opt_in_repo = [u for u in undeclared if not u["required"] and u["in_repo"]]
        opt_not_in_repo = [u for u in undeclared if not u["required"] and not u["in_repo"]]

        summary_parts = []
        if req_not_in_repo:
            summary_parts.append(f"{len(req_not_in_repo)} required NOT in repo")
        if req_hle_not_in_repo:
            summary_parts.append(f"{len(req_hle_not_in_repo)} required with HLE NOT in repo")
        if req_in_repo:
            summary_parts.append(f"{len(req_in_repo)} required in repo")
        if opt_in_repo:
            summary_parts.append(f"{len(opt_in_repo)} optional in repo")
        if opt_not_in_repo:
            summary_parts.append(f"{len(opt_not_in_repo)} optional NOT in repo")
        print(f"  Core gaps: {len(undeclared)} undeclared ({', '.join(summary_parts)})")

        # Show critical gaps (required + no HLE + not in repo)
        for u in req_not_in_repo:
            print(f"    {u['emulator']} → {u['name']} (required, NOT in repo)")
        # Show required with HLE (core works but not ideal)
        for u in req_hle_not_in_repo:
            print(f"    {u['emulator']} → {u['name']} (required, HLE available, NOT in repo)")
        # Show required in repo (actionable)
        for u in req_in_repo[:10]:
            print(f"    {u['emulator']} → {u['name']} (required, in repo)")
        if len(req_in_repo) > 10:
            print(f"    ... and {len(req_in_repo) - 10} more required in repo")

    # Intentional exclusions (explain why certain emulator files are NOT included)
    exclusions = result.get("exclusion_notes", [])
    if exclusions:
        print(f"  Intentional exclusions ({len(exclusions)}):")
        for ex in exclusions:
            print(f"    {ex['emulator']} — {ex['detail']} [{ex['reason']}]")


def main():
    parser = argparse.ArgumentParser(description="Platform-native BIOS verification")
    parser.add_argument("--platform", "-p", help="Platform name")
    parser.add_argument("--all", action="store_true", help="Verify all active platforms")
    parser.add_argument("--include-archived", action="store_true")
    parser.add_argument("--db", default=DEFAULT_DB)
    parser.add_argument("--platforms-dir", default=DEFAULT_PLATFORMS_DIR)
    parser.add_argument("--emulators-dir", default=DEFAULT_EMULATORS_DIR)
    parser.add_argument("--json", action="store_true", help="JSON output")
    args = parser.parse_args()

    with open(args.db) as f:
        db = json.load(f)

    if args.all:
        from list_platforms import list_platforms as _list_platforms
        platforms = _list_platforms(include_archived=args.include_archived)
    elif args.platform:
        platforms = [args.platform]
    else:
        parser.error("Specify --platform or --all")
        return

    # Load emulator profiles once for cross-reference (not per-platform)
    emu_profiles = load_emulator_profiles(args.emulators_dir)

    # Group identical platforms (same function as generate_pack)
    groups = group_identical_platforms(platforms, args.platforms_dir)
    all_results = {}
    group_results: list[tuple[dict, list[str]]] = []
    for group_platforms, representative in groups:
        config = load_platform_config(representative, args.platforms_dir)
        result = verify_platform(config, db, args.emulators_dir, emu_profiles)
        names = [load_platform_config(p, args.platforms_dir).get("platform", p) for p in group_platforms]
        group_results.append((result, names))
        for p in group_platforms:
            all_results[p] = result

    if not args.json:
        for result, group in group_results:
            print_platform_result(result, group)
            print()

    if args.json:
        for r in all_results.values():
            r["details"] = [d for d in r["details"] if d["status"] != Status.OK]
        print(json.dumps(all_results, indent=2))


if __name__ == "__main__":
    main()