Files
libretro/scripts/scraper/retrodeck_scraper.py
Abdessamad Derraz 2e21d64a08 refactor: harden codebase and remove unicode artifacts
- fix urllib.parse.quote import (was urllib.request.quote)
- add operator precedence parens in generate_pack dedup check
- narrow bare except to specific types in batocera target scraper
- cache load_platform_config and build_zip_contents_index results
- add selective algorithm support to compute_hashes
- atomic write for fetch_large_file (tmp + rename)
- add response size limit to base scraper fetch
- extract build_target_cores_cache to common.py (dedup verify/pack)
- hoist _build_supplemental_index out of per-platform loop
- migrate function-attribute caches to module-level dicts
- add @abstractmethod to BaseTargetScraper.fetch_targets
- remove backward-compat re-exports from common.py
- replace em-dashes and unicode arrows with ASCII equivalents
- remove decorative section dividers and obvious comments
2026-03-29 23:15:20 +02:00

431 lines
15 KiB
Python

#!/usr/bin/env python3
"""Scraper for RetroDECK BIOS requirements.
Source: https://github.com/RetroDECK/components
Format: component_manifest.json per component directory
Hash: MD5 (primary), SHA256 for some entries (melonDS DSi)
RetroDECK stores BIOS requirements in component_manifest.json files,
one per emulator component. BIOS entries can appear in three locations:
- top-level 'bios' key
- preset_actions.bios (duckstation, dolphin, pcsx2)
- cores.bios (retroarch)
Path tokens: $bios_path, $saves_path, $roms_path map to
~/retrodeck/bios/, ~/retrodeck/saves/, ~/retrodeck/roms/ respectively.
$saves_path entries are directory placeholders (excluded).
$roms_path entries (neogeo.zip etc.) get roms/ prefix in destination.
Entries with no paths key default to bios/ (RetroDECK's default BIOS dir).
Verification logic (api_data_processing.sh:289-405):
- md5sum per file, compared against known_md5 (comma-separated list)
- envsubst resolves path tokens at runtime
- Multi-threaded on system_cpu_max_threads
"""
from __future__ import annotations
import json
import os
import re
import sys
import urllib.request
import urllib.error
from pathlib import Path
try:
from .base_scraper import BaseScraper, BiosRequirement
except ImportError:
sys.path.insert(0, str(Path(__file__).parent.parent))
from scraper.base_scraper import BaseScraper, BiosRequirement
PLATFORM_NAME = "retrodeck"
COMPONENTS_REPO = "RetroDECK/components"
COMPONENTS_BRANCH = "main"
COMPONENTS_API_URL = (
f"https://api.github.com/repos/{COMPONENTS_REPO}"
f"/git/trees/{COMPONENTS_BRANCH}"
)
RAW_BASE = (
f"https://raw.githubusercontent.com/{COMPONENTS_REPO}"
f"/{COMPONENTS_BRANCH}"
)
SKIP_DIRS = {"archive_later", "archive_old", "automation-tools", ".github"}
NON_EMULATOR_COMPONENTS = {
"framework", "es-de", "steam-rom-manager", "flips", "portmaster",
}
# RetroDECK system ID -> retrobios slug.
# None = skip (system not relevant for BIOS packs).
# Missing key = pass through as-is.
SYSTEM_SLUG_MAP: dict[str, str | None] = {
# Nintendo
"nes": "nintendo-nes",
"snes": "nintendo-snes",
"snesna": "nintendo-snes",
"n64": "nintendo-64",
"n64dd": "nintendo-64dd",
"gc": "nintendo-gamecube",
"wii": "nintendo-wii",
"wiiu": "nintendo-wii-u",
"switch": "nintendo-switch",
"gb": "nintendo-gb",
"gbc": "nintendo-gbc",
"gba": "nintendo-gba",
"nds": "nintendo-ds",
"3ds": "nintendo-3ds",
"n3ds": "nintendo-3ds",
"fds": "nintendo-fds",
"sgb": "nintendo-sgb",
"virtualboy": "nintendo-virtual-boy",
# Sony
"psx": "sony-playstation",
"ps2": "sony-playstation-2",
"ps3": "sony-playstation-3",
"psp": "sony-psp",
"psvita": "sony-psvita",
# Sega
"megadrive": "sega-mega-drive",
"genesis": "sega-mega-drive",
"megacd": "sega-mega-cd",
"megacdjp": "sega-mega-cd",
"segacd": "sega-mega-cd",
"saturn": "sega-saturn",
"saturnjp": "sega-saturn",
"dreamcast": "sega-dreamcast",
"naomi": "sega-dreamcast-arcade",
"naomi2": "sega-dreamcast-arcade",
"atomiswave": "sega-dreamcast-arcade",
"gamegear": "sega-game-gear",
"mastersystem": "sega-master-system",
"sms": "sega-master-system",
# NEC
"pcengine": "nec-pc-engine",
"pcenginecd": "nec-pc-engine",
"turbografx16": "nec-pc-engine",
"pcfx": "nec-pc-fx",
"pc98": "nec-pc-98",
"pc9800": "nec-pc-98",
"pc88": "nec-pc-88",
"pc8800": "nec-pc-88",
# Other
"3do": "3do",
"amstradcpc": "amstrad-cpc",
"arcade": "arcade",
"mame": "arcade",
"fbneo": "arcade",
"atari800": "atari-400-800",
"atari5200": "atari-5200",
"atari7800": "atari-7800",
"atarijaguar": "atari-jaguar",
"atarilynx": "atari-lynx",
"atarist": "atari-st",
"atarixe": "atari-400-800",
"c64": "commodore-c64",
"amiga": "commodore-amiga",
"cdimono1": "philips-cdi",
"channelf": "fairchild-channel-f",
"colecovision": "coleco-colecovision",
"intellivision": "mattel-intellivision",
"msx": "microsoft-msx",
"xbox": "microsoft-xbox",
"doom": "doom",
"j2me": "j2me",
"mac2": "apple-macintosh-ii",
"macintosh": "apple-macintosh-ii",
"apple2": "apple-ii",
"apple2gs": "apple-iigs",
"enterprise": "enterprise-64-128",
"gamecom": "tiger-game-com",
"gmaster": "hartung-game-master",
"pokemini": "nintendo-pokemon-mini",
"scv": "epoch-scv",
"supervision": "watara-supervision",
"wonderswan": "bandai-wonderswan",
"neogeocd": "snk-neogeo-cd",
"neogeocdjp": "snk-neogeo-cd",
"coco": "tandy-coco",
"trs80": "tandy-trs-80",
"dragon": "dragon-32-64",
"tanodragon": "dragon-32-64",
"pico8": "pico8",
"wolfenstein": "wolfenstein-3d",
"zxspectrum": "sinclair-zx-spectrum",
}
def _sanitize_path(p: str) -> str:
"""Fix upstream typos in path tokens."""
return re.sub(r"\$saves_\w+", "$saves_path", p)
def _resolve_path(p: str) -> str:
"""Resolve RetroDECK path tokens to pack-relative paths."""
p = _sanitize_path(p)
p = p.replace("$bios_path", "bios")
p = p.replace("$saves_path", "saves")
p = p.replace("$roms_path", "roms")
return p.strip("/")
def _extract_bios_entries(component_val: dict) -> list[dict]:
"""Extract BIOS entries from all three possible locations in a component.
No dedup here -dedup is done in fetch_requirements() with full
(system, filename) key to avoid dropping valid same-filename entries
across different systems.
"""
entries: list[dict] = []
def collect(bios_data: list | dict) -> None:
if isinstance(bios_data, dict):
bios_data = [bios_data]
if not isinstance(bios_data, list):
return
for entry in bios_data:
if isinstance(entry, dict) and entry.get("filename", "").strip():
entries.append(entry)
if "bios" in component_val:
collect(component_val["bios"])
pa = component_val.get("preset_actions", {})
if isinstance(pa, dict) and "bios" in pa:
collect(pa["bios"])
cores = component_val.get("cores", {})
if isinstance(cores, dict) and "bios" in cores:
collect(cores["bios"])
return entries
def _map_system(raw_system: str) -> str | None:
"""Map RetroDECK system ID to retrobios slug.
Returns None for systems explicitly excluded from the map.
Unknown systems pass through as-is.
"""
if raw_system in SYSTEM_SLUG_MAP:
return SYSTEM_SLUG_MAP[raw_system]
return raw_system
class Scraper(BaseScraper):
"""RetroDECK BIOS scraper from component manifests."""
platform_name = PLATFORM_NAME
def __init__(self, manifests_dir: str = "") -> None:
super().__init__()
self.manifests_dir = manifests_dir
self._manifests: list[tuple[str, dict]] | None = None
def _get_manifests(self) -> list[tuple[str, dict]]:
"""Fetch manifests once, cache for reuse."""
if self._manifests is None:
self._manifests = (
self._fetch_local_manifests()
if self.manifests_dir
else self._fetch_remote_manifests()
)
return self._manifests
def _fetch_remote_manifests(self) -> list[tuple[str, dict]]:
"""Fetch component manifests via GitHub API."""
token = os.environ.get("GITHUB_TOKEN", "")
headers = {"User-Agent": "retrobios-scraper/1.0"}
if token:
headers["Authorization"] = f"token {token}"
try:
req = urllib.request.Request(COMPONENTS_API_URL, headers=headers)
with urllib.request.urlopen(req, timeout=30) as resp:
tree = json.loads(resp.read().decode())
except (urllib.error.HTTPError, urllib.error.URLError) as e:
raise ConnectionError(f"Failed to fetch component tree: {e}") from e
if tree.get("truncated"):
print(" WARNING: GitHub tree response truncated", file=sys.stderr)
component_dirs = [
item["path"]
for item in tree.get("tree", [])
if item["type"] == "tree" and item["path"] not in SKIP_DIRS
]
manifests: list[tuple[str, dict]] = []
for comp in sorted(component_dirs):
url = f"{RAW_BASE}/{comp}/component_manifest.json"
print(f" {comp} ...", file=sys.stderr, end="", flush=True)
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=15) as resp:
data = json.loads(resp.read().decode())
manifests.append((comp, data))
print(" ok", file=sys.stderr)
except (urllib.error.HTTPError, urllib.error.URLError):
print(" skip", file=sys.stderr)
except json.JSONDecodeError as e:
print(f" parse error: {e}", file=sys.stderr)
return manifests
def _fetch_local_manifests(self) -> list[tuple[str, dict]]:
"""Read manifests from local RetroDECK install."""
root = Path(self.manifests_dir)
manifests: list[tuple[str, dict]] = []
for d in sorted(root.iterdir()):
if not d.is_dir() or d.name in SKIP_DIRS or d.name.startswith("."):
continue
mf = d / "component_manifest.json"
if not mf.exists():
continue
try:
with open(mf) as f:
manifests.append((d.name, json.load(f)))
except (json.JSONDecodeError, OSError) as e:
print(f" WARNING: {mf}: {e}", file=sys.stderr)
return manifests
def validate_format(self, raw_data: str) -> bool:
try:
return isinstance(json.loads(raw_data), dict)
except (json.JSONDecodeError, TypeError):
return False
def fetch_requirements(self) -> list[BiosRequirement]:
manifests = self._get_manifests()
requirements: list[BiosRequirement] = []
seen: set[tuple[str, str]] = set()
for comp_name, manifest in manifests:
for comp_key, comp_val in manifest.items():
if not isinstance(comp_val, dict):
continue
default_system = comp_val.get("system", comp_key)
if isinstance(default_system, list):
default_system = default_system[0] if default_system else comp_key
for entry in _extract_bios_entries(comp_val):
filename = entry["filename"].strip()
raw_system = entry.get("system", default_system)
if isinstance(raw_system, list):
raw_system = raw_system[0] if raw_system else default_system
system = _map_system(str(raw_system))
if system is None:
continue
# Resolve path
paths_raw = entry.get("paths")
if isinstance(paths_raw, str):
resolved = _resolve_path(paths_raw)
elif isinstance(paths_raw, list):
resolved = ""
for p in paths_raw:
rp = _resolve_path(str(p))
if not rp.startswith("saves"):
resolved = rp
break
if not resolved:
continue
else:
resolved = ""
# Skip saves-only entries
if resolved.startswith("saves"):
continue
# Build destination -default to bios/ if no path specified
if resolved:
destination = f"{resolved}/{filename}"
else:
destination = f"bios/{filename}"
# MD5 handling -sanitize upstream errors
md5_raw = entry.get("md5", "")
if isinstance(md5_raw, list):
parts = [str(m).strip().lower() for m in md5_raw if m]
elif md5_raw:
parts = [str(md5_raw).strip().lower()]
else:
parts = []
# Keep only valid 32-char hex MD5 hashes
valid = [p for p in parts if re.fullmatch(r"[0-9a-f]{32}", p)]
md5 = ",".join(valid)
required_raw = entry.get("required", "")
required = bool(required_raw) and str(required_raw).lower() not in (
"false", "no", "optional", "",
)
key = (system, filename.lower())
if key in seen:
existing = next(
(r for r in requirements if (r.system, r.name.lower()) == key),
None,
)
if existing and md5 and existing.md5 and md5 != existing.md5:
print(
f" WARNING: {filename} ({system}): MD5 conflict "
f"({existing.md5[:12]}... vs {md5[:12]}...)",
file=sys.stderr,
)
continue
seen.add(key)
requirements.append(BiosRequirement(
name=filename,
system=system,
destination=destination,
md5=md5,
required=required,
))
return requirements
def generate_platform_yaml(self) -> dict:
reqs = self.fetch_requirements()
manifests = self._get_manifests()
cores = sorted({
comp_name for comp_name, _ in manifests
if comp_name not in SKIP_DIRS
and comp_name not in NON_EMULATOR_COMPONENTS
})
systems: dict[str, dict] = {}
for req in reqs:
sys_entry = systems.setdefault(req.system, {"files": []})
file_entry: dict = {
"name": req.name,
"destination": req.destination,
"required": req.required,
}
if req.md5:
file_entry["md5"] = req.md5
sys_entry["files"].append(file_entry)
return {
"platform": "RetroDECK",
"version": "",
"homepage": "https://retrodeck.net",
"source": "https://github.com/RetroDECK/components",
"base_destination": "",
"hash_type": "md5",
"verification_mode": "md5",
"cores": cores,
"systems": systems,
}
def main() -> None:
from scraper.base_scraper import scraper_cli
scraper_cli(Scraper, "Scrape RetroDECK BIOS requirements")
if __name__ == "__main__":
main()