Files
libretro/scripts/scraper/recalbox_scraper.py

273 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""Scraper for Recalbox BIOS requirements.
Source: https://gitlab.com/recalbox/recalbox/-/raw/master/board/recalbox/fsoverlay/recalbox/share_init/system/.emulationstation/es_bios.xml
Format: XML (es_bios.xml)
Hash: MD5 (multiple valid hashes per entry, comma-separated)
Recalbox verification logic:
- Checks MD5 of file on disk against list of valid hashes
- Multiple MD5s accepted per BIOS (different ROM revisions)
- Alternate file paths (pipe-separated)
- hashMatchMandatory flag: if false, wrong hash = warning (YELLOW) not error (RED)
- ZIP files get composite MD5 calculation
"""
from __future__ import annotations
import sys
import urllib.request
import urllib.error
import xml.etree.ElementTree as ET
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
PLATFORM_NAME = "recalbox"
SOURCE_URL = (
"https://gitlab.com/recalbox/recalbox/-/raw/master/"
"board/recalbox/fsoverlay/recalbox/share_init/system/"
".emulationstation/es_bios.xml"
)
SYSTEM_SLUG_MAP = {
"3do": "3do",
"amiga600": "commodore-amiga",
"amiga1200": "commodore-amiga",
"amigacd32": "commodore-amiga",
"amigacdtv": "commodore-amiga",
"amstradcpc": "amstrad-cpc",
"atari800": "atari-400-800",
"atari5200": "atari-5200",
"atari7800": "atari-7800",
"atarilynx": "atari-lynx",
"atarist": "atari-st",
"c64": "commodore-c64",
"channelf": "fairchild-channel-f",
"colecovision": "coleco-colecovision",
"dreamcast": "sega-dreamcast",
"fds": "nintendo-fds",
"gamecube": "nintendo-gamecube",
"gamegear": "sega-game-gear",
"gb": "nintendo-gb",
"gba": "nintendo-gba",
"gbc": "nintendo-gbc",
"intellivision": "mattel-intellivision",
"jaguar": "atari-jaguar",
"mastersystem": "sega-master-system",
"megadrive": "sega-mega-drive",
"msx": "microsoft-msx",
"msx1": "microsoft-msx",
"msx2": "microsoft-msx",
"n64": "nintendo-64",
"naomi": "sega-dreamcast-arcade",
"naomigd": "sega-dreamcast-arcade",
"atomiswave": "sega-dreamcast-arcade",
"nds": "nintendo-ds",
"neogeo": "snk-neogeo",
"neogeocd": "snk-neogeo-cd",
"o2em": "magnavox-odyssey2",
"pcengine": "nec-pc-engine",
"pcenginecd": "nec-pc-engine",
"pcfx": "nec-pc-fx",
"ps2": "sony-playstation-2",
"psx": "sony-playstation",
"saturn": "sega-saturn",
"scummvm": "scummvm",
"segacd": "sega-mega-cd",
"snes": "nintendo-snes",
"supergrafx": "nec-pc-engine",
"x68000": "sharp-x68000",
"zxspectrum": "sinclair-zx-spectrum",
}
class Scraper(BaseScraper):
"""Scraper for Recalbox es_bios.xml."""
def __init__(self, url: str = SOURCE_URL):
super().__init__(url=url)
def fetch_requirements(self) -> list[BiosRequirement]:
"""Parse es_bios.xml and return BIOS requirements."""
raw = self._fetch_raw()
if not self.validate_format(raw):
raise ValueError("es_bios.xml format validation failed")
root = ET.fromstring(raw)
requirements = []
seen = set()
for system_elem in root.findall(".//system"):
platform = system_elem.get("platform", "")
system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
for bios_elem in system_elem.findall("bios"):
paths_str = bios_elem.get("path", "")
md5_str = bios_elem.get("md5", "")
core = bios_elem.get("core", "")
mandatory = bios_elem.get("mandatory", "true") != "false"
hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
note = bios_elem.get("note", "")
paths = [p.strip() for p in paths_str.split("|") if p.strip()]
if not paths:
continue
primary_path = paths[0]
name = primary_path.split("/")[-1] if "/" in primary_path else primary_path
md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
all_md5 = ",".join(md5_list) if md5_list else None
dedup_key = primary_path
if dedup_key in seen:
continue
seen.add(dedup_key)
requirements.append(BiosRequirement(
name=name,
system=system_slug,
md5=all_md5,
destination=primary_path,
required=mandatory,
))
return requirements
def fetch_full_requirements(self) -> list[dict]:
"""Parse es_bios.xml preserving all Recalbox-specific fields."""
raw = self._fetch_raw()
root = ET.fromstring(raw)
requirements = []
for system_elem in root.findall(".//system"):
platform = system_elem.get("platform", "")
system_name = system_elem.get("name", platform)
system_slug = SYSTEM_SLUG_MAP.get(platform, platform)
for bios_elem in system_elem.findall("bios"):
paths_str = bios_elem.get("path", "")
md5_str = bios_elem.get("md5", "")
core = bios_elem.get("core", "")
mandatory = bios_elem.get("mandatory", "true") != "false"
hash_match_mandatory = bios_elem.get("hashMatchMandatory", "true") != "false"
note = bios_elem.get("note", "")
paths = [p.strip() for p in paths_str.split("|") if p.strip()]
md5_list = [m.strip() for m in md5_str.split(",") if m.strip()]
if not paths:
continue
name = paths[0].split("/")[-1] if "/" in paths[0] else paths[0]
requirements.append({
"name": name,
"system": system_slug,
"system_name": system_name,
"paths": paths,
"md5_list": md5_list,
"core": core,
"mandatory": mandatory,
"hash_match_mandatory": hash_match_mandatory,
"note": note,
})
return requirements
def validate_format(self, raw_data: str) -> bool:
"""Validate es_bios.xml format."""
return "<biosList" in raw_data and "<system" in raw_data and "<bios" in raw_data
def generate_platform_yaml(self) -> dict:
"""Generate a platform YAML config dict from scraped data."""
requirements = self.fetch_requirements()
systems = {}
for req in requirements:
if req.system not in systems:
systems[req.system] = {"files": []}
entry = {
"name": req.name,
"destination": req.destination,
"required": req.required,
}
if req.md5:
entry["md5"] = req.md5
systems[req.system]["files"].append(entry)
version = fetch_github_latest_tag("recalbox/recalbox", prefix="") or ""
# Recalbox uses GitLab - GitHub API may not resolve
if not version:
version = "10.0"
return {
"platform": "Recalbox",
"version": version,
"homepage": "https://www.recalbox.com",
"source": SOURCE_URL,
"base_destination": "bios",
"hash_type": "md5",
"verification_mode": "md5",
"systems": systems,
}
def main():
"""CLI entry point."""
import argparse
import json
parser = argparse.ArgumentParser(description="Scrape Recalbox es_bios.xml")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--json", action="store_true")
parser.add_argument("--full", action="store_true", help="Show full Recalbox-specific fields")
parser.add_argument("--output", "-o")
args = parser.parse_args()
scraper = Scraper()
try:
if args.full:
reqs = scraper.fetch_full_requirements()
print(json.dumps(reqs[:5], indent=2))
print(f"\nTotal: {len(reqs)} BIOS entries")
return
reqs = scraper.fetch_requirements()
except (ConnectionError, ValueError) as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if args.dry_run:
from collections import defaultdict
by_system = defaultdict(list)
for r in reqs:
by_system[r.system].append(r)
for sys_name, files in sorted(by_system.items()):
print(f"\n{sys_name} ({len(files)} files):")
for f in files[:5]:
print(f" {f.name} (md5={f.md5[:12] if f.md5 else 'N/A'}...)")
if len(files) > 5:
print(f" ... +{len(files)-5} more")
print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems")
return
if args.json:
config = scraper.generate_platform_yaml()
print(json.dumps(config, indent=2))
return
by_system = {}
for r in reqs:
by_system.setdefault(r.system, []).append(r)
print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems")
if __name__ == "__main__":
main()