#!/usr/bin/env python3 """Scraper for Batocera batocera-systems. Source: https://github.com/batocera-linux/batocera.linux/.../batocera-systems Format: Python dict with systems -> biosFiles Hash: MD5 primary """ from __future__ import annotations import ast import json import re import sys import urllib.request import urllib.error from pathlib import Path import yaml from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag PLATFORM_NAME = "batocera" SOURCE_URL = ( "https://raw.githubusercontent.com/batocera-linux/batocera.linux/" "master/package/batocera/core/batocera-scripts/scripts/batocera-systems" ) CONFIGGEN_DEFAULTS_URL = ( "https://raw.githubusercontent.com/batocera-linux/batocera.linux/" "master/package/batocera/core/batocera-configgen/configs/" "configgen-defaults.yml" ) SYSTEM_SLUG_MAP = { "atari800": "atari-400-800", "atari5200": "atari-5200", "atarist": "atari-st", "lynx": "atari-lynx", "3do": "3do", "amiga": "commodore-amiga", "amiga600": "commodore-amiga", "amiga1200": "commodore-amiga", "amigacd32": "commodore-amiga", "amigacdtv": "commodore-amiga", "c128": "commodore-c128", "colecovision": "coleco-colecovision", "dreamcast": "sega-dreamcast", "naomi": "sega-dreamcast-arcade", "naomi2": "sega-dreamcast-arcade", "atomiswave": "sega-dreamcast-arcade", "fds": "nintendo-fds", "gamecube": "nintendo-gamecube", "gb": "nintendo-gb", "gba": "nintendo-gba", "gbc": "nintendo-gbc", "nds": "nintendo-ds", "n64dd": "nintendo-64dd", "satellaview": "nintendo-satellaview", "sgb": "nintendo-sgb", "snes": "nintendo-snes", "channelf": "fairchild-channel-f", "intellivision": "mattel-intellivision", "msx": "microsoft-msx", "msx1": "microsoft-msx", "msx2": "microsoft-msx", "msxturbor": "microsoft-msx", "neogeo": "snk-neogeo", "neogeocd": "snk-neogeo-cd", "odyssey2": "magnavox-odyssey2", "pcengine": "nec-pc-engine", "pcenginecd": "nec-pc-engine", "supergrafx": "nec-pc-engine", "pc88": "nec-pc-88", "pc98": "nec-pc-98", "pcfx": "nec-pc-fx", "psx": "sony-playstation", "ps2": "sony-playstation-2", "psp": "sony-psp", "saturn": "sega-saturn", "segacd": "sega-mega-cd", "mastersystem": "sega-master-system", "megadrive": "sega-mega-drive", "gamegear": "sega-game-gear", "x1": "sharp-x1", "x68000": "sharp-x68000", "zxspectrum": "sinclair-zx-spectrum", "scummvm": "scummvm", "doom": "doom", "macintosh": "apple-macintosh-ii", "dos": "dos", "videopac": "philips-videopac", "pokemini": "nintendo-pokemon-mini", } _MD5_RE = re.compile(r'^[a-fA-F0-9]+$') def _load_md5_index() -> dict[str, str]: """Load by_md5 index from database.json for prefix resolution.""" db_path = Path(__file__).resolve().parents[2] / "database.json" if not db_path.exists(): return {} with open(db_path) as f: db = json.load(f) return db.get("indexes", {}).get("by_md5", {}) def _resolve_truncated_md5(md5: str, md5_index: dict[str, str]) -> str: """Resolve a truncated MD5 to its full 32-char version via prefix match.""" if not md5 or len(md5) == 32: return md5 if not _MD5_RE.match(md5): return md5 matches = [k for k in md5_index if k.startswith(md5)] if len(matches) == 1: print(f" fixed truncated md5: {md5} -> {matches[0]}", file=sys.stderr) return matches[0] return md5 class Scraper(BaseScraper): """Scraper for batocera-systems Python dict.""" def __init__(self, url: str = SOURCE_URL): super().__init__(url=url) def _fetch_cores(self) -> tuple[list[str], list[str]]: """Extract core names and standalone cores from configgen-defaults.yml. Returns (all_cores, standalone_cores) where standalone_cores are those with emulator != "libretro". """ try: req = urllib.request.Request( CONFIGGEN_DEFAULTS_URL, headers={"User-Agent": "retrobios-scraper/1.0"}, ) with urllib.request.urlopen(req, timeout=30) as resp: raw = resp.read().decode("utf-8") except urllib.error.URLError as e: raise ConnectionError( f"Failed to fetch {CONFIGGEN_DEFAULTS_URL}: {e}" ) from e data = yaml.safe_load(raw) cores: set[str] = set() standalone: set[str] = set() for system, cfg in data.items(): if system == "default" or not isinstance(cfg, dict): continue emulator = cfg.get("emulator", "") core = cfg.get("core", "") if core: cores.add(core) if emulator and emulator != "libretro": standalone.add(emulator) if core and core != emulator: standalone.add(core) return sorted(cores), sorted(standalone) def _extract_systems_dict(self, raw: str) -> dict: """Extract and parse the 'systems' dict from the Python source via ast.literal_eval.""" match = re.search(r'^systems\s*=\s*\{', raw, re.MULTILINE) if not match: raise ValueError("Could not find 'systems = {' in batocera-systems") start = match.start() + raw[match.start():].index("{") depth = 0 i = start in_str = False str_ch = None while i < len(raw): ch = raw[i] if in_str: if ch == '\\': i += 2 continue if ch == str_ch: in_str = False elif ch in ('"', "'"): in_str = True str_ch = ch elif ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: break elif ch == "#": while i < len(raw) and raw[i] != "\n": i += 1 i += 1 dict_str = raw[start:i + 1] lines = [] for line in dict_str.split("\n"): in_string = False string_char = None clean = [] j = 0 while j < len(line): ch = line[j] if ch == '\\' and j + 1 < len(line): clean.append(ch) clean.append(line[j + 1]) j += 2 continue if ch in ('"', "'") and not in_string: in_string = True string_char = ch clean.append(ch) elif ch == string_char and in_string: in_string = False clean.append(ch) elif ch == "#" and not in_string: break else: clean.append(ch) j += 1 lines.append("".join(clean)) clean_dict_str = "\n".join(lines) # OrderedDict({...}) -> just the inner dict literal clean_dict_str = re.sub(r'OrderedDict\(\s*\{', '{', clean_dict_str) clean_dict_str = re.sub(r'\}\s*\)', '}', clean_dict_str) try: return ast.literal_eval(clean_dict_str) except (SyntaxError, ValueError) as e: raise ValueError(f"Failed to parse systems dict: {e}") from e def fetch_requirements(self) -> list[BiosRequirement]: """Parse batocera-systems and return BIOS requirements.""" raw = self._fetch_raw() if not self.validate_format(raw): raise ValueError("batocera-systems format validation failed") systems = self._extract_systems_dict(raw) requirements = [] md5_index = _load_md5_index() for sys_key, sys_data in systems.items(): system_slug = SYSTEM_SLUG_MAP.get(sys_key, sys_key) bios_files = sys_data.get("biosFiles", []) for bios in bios_files: file_path = bios.get("file", "") md5 = _resolve_truncated_md5(bios.get("md5", ""), md5_index) zipped_file = bios.get("zippedFile", "") if file_path.startswith("bios/"): file_path = file_path[5:] name = file_path.split("/")[-1] if "/" in file_path else file_path requirements.append(BiosRequirement( name=name, system=system_slug, md5=md5 or None, destination=file_path, required=True, zipped_file=zipped_file or None, )) return requirements def validate_format(self, raw_data: str) -> bool: """Validate batocera-systems format.""" has_systems = "systems" in raw_data and "biosFiles" in raw_data has_dict = re.search(r'^systems\s*=\s*\{', raw_data, re.MULTILINE) is not None has_md5 = '"md5"' in raw_data has_file = '"file"' in raw_data return has_systems and has_dict and has_md5 and has_file def generate_platform_yaml(self) -> dict: """Generate a platform YAML config dict from scraped data.""" requirements = self.fetch_requirements() systems = {} for req in requirements: if req.system not in systems: systems[req.system] = {"files": []} entry = { "name": req.name, "destination": req.destination, "required": req.required, } if req.md5: entry["md5"] = req.md5 if req.zipped_file: entry["zipped_file"] = req.zipped_file systems[req.system]["files"].append(entry) tag = fetch_github_latest_tag("batocera-linux/batocera.linux", prefix="batocera-") batocera_version = "" if tag: num = tag.removeprefix("batocera-") if num.isdigit(): batocera_version = num cores, standalone = self._fetch_cores() result = { "platform": "Batocera", "version": batocera_version or "", "homepage": "https://batocera.org", "source": SOURCE_URL, "base_destination": "bios", "hash_type": "md5", "verification_mode": "md5", "cores": cores, "systems": systems, } if standalone: result["standalone_cores"] = standalone return result def main(): from scripts.scraper.base_scraper import scraper_cli scraper_cli(Scraper, "Scrape batocera BIOS requirements") if __name__ == "__main__": main()