Files
libretro/scripts/scraper/batocera_scraper.py
2026-03-30 11:55:57 +02:00

364 lines
12 KiB
Python

#!/usr/bin/env python3
"""Scraper for Batocera batocera-systems.
Source: https://github.com/batocera-linux/batocera.linux/.../batocera-systems
Format: Python dict with systems -> biosFiles
Hash: MD5 primary
"""
from __future__ import annotations
import ast
import json
import re
import sys
import urllib.request
import urllib.error
from pathlib import Path
import yaml
from .base_scraper import BaseScraper, BiosRequirement, fetch_github_latest_tag
PLATFORM_NAME = "batocera"
SOURCE_URL = (
"https://raw.githubusercontent.com/batocera-linux/batocera.linux/"
"master/package/batocera/core/batocera-scripts/scripts/batocera-systems"
)
CONFIGGEN_DEFAULTS_URL = (
"https://raw.githubusercontent.com/batocera-linux/batocera.linux/"
"master/package/batocera/core/batocera-configgen/configs/"
"configgen-defaults.yml"
)
SYSTEM_SLUG_MAP = {
"atari800": "atari-400-800",
"atari5200": "atari-5200",
"atarist": "atari-st",
"lynx": "atari-lynx",
"3do": "3do",
"amiga": "commodore-amiga",
"amiga600": "commodore-amiga",
"amiga1200": "commodore-amiga",
"amigacd32": "commodore-amiga",
"amigacdtv": "commodore-amiga",
"c128": "commodore-c128",
"colecovision": "coleco-colecovision",
"dreamcast": "sega-dreamcast",
"naomi": "sega-dreamcast-arcade",
"naomi2": "sega-dreamcast-arcade",
"atomiswave": "sega-dreamcast-arcade",
"fds": "nintendo-fds",
"gamecube": "nintendo-gamecube",
"gb": "nintendo-gb",
"gba": "nintendo-gba",
"gbc": "nintendo-gbc",
"nds": "nintendo-ds",
"n64dd": "nintendo-64dd",
"satellaview": "nintendo-satellaview",
"sgb": "nintendo-sgb",
"snes": "nintendo-snes",
"channelf": "fairchild-channel-f",
"intellivision": "mattel-intellivision",
"msx": "microsoft-msx",
"msx1": "microsoft-msx",
"msx2": "microsoft-msx",
"msxturbor": "microsoft-msx",
"neogeo": "snk-neogeo",
"neogeocd": "snk-neogeo-cd",
"odyssey2": "magnavox-odyssey2",
"pcengine": "nec-pc-engine",
"pcenginecd": "nec-pc-engine",
"supergrafx": "nec-pc-engine",
"pc88": "nec-pc-88",
"pc98": "nec-pc-98",
"pcfx": "nec-pc-fx",
"psx": "sony-playstation",
"ps2": "sony-playstation-2",
"psp": "sony-psp",
"saturn": "sega-saturn",
"segacd": "sega-mega-cd",
"mastersystem": "sega-master-system",
"megadrive": "sega-mega-drive",
"gamegear": "sega-game-gear",
"x1": "sharp-x1",
"x68000": "sharp-x68000",
"zxspectrum": "sinclair-zx-spectrum",
"scummvm": "scummvm",
"doom": "doom",
"macintosh": "apple-macintosh-ii",
"dos": "dos",
"videopac": "philips-videopac",
"pokemini": "nintendo-pokemon-mini",
"gsplus": "apple-iigs",
"apple2": "apple-ii",
"apple2gs": "apple-iigs",
"ps3": "sony-playstation-3",
"psvita": "sony-playstation-vita",
"coco": "coco",
"dragon32": "dragon32",
"dragon64": "dragon64",
"mc10": "mc10",
"msx2+": "microsoft-msx",
"msxturbor": "microsoft-msx",
"spectravideo": "spectravideo",
"tvc": "videoton-tvc",
"enterprise": "enterprise-64-128",
"vis": "tandy-vis",
"supracan": "supracan",
"jaguar": "atari-jaguar",
"jaguarcd": "atari-jaguar",
"switch": "nintendo-switch",
"wii": "nintendo-wii",
"xbox360": "microsoft-xbox-360",
}
_MD5_RE = re.compile(r'^[a-fA-F0-9]+$')
def _load_md5_index() -> dict[str, str]:
"""Load by_md5 index from database.json for prefix resolution."""
db_path = Path(__file__).resolve().parents[2] / "database.json"
if not db_path.exists():
return {}
with open(db_path) as f:
db = json.load(f)
return db.get("indexes", {}).get("by_md5", {})
def _resolve_truncated_md5(md5: str, md5_index: dict[str, str]) -> str:
"""Resolve a truncated MD5 to its full 32-char version via prefix match."""
if not md5 or len(md5) == 32:
return md5
if not _MD5_RE.match(md5):
return md5
matches = [k for k in md5_index if k.startswith(md5)]
if len(matches) == 1:
print(f" fixed truncated md5: {md5} -> {matches[0]}", file=sys.stderr)
return matches[0]
return md5
class Scraper(BaseScraper):
"""Scraper for batocera-systems Python dict."""
def __init__(self, url: str = SOURCE_URL):
super().__init__(url=url)
def _fetch_cores(self) -> tuple[list[str], list[str]]:
"""Extract core names and standalone cores from configgen-defaults.yml.
Returns (all_cores, standalone_cores) where standalone_cores are
those with emulator != "libretro".
"""
try:
req = urllib.request.Request(
CONFIGGEN_DEFAULTS_URL,
headers={"User-Agent": "retrobios-scraper/1.0"},
)
with urllib.request.urlopen(req, timeout=30) as resp:
raw = resp.read().decode("utf-8")
except urllib.error.URLError as e:
raise ConnectionError(
f"Failed to fetch {CONFIGGEN_DEFAULTS_URL}: {e}"
) from e
data = yaml.safe_load(raw)
cores: set[str] = set()
standalone: set[str] = set()
for system, cfg in data.items():
if system == "default" or not isinstance(cfg, dict):
continue
emulator = cfg.get("emulator", "")
core = cfg.get("core", "")
if core:
cores.add(core)
if emulator and emulator != "libretro":
standalone.add(emulator)
if core and core != emulator:
standalone.add(core)
return sorted(cores), sorted(standalone)
def _extract_systems_dict(self, raw: str) -> dict:
"""Extract and parse the 'systems' dict from the Python source via ast.literal_eval."""
match = re.search(r'^systems\s*=\s*\{', raw, re.MULTILINE)
if not match:
raise ValueError("Could not find 'systems = {' in batocera-systems")
start = match.start() + raw[match.start():].index("{")
depth = 0
i = start
in_str = False
str_ch = None
while i < len(raw):
ch = raw[i]
if in_str:
if ch == '\\':
i += 2
continue
if ch == str_ch:
in_str = False
elif ch in ('"', "'"):
in_str = True
str_ch = ch
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
break
elif ch == "#":
while i < len(raw) and raw[i] != "\n":
i += 1
i += 1
dict_str = raw[start:i + 1]
lines = []
for line in dict_str.split("\n"):
in_string = False
string_char = None
clean = []
j = 0
while j < len(line):
ch = line[j]
if ch == '\\' and j + 1 < len(line):
clean.append(ch)
clean.append(line[j + 1])
j += 2
continue
if ch in ('"', "'") and not in_string:
in_string = True
string_char = ch
clean.append(ch)
elif ch == string_char and in_string:
in_string = False
clean.append(ch)
elif ch == "#" and not in_string:
break
else:
clean.append(ch)
j += 1
lines.append("".join(clean))
clean_dict_str = "\n".join(lines)
# OrderedDict({...}) -> just the inner dict literal
clean_dict_str = re.sub(r'OrderedDict\(\s*\{', '{', clean_dict_str)
clean_dict_str = re.sub(r'\}\s*\)', '}', clean_dict_str)
try:
return ast.literal_eval(clean_dict_str)
except (SyntaxError, ValueError) as e:
raise ValueError(f"Failed to parse systems dict: {e}") from e
def fetch_requirements(self) -> list[BiosRequirement]:
"""Parse batocera-systems and return BIOS requirements."""
raw = self._fetch_raw()
if not self.validate_format(raw):
raise ValueError("batocera-systems format validation failed")
systems = self._extract_systems_dict(raw)
requirements = []
md5_index = _load_md5_index()
for sys_key, sys_data in systems.items():
system_slug = SYSTEM_SLUG_MAP.get(sys_key, sys_key)
bios_files = sys_data.get("biosFiles", [])
for bios in bios_files:
file_path = bios.get("file", "")
md5 = _resolve_truncated_md5(bios.get("md5", ""), md5_index)
zipped_file = bios.get("zippedFile", "")
if file_path.startswith("bios/"):
file_path = file_path[5:]
name = file_path.split("/")[-1] if "/" in file_path else file_path
requirements.append(BiosRequirement(
name=name,
system=system_slug,
md5=md5 or None,
destination=file_path,
required=True,
zipped_file=zipped_file or None,
native_id=sys_key,
))
return requirements
def validate_format(self, raw_data: str) -> bool:
"""Validate batocera-systems format."""
has_systems = "systems" in raw_data and "biosFiles" in raw_data
has_dict = re.search(r'^systems\s*=\s*\{', raw_data, re.MULTILINE) is not None
has_md5 = '"md5"' in raw_data
has_file = '"file"' in raw_data
return has_systems and has_dict and has_md5 and has_file
def generate_platform_yaml(self) -> dict:
"""Generate a platform YAML config dict from scraped data."""
requirements = self.fetch_requirements()
systems = {}
for req in requirements:
if req.system not in systems:
sys_entry: dict = {"files": []}
if req.native_id:
sys_entry["native_id"] = req.native_id
systems[req.system] = sys_entry
entry = {
"name": req.name,
"destination": req.destination,
"required": req.required,
}
if req.md5:
entry["md5"] = req.md5
if req.zipped_file:
entry["zipped_file"] = req.zipped_file
systems[req.system]["files"].append(entry)
tag = fetch_github_latest_tag("batocera-linux/batocera.linux", prefix="batocera-")
batocera_version = ""
if tag:
num = tag.removeprefix("batocera-")
if num.isdigit():
batocera_version = num
if not batocera_version:
# Preserve existing version when fetch fails (offline mode)
existing = Path(__file__).resolve().parents[2] / "platforms" / "batocera.yml"
if existing.exists():
with open(existing) as f:
old = yaml.safe_load(f) or {}
batocera_version = str(old.get("version", ""))
cores, standalone = self._fetch_cores()
result = {
"platform": "Batocera",
"version": batocera_version or "",
"homepage": "https://batocera.org",
"source": SOURCE_URL,
"base_destination": "bios",
"hash_type": "md5",
"verification_mode": "md5",
"cores": cores,
"systems": systems,
}
if standalone:
result["standalone_cores"] = standalone
return result
def main():
from scripts.scraper.base_scraper import scraper_cli
scraper_cli(Scraper, "Scrape batocera BIOS requirements")
if __name__ == "__main__":
main()