feat: add target scraper infra and retroarch buildbot scraper

2026-04-13 12:22:33 -05:00 · 2026-03-26 08:55:50 +01:00
parent ac66f0b73b
commit c6ab8e9c3a
2 changed files with 186 additions and 0 deletions
--- a/scripts/scraper/targets/init.py
+++ b/scripts/scraper/targets/init.py
@@ -0,0 +1,48 @@
+"""Target scraper plugin discovery module.
+
+Auto-detects *_targets_scraper.py files and exposes their scrapers.
+"""
+from __future__ import annotations
+
+import importlib
+import pkgutil
+from pathlib import Path
+
+
+class BaseTargetScraper:
+    """Base class for target scrapers."""
+
+    def __init__(self, url: str = ""):
+        self.url = url
+
+    def fetch_targets(self) -> dict:
+        """Fetch targets and their core lists. Returns dict matching target YAML format."""
+        raise NotImplementedError
+
+    def write_output(self, data: dict, output_path: str) -> None:
+        """Write target data to YAML file."""
+        try:
+            import yaml
+        except ImportError:
+            raise ImportError("PyYAML required: pip install pyyaml")
+        with open(output_path, "w") as f:
+            yaml.dump(data, f, default_flow_style=False, sort_keys=False)
+
+
+_scrapers: dict[str, type] = {}
+
+
+def discover_target_scrapers() -> dict[str, type]:
+    """Auto-discover all *_targets_scraper.py modules."""
+    if _scrapers:
+        return _scrapers
+    package_dir = Path(__file__).parent
+    for finder, name, ispkg in pkgutil.iter_modules([str(package_dir)]):
+        if not name.endswith("_targets_scraper"):
+            continue
+        module = importlib.import_module(f".{name}", package=__package__)
+        platform_name = getattr(module, "PLATFORM_NAME", None)
+        scraper_class = getattr(module, "Scraper", None)
+        if platform_name and scraper_class:
+            _scrapers[platform_name] = scraper_class
+    return _scrapers
--- a/scripts/scraper/targets/retroarch_targets_scraper.py
+++ b/scripts/scraper/targets/retroarch_targets_scraper.py
@@ -0,0 +1,138 @@
+"""Scraper for RetroArch buildbot nightly targets.
+
+Source: https://buildbot.libretro.com/nightly/
+Fetches directory listings per target to determine available cores.
+"""
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+import urllib.error
+import urllib.request
+from datetime import datetime, timezone
+
+import yaml
+
+from . import BaseTargetScraper
+
+PLATFORM_NAME = "retroarch"
+
+BUILDBOT_URL = "https://buildbot.libretro.com/nightly/"
+
+# (path, target_name, architecture)
+TARGETS: list[tuple[str, str, str]] = [
+    ("linux/x86_64", "linux-x86_64", "x86_64"),
+    ("linux/armhf", "linux-armhf", "armhf"),
+    ("linux/armv7-neon-hf", "linux-armv7-neon-hf", "armv7"),
+    ("windows/x86_64", "windows-x86_64", "x86_64"),
+    ("windows/x86", "windows-x86", "x86"),
+    ("android/armeabi-v7a", "android-armeabi-v7a", "armv7"),
+    ("android/arm64-v8a", "android-arm64-v8a", "aarch64"),
+    ("apple/osx/x86_64", "osx-x86_64", "x86_64"),
+    ("apple/osx/arm64", "osx-arm64", "aarch64"),
+    ("apple/ios-arm64", "ios-arm64", "aarch64"),
+    ("apple/tvos-arm64", "tvos-arm64", "aarch64"),
+    ("nintendo/switch/libnx", "switch-libnx", "aarch64"),
+    ("nintendo/3ds", "3ds", "armv6"),
+    ("nintendo/ngc", "ngc", "ppc"),
+    ("nintendo/wii", "wii", "ppc"),
+    ("nintendo/wiiu", "wiiu", "ppc"),
+    ("playstation/ps2", "ps2", "mips"),
+    ("playstation/psp", "psp", "mips"),
+    ("playstation/vita", "vita", "armv7"),
+]
+
+_CORE_RE = re.compile(
+    r'href="([^"]+_libretro(?:\.so|\.dll|\.dylib)(?:\.zip)?)"',
+    re.IGNORECASE,
+)
+
+
+def _strip_core_suffix(filename: str) -> str:
+    """Strip _libretro.so/.dll/.dylib(.zip)? suffix to get core name."""
+    name = re.sub(r'\.zip$', '', filename, flags=re.IGNORECASE)
+    name = re.sub(r'_libretro(?:\.so|\.dll|\.dylib)$', '', name, flags=re.IGNORECASE)
+    return name
+
+
+class Scraper(BaseTargetScraper):
+    """Fetches core lists per target from RetroArch buildbot nightly."""
+
+    def __init__(self, url: str = BUILDBOT_URL):
+        super().__init__(url=url)
+
+    def _fetch_url(self, url: str) -> str | None:
+        """Fetch URL, return text or None on failure."""
+        try:
+            req = urllib.request.Request(
+                url, headers={"User-Agent": "retrobios-scraper/1.0"}
+            )
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                return resp.read().decode("utf-8")
+        except urllib.error.URLError as e:
+            print(f"  skip {url}: {e}", file=sys.stderr)
+            return None
+
+    def _fetch_cores_for_target(self, path: str) -> list[str]:
+        """Fetch core list from buildbot directory listing."""
+        url = f"{self.url}{path}/latest/"
+        html = self._fetch_url(url)
+        if html is None:
+            return []
+        cores = []
+        seen: set[str] = set()
+        for match in _CORE_RE.finditer(html):
+            filename = match.group(1).split("/")[-1]
+            core = _strip_core_suffix(filename)
+            if core and core not in seen:
+                seen.add(core)
+                cores.append(core)
+        return sorted(cores)
+
+    def fetch_targets(self) -> dict:
+        """Fetch all targets and their core lists."""
+        targets: dict[str, dict] = {}
+        for path, target_name, arch in TARGETS:
+            print(f"  fetching {target_name}...", file=sys.stderr)
+            cores = self._fetch_cores_for_target(path)
+            if not cores:
+                print(f"  warning: no cores found for {target_name}", file=sys.stderr)
+            targets[target_name] = {
+                "architecture": arch,
+                "cores": cores,
+            }
+        return {
+            "platform": "retroarch",
+            "source": self.url,
+            "scraped_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "targets": targets,
+        }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Scrape RetroArch buildbot nightly targets"
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Show target summary")
+    parser.add_argument("--output", "-o", help="Output YAML file")
+    args = parser.parse_args()
+
+    scraper = Scraper()
+    data = scraper.fetch_targets()
+
+    if args.dry_run:
+        for name, info in data["targets"].items():
+            print(f"  {name} ({info['architecture']}): {len(info['cores'])} cores")
+        return
+
+    if args.output:
+        scraper.write_output(data, args.output)
+        print(f"Written to {args.output}")
+        return
+
+    print(yaml.dump(data, default_flow_style=False, sort_keys=False))
+
+
+if __name__ == "__main__":
+    main()