mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-15 13:22:31 -05:00
feat: rewrite retropie scraper using scriptmodules
This commit is contained in:
@@ -1,11 +1,13 @@
|
|||||||
"""Scraper for RetroPie package availability per platform.
|
"""Scraper for RetroPie libretro core availability per platform.
|
||||||
|
|
||||||
Source: https://retropie.org.uk/stats/pkgflags/
|
Source: https://github.com/RetroPie/RetroPie-Setup/tree/master/scriptmodules/libretrocores
|
||||||
Parses the HTML table of packages × platforms.
|
Parses rp_module_id and rp_module_flags from each scriptmodule to determine
|
||||||
|
which platforms each core supports.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import urllib.error
|
import urllib.error
|
||||||
@@ -18,33 +20,48 @@ from . import BaseTargetScraper
|
|||||||
|
|
||||||
PLATFORM_NAME = "retropie"
|
PLATFORM_NAME = "retropie"
|
||||||
|
|
||||||
SOURCE_URL = "https://retropie.org.uk/stats/pkgflags/"
|
GITHUB_API_URL = (
|
||||||
|
"https://api.github.com/repos/RetroPie/RetroPie-Setup/contents"
|
||||||
|
"/scriptmodules/libretrocores"
|
||||||
|
)
|
||||||
|
RAW_BASE_URL = (
|
||||||
|
"https://raw.githubusercontent.com/RetroPie/RetroPie-Setup/master"
|
||||||
|
"/scriptmodules/libretrocores/"
|
||||||
|
)
|
||||||
|
|
||||||
# Maps table column header to (target_name, architecture)
|
# Platform flag sets: flags that the platform possesses
|
||||||
_COLUMN_MAP: dict[str, tuple[str, str]] = {
|
PLATFORM_FLAGS: dict[str, set[str]] = {
|
||||||
"rpi1": ("rpi1", "armv6"),
|
"rpi1": {"arm", "armv6", "rpi", "gles"},
|
||||||
"rpi2": ("rpi2", "armv7"),
|
"rpi2": {"arm", "armv7", "neon", "rpi", "gles"},
|
||||||
"rpi3": ("rpi3", "armv7"),
|
"rpi3": {"arm", "armv8", "neon", "rpi", "gles"},
|
||||||
"rpi4": ("rpi4", "aarch64"),
|
"rpi4": {"arm", "armv8", "neon", "rpi", "gles", "gles3", "gles31"},
|
||||||
"rpi5": ("rpi5", "aarch64"),
|
"rpi5": {"arm", "armv8", "neon", "rpi", "gles", "gles3", "gles31"},
|
||||||
"x86": ("x86", "x86"),
|
"x86": {"x86"},
|
||||||
"x86_64": ("x86_64", "x86_64"),
|
"x86_64": {"x86"},
|
||||||
}
|
}
|
||||||
|
|
||||||
_TH_RE = re.compile(r'<th[^>]*>(.*?)</th>', re.IGNORECASE | re.DOTALL)
|
ARCH_MAP: dict[str, str] = {
|
||||||
_TR_RE = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
|
"rpi1": "armv6",
|
||||||
_TD_RE = re.compile(r'<td[^>]*>(.*?)</td>', re.IGNORECASE | re.DOTALL)
|
"rpi2": "armv7",
|
||||||
_TAG_RE = re.compile(r'<[^>]+>')
|
"rpi3": "armv7",
|
||||||
|
"rpi4": "aarch64",
|
||||||
|
"rpi5": "aarch64",
|
||||||
|
"x86": "x86",
|
||||||
|
"x86_64": "x86_64",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Flags that are build directives, not platform restrictions
|
||||||
|
_BUILD_FLAGS = {"nodistcc"}
|
||||||
|
|
||||||
|
_MODULE_ID_RE = re.compile(r'rp_module_id\s*=\s*["\']([^"\']+)["\']')
|
||||||
|
_MODULE_FLAGS_RE = re.compile(r'rp_module_flags\s*=\s*["\']([^"\']*)["\']')
|
||||||
|
|
||||||
|
|
||||||
def _strip_tags(text: str) -> str:
|
def _fetch(url: str, accept: str = "text/plain") -> str | None:
|
||||||
return _TAG_RE.sub("", text).strip()
|
|
||||||
|
|
||||||
|
|
||||||
def _fetch(url: str) -> str | None:
|
|
||||||
try:
|
try:
|
||||||
req = urllib.request.Request(
|
req = urllib.request.Request(
|
||||||
url, headers={"User-Agent": "retrobios-scraper/1.0"}
|
url,
|
||||||
|
headers={"User-Agent": "retrobios-scraper/1.0", "Accept": accept},
|
||||||
)
|
)
|
||||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
return resp.read().decode("utf-8")
|
return resp.read().decode("utf-8")
|
||||||
@@ -53,75 +70,86 @@ def _fetch(url: str) -> str | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _parse_table(html: str) -> dict[str, list[str]]:
|
def _is_available(flags_str: str, platform: str) -> bool:
|
||||||
"""Parse the pkgflags HTML table into {target: [packages]}."""
|
"""Return True if the core is available on the given platform."""
|
||||||
# Extract header row to find column indices
|
platform_has = PLATFORM_FLAGS.get(platform, set())
|
||||||
header_match = re.search(
|
tokens = flags_str.split() if flags_str.strip() else []
|
||||||
r'<thead[^>]*>(.*?)</thead>', html, re.IGNORECASE | re.DOTALL
|
|
||||||
)
|
|
||||||
if not header_match:
|
|
||||||
# Fallback: find first tr
|
|
||||||
header_match = re.search(
|
|
||||||
r'<tr[^>]*>(.*?)</tr>', html, re.IGNORECASE | re.DOTALL
|
|
||||||
)
|
|
||||||
if not header_match:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
headers = [_strip_tags(h).lower() for h in _TH_RE.findall(header_match.group(1))]
|
for token in tokens:
|
||||||
# Find which column index maps to which target
|
if token in _BUILD_FLAGS:
|
||||||
col_targets: dict[int, tuple[str, str]] = {}
|
|
||||||
for i, h in enumerate(headers):
|
|
||||||
if h in _COLUMN_MAP:
|
|
||||||
col_targets[i] = _COLUMN_MAP[h]
|
|
||||||
|
|
||||||
if not col_targets:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Initialize result
|
|
||||||
result: dict[str, list[str]] = {name: [] for name, _ in col_targets.values()}
|
|
||||||
|
|
||||||
# Parse body rows
|
|
||||||
tbody_match = re.search(
|
|
||||||
r'<tbody[^>]*>(.*?)</tbody>', html, re.IGNORECASE | re.DOTALL
|
|
||||||
)
|
|
||||||
body_html = tbody_match.group(1) if tbody_match else html
|
|
||||||
|
|
||||||
for tr_match in _TR_RE.finditer(body_html):
|
|
||||||
cells = [_strip_tags(td) for td in _TD_RE.findall(tr_match.group(1))]
|
|
||||||
if not cells:
|
|
||||||
continue
|
continue
|
||||||
# First cell is package name
|
if token.startswith("!"):
|
||||||
package = cells[0].strip().lower()
|
# Exclusion: if platform has this flag, core is excluded
|
||||||
if not package:
|
flag = token[1:]
|
||||||
continue
|
if flag in platform_has:
|
||||||
for col_idx, (target_name, _arch) in col_targets.items():
|
return False
|
||||||
if col_idx < len(cells):
|
else:
|
||||||
cell_val = cells[col_idx].strip().lower()
|
# Requirement: platform must have this flag
|
||||||
# Any non-empty, non-dash, non-zero value = available
|
if token not in platform_has:
|
||||||
if cell_val and cell_val not in ("", "-", "0", "n", "no", "false"):
|
return False
|
||||||
result[target_name].append(package)
|
|
||||||
|
|
||||||
return result
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_module(content: str) -> tuple[str | None, str]:
|
||||||
|
"""Return (module_id, flags_string) from a scriptmodule file."""
|
||||||
|
id_match = _MODULE_ID_RE.search(content)
|
||||||
|
flags_match = _MODULE_FLAGS_RE.search(content)
|
||||||
|
module_id = id_match.group(1) if id_match else None
|
||||||
|
flags = flags_match.group(1) if flags_match else ""
|
||||||
|
return module_id, flags
|
||||||
|
|
||||||
|
|
||||||
class Scraper(BaseTargetScraper):
|
class Scraper(BaseTargetScraper):
|
||||||
"""Fetches RetroPie package availability per platform from pkgflags page."""
|
"""Fetches RetroPie libretro core availability by parsing scriptmodules."""
|
||||||
|
|
||||||
def __init__(self, url: str = SOURCE_URL):
|
def __init__(self, url: str = GITHUB_API_URL):
|
||||||
super().__init__(url=url)
|
super().__init__(url=url)
|
||||||
|
|
||||||
|
def _list_scriptmodules(self) -> list[str]:
|
||||||
|
"""Return list of .sh filenames from the libretrocores directory."""
|
||||||
|
raw = _fetch(self.url, accept="application/vnd.github+json")
|
||||||
|
if raw is None:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
entries = json.loads(raw)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f" JSON parse error: {e}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
return [e["name"] for e in entries if e.get("name", "").endswith(".sh")]
|
||||||
|
|
||||||
|
def _fetch_module(self, filename: str) -> str | None:
|
||||||
|
return _fetch(f"{RAW_BASE_URL}{filename}")
|
||||||
|
|
||||||
def fetch_targets(self) -> dict:
|
def fetch_targets(self) -> dict:
|
||||||
print(" fetching RetroPie pkgflags...", file=sys.stderr)
|
print(" listing RetroPie scriptmodules...", file=sys.stderr)
|
||||||
html = _fetch(self.url)
|
filenames = self._list_scriptmodules()
|
||||||
packages_per_target: dict[str, list[str]] = {}
|
if not filenames:
|
||||||
if html:
|
print(" warning: no scriptmodules found", file=sys.stderr)
|
||||||
packages_per_target = _parse_table(html)
|
|
||||||
|
# {platform: [core_id, ...]}
|
||||||
|
platform_cores: dict[str, list[str]] = {p: [] for p in PLATFORM_FLAGS}
|
||||||
|
|
||||||
|
for filename in filenames:
|
||||||
|
content = self._fetch_module(filename)
|
||||||
|
if content is None:
|
||||||
|
continue
|
||||||
|
module_id, flags = _parse_module(content)
|
||||||
|
if not module_id:
|
||||||
|
print(f" warning: no rp_module_id in {filename}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
for platform in PLATFORM_FLAGS:
|
||||||
|
if _is_available(flags, platform):
|
||||||
|
platform_cores[platform].append(module_id)
|
||||||
|
|
||||||
|
print(f" parsed {len(filenames)} scriptmodules", file=sys.stderr)
|
||||||
|
|
||||||
targets: dict[str, dict] = {}
|
targets: dict[str, dict] = {}
|
||||||
for col_key, (target_name, arch) in _COLUMN_MAP.items():
|
for platform, arch in ARCH_MAP.items():
|
||||||
targets[target_name] = {
|
cores = sorted(platform_cores.get(platform, []))
|
||||||
|
targets[platform] = {
|
||||||
"architecture": arch,
|
"architecture": arch,
|
||||||
"cores": sorted(packages_per_target.get(target_name, [])),
|
"cores": cores,
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -134,7 +162,7 @@ class Scraper(BaseTargetScraper):
|
|||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Scrape RetroPie package targets"
|
description="Scrape RetroPie libretro core targets from scriptmodules"
|
||||||
)
|
)
|
||||||
parser.add_argument("--dry-run", action="store_true", help="Show target summary")
|
parser.add_argument("--dry-run", action="store_true", help="Show target summary")
|
||||||
parser.add_argument("--output", "-o", help="Output YAML file")
|
parser.add_argument("--output", "-o", help="Output YAML file")
|
||||||
@@ -145,7 +173,7 @@ def main() -> None:
|
|||||||
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
for name, info in data["targets"].items():
|
for name, info in data["targets"].items():
|
||||||
print(f" {name} ({info['architecture']}): {len(info['cores'])} packages")
|
print(f" {name} ({info['architecture']}): {len(info['cores'])} cores")
|
||||||
return
|
return
|
||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
|
|||||||
Reference in New Issue
Block a user