Files
libretro/scripts/scraper/targets/retropie_targets_scraper.py
2026-03-26 08:55:58 +01:00

161 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Scraper for RetroPie package availability per platform.
Source: https://retropie.org.uk/stats/pkgflags/
Parses the HTML table of packages × platforms.
"""
from __future__ import annotations
import argparse
import re
import sys
import urllib.error
import urllib.request
from datetime import datetime, timezone
import yaml
from . import BaseTargetScraper
PLATFORM_NAME = "retropie"
SOURCE_URL = "https://retropie.org.uk/stats/pkgflags/"
# Maps table column header to (target_name, architecture)
_COLUMN_MAP: dict[str, tuple[str, str]] = {
"rpi1": ("rpi1", "armv6"),
"rpi2": ("rpi2", "armv7"),
"rpi3": ("rpi3", "armv7"),
"rpi4": ("rpi4", "aarch64"),
"rpi5": ("rpi5", "aarch64"),
"x86": ("x86", "x86"),
"x86_64": ("x86_64", "x86_64"),
}
_TH_RE = re.compile(r'<th[^>]*>(.*?)</th>', re.IGNORECASE | re.DOTALL)
_TR_RE = re.compile(r'<tr[^>]*>(.*?)</tr>', re.IGNORECASE | re.DOTALL)
_TD_RE = re.compile(r'<td[^>]*>(.*?)</td>', re.IGNORECASE | re.DOTALL)
_TAG_RE = re.compile(r'<[^>]+>')
def _strip_tags(text: str) -> str:
return _TAG_RE.sub("", text).strip()
def _fetch(url: str) -> str | None:
try:
req = urllib.request.Request(
url, headers={"User-Agent": "retrobios-scraper/1.0"}
)
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8")
except urllib.error.URLError as e:
print(f" skip {url}: {e}", file=sys.stderr)
return None
def _parse_table(html: str) -> dict[str, list[str]]:
"""Parse the pkgflags HTML table into {target: [packages]}."""
# Extract header row to find column indices
header_match = re.search(
r'<thead[^>]*>(.*?)</thead>', html, re.IGNORECASE | re.DOTALL
)
if not header_match:
# Fallback: find first tr
header_match = re.search(
r'<tr[^>]*>(.*?)</tr>', html, re.IGNORECASE | re.DOTALL
)
if not header_match:
return {}
headers = [_strip_tags(h).lower() for h in _TH_RE.findall(header_match.group(1))]
# Find which column index maps to which target
col_targets: dict[int, tuple[str, str]] = {}
for i, h in enumerate(headers):
if h in _COLUMN_MAP:
col_targets[i] = _COLUMN_MAP[h]
if not col_targets:
return {}
# Initialize result
result: dict[str, list[str]] = {name: [] for name, _ in col_targets.values()}
# Parse body rows
tbody_match = re.search(
r'<tbody[^>]*>(.*?)</tbody>', html, re.IGNORECASE | re.DOTALL
)
body_html = tbody_match.group(1) if tbody_match else html
for tr_match in _TR_RE.finditer(body_html):
cells = [_strip_tags(td) for td in _TD_RE.findall(tr_match.group(1))]
if not cells:
continue
# First cell is package name
package = cells[0].strip().lower()
if not package:
continue
for col_idx, (target_name, _arch) in col_targets.items():
if col_idx < len(cells):
cell_val = cells[col_idx].strip().lower()
# Any non-empty, non-dash, non-zero value = available
if cell_val and cell_val not in ("", "-", "0", "n", "no", "false"):
result[target_name].append(package)
return result
class Scraper(BaseTargetScraper):
"""Fetches RetroPie package availability per platform from pkgflags page."""
def __init__(self, url: str = SOURCE_URL):
super().__init__(url=url)
def fetch_targets(self) -> dict:
print(" fetching RetroPie pkgflags...", file=sys.stderr)
html = _fetch(self.url)
packages_per_target: dict[str, list[str]] = {}
if html:
packages_per_target = _parse_table(html)
targets: dict[str, dict] = {}
for col_key, (target_name, arch) in _COLUMN_MAP.items():
targets[target_name] = {
"architecture": arch,
"cores": sorted(packages_per_target.get(target_name, [])),
}
return {
"platform": "retropie",
"source": self.url,
"scraped_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"targets": targets,
}
def main() -> None:
parser = argparse.ArgumentParser(
description="Scrape RetroPie package targets"
)
parser.add_argument("--dry-run", action="store_true", help="Show target summary")
parser.add_argument("--output", "-o", help="Output YAML file")
args = parser.parse_args()
scraper = Scraper()
data = scraper.fetch_targets()
if args.dry_run:
for name, info in data["targets"].items():
print(f" {name} ({info['architecture']}): {len(info['cores'])} packages")
return
if args.output:
scraper.write_output(data, args.output)
print(f"Written to {args.output}")
return
print(yaml.dump(data, default_flow_style=False, sort_keys=False))
if __name__ == "__main__":
main()