From 2466fc4a97f07a3d94e315f64e3e4f3ecfc9b138 Mon Sep 17 00:00:00 2001 From: Abdessamad Derraz <3028866+Abdess@users.noreply.github.com> Date: Wed, 18 Mar 2026 08:17:14 +0100 Subject: [PATCH] refactor: extract scraper_cli() to base_scraper.py (DRY) Shared CLI boilerplate for all scrapers: argparse, dry-run, json, yaml output. 4 scrapers (libretro, batocera, retrobat, emudeck) reduced from ~58 lines main() each to 3 lines calling scraper_cli(). ~220 lines of duplicated boilerplate eliminated. recalbox + coreinfo keep custom main() (extra flags: --full, --compare-db). --- scripts/scraper/base_scraper.py | 58 +++++++++++++++++++++++++++++ scripts/scraper/batocera_scraper.py | 56 +--------------------------- scripts/scraper/emudeck_scraper.py | 54 +-------------------------- scripts/scraper/libretro_scraper.py | 56 +--------------------------- scripts/scraper/retrobat_scraper.py | 54 +-------------------------- 5 files changed, 66 insertions(+), 212 deletions(-) diff --git a/scripts/scraper/base_scraper.py b/scripts/scraper/base_scraper.py index d44ad176..efd16f27 100644 --- a/scripts/scraper/base_scraper.py +++ b/scripts/scraper/base_scraper.py @@ -135,6 +135,64 @@ def fetch_github_latest_version(repo: str) -> str | None: return None +def scraper_cli(scraper_class: type, description: str = "Scrape BIOS requirements") -> None: + """Shared CLI entry point for all scrapers. Eliminates main() boilerplate.""" + import argparse + + parser = argparse.ArgumentParser(description=description) + parser.add_argument("--dry-run", action="store_true", help="Show scraped data") + parser.add_argument("--output", "-o", help="Output YAML file") + parser.add_argument("--json", action="store_true", help="Output as JSON") + args = parser.parse_args() + + scraper = scraper_class() + try: + reqs = scraper.fetch_requirements() + except (ConnectionError, ValueError) as e: + print(f"Error: {e}", file=sys.stderr) + import sys + sys.exit(1) + + if args.dry_run: + by_system: dict[str, list] = {} + for req in reqs: + by_system.setdefault(req.system, []).append(req) + for system, files in sorted(by_system.items()): + req_count = sum(1 for f in files if f.required) + opt_count = len(files) - req_count + print(f" {system}: {req_count} required, {opt_count} optional") + print(f"\nTotal: {len(reqs)} BIOS entries across {len(by_system)} systems") + return + + if args.json: + data = [{"name": r.name, "system": r.system, "sha1": r.sha1, "md5": r.md5, + "size": r.size, "required": r.required} for r in reqs] + print(json.dumps(data, indent=2)) + return + + if args.output: + # Generate platform YAML + import yaml + config = {"systems": {}} + for req in reqs: + sys_id = req.system + config["systems"].setdefault(sys_id, {"files": []}) + entry = {"name": req.name, "destination": req.destination or req.name, "required": req.required} + if req.sha1: + entry["sha1"] = req.sha1 + if req.md5: + entry["md5"] = req.md5 + if req.zipped_file: + entry["zipped_file"] = req.zipped_file + config["systems"][sys_id]["files"].append(entry) + with open(args.output, "w") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + print(f"Written {len(reqs)} entries to {args.output}") + return + + print(f"Scraped {len(reqs)} requirements. Use --dry-run, --json, or --output.") + + def fetch_github_latest_tag(repo: str, prefix: str = "") -> str | None: """Fetch the most recent matching tag from a GitHub repo.""" url = f"https://api.github.com/repos/{repo}/tags?per_page=50" diff --git a/scripts/scraper/batocera_scraper.py b/scripts/scraper/batocera_scraper.py index aa67acc0..58cdd6c9 100644 --- a/scripts/scraper/batocera_scraper.py +++ b/scripts/scraper/batocera_scraper.py @@ -259,60 +259,8 @@ class Scraper(BaseScraper): def main(): - """CLI entry point for testing.""" - import argparse - import json - - parser = argparse.ArgumentParser(description="Scrape batocera-systems") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--json", action="store_true") - parser.add_argument("--output", "-o") - args = parser.parse_args() - - scraper = Scraper() - - try: - reqs = scraper.fetch_requirements() - except (ConnectionError, ValueError) as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - - if args.dry_run: - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - - for system, files in sorted(by_system.items()): - print(f"\n{system} ({len(files)} files):") - for f in files: - hash_info = f.md5[:12] if f.md5 else "no-hash" - print(f" {f.name} ({hash_info}...)") - - print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems") - return - - if args.json: - config = scraper.generate_platform_yaml() - print(json.dumps(config, indent=2)) - return - - if args.output: - try: - import yaml - except ImportError: - print("Error: PyYAML required", file=sys.stderr) - sys.exit(1) - - config = scraper.generate_platform_yaml() - with open(args.output, "w") as f: - yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False) - print(f"Written to {args.output}") - else: - reqs = scraper.fetch_requirements() - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems") + from scripts.scraper.base_scraper import scraper_cli + scraper_cli(Scraper, "Scrape batocera BIOS requirements") if __name__ == "__main__": diff --git a/scripts/scraper/emudeck_scraper.py b/scripts/scraper/emudeck_scraper.py index a3cb3ed1..7e8a8cd7 100644 --- a/scripts/scraper/emudeck_scraper.py +++ b/scripts/scraper/emudeck_scraper.py @@ -403,58 +403,8 @@ class Scraper(BaseScraper): def main(): - import argparse - import json - - parser = argparse.ArgumentParser(description="Scrape EmuDeck BIOS requirements") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--json", action="store_true") - parser.add_argument("--output", "-o") - args = parser.parse_args() - - scraper = Scraper() - - try: - reqs = scraper.fetch_requirements() - except (ConnectionError, ValueError) as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - - if args.dry_run: - by_system: dict[str, list[BiosRequirement]] = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - - for system, files in sorted(by_system.items()): - print(f"\n{system} ({len(files)} files):") - for f in files: - hash_info = f.md5[:12] if f.md5 else "no-hash" - print(f" {f.name} ({hash_info}...)") - - print(f"\nTotal: {len(reqs)} BIOS entries across {len(by_system)} systems") - return - - if args.json: - config = scraper.generate_platform_yaml() - print(json.dumps(config, indent=2)) - return - - if args.output: - try: - import yaml - except ImportError: - print("Error: PyYAML required", file=sys.stderr) - sys.exit(1) - - config = scraper.generate_platform_yaml() - with open(args.output, "w") as f: - yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False) - print(f"Written to {args.output}") - else: - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - print(f"Scraped {len(reqs)} BIOS entries across {len(by_system)} systems") + from scripts.scraper.base_scraper import scraper_cli + scraper_cli(Scraper, "Scrape emudeck BIOS requirements") if __name__ == "__main__": diff --git a/scripts/scraper/libretro_scraper.py b/scripts/scraper/libretro_scraper.py index e487ebd0..139944a4 100644 --- a/scripts/scraper/libretro_scraper.py +++ b/scripts/scraper/libretro_scraper.py @@ -263,60 +263,8 @@ class Scraper(BaseScraper): def main(): - """CLI entry point for testing.""" - import argparse - import json - - parser = argparse.ArgumentParser(description="Scrape libretro System.dat") - parser.add_argument("--dry-run", action="store_true", help="Just show what would be scraped") - parser.add_argument("--output", "-o", help="Output YAML file") - parser.add_argument("--json", action="store_true", help="Output as JSON") - args = parser.parse_args() - - scraper = Scraper() - - try: - reqs = scraper.fetch_requirements() - except (ConnectionError, ValueError) as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - - if args.dry_run: - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - - for system, files in sorted(by_system.items()): - print(f"\n{system} ({len(files)} files):") - for f in files: - hash_info = f.sha1[:12] if f.sha1 else f.md5[:12] if f.md5 else "no-hash" - print(f" {f.name} ({f.size or '?'} bytes, {hash_info}...)") - - print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems") - return - - if args.json: - config = scraper.generate_platform_yaml() - print(json.dumps(config, indent=2)) - return - - if args.output: - try: - import yaml - except ImportError: - print("Error: PyYAML required (pip install pyyaml)", file=sys.stderr) - sys.exit(1) - - config = scraper.generate_platform_yaml() - with open(args.output, "w") as f: - yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False) - print(f"Written to {args.output}") - else: - reqs = scraper.fetch_requirements() - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems") + from scripts.scraper.base_scraper import scraper_cli + scraper_cli(Scraper, "Scrape libretro BIOS requirements") if __name__ == "__main__": diff --git a/scripts/scraper/retrobat_scraper.py b/scripts/scraper/retrobat_scraper.py index 0a7d2a26..34b9eb61 100644 --- a/scripts/scraper/retrobat_scraper.py +++ b/scripts/scraper/retrobat_scraper.py @@ -158,58 +158,8 @@ class Scraper(BaseScraper): def main(): - """CLI entry point for testing.""" - import argparse - - parser = argparse.ArgumentParser(description="Scrape RetroBat batocera-systems.json") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--json", action="store_true") - parser.add_argument("--output", "-o") - args = parser.parse_args() - - scraper = Scraper() - - try: - reqs = scraper.fetch_requirements() - except (ConnectionError, ValueError) as e: - print(f"Error: {e}", file=sys.stderr) - sys.exit(1) - - if args.dry_run: - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - - for system, files in sorted(by_system.items()): - print(f"\n{system} ({len(files)} files):") - for f in files: - hash_info = f.md5[:12] if f.md5 else "no-hash" - print(f" {f.name} ({hash_info}...)") - - print(f"\nTotal: {len(reqs)} BIOS files across {len(by_system)} systems") - return - - if args.json: - config = scraper.generate_platform_yaml() - print(json.dumps(config, indent=2)) - return - - if args.output: - try: - import yaml - except ImportError: - print("Error: PyYAML required", file=sys.stderr) - sys.exit(1) - - config = scraper.generate_platform_yaml() - with open(args.output, "w") as f: - yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False) - print(f"Written to {args.output}") - else: - by_system = {} - for req in reqs: - by_system.setdefault(req.system, []).append(req) - print(f"Scraped {len(reqs)} BIOS files across {len(by_system)} systems") + from scripts.scraper.base_scraper import scraper_cli + scraper_cli(Scraper, "Scrape retrobat BIOS requirements") if __name__ == "__main__":