mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-13 12:22:33 -05:00
- fix urllib.parse.quote import (was urllib.request.quote) - add operator precedence parens in generate_pack dedup check - narrow bare except to specific types in batocera target scraper - cache load_platform_config and build_zip_contents_index results - add selective algorithm support to compute_hashes - atomic write for fetch_large_file (tmp + rename) - add response size limit to base scraper fetch - extract build_target_cores_cache to common.py (dedup verify/pack) - hoist _build_supplemental_index out of per-platform loop - migrate function-attribute caches to module-level dicts - add @abstractmethod to BaseTargetScraper.fetch_targets - remove backward-compat re-exports from common.py - replace em-dashes and unicode arrows with ASCII equivalents - remove decorative section dividers and obvious comments
244 lines
8.7 KiB
Python
244 lines
8.7 KiB
Python
"""Deduplicate bios/ directory -keep one canonical file per unique content.
|
|
|
|
Usage:
|
|
python scripts/dedup.py [--dry-run] [--bios-dir bios]
|
|
|
|
Two types of deduplication:
|
|
|
|
1. TRUE DUPLICATES: Same filename in different directories (e.g., naomi.zip
|
|
in both Arcade/ and Sega/Dreamcast/). Keeps one canonical copy, removes
|
|
the others. resolve_local_file finds files by hash, not path.
|
|
|
|
2. MAME DEVICE CLONES: Different filenames with identical content in the same
|
|
MAME directory (e.g., bbc_m87.zip and bbc_24bbc.zip are identical ZIPs).
|
|
These are NOT aliases -MAME loads each by its unique name. Instead of
|
|
deleting, we create a _mame_clones.json mapping so generate_pack.py can
|
|
pack all names from a single canonical file.
|
|
|
|
After dedup, run generate_db.py --force to rebuild database indexes.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
from common import compute_hashes
|
|
|
|
DEFAULT_BIOS_DIR = "bios"
|
|
|
|
# Directories where deduplication must NOT be applied.
|
|
NODEDUP_DIRS = {
|
|
"RPG Maker",
|
|
"ScummVM",
|
|
}
|
|
|
|
|
|
def path_priority(path: str) -> tuple:
|
|
"""Lower score = better candidate to keep as canonical.
|
|
|
|
Prefers: shorter path, non-variant, non-MAME (system-specific over generic).
|
|
"""
|
|
parts = Path(path).parts
|
|
is_variant = ".variants" in parts
|
|
is_mame = "MAME" in parts
|
|
is_arcade = "Arcade" in parts
|
|
return (is_variant, is_mame, is_arcade, len(parts), path)
|
|
|
|
|
|
def _in_nodedup_dir(path: str) -> bool:
|
|
parts = Path(path).parts
|
|
return any(nodedup in parts for nodedup in NODEDUP_DIRS)
|
|
|
|
|
|
def _is_mame_dir(path: str) -> bool:
|
|
"""Check if a path is in a MAME-specific directory."""
|
|
parts = Path(path).parts
|
|
return "MAME" in parts or "Arcade" in parts
|
|
|
|
|
|
def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
|
|
"""Find all files grouped by SHA1, excluding no-dedup directories."""
|
|
sha1_to_paths: dict[str, list[str]] = defaultdict(list)
|
|
|
|
for root, dirs, files in os.walk(bios_dir):
|
|
for name in files:
|
|
path = os.path.join(root, name)
|
|
if _in_nodedup_dir(path):
|
|
continue
|
|
sha1 = compute_hashes(path)["sha1"]
|
|
sha1_to_paths[sha1].append(path)
|
|
|
|
return sha1_to_paths
|
|
|
|
|
|
def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
|
|
"""Remove true duplicates, map MAME device clones.
|
|
|
|
True duplicates (same name, different dirs): removes copies.
|
|
MAME clones (different names, same content, same dir): creates mapping.
|
|
|
|
Returns dict of {sha1: {"canonical": path, "removed": [paths], "aliases": [names]}}
|
|
"""
|
|
sha1_groups = scan_duplicates(bios_dir)
|
|
results = {}
|
|
total_removed = 0
|
|
total_saved = 0
|
|
mame_clones: dict[str, dict] = {} # canonical_name -> {sha1, clones: [names]}
|
|
|
|
for sha1, paths in sorted(sha1_groups.items()):
|
|
if len(paths) <= 1:
|
|
continue
|
|
|
|
# Separate by filename -same name = true duplicate, different name = clone
|
|
by_name: dict[str, list[str]] = defaultdict(list)
|
|
for p in paths:
|
|
by_name[os.path.basename(p)].append(p)
|
|
|
|
# True duplicates: same filename in multiple directories
|
|
true_dupes_to_remove = []
|
|
for name, name_paths in by_name.items():
|
|
if len(name_paths) > 1:
|
|
name_paths.sort(key=path_priority)
|
|
true_dupes_to_remove.extend(name_paths[1:])
|
|
|
|
# Different filenames, same content -need special handling
|
|
unique_names = sorted(by_name.keys())
|
|
if len(unique_names) > 1:
|
|
# Check if these are all in MAME/Arcade dirs AND all ZIPs
|
|
all_mame_zip = (
|
|
all(
|
|
any(_is_mame_dir(p) for p in name_paths)
|
|
for name_paths in by_name.values()
|
|
)
|
|
and all(n.endswith(".zip") for n in unique_names)
|
|
)
|
|
if all_mame_zip:
|
|
# MAME device clones: different ZIP names, same ROM content
|
|
# Keep one canonical, remove clones, record in clone map
|
|
canonical_name = min(unique_names, key=len)
|
|
clone_names = sorted(n for n in unique_names if n != canonical_name)
|
|
if clone_names:
|
|
mame_clones[canonical_name] = {
|
|
"sha1": sha1,
|
|
"clones": clone_names,
|
|
"total_copies": sum(len(by_name[n]) for n in clone_names),
|
|
}
|
|
for clone_name in clone_names:
|
|
for p in by_name[clone_name]:
|
|
true_dupes_to_remove.append(p)
|
|
else:
|
|
# Non-MAME different names (e.g., 64DD_IPL_US.n64 vs IPL_USA.n64)
|
|
# Keep ALL -each name may be needed by a different emulator
|
|
# Only remove true duplicates (same name in multiple dirs)
|
|
pass
|
|
|
|
if not true_dupes_to_remove:
|
|
continue
|
|
|
|
# Find the best canonical across all paths
|
|
all_paths = [p for p in paths if p not in true_dupes_to_remove]
|
|
if not all_paths:
|
|
# All copies were marked for removal -keep the best one
|
|
all_paths_sorted = sorted(paths, key=path_priority)
|
|
all_paths = [all_paths_sorted[0]]
|
|
true_dupes_to_remove = [p for p in paths if p != all_paths[0]]
|
|
|
|
canonical = sorted(all_paths, key=path_priority)[0]
|
|
canonical_name = os.path.basename(canonical)
|
|
|
|
all_names = set(os.path.basename(p) for p in paths)
|
|
alias_names = sorted(all_names - {canonical_name})
|
|
|
|
size = os.path.getsize(canonical)
|
|
|
|
results[sha1] = {
|
|
"canonical": canonical,
|
|
"removed": [],
|
|
"aliases": alias_names,
|
|
}
|
|
|
|
for dup in true_dupes_to_remove:
|
|
if dup == canonical:
|
|
continue
|
|
if not os.path.exists(dup):
|
|
continue
|
|
if dry_run:
|
|
print(f" WOULD REMOVE: {dup}")
|
|
else:
|
|
try:
|
|
os.remove(dup)
|
|
except OSError as e:
|
|
print(f" WARNING: cannot remove {dup}: {e}")
|
|
continue
|
|
# Clean up empty .variants/ directories
|
|
parent = os.path.dirname(dup)
|
|
if os.path.basename(parent) == ".variants" and not os.listdir(parent):
|
|
os.rmdir(parent)
|
|
results[sha1]["removed"].append(dup)
|
|
total_removed += 1
|
|
total_saved += size
|
|
|
|
if alias_names or true_dupes_to_remove:
|
|
action = "Would remove" if dry_run else "Removed"
|
|
dn = os.path.basename(canonical)
|
|
print(f" {dn} (keep: {canonical})")
|
|
if true_dupes_to_remove:
|
|
print(f" {action} {len(true_dupes_to_remove)} copies")
|
|
if alias_names:
|
|
print(f" MAME clones: {alias_names}")
|
|
|
|
# Clean up empty directories
|
|
if not dry_run:
|
|
empty_cleaned = 0
|
|
for root, dirs, files in os.walk(bios_dir, topdown=False):
|
|
if not files and not dirs and root != bios_dir:
|
|
os.rmdir(root)
|
|
empty_cleaned += 1
|
|
|
|
prefix = "Would remove" if dry_run else "Removed"
|
|
print(f"\n{prefix}: {total_removed} files")
|
|
print(f"Space {'to save' if dry_run else 'saved'}: {total_saved / 1024 / 1024:.1f} MB")
|
|
if not dry_run and empty_cleaned:
|
|
print(f"Cleaned {empty_cleaned} empty directories")
|
|
|
|
# Write MAME clone mapping
|
|
if mame_clones:
|
|
clone_path = "_mame_clones.json"
|
|
if dry_run:
|
|
print(f"\nWould write MAME clone map: {clone_path}")
|
|
print(f" {len(mame_clones)} canonical ZIPs with "
|
|
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
|
|
else:
|
|
with open(clone_path, "w") as f:
|
|
json.dump(mame_clones, f, indent=2, sort_keys=True)
|
|
print(f"\nWrote MAME clone map: {clone_path}")
|
|
print(f" {len(mame_clones)} canonical ZIPs with "
|
|
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
|
|
|
|
return results
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Deduplicate bios/ directory")
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview without deleting")
|
|
parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR)
|
|
args = parser.parse_args()
|
|
|
|
print(f"Scanning {args.bios_dir}/ for duplicates...")
|
|
if args.dry_run:
|
|
print("(DRY RUN)\n")
|
|
|
|
deduplicate(args.bios_dir, dry_run=args.dry_run)
|
|
|
|
if not args.dry_run:
|
|
print("\nRun 'python scripts/generate_db.py --force' to rebuild database.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|