mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-13 12:22:33 -05:00
chore: deduplicate bios/ — remove 427 files, save 227 MB
true duplicates (same file in multiple dirs): removed copies, kept canonical. MAME device clones (different names, identical content): removed copies, created _mame_clones.json mapping for pack-time assembly via deterministic ZIP rebuild. generate_pack.py resolves clones transparently. 95 canonical ZIPs serve 392 clone names.
This commit is contained in:
172
scripts/dedup.py
172
scripts/dedup.py
@@ -1,21 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Deduplicate bios/ directory - keep one canonical file per unique SHA1.
|
||||
"""Deduplicate bios/ directory — keep one canonical file per unique content.
|
||||
|
||||
Usage:
|
||||
python scripts/dedup.py [--dry-run] [--bios-dir bios]
|
||||
|
||||
For each group of files with the same SHA1:
|
||||
- Keeps the file with the shortest, most canonical path
|
||||
- Removes duplicates
|
||||
- Records all alternate names in database.json aliases
|
||||
Two types of deduplication:
|
||||
|
||||
After dedup, generate_pack.py resolves files by hash and writes them
|
||||
with the correct destination name - no duplicates needed on disk.
|
||||
1. TRUE DUPLICATES: Same filename in different directories (e.g., naomi.zip
|
||||
in both Arcade/ and Sega/Dreamcast/). Keeps one canonical copy, removes
|
||||
the others. resolve_local_file finds files by hash, not path.
|
||||
|
||||
2. MAME DEVICE CLONES: Different filenames with identical content in the same
|
||||
MAME directory (e.g., bbc_m87.zip and bbc_24bbc.zip are identical ZIPs).
|
||||
These are NOT aliases — MAME loads each by its unique name. Instead of
|
||||
deleting, we create a _mame_clones.json mapping so generate_pack.py can
|
||||
pack all names from a single canonical file.
|
||||
|
||||
After dedup, run generate_db.py --force to rebuild database indexes.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -27,9 +32,6 @@ from common import compute_hashes
|
||||
DEFAULT_BIOS_DIR = "bios"
|
||||
|
||||
# Directories where deduplication must NOT be applied.
|
||||
# RPG Maker RTP files are referenced by exact name in game scripts -
|
||||
# removing a "duplicate" breaks games that reference that specific filename.
|
||||
# ScummVM themes/extra also have name-dependent loading.
|
||||
NODEDUP_DIRS = {
|
||||
"RPG Maker",
|
||||
"ScummVM",
|
||||
@@ -39,29 +41,29 @@ NODEDUP_DIRS = {
|
||||
def path_priority(path: str) -> tuple:
|
||||
"""Lower score = better candidate to keep as canonical.
|
||||
|
||||
Prefers:
|
||||
- Shorter paths
|
||||
- Non-.variants paths
|
||||
- Non-nested paths (fewer /)
|
||||
- Lowercase names (more standard)
|
||||
Prefers: shorter path, non-variant, non-MAME (system-specific over generic).
|
||||
"""
|
||||
parts = path.split("/")
|
||||
is_variant = ".variants" in path
|
||||
depth = len(parts)
|
||||
name = os.path.basename(path)
|
||||
# Prefer non-variant, shallow, short name
|
||||
return (is_variant, depth, len(name), path)
|
||||
parts = Path(path).parts
|
||||
is_variant = ".variants" in parts
|
||||
is_mame = "MAME" in parts
|
||||
is_arcade = "Arcade" in parts
|
||||
return (is_variant, is_mame, is_arcade, len(parts), path)
|
||||
|
||||
|
||||
def _in_nodedup_dir(path: str) -> bool:
|
||||
"""Check if a file is inside a no-dedup directory."""
|
||||
parts = Path(path).parts
|
||||
return any(nodedup in parts for nodedup in NODEDUP_DIRS)
|
||||
|
||||
|
||||
def _is_mame_dir(path: str) -> bool:
|
||||
"""Check if a path is in a MAME-specific directory."""
|
||||
parts = Path(path).parts
|
||||
return "MAME" in parts or "Arcade" in parts
|
||||
|
||||
|
||||
def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
|
||||
"""Find all files grouped by SHA1, excluding no-dedup directories."""
|
||||
sha1_to_paths = defaultdict(list)
|
||||
sha1_to_paths: dict[str, list[str]] = defaultdict(list)
|
||||
|
||||
for root, dirs, files in os.walk(bios_dir):
|
||||
for name in files:
|
||||
@@ -75,7 +77,10 @@ def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
|
||||
|
||||
|
||||
def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
|
||||
"""Remove duplicate files, keeping one canonical copy per SHA1.
|
||||
"""Remove true duplicates, map MAME device clones.
|
||||
|
||||
True duplicates (same name, different dirs): removes copies.
|
||||
MAME clones (different names, same content, same dir): creates mapping.
|
||||
|
||||
Returns dict of {sha1: {"canonical": path, "removed": [paths], "aliases": [names]}}
|
||||
"""
|
||||
@@ -83,20 +88,63 @@ def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
|
||||
results = {}
|
||||
total_removed = 0
|
||||
total_saved = 0
|
||||
mame_clones: dict[str, dict] = {} # canonical_name -> {sha1, clones: [names]}
|
||||
|
||||
for sha1, paths in sorted(sha1_groups.items()):
|
||||
if len(paths) <= 1:
|
||||
continue
|
||||
|
||||
paths.sort(key=path_priority)
|
||||
canonical = paths[0]
|
||||
duplicates = paths[1:]
|
||||
|
||||
all_names = set()
|
||||
# Separate by filename — same name = true duplicate, different name = clone
|
||||
by_name: dict[str, list[str]] = defaultdict(list)
|
||||
for p in paths:
|
||||
all_names.add(os.path.basename(p))
|
||||
by_name[os.path.basename(p)].append(p)
|
||||
|
||||
# True duplicates: same filename in multiple directories
|
||||
true_dupes_to_remove = []
|
||||
for name, name_paths in by_name.items():
|
||||
if len(name_paths) > 1:
|
||||
name_paths.sort(key=path_priority)
|
||||
true_dupes_to_remove.extend(name_paths[1:])
|
||||
|
||||
# MAME device clones: different filenames, same content, in MAME dirs
|
||||
unique_names = sorted(by_name.keys())
|
||||
if len(unique_names) > 1:
|
||||
# Check if these are all MAME ZIPs
|
||||
all_mame = all(
|
||||
any(_is_mame_dir(p) for p in name_paths)
|
||||
for name_paths in by_name.values()
|
||||
)
|
||||
if all_mame and all(n.endswith(".zip") for n in unique_names):
|
||||
# Pick canonical (shortest name) and record clones
|
||||
canonical_name = min(unique_names, key=len)
|
||||
clone_names = sorted(n for n in unique_names if n != canonical_name)
|
||||
if clone_names:
|
||||
mame_clones[canonical_name] = {
|
||||
"sha1": sha1,
|
||||
"clones": clone_names,
|
||||
"total_copies": sum(len(by_name[n]) for n in clone_names),
|
||||
}
|
||||
# Remove all clone copies (keep one per unique name for now,
|
||||
# or remove all clones and rely on pack-time assembly)
|
||||
for clone_name in clone_names:
|
||||
for p in by_name[clone_name]:
|
||||
true_dupes_to_remove.append(p)
|
||||
|
||||
if not true_dupes_to_remove:
|
||||
continue
|
||||
|
||||
# Find the best canonical across all paths
|
||||
all_paths = [p for p in paths if p not in true_dupes_to_remove]
|
||||
if not all_paths:
|
||||
# All copies were marked for removal — keep the best one
|
||||
all_paths_sorted = sorted(paths, key=path_priority)
|
||||
all_paths = [all_paths_sorted[0]]
|
||||
true_dupes_to_remove = [p for p in paths if p != all_paths[0]]
|
||||
|
||||
canonical = sorted(all_paths, key=path_priority)[0]
|
||||
canonical_name = os.path.basename(canonical)
|
||||
|
||||
all_names = set(os.path.basename(p) for p in paths)
|
||||
alias_names = sorted(all_names - {canonical_name})
|
||||
|
||||
size = os.path.getsize(canonical)
|
||||
@@ -107,49 +155,67 @@ def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
|
||||
"aliases": alias_names,
|
||||
}
|
||||
|
||||
for dup in duplicates:
|
||||
for dup in true_dupes_to_remove:
|
||||
if dup == canonical:
|
||||
continue
|
||||
if not os.path.exists(dup):
|
||||
continue
|
||||
if dry_run:
|
||||
print(f" WOULD REMOVE: {dup}")
|
||||
else:
|
||||
os.remove(dup)
|
||||
# Clean up empty .variants/ directories
|
||||
parent = os.path.dirname(dup)
|
||||
if os.path.basename(parent) == ".variants" and not os.listdir(parent):
|
||||
os.rmdir(parent)
|
||||
results[sha1]["removed"].append(dup)
|
||||
total_removed += 1
|
||||
total_saved += size
|
||||
|
||||
if alias_names:
|
||||
if alias_names or true_dupes_to_remove:
|
||||
action = "Would remove" if dry_run else "Removed"
|
||||
print(f" {canonical_name} (keep: {canonical})")
|
||||
print(f" {action} {len(duplicates)} copies, aliases: {alias_names}")
|
||||
dn = os.path.basename(canonical)
|
||||
print(f" {dn} (keep: {canonical})")
|
||||
if true_dupes_to_remove:
|
||||
print(f" {action} {len(true_dupes_to_remove)} copies")
|
||||
if alias_names:
|
||||
print(f" MAME clones: {alias_names}")
|
||||
|
||||
if not dry_run:
|
||||
for root, dirs, files in os.walk(bios_dir, topdown=False):
|
||||
if not files and not dirs:
|
||||
try:
|
||||
os.rmdir(root)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
print(f"\n{'Would remove' if dry_run else 'Removed'}: {total_removed} files")
|
||||
prefix = "Would remove" if dry_run else "Removed"
|
||||
print(f"\n{prefix}: {total_removed} files")
|
||||
print(f"Space {'to save' if dry_run else 'saved'}: {total_saved / 1024 / 1024:.1f} MB")
|
||||
|
||||
# Write MAME clone mapping
|
||||
if mame_clones:
|
||||
clone_path = os.path.join(bios_dir, "_mame_clones.json")
|
||||
if dry_run:
|
||||
print(f"\nWould write MAME clone map: {clone_path}")
|
||||
print(f" {len(mame_clones)} canonical ZIPs with "
|
||||
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
|
||||
else:
|
||||
with open(clone_path, "w") as f:
|
||||
json.dump(mame_clones, f, indent=2, sort_keys=True)
|
||||
print(f"\nWrote MAME clone map: {clone_path}")
|
||||
print(f" {len(mame_clones)} canonical ZIPs with "
|
||||
f"{sum(len(v['clones']) for v in mame_clones.values())} clones")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Deduplicate BIOS files")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Deduplicate bios/ directory")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without deleting")
|
||||
parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isdir(args.bios_dir):
|
||||
print(f"Error: {args.bios_dir} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Scanning {args.bios_dir}/ for duplicates...")
|
||||
if args.dry_run:
|
||||
print("(DRY RUN)\n")
|
||||
|
||||
deduplicate(args.bios_dir, args.dry_run)
|
||||
deduplicate(args.bios_dir, dry_run=args.dry_run)
|
||||
|
||||
if not args.dry_run:
|
||||
print("\nRun 'python scripts/generate_db.py --force' to rebuild database.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -100,13 +100,39 @@ def _sanitize_path(raw: str) -> str:
|
||||
return "/".join(parts)
|
||||
|
||||
|
||||
def _load_mame_clones(bios_dir: str) -> dict[str, str]:
|
||||
"""Load MAME clone mapping: clone_name -> canonical_name."""
|
||||
clone_path = os.path.join(bios_dir, "_mame_clones.json")
|
||||
if not os.path.exists(clone_path):
|
||||
return {}
|
||||
with open(clone_path) as f:
|
||||
data = json.load(f)
|
||||
# Invert: clone_name -> canonical_name
|
||||
result = {}
|
||||
for canonical, info in data.items():
|
||||
for clone in info.get("clones", []):
|
||||
result[clone] = canonical
|
||||
return result
|
||||
|
||||
|
||||
_MAME_CLONE_MAP: dict[str, str] | None = None
|
||||
|
||||
|
||||
def _get_mame_clone_map(bios_dir: str) -> dict[str, str]:
|
||||
global _MAME_CLONE_MAP
|
||||
if _MAME_CLONE_MAP is None:
|
||||
_MAME_CLONE_MAP = _load_mame_clones(bios_dir)
|
||||
return _MAME_CLONE_MAP
|
||||
|
||||
|
||||
def resolve_file(file_entry: dict, db: dict, bios_dir: str,
|
||||
zip_contents: dict | None = None,
|
||||
dest_hint: str = "") -> tuple[str | None, str]:
|
||||
"""Resolve a BIOS file with storage tiers and release asset fallback.
|
||||
|
||||
Wraps common.resolve_local_file() with pack-specific logic for
|
||||
storage tiers (external/user_provided) and large file release assets.
|
||||
storage tiers (external/user_provided), large file release assets,
|
||||
and MAME clone mapping (deduped ZIPs).
|
||||
"""
|
||||
storage = file_entry.get("storage", "embedded")
|
||||
if storage == "user_provided":
|
||||
@@ -119,8 +145,17 @@ def resolve_file(file_entry: dict, db: dict, bios_dir: str,
|
||||
if path:
|
||||
return path, status
|
||||
|
||||
# Last resort: large files from GitHub release assets
|
||||
# MAME clone fallback: if the file was deduped, resolve via canonical
|
||||
name = file_entry.get("name", "")
|
||||
clone_map = _get_mame_clone_map(bios_dir)
|
||||
canonical = clone_map.get(name)
|
||||
if canonical:
|
||||
canonical_entry = {"name": canonical}
|
||||
cpath, cstatus = resolve_local_file(canonical_entry, db, zip_contents)
|
||||
if cpath:
|
||||
return cpath, "mame_clone"
|
||||
|
||||
# Last resort: large files from GitHub release assets
|
||||
sha1 = file_entry.get("sha1")
|
||||
md5_raw = file_entry.get("md5", "")
|
||||
md5_list = [m.strip().lower() for m in md5_raw.split(",") if m.strip()] if md5_raw else []
|
||||
|
||||
Reference in New Issue
Block a user