mirror of
https://github.com/Abdess/retroarch_system.git
synced 2026-04-13 12:22:33 -05:00
Critical: stream large file downloads (OOM fix), fix basename match in auto_fetch, include hashes in pack grouping fingerprint, handle not_in_zip status in verify, fix escaped quotes in batocera parser. Important: deduplicate shared group includes, catch coreinfo network errors, fix NODEDUP path component match, fix CI word splitting on spaces, replace bare except Exception in 3 files. Minor: argparse in list_platforms, specific exceptions in download.py.
157 lines
4.5 KiB
Python
157 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Deduplicate bios/ directory - keep one canonical file per unique SHA1.
|
|
|
|
Usage:
|
|
python scripts/dedup.py [--dry-run] [--bios-dir bios]
|
|
|
|
For each group of files with the same SHA1:
|
|
- Keeps the file with the shortest, most canonical path
|
|
- Removes duplicates
|
|
- Records all alternate names in database.json aliases
|
|
|
|
After dedup, generate_pack.py resolves files by hash and writes them
|
|
with the correct destination name - no duplicates needed on disk.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
from common import compute_hashes
|
|
|
|
DEFAULT_BIOS_DIR = "bios"
|
|
|
|
# Directories where deduplication must NOT be applied.
|
|
# RPG Maker RTP files are referenced by exact name in game scripts -
|
|
# removing a "duplicate" breaks games that reference that specific filename.
|
|
# ScummVM themes/extra also have name-dependent loading.
|
|
NODEDUP_DIRS = {
|
|
"RPG Maker",
|
|
"ScummVM",
|
|
}
|
|
|
|
|
|
def path_priority(path: str) -> tuple:
|
|
"""Lower score = better candidate to keep as canonical.
|
|
|
|
Prefers:
|
|
- Shorter paths
|
|
- Non-.variants paths
|
|
- Non-nested paths (fewer /)
|
|
- Lowercase names (more standard)
|
|
"""
|
|
parts = path.split("/")
|
|
is_variant = ".variants" in path
|
|
depth = len(parts)
|
|
name = os.path.basename(path)
|
|
# Prefer non-variant, shallow, short name
|
|
return (is_variant, depth, len(name), path)
|
|
|
|
|
|
def _in_nodedup_dir(path: str) -> bool:
|
|
"""Check if a file is inside a no-dedup directory."""
|
|
parts = Path(path).parts
|
|
return any(nodedup in parts for nodedup in NODEDUP_DIRS)
|
|
|
|
|
|
def scan_duplicates(bios_dir: str) -> dict[str, list[str]]:
|
|
"""Find all files grouped by SHA1, excluding no-dedup directories."""
|
|
sha1_to_paths = defaultdict(list)
|
|
|
|
for root, dirs, files in os.walk(bios_dir):
|
|
for name in files:
|
|
path = os.path.join(root, name)
|
|
if _in_nodedup_dir(path):
|
|
continue
|
|
sha1 = compute_hashes(path)["sha1"]
|
|
sha1_to_paths[sha1].append(path)
|
|
|
|
return sha1_to_paths
|
|
|
|
|
|
def deduplicate(bios_dir: str, dry_run: bool = False) -> dict:
|
|
"""Remove duplicate files, keeping one canonical copy per SHA1.
|
|
|
|
Returns dict of {sha1: {"canonical": path, "removed": [paths], "aliases": [names]}}
|
|
"""
|
|
sha1_groups = scan_duplicates(bios_dir)
|
|
results = {}
|
|
total_removed = 0
|
|
total_saved = 0
|
|
|
|
for sha1, paths in sorted(sha1_groups.items()):
|
|
if len(paths) <= 1:
|
|
continue
|
|
|
|
paths.sort(key=path_priority)
|
|
canonical = paths[0]
|
|
duplicates = paths[1:]
|
|
|
|
all_names = set()
|
|
for p in paths:
|
|
all_names.add(os.path.basename(p))
|
|
|
|
canonical_name = os.path.basename(canonical)
|
|
alias_names = sorted(all_names - {canonical_name})
|
|
|
|
size = os.path.getsize(canonical)
|
|
|
|
results[sha1] = {
|
|
"canonical": canonical,
|
|
"removed": [],
|
|
"aliases": alias_names,
|
|
}
|
|
|
|
for dup in duplicates:
|
|
if dry_run:
|
|
print(f" WOULD REMOVE: {dup}")
|
|
else:
|
|
os.remove(dup)
|
|
results[sha1]["removed"].append(dup)
|
|
total_removed += 1
|
|
total_saved += size
|
|
|
|
if alias_names:
|
|
action = "Would remove" if dry_run else "Removed"
|
|
print(f" {canonical_name} (keep: {canonical})")
|
|
print(f" {action} {len(duplicates)} copies, aliases: {alias_names}")
|
|
|
|
if not dry_run:
|
|
for root, dirs, files in os.walk(bios_dir, topdown=False):
|
|
if not files and not dirs:
|
|
try:
|
|
os.rmdir(root)
|
|
except OSError:
|
|
pass
|
|
|
|
print(f"\n{'Would remove' if dry_run else 'Removed'}: {total_removed} files")
|
|
print(f"Space {'to save' if dry_run else 'saved'}: {total_saved / 1024 / 1024:.1f} MB")
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Deduplicate BIOS files")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--bios-dir", default=DEFAULT_BIOS_DIR)
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.isdir(args.bios_dir):
|
|
print(f"Error: {args.bios_dir} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Scanning {args.bios_dir}/ for duplicates...")
|
|
if args.dry_run:
|
|
print("(DRY RUN)\n")
|
|
|
|
deduplicate(args.bios_dir, args.dry_run)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|