diff --git a/历史版本/duplicate_cleanerV4.py b/历史版本/duplicate_cleanerV4.py new file mode 100644 index 0000000..89f553d --- /dev/null +++ b/历史版本/duplicate_cleanerV4.py @@ -0,0 +1,908 @@ +import os +import hashlib +import zipfile +import rarfile +import subprocess +from datetime import datetime +import argparse +import sqlite3 +import logging +from typing import Dict, List, Any, Set, Tuple +from concurrent.futures import ThreadPoolExecutor, as_completed +import time +import re +from pathlib import Path + +# 配置日志系统 +def setup_logging(log_level=logging.INFO, log_file='duplicate_cleaner.log'): + """设置日志配置""" + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file, encoding='utf-8'), + logging.StreamHandler() + ] + ) + return logging.getLogger(__name__) + +logger = setup_logging() + +class PerformanceOptimizedFileDatabase: + def __init__(self, db_path: str = "file_cleaner.db"): + self.db_path = db_path + self.batch_size = 1000 + self.init_database() + + def init_database(self): + """初始化数据库表结构""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute('PRAGMA journal_mode=WAL') + cursor.execute('PRAGMA synchronous=NORMAL') + cursor.execute('PRAGMA cache_size=-64000') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE, + file_hash TEXT, + file_size INTEGER, + file_type TEXT, + mod_time DATETIME, + is_archive BOOLEAN DEFAULT 0, + archive_path TEXT, + is_deleted BOOLEAN DEFAULT 0, + created_time DATETIME DEFAULT CURRENT_TIMESTAMP, + last_scanned DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS operations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + operation_type TEXT, + file_path TEXT, + file_hash TEXT, + reason TEXT, + details TEXT, + operation_time DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS scan_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + scan_time DATETIME DEFAULT CURRENT_TIMESTAMP, + target_directory TEXT, + total_files INTEGER, + duplicate_groups INTEGER, + deleted_files INTEGER, + deleted_archives INTEGER, + duration_seconds REAL + ) + ''') + + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_deleted ON files(is_deleted)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_size ON files(file_size)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_time ON operations(operation_time)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)') + + conn.commit() + conn.close() + logger.info("数据库初始化完成") + + def bulk_add_files(self, file_infos: List[Dict[str, Any]]): + """批量添加文件记录""" + if not file_infos: + return + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + for i in range(0, len(file_infos), self.batch_size): + batch = file_infos[i:i + self.batch_size] + placeholders = [] + values = [] + + for file_info in batch: + placeholders.append('(?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)') + values.extend([ + file_info['path'], + file_info['hash'], + file_info.get('size', 0), + file_info.get('type', 'unknown'), + file_info['mod_time'], + file_info.get('is_archive', False), + file_info.get('archive_path'), + 0 + ]) + + sql = f''' + INSERT OR REPLACE INTO files + (file_path, file_hash, file_size, file_type, mod_time, is_archive, archive_path, is_deleted, last_scanned) + VALUES {",".join(placeholders)} + ''' + cursor.execute(sql, values) + + conn.commit() + logger.debug(f"批量添加了 {len(file_infos)} 个文件记录") + except Exception as e: + logger.error(f"批量添加文件记录时出错: {e}") + conn.rollback() + finally: + conn.close() + + def mark_file_deleted(self, file_path: str, reason: str = "duplicate"): + """标记文件为已删除""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + UPDATE files + SET is_deleted = 1, last_scanned = CURRENT_TIMESTAMP + WHERE file_path = ? + ''', (file_path,)) + + cursor.execute('SELECT file_hash FROM files WHERE file_path = ?', (file_path,)) + result = cursor.fetchone() + file_hash = result[0] if result else None + + self.add_operation("delete", file_path, file_hash, reason) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (标记删除): {e}") + finally: + conn.close() + + def add_operation(self, operation_type: str, file_path: str, file_hash: str = None, + reason: str = "", details: str = ""): + """添加操作记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO operations (operation_type, file_path, file_hash, reason, details) + VALUES (?, ?, ?, ?, ?) + ''', (operation_type, file_path, file_hash, reason, details)) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加操作): {e}") + finally: + conn.close() + + def add_scan_history(self, scan_data: Dict[str, Any]): + """添加扫描历史记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO scan_history + (target_directory, total_files, duplicate_groups, deleted_files, deleted_archives, duration_seconds) + VALUES (?, ?, ?, ?, ?, ?) + ''', ( + scan_data.get('target_directory', ''), + scan_data.get('total_files', 0), + scan_data.get('duplicate_groups', 0), + scan_data.get('deleted_files', 0), + scan_data.get('deleted_archives', 0), + scan_data.get('duration_seconds', 0) + )) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加扫描历史): {e}") + finally: + conn.close() + + def get_scan_statistics(self) -> Dict[str, Any]: + """获取扫描统计信息""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute('SELECT COUNT(*) FROM files') + total_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM files WHERE is_deleted = 1') + deleted_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(DISTINCT file_hash) FROM files WHERE is_deleted = 0') + unique_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM operations') + total_operations = cursor.fetchone()[0] + + return { + 'total_files': total_files, + 'deleted_files': deleted_files, + 'unique_files': unique_files, + 'total_operations': total_operations + } + except Exception as e: + logger.error(f"数据库错误 (获取统计): {e}") + return {} + finally: + conn.close() + +class MovieMetadataExtractor: + """电影元数据提取器""" + + # 常见分辨率模式 + RESOLUTION_PATTERNS = [ + r'(\d{3,4}[pi])', # 1080p, 720p, 480p, 2160p + r'([24]k)', # 2k, 4k + r'(hd)', # hd + r'(fhd)', # fhd + r'(uhd)', # uhd + ] + + # 常见编码格式 + CODEC_PATTERNS = [ + r'(x264)', r'(x265)', r'(h264)', r'(h265)', r'(hevc)', + r'(avc)', r'(divx)', r'(xvid)' + ] + + # 常见来源 + SOURCE_PATTERNS = [ + r'(bluray)', r'(blu-ray)', r'(webdl)', r'(web-dl)', + r'(hdtv)', r'(dvdrip)', r'(bdrip)', r'(brrip)' + ] + + # 常见音频格式 + AUDIO_PATTERNS = [ + r'(dts)', r'(ac3)', r'(aac)', r'(flac)', r'(dd)' + ] + + @staticmethod + def extract_movie_name(filename): + """提取电影名称""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见需要移除的模式 + patterns_to_remove = [ + # 年份 + r'\s*[\(\[]?\d{4}[\)\]]?', + # 分辨率 + r'\s*\d{3,4}[pi]', + r'\s*[24]k', + r'\s*hd', + r'\s*fhd', + r'\s*uhd', + # 编码 + r'\s*x264', r'\s*x265', r'\s*h264', r'\s*h265', r'\s*hevc', + r'\s*avc', r'\s*divx', r'\s*xvid', + # 来源 + r'\s*bluray', r'\s*blu-ray', r'\s*webdl', r'\s*web-dl', + r'\s*hdtv', r'\s*dvdrip', r'\s*bdrip', r'\s*brrip', + # 音频 + r'\s*dts', r'\s*ac3', r'\s*aac', r'\s*flac', r'\s*dd', + # 发布组和其他信息 + r'\s*-\s*[^-]+$', # 最后一个 - 之后的内容 + r'\[[^\]]+\]', # 方括号内容 + r'\([^\)]+\)', # 圆括号内容 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, '', name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r'[\._\-\s]+', ' ', name) + name = name.strip() + + return name + + @staticmethod + def extract_resolution(filename): + """提取分辨率""" + filename_lower = filename.lower() + + resolution_map = { + '2160p': '4K', '4k': '4K', + '1080p': '1080p', + '720p': '720p', + '480p': '480p', + 'hd': 'HD' + } + + for pattern, resolution in resolution_map.items(): + if pattern in filename_lower: + return resolution + + return 'Unknown' + + @staticmethod + def extract_quality_score(filename, file_size): + """计算质量评分""" + score = 0 + + # 基于文件大小的评分 + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + score += 30 + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + score += 20 + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + score += 10 + + # 基于分辨率的评分 + resolution = MovieMetadataExtractor.extract_resolution(filename) + resolution_scores = { + '4K': 25, + '1080p': 20, + '720p': 15, + 'HD': 10, + 'Unknown': 5 + } + score += resolution_scores.get(resolution, 5) + + # 基于编码的评分 + filename_lower = filename.lower() + if 'x265' in filename_lower or 'hevc' in filename_lower: + score += 10 # 更高效的编码 + if 'x264' in filename_lower: + score += 5 + + # 基于来源的评分 + if 'bluray' in filename_lower or 'blu-ray' in filename_lower: + score += 15 + elif 'webdl' in filename_lower or 'web-dl' in filename_lower: + score += 10 + elif 'hdtv' in filename_lower: + score += 5 + + return score + +class IntelligentDuplicateCleaner: + def __init__(self, target_dir, db_path="file_cleaner.db", max_workers=4): + self.target_dir = target_dir + self.db = PerformanceOptimizedFileDatabase(db_path) + self.max_workers = max_workers + self.metadata_extractor = MovieMetadataExtractor() + + # 媒体文件扩展名 + self.video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', + '.m4v', '.3gp', '.mpg', '.mpeg', '.ts', '.m2ts', '.vob', '.rmvb'} + self.audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.wma', '.m4a', + '.aiff', '.ape', '.opus', '.amr'} + + # 性能统计 + self.stats = { + 'files_processed': 0, + 'files_skipped': 0, + 'hash_time': 0, + 'start_time': None + } + + self.hash_cache = {} + + logger.info(f"初始化智能重复文件清理器,目标目录: {target_dir}") + + def get_file_hash_complete(self, file_path): + """完整文件哈希计算""" + hash_md5 = hashlib.md5() + try: + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"计算文件完整哈希时出错 {file_path}: {e}") + return None + + def get_file_sample_hash(self, file_path, sample_points=3, sample_size=4096): + """文件采样哈希""" + try: + file_size = os.path.getsize(file_path) + if file_size <= sample_size * sample_points: + # 小文件直接计算完整哈希 + return self.get_file_hash_complete(file_path) + + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + # 采样点:开头、25%、50%、75%、结尾 + positions = [ + 0, # 开头 + file_size // 4 - sample_size // 2, # 25% + file_size // 2 - sample_size // 2, # 50% + file_size * 3 // 4 - sample_size // 2, # 75% + file_size - sample_size # 结尾 + ] + + for pos in positions[:sample_points]: + if pos < 0: + pos = 0 + f.seek(pos) + hash_md5.update(f.read(sample_size)) + + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"文件采样时出错 {file_path}: {e}") + return None + + def extract_content_signature(self, file_path): + """ + 提取内容特征签名 + 对于电影文件,提取关键帧的特征 + """ + try: + file_size = os.path.getsize(file_path) + + # 简单的内容特征提取策略 + signature_parts = [] + + # 1. 文件大小范围 + size_bucket = self.get_size_bucket(file_size) + signature_parts.append(f"size_{size_bucket}") + + # 2. 文件头部特征 + header_hash = self.get_file_header_hash(file_path) + if header_hash: + signature_parts.append(f"header_{header_hash[:8]}") + + # 3. 关键位置采样(避免读取整个文件) + sample_hash = self.get_file_sample_hash(file_path) + if sample_hash: + signature_parts.append(f"sample_{sample_hash[:8]}") + + return "_".join(signature_parts) + + except Exception as e: + logger.error(f"提取内容特征时出错 {file_path}: {e}") + return None + + def get_size_bucket(self, file_size): + """将文件大小分桶""" + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + return "xl" + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + return "large" + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + return "medium" + elif file_size > 1 * 1024 * 1024 * 1024: # >1GB + return "small" + else: + return "tiny" + + def get_file_header_hash(self, file_path, bytes_to_read=8192): + """读取文件头部哈希""" + try: + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + hash_md5.update(f.read(bytes_to_read)) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"读取文件头部时出错 {file_path}: {e}") + return None + + def process_single_file(self, file_path): + """处理单个文件,提取元数据""" + if not os.path.exists(file_path): + return None + + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext in self.video_extensions: + start_time = time.time() + + file_stat = os.stat(file_path) + cache_key = (file_path, file_stat.st_size, file_stat.st_mtime) + + if cache_key in self.hash_cache: + file_hash = self.hash_cache[cache_key] + else: + # 对于大视频文件,使用采样哈希 + if file_stat.st_size > 500 * 1024 * 1024: # >500MB + file_hash = self.get_file_sample_hash(file_path) + else: + file_hash = self.get_file_hash_complete(file_path) + + if file_hash: + self.hash_cache[cache_key] = file_hash + + hash_time = time.time() - start_time + self.stats['hash_time'] += hash_time + + if file_hash: + # 提取电影元数据 + filename = os.path.basename(file_path) + movie_name = self.metadata_extractor.extract_movie_name(filename) + resolution = self.metadata_extractor.extract_resolution(filename) + quality_score = self.metadata_extractor.extract_quality_score(filename, file_stat.st_size) + content_signature = self.extract_content_signature(file_path) + + file_info = { + 'path': file_path, + 'hash': file_hash, + 'size': file_stat.st_size, + 'type': 'video', + 'mod_time': datetime.fromtimestamp(file_stat.st_mtime), + 'is_archive': False, + 'archive_path': None, + 'movie_name': movie_name, + 'resolution': resolution, + 'quality_score': quality_score, + 'content_signature': content_signature, + 'filename': filename + } + + self.stats['files_processed'] += 1 + if self.stats['files_processed'] % 1000 == 0: + logger.info(f"已处理 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件") + + return file_info + + self.stats['files_skipped'] += 1 + return None + + def scan_files_parallel(self): + """并行扫描目录中的所有文件""" + logger.info("开始并行扫描文件...") + self.stats['start_time'] = time.time() + + file_type_stats = { + 'video': 0, + 'audio': 0, + 'other': 0, + 'skipped': 0 + } + + all_files = [] + media_files_to_process = [] + + logger.info("第一阶段:收集文件路径...") + for root, dirs, files in os.walk(self.target_dir): + if any(skip_dir in root for skip_dir in ['temp_extract', '@eaDir', '.Trash']): + continue + + for file in files: + file_path = os.path.join(root, file) + file_ext = os.path.splitext(file)[1].lower() + + if file_ext in self.video_extensions: + media_files_to_process.append(file_path) + file_type_stats['video'] += 1 + elif file_ext in self.audio_extensions: + media_files_to_process.append(file_path) + file_type_stats['audio'] += 1 + else: + file_type_stats['other'] += 1 + + logger.info("文件类型统计:") + logger.info(f" 视频文件: {file_type_stats['video']}") + logger.info(f" 音频文件: {file_type_stats['audio']}") + logger.info(f" 其他文件: {file_type_stats['other']}") + logger.info(f" 总计媒体文件: {len(media_files_to_process)}") + + if len(media_files_to_process) == 0: + logger.warning("没有找到任何媒体文件!请检查文件扩展名配置和目录路径。") + logger.info(f"支持的视频扩展名: {self.video_extensions}") + logger.info(f"支持的音频扩展名: {self.audio_extensions}") + return [] + + logger.info("第二阶段:并行处理文件...") + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_file = { + executor.submit(self.process_single_file, file_path): file_path + for file_path in media_files_to_process + } + + batch_files = [] + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + if result: + batch_files.append(result) + + if len(batch_files) >= 1000: + self.db.bulk_add_files(batch_files) + batch_files = [] + all_files.extend(batch_files) + + except Exception as e: + logger.error(f"处理文件 {file_path} 时出错: {e}") + self.stats['files_skipped'] += 1 + + if batch_files: + self.db.bulk_add_files(batch_files) + all_files.extend(batch_files) + + total_time = time.time() - self.stats['start_time'] + logger.info(f"文件扫描完成。处理了 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件") + logger.info(f"哈希计算总时间: {self.stats['hash_time']:.2f}秒") + logger.info(f"总扫描时间: {total_time:.2f}秒") + + return all_files + + def find_similar_movies(self, files, similarity_threshold=0.8): + """查找相似的电影文件""" + logger.info("开始查找相似电影文件...") + + # 按电影名称分组 + movie_groups = {} + for file_info in files: + movie_name = file_info.get('movie_name', '') + if movie_name and len(movie_name) > 3: # 忽略太短的名称 + if movie_name not in movie_groups: + movie_groups[movie_name] = [] + movie_groups[movie_name].append(file_info) + + # 查找每个电影组内的重复/相似文件 + similar_groups = {} + + for movie_name, file_group in movie_groups.items(): + if len(file_group) <= 1: + continue + + logger.info(f"分析电影: {movie_name} (共{len(file_group)}个版本)") + + # 按内容特征进一步分组 + signature_groups = {} + for file_info in file_group: + signature = file_info.get('content_signature', 'unknown') + if signature not in signature_groups: + signature_groups[signature] = [] + signature_groups[signature].append(file_info) + + # 对于每个签名组,选择最佳版本 + for signature, signature_group in signature_groups.items(): + if len(signature_group) > 1: + # 按质量评分排序 + signature_group.sort(key=lambda x: x.get('quality_score', 0), reverse=True) + + similar_groups[f"{movie_name}_{signature}"] = signature_group + + logger.info(f" 发现 {len(signature_group)} 个相似文件:") + for i, file_info in enumerate(signature_group): + logger.info(f" {i+1}. {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)}, " + f"大小: {file_info['size'] / (1024*1024*1024):.2f}GB)") + + logger.info(f"找到 {len(similar_groups)} 组相似电影文件") + return similar_groups + + def select_best_version(self, file_group, strategy='quality'): + """选择最佳版本的文件""" + if not file_group: + return None, [] + + if strategy == 'quality': + # 按质量评分选择 + file_group.sort(key=lambda x: x.get('quality_score', 0), reverse=True) + elif strategy == 'size': + # 按文件大小选择(通常更大的文件质量更好) + file_group.sort(key=lambda x: x['size'], reverse=True) + elif strategy == 'resolution': + # 按分辨率选择 + resolution_order = {'4K': 4, '1080p': 3, '720p': 2, 'HD': 1, 'Unknown': 0} + file_group.sort(key=lambda x: resolution_order.get(x.get('resolution', 'Unknown'), 0), reverse=True) + else: # 'newest' + # 按修改时间选择 + file_group.sort(key=lambda x: x['mod_time'], reverse=True) + + best_file = file_group[0] + files_to_delete = file_group[1:] + + return best_file, files_to_delete + + def remove_similar_duplicates(self, similar_groups, dry_run=True, strategy='quality'): + """删除相似的重复文件""" + logger.info("开始处理相似电影文件...") + + kept_files = [] + deleted_files = [] + delete_errors = [] + + for group_name, file_group in similar_groups.items(): + if len(file_group) <= 1: + continue + + best_file, files_to_delete = self.select_best_version(file_group, strategy) + + logger.info(f"\n电影组: {group_name}") + logger.info(f" 保留: {best_file['filename']} " + f"(质量分: {best_file.get('quality_score', 0)})") + + kept_files.append(best_file) + + for file_info in files_to_delete: + file_path = file_info['path'] + + if dry_run: + logger.info(f" [干运行] 将删除: {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)})") + else: + try: + if os.path.exists(file_path): + # 创建备份 + backup_dir = os.path.join(self.target_dir, ".similar_movie_backup") + os.makedirs(backup_dir, exist_ok=True) + + backup_path = os.path.join(backup_dir, os.path.basename(file_path)) + counter = 1 + while os.path.exists(backup_path): + name, ext = os.path.splitext(os.path.basename(file_path)) + backup_path = os.path.join(backup_dir, f"{name}_{counter}{ext}") + counter += 1 + + os.rename(file_path, backup_path) + deleted_files.append(file_path) + + # 记录删除操作 + self.db.mark_file_deleted(file_path, "similar_movie") + logger.info(f" 已移动相似电影到备份: {file_info['filename']}") + else: + logger.warning(f" 文件不存在,跳过删除: {file_info['filename']}") + + except Exception as e: + error_msg = f"删除文件时出错 {file_path}: {e}" + logger.error(error_msg) + delete_errors.append(error_msg) + self.db.add_operation("error", file_path, reason="delete_failed", details=str(e)) + + if delete_errors: + logger.error(f"删除过程中遇到 {len(delete_errors)} 个错误") + + logger.info(f"保留了 {len(kept_files)} 个最佳版本文件") + logger.info(f"删除了 {len(deleted_files)} 个相似电影文件") + + return kept_files, deleted_files + + def remove_empty_folders_efficient(self): + """高效删除空文件夹""" + logger.info("开始清理空文件夹...") + + empty_folders = [] + + for root, dirs, files in os.walk(self.target_dir, topdown=False): + if any(skip_dir in root for skip_dir in ['@eaDir', '.Trash', '.duplicate_backup', 'temp_extract']): + continue + + if not dirs and not files and root != self.target_dir: + try: + os.rmdir(root) + empty_folders.append(root) + self.db.add_operation("delete_folder", root, reason="empty_folder") + logger.debug(f"删除空文件夹: {root}") + except OSError: + pass + + logger.info(f"删除了 {len(empty_folders)} 个空文件夹") + return empty_folders + + def run_intelligent_cleanup(self, dry_run=True, strategy='quality', similarity_threshold=0.8): + """运行智能清理流程""" + logger.info("开始智能电影重复文件清理流程") + start_time = time.time() + + self.db.add_operation("scan_start", self.target_dir, reason="intelligent_cleanup") + + try: + # 1. 扫描文件并提取元数据 + all_files = self.scan_files_parallel() + + if not all_files: + logger.warning("没有找到任何视频文件") + return {} + + # 2. 查找相似的电影文件 + similar_groups = self.find_similar_movies(all_files, similarity_threshold) + + if not similar_groups: + logger.info("没有找到相似的电影文件") + return {} + + # 3. 删除相似的重复文件 + kept_files, deleted_files = self.remove_similar_duplicates( + similar_groups, dry_run, strategy + ) + + # 4. 清理空文件夹 + if not dry_run: + self.remove_empty_folders_efficient() + + # 记录扫描结束 + self.db.add_operation("scan_complete", self.target_dir, reason="intelligent_cleanup_finished") + + # 计算持续时间 + duration = time.time() - start_time + + # 记录扫描历史 + scan_data = { + 'target_directory': self.target_dir, + 'total_files': len(all_files), + 'similar_groups': len(similar_groups), + 'kept_files': len(kept_files), + 'deleted_files': len(deleted_files), + 'duration_seconds': duration + } + self.db.add_scan_history(scan_data) + + # 显示统计信息 + self.show_intelligent_statistics(scan_data) + + return scan_data + + except Exception as e: + logger.error(f"智能清理过程中发生错误: {e}") + self.db.add_operation("error", "SYSTEM", reason="intelligent_cleanup_failed", details=str(e)) + raise + + def show_intelligent_statistics(self, scan_data): + """显示智能清理统计信息""" + logger.info("\n" + "="*60) + logger.info("智能清理统计信息") + logger.info("="*60) + logger.info(f"扫描目录: {scan_data['target_directory']}") + logger.info(f"总视频文件: {scan_data['total_files']} 个") + logger.info(f"相似电影组: {scan_data['similar_groups']} 组") + logger.info(f"保留文件: {scan_data['kept_files']} 个") + logger.info(f"删除文件: {scan_data['deleted_files']} 个") + logger.info(f"释放空间: 约 {scan_data['deleted_files'] * 2:.2f} GB (估算)") + logger.info(f"耗时: {scan_data['duration_seconds']:.2f} 秒") + +def main(): + parser = argparse.ArgumentParser(description='智能电影重复文件清理工具') + parser.add_argument('directory', help='要扫描的目录路径') + parser.add_argument('--dry-run', action='store_true', help='干运行模式,只显示不会实际删除') + parser.add_argument('--strategy', choices=['quality', 'size', 'resolution', 'newest'], + default='quality', help='选择最佳版本策略(默认: quality)') + parser.add_argument('--similarity-threshold', type=float, default=0.8, + help='相似度阈值(0.0-1.0,默认: 0.8)') + parser.add_argument('--db-path', default='file_cleaner.db', help='数据库文件路径') + parser.add_argument('--workers', type=int, default=4, help='并行工作线程数 (默认: 4)') + parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], + default='INFO', help='日志级别 (默认: INFO)') + parser.add_argument('--log-file', default='duplicate_cleaner.log', help='日志文件路径') + + args = parser.parse_args() + + log_level = getattr(logging, args.log_level) + global logger + logger = setup_logging(log_level, args.log_file) + + if not os.path.exists(args.directory): + logger.error(f"错误: 目录 {args.directory} 不存在") + return + + logger.info(f"启动智能电影重复文件清理器") + logger.info(f"目标目录: {args.directory}") + logger.info(f"选择策略: {args.strategy}") + logger.info(f"相似阈值: {args.similarity_threshold}") + + cleaner = IntelligentDuplicateCleaner(args.directory, args.db_path, args.workers) + + try: + result = cleaner.run_intelligent_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold + ) + + if not args.dry_run and result: + logger.info(f"\n=== 清理总结 ===") + logger.info(f"相似电影组: {result.get('similar_groups', 0)} 组") + logger.info(f"保留文件: {result.get('kept_files', 0)} 个") + logger.info(f"删除文件: {result.get('deleted_files', 0)} 个") + logger.info(f"耗时: {result.get('duration_seconds', 0):.2f} 秒") + + except KeyboardInterrupt: + logger.info("\n用户中断操作") + cleaner.db.add_operation("error", "SYSTEM", reason="user_interrupt") + except Exception as e: + logger.error(f"发生错误: {e}") + cleaner.db.add_operation("error", "SYSTEM", reason="exception", details=str(e)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/历史版本/duplicate_cleanerV5.py b/历史版本/duplicate_cleanerV5.py new file mode 100644 index 0000000..d103b60 --- /dev/null +++ b/历史版本/duplicate_cleanerV5.py @@ -0,0 +1,1015 @@ +import os +import hashlib +import zipfile +import rarfile +import subprocess +from datetime import datetime +import argparse +import sqlite3 +import logging +from typing import Dict, List, Any, Set, Tuple +from concurrent.futures import ThreadPoolExecutor, as_completed +import time +import re +from pathlib import Path + +# 配置日志系统 +def setup_logging(log_level=logging.INFO, log_file='duplicate_cleaner.log'): + """设置日志配置""" + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file, encoding='utf-8'), + logging.StreamHandler() + ] + ) + return logging.getLogger(__name__) + +logger = setup_logging() + +class PerformanceOptimizedFileDatabase: + def __init__(self, db_path: str = "file_cleaner.db"): + self.db_path = db_path + self.batch_size = 1000 + self.init_database() + + def init_database(self): + """初始化数据库表结构""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute('PRAGMA journal_mode=WAL') + cursor.execute('PRAGMA synchronous=NORMAL') + cursor.execute('PRAGMA cache_size=-64000') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE, + file_hash TEXT, + file_size INTEGER, + file_type TEXT, + mod_time DATETIME, + is_archive BOOLEAN DEFAULT 0, + archive_path TEXT, + is_deleted BOOLEAN DEFAULT 0, + created_time DATETIME DEFAULT CURRENT_TIMESTAMP, + last_scanned DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS operations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + operation_type TEXT, + file_path TEXT, + file_hash TEXT, + reason TEXT, + details TEXT, + operation_time DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS scan_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + scan_time DATETIME DEFAULT CURRENT_TIMESTAMP, + target_directory TEXT, + total_files INTEGER, + duplicate_groups INTEGER, + deleted_files INTEGER, + deleted_archives INTEGER, + duration_seconds REAL + ) + ''') + + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_deleted ON files(is_deleted)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_size ON files(file_size)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_time ON operations(operation_time)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)') + + conn.commit() + conn.close() + logger.info("数据库初始化完成") + + def bulk_add_files(self, file_infos: List[Dict[str, Any]]): + """批量添加文件记录""" + if not file_infos: + return + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + for i in range(0, len(file_infos), self.batch_size): + batch = file_infos[i:i + self.batch_size] + placeholders = [] + values = [] + + for file_info in batch: + placeholders.append('(?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)') + values.extend([ + file_info['path'], + file_info['hash'], + file_info.get('size', 0), + file_info.get('type', 'unknown'), + file_info['mod_time'], + file_info.get('is_archive', False), + file_info.get('archive_path'), + 0 + ]) + + sql = f''' + INSERT OR REPLACE INTO files + (file_path, file_hash, file_size, file_type, mod_time, is_archive, archive_path, is_deleted, last_scanned) + VALUES {",".join(placeholders)} + ''' + cursor.execute(sql, values) + + conn.commit() + logger.debug(f"批量添加了 {len(file_infos)} 个文件记录") + except Exception as e: + logger.error(f"批量添加文件记录时出错: {e}") + conn.rollback() + finally: + conn.close() + + def mark_file_deleted(self, file_path: str, reason: str = "duplicate"): + """标记文件为已删除""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + UPDATE files + SET is_deleted = 1, last_scanned = CURRENT_TIMESTAMP + WHERE file_path = ? + ''', (file_path,)) + + cursor.execute('SELECT file_hash FROM files WHERE file_path = ?', (file_path,)) + result = cursor.fetchone() + file_hash = result[0] if result else None + + self.add_operation("delete", file_path, file_hash, reason) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (标记删除): {e}") + finally: + conn.close() + + def add_operation(self, operation_type: str, file_path: str, file_hash: str = None, + reason: str = "", details: str = ""): + """添加操作记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO operations (operation_type, file_path, file_hash, reason, details) + VALUES (?, ?, ?, ?, ?) + ''', (operation_type, file_path, file_hash, reason, details)) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加操作): {e}") + finally: + conn.close() + + def add_scan_history(self, scan_data: Dict[str, Any]): + """添加扫描历史记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO scan_history + (target_directory, total_files, duplicate_groups, deleted_files, deleted_archives, duration_seconds) + VALUES (?, ?, ?, ?, ?, ?) + ''', ( + scan_data.get('target_directory', ''), + scan_data.get('total_files', 0), + scan_data.get('duplicate_groups', 0), + scan_data.get('deleted_files', 0), + scan_data.get('deleted_archives', 0), + scan_data.get('duration_seconds', 0) + )) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加扫描历史): {e}") + finally: + conn.close() + + def get_scan_statistics(self) -> Dict[str, Any]: + """获取扫描统计信息""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute('SELECT COUNT(*) FROM files') + total_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM files WHERE is_deleted = 1') + deleted_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(DISTINCT file_hash) FROM files WHERE is_deleted = 0') + unique_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM operations') + total_operations = cursor.fetchone()[0] + + return { + 'total_files': total_files, + 'deleted_files': deleted_files, + 'unique_files': unique_files, + 'total_operations': total_operations + } + except Exception as e: + logger.error(f"数据库错误 (获取统计): {e}") + return {} + finally: + conn.close() + +class MovieMetadataExtractor: + """电影元数据提取器""" + + # 常见分辨率模式 + RESOLUTION_PATTERNS = [ + r'(\d{3,4}[pi])', # 1080p, 720p, 480p, 2160p + r'([24]k)', # 2k, 4k + r'(hd)', # hd + r'(fhd)', # fhd + r'(uhd)', # uhd + ] + + # 常见编码格式 + CODEC_PATTERNS = [ + r'(x264)', r'(x265)', r'(h264)', r'(h265)', r'(hevc)', + r'(avc)', r'(divx)', r'(xvid)' + ] + + # 常见来源 + SOURCE_PATTERNS = [ + r'(bluray)', r'(blu-ray)', r'(webdl)', r'(web-dl)', + r'(hdtv)', r'(dvdrip)', r'(bdrip)', r'(brrip)' + ] + + # 常见音频格式 + AUDIO_PATTERNS = [ + r'(dts)', r'(ac3)', r'(aac)', r'(flac)', r'(dd)' + ] + + @staticmethod + def extract_movie_name(filename): + """提取电影名称""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见需要移除的模式 + patterns_to_remove = [ + # 年份 + r'\s*[\(\[]?\d{4}[\)\]]?', + # 分辨率 + r'\s*\d{3,4}[pi]', + r'\s*[24]k', + r'\s*hd', + r'\s*fhd', + r'\s*uhd', + # 编码 + r'\s*x264', r'\s*x265', r'\s*h264', r'\s*h265', r'\s*hevc', + r'\s*avc', r'\s*divx', r'\s*xvid', + # 来源 + r'\s*bluray', r'\s*blu-ray', r'\s*webdl', r'\s*web-dl', + r'\s*hdtv', r'\s*dvdrip', r'\s*bdrip', r'\s*brrip', + # 音频 + r'\s*dts', r'\s*ac3', r'\s*aac', r'\s*flac', r'\s*dd', + # 发布组和其他信息 + r'\s*-\s*[^-]+$', # 最后一个 - 之后的内容 + r'\[[^\]]+\]', # 方括号内容 + r'\([^\)]+\)', # 圆括号内容 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, '', name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r'[\._\-\s]+', ' ', name) + name = name.strip() + + return name + + @staticmethod + def extract_resolution(filename): + """提取分辨率""" + filename_lower = filename.lower() + + resolution_map = { + '2160p': '4K', '4k': '4K', + '1080p': '1080p', + '720p': '720p', + '480p': '480p', + 'hd': 'HD' + } + + for pattern, resolution in resolution_map.items(): + if pattern in filename_lower: + return resolution + + return 'Unknown' + + @staticmethod + def extract_quality_score(filename, file_size): + """计算质量评分""" + score = 0 + + # 基于文件大小的评分 + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + score += 30 + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + score += 20 + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + score += 10 + + # 基于分辨率的评分 + resolution = MovieMetadataExtractor.extract_resolution(filename) + resolution_scores = { + '4K': 25, + '1080p': 20, + '720p': 15, + 'HD': 10, + 'Unknown': 5 + } + score += resolution_scores.get(resolution, 5) + + # 基于编码的评分 + filename_lower = filename.lower() + if 'x265' in filename_lower or 'hevc' in filename_lower: + score += 10 # 更高效的编码 + if 'x264' in filename_lower: + score += 5 + + # 基于来源的评分 + if 'bluray' in filename_lower or 'blu-ray' in filename_lower: + score += 15 + elif 'webdl' in filename_lower or 'web-dl' in filename_lower: + score += 10 + elif 'hdtv' in filename_lower: + score += 5 + + return score + +class IntelligentDuplicateCleaner: + def __init__(self, target_dirs, db_path="file_cleaner.db", max_workers=4, prefer_folders=None): + # 修改为支持多个目录 + if isinstance(target_dirs, str): + self.target_dirs = [target_dirs] + else: + self.target_dirs = target_dirs + + self.prefer_folders = prefer_folders or [] + self.db = PerformanceOptimizedFileDatabase(db_path) + self.max_workers = max_workers + self.metadata_extractor = MovieMetadataExtractor() + + # 媒体文件扩展名(保持不变) + self.video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', + '.m4v', '.3gp', '.mpg', '.mpeg', '.ts', '.m2ts', '.vob', '.rmvb'} + self.audio_extensions = {'.mp3', '.wav', '.flac', '.aac', '.ogg', '.wma', '.m4a', + '.aiff', '.ape', '.opus', '.amr'} + + # 性能统计 + self.stats = { + 'files_processed': 0, + 'files_skipped': 0, + 'hash_time': 0, + 'start_time': None + } + + self.hash_cache = {} + + logger.info(f"初始化智能重复文件清理器,目标目录: {target_dirs}") + + def get_file_source_folder(self, file_path): + """获取文件所属的源文件夹""" + for target_dir in self.target_dirs: + if file_path.startswith(target_dir): + return target_dir + return None + + def get_file_hash_complete(self, file_path): + """完整文件哈希计算""" + hash_md5 = hashlib.md5() + try: + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"计算文件完整哈希时出错 {file_path}: {e}") + return None + + def get_file_sample_hash(self, file_path, sample_points=3, sample_size=4096): + """文件采样哈希""" + try: + file_size = os.path.getsize(file_path) + if file_size <= sample_size * sample_points: + # 小文件直接计算完整哈希 + return self.get_file_hash_complete(file_path) + + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + # 采样点:开头、25%、50%、75%、结尾 + positions = [ + 0, # 开头 + file_size // 4 - sample_size // 2, # 25% + file_size // 2 - sample_size // 2, # 50% + file_size * 3 // 4 - sample_size // 2, # 75% + file_size - sample_size # 结尾 + ] + + for pos in positions[:sample_points]: + if pos < 0: + pos = 0 + f.seek(pos) + hash_md5.update(f.read(sample_size)) + + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"文件采样时出错 {file_path}: {e}") + return None + + def extract_content_signature(self, file_path): + """ + 提取内容特征签名 + 对于电影文件,提取关键帧的特征 + """ + try: + file_size = os.path.getsize(file_path) + + # 简单的内容特征提取策略 + signature_parts = [] + + # 1. 文件大小范围 + size_bucket = self.get_size_bucket(file_size) + signature_parts.append(f"size_{size_bucket}") + + # 2. 文件头部特征 + header_hash = self.get_file_header_hash(file_path) + if header_hash: + signature_parts.append(f"header_{header_hash[:8]}") + + # 3. 关键位置采样(避免读取整个文件) + sample_hash = self.get_file_sample_hash(file_path) + if sample_hash: + signature_parts.append(f"sample_{sample_hash[:8]}") + + return "_".join(signature_parts) + + except Exception as e: + logger.error(f"提取内容特征时出错 {file_path}: {e}") + return None + + def get_size_bucket(self, file_size): + """将文件大小分桶""" + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + return "xl" + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + return "large" + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + return "medium" + elif file_size > 1 * 1024 * 1024 * 1024: # >1GB + return "small" + else: + return "tiny" + + def get_file_header_hash(self, file_path, bytes_to_read=8192): + """读取文件头部哈希""" + try: + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + hash_md5.update(f.read(bytes_to_read)) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"读取文件头部时出错 {file_path}: {e}") + return None + + def process_single_file(self, file_path): + """处理单个文件,提取元数据""" + if not os.path.exists(file_path): + return None + + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext in self.video_extensions: + start_time = time.time() + + file_stat = os.stat(file_path) + cache_key = (file_path, file_stat.st_size, file_stat.st_mtime) + + if cache_key in self.hash_cache: + file_hash = self.hash_cache[cache_key] + else: + # 对于大视频文件,使用采样哈希 + if file_stat.st_size > 500 * 1024 * 1024: # >500MB + file_hash = self.get_file_sample_hash(file_path) + else: + file_hash = self.get_file_hash_complete(file_path) + + if file_hash: + self.hash_cache[cache_key] = file_hash + + hash_time = time.time() - start_time + self.stats['hash_time'] += hash_time + + if file_hash: + # 提取电影元数据 + filename = os.path.basename(file_path) + movie_name = self.metadata_extractor.extract_movie_name(filename) + resolution = self.metadata_extractor.extract_resolution(filename) + quality_score = self.metadata_extractor.extract_quality_score(filename, file_stat.st_size) + content_signature = self.extract_content_signature(file_path) + + file_info = { + 'path': file_path, + 'hash': file_hash, + 'size': file_stat.st_size, + 'type': 'video', + 'mod_time': datetime.fromtimestamp(file_stat.st_mtime), + 'is_archive': False, + 'archive_path': None, + 'movie_name': movie_name, + 'resolution': resolution, + 'quality_score': quality_score, + 'content_signature': content_signature, + 'filename': filename + } + + self.stats['files_processed'] += 1 + if self.stats['files_processed'] % 1000 == 0: + logger.info(f"已处理 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件") + + return file_info + + self.stats['files_skipped'] += 1 + return None + + def scan_files_parallel(self): + """并行扫描多个目录中的所有文件""" + logger.info(f"开始并行扫描 {len(self.target_dirs)} 个目录...") + # 单文件夹的特殊日志 + if len(self.target_dirs) == 1: + logger.info(f"单文件夹模式: {self.target_dirs[0]}") + self.stats['start_time'] = time.time() + + file_type_stats = { + 'video': 0, + 'audio': 0, + 'other': 0, + 'skipped': 0 + } + + all_files = [] + media_files_to_process = [] + + logger.info("第一阶段:收集所有目录的文件路径...") + for target_dir in self.target_dirs: + logger.info(f"扫描目录: {target_dir}") + for root, dirs, files in os.walk(target_dir): + if any(skip_dir in root for skip_dir in ['temp_extract', '@eaDir', '.Trash']): + continue + + for file in files: + file_path = os.path.join(root, file) + file_ext = os.path.splitext(file)[1].lower() + + if file_ext in self.video_extensions: + media_files_to_process.append(file_path) + file_type_stats['video'] += 1 + elif file_ext in self.audio_extensions: + media_files_to_process.append(file_path) + file_type_stats['audio'] += 1 + else: + file_type_stats['other'] += 1 + + logger.info("文件类型统计:") + logger.info(f" 视频文件: {file_type_stats['video']}") + logger.info(f" 音频文件: {file_type_stats['audio']}") + logger.info(f" 其他文件: {file_type_stats['other']}") + logger.info(f" 总计媒体文件: {len(media_files_to_process)}") + + if len(media_files_to_process) == 0: + logger.warning("没有找到任何媒体文件!请检查文件扩展名配置和目录路径。") + logger.info(f"支持的视频扩展名: {self.video_extensions}") + logger.info(f"支持的音频扩展名: {self.audio_extensions}") + return [] + + logger.info("第二阶段:并行处理文件...") + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_file = { + executor.submit(self.process_single_file, file_path): file_path + for file_path in media_files_to_process + } + + batch_files = [] + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + if result: + # 添加文件来源信息 + result['source_folder'] = self.get_file_source_folder(file_path) + batch_files.append(result) + + if len(batch_files) >= 1000: + self.db.bulk_add_files(batch_files) + batch_files = [] + all_files.extend(batch_files) + + except Exception as e: + logger.error(f"处理文件 {file_path} 时出错: {e}") + self.stats['files_skipped'] += 1 + + if batch_files: + self.db.bulk_add_files(batch_files) + all_files.extend(batch_files) + + total_time = time.time() - self.stats['start_time'] + logger.info(f"文件扫描完成。处理了 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件") + logger.info(f"哈希计算总时间: {self.stats['hash_time']:.2f}秒") + logger.info(f"总扫描时间: {total_time:.2f}秒") + + return all_files + + def find_similar_movies(self, files, similarity_threshold=0.8): + """查找相似的电影文件""" + logger.info("开始查找相似电影文件...") + + # 按电影名称分组 + movie_groups = {} + for file_info in files: + movie_name = file_info.get('movie_name', '') + if movie_name and len(movie_name) > 3: # 忽略太短的名称 + if movie_name not in movie_groups: + movie_groups[movie_name] = [] + movie_groups[movie_name].append(file_info) + + # 查找每个电影组内的重复/相似文件 + similar_groups = {} + + for movie_name, file_group in movie_groups.items(): + if len(file_group) <= 1: + continue + + logger.info(f"分析电影: {movie_name} (共{len(file_group)}个版本)") + + # 按内容特征进一步分组 + signature_groups = {} + for file_info in file_group: + signature = file_info.get('content_signature', 'unknown') + if signature not in signature_groups: + signature_groups[signature] = [] + signature_groups[signature].append(file_info) + + # 对于每个签名组,选择最佳版本 + for signature, signature_group in signature_groups.items(): + if len(signature_group) > 1: + # 按质量评分排序 + signature_group.sort(key=lambda x: x.get('quality_score', 0), reverse=True) + + similar_groups[f"{movie_name}_{signature}"] = signature_group + + logger.info(f" 发现 {len(signature_group)} 个相似文件:") + for i, file_info in enumerate(signature_group): + logger.info(f" {i+1}. {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)}, " + f"大小: {file_info['size'] / (1024*1024*1024):.2f}GB)") + + logger.info(f"找到 {len(similar_groups)} 组相似电影文件") + return similar_groups + + def select_best_version(self, file_group, strategy='quality'): + """选择最佳版本的文件(增强版,支持文件夹优先级)""" + if not file_group: + return None, [] + + # 创建文件组的副本以避免修改原始数据 + sorted_group = file_group.copy() + + # 第一步:如果设置了文件夹优先级,优先考虑 + if self.prefer_folders: + # 为每个文件计算优先级分数 + for file_info in sorted_group: + source_folder = self.get_file_source_folder(file_info['path']) + if source_folder in self.prefer_folders: + # 在质量分基础上增加优先级分数 + file_info['priority_boost'] = 1000 - self.prefer_folders.index(source_folder) * 100 + else: + file_info['priority_boost'] = 0 + + # 第二步:按策略排序 + if strategy == 'quality': + # 如果有优先级提升,则结合质量分和优先级 + if self.prefer_folders: + sorted_group.sort(key=lambda x: x.get('quality_score', 0) + x.get('priority_boost', 0), reverse=True) + else: + sorted_group.sort(key=lambda x: x.get('quality_score', 0), reverse=True) + elif strategy == 'size': + sorted_group.sort(key=lambda x: x['size'], reverse=True) + elif strategy == 'resolution': + resolution_order = {'4K': 4, '1080p': 3, '720p': 2, 'HD': 1, 'Unknown': 0} + sorted_group.sort(key=lambda x: resolution_order.get(x.get('resolution', 'Unknown'), 0), reverse=True) + else: # 'newest' + sorted_group.sort(key=lambda x: x['mod_time'], reverse=True) + + best_file = sorted_group[0] + files_to_delete = sorted_group[1:] + + # 记录选择原因 + best_source = self.get_file_source_folder(best_file['path']) + logger.debug(f"选择最佳文件: {best_file['filename']} (来源: {best_source}, 质量分: {best_file.get('quality_score', 0)})") + + return best_file, files_to_delete + + def remove_similar_duplicates(self, similar_groups, dry_run=True, strategy='quality'): + """删除相似的重复文件""" + logger.info("开始处理相似电影文件...") + + kept_files = [] + deleted_files = [] + delete_errors = [] + + for group_name, file_group in similar_groups.items(): + if len(file_group) <= 1: + continue + + best_file, files_to_delete = self.select_best_version(file_group, strategy) + + logger.info(f"\n电影组: {group_name}") + logger.info(f" 保留: {best_file['filename']} " + f"(质量分: {best_file.get('quality_score', 0)})") + + kept_files.append(best_file) + + for file_info in files_to_delete: + file_path = file_info['path'] + + if dry_run: + logger.info(f" [干运行] 将删除: {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)})") + else: + try: + if os.path.exists(file_path): + # 创建备份 + backup_dir = os.path.join(self.target_dir, ".similar_movie_backup") + os.makedirs(backup_dir, exist_ok=True) + + backup_path = os.path.join(backup_dir, os.path.basename(file_path)) + counter = 1 + while os.path.exists(backup_path): + name, ext = os.path.splitext(os.path.basename(file_path)) + backup_path = os.path.join(backup_dir, f"{name}_{counter}{ext}") + counter += 1 + + os.rename(file_path, backup_path) + deleted_files.append(file_path) + + # 记录删除操作 + self.db.mark_file_deleted(file_path, "similar_movie") + logger.info(f" 已移动相似电影到备份: {file_info['filename']}") + else: + logger.warning(f" 文件不存在,跳过删除: {file_info['filename']}") + + except Exception as e: + error_msg = f"删除文件时出错 {file_path}: {e}" + logger.error(error_msg) + delete_errors.append(error_msg) + self.db.add_operation("error", file_path, reason="delete_failed", details=str(e)) + + if delete_errors: + logger.error(f"删除过程中遇到 {len(delete_errors)} 个错误") + + logger.info(f"保留了 {len(kept_files)} 个最佳版本文件") + logger.info(f"删除了 {len(deleted_files)} 个相似电影文件") + + return kept_files, deleted_files + + def remove_empty_folders_efficient(self): + """高效删除空文件夹""" + logger.info("开始清理空文件夹...") + + empty_folders = [] + + for root, dirs, files in os.walk(self.target_dir, topdown=False): + if any(skip_dir in root for skip_dir in ['@eaDir', '.Trash', '.duplicate_backup', 'temp_extract']): + continue + + if not dirs and not files and root != self.target_dir: + try: + os.rmdir(root) + empty_folders.append(root) + self.db.add_operation("delete_folder", root, reason="empty_folder") + logger.debug(f"删除空文件夹: {root}") + except OSError: + pass + + logger.info(f"删除了 {len(empty_folders)} 个空文件夹") + return empty_folders + + def run_intelligent_cleanup(self, dry_run=True, strategy='quality', similarity_threshold=0.8): + """运行智能清理流程(支持多目录)""" + logger.info("开始智能电影重复文件清理流程") + start_time = time.time() + + self.db.add_operation("scan_start", str(self.target_dirs), reason="intelligent_cleanup") + + try: + # 1. 扫描所有目录的文件并提取元数据 + all_files = self.scan_files_parallel() + + if not all_files: + logger.warning("没有找到任何视频文件") + return {} + + # 2. 查找相似的电影文件(跨目录) + similar_groups = self.find_similar_movies(all_files, similarity_threshold) + + if not similar_groups: + logger.info("没有找到相似的电影文件") + return {} + + # 3. 删除相似的重复文件(跨目录) + kept_files, deleted_files = self.remove_similar_duplicates( + similar_groups, dry_run, strategy + ) + + # 4. 清理所有目录的空文件夹 + if not dry_run: + for target_dir in self.target_dirs: + self.remove_empty_folders_efficient(target_dir) + + # 记录扫描结束 + self.db.add_operation("scan_complete", str(self.target_dirs), reason="intelligent_cleanup_finished") + + # 计算持续时间 + duration = time.time() - start_time + + # 记录扫描历史 + scan_data = { + 'target_directory': str(self.target_dirs), + 'total_files': len(all_files), + 'similar_groups': len(similar_groups), + 'kept_files': len(kept_files), + 'deleted_files': len(deleted_files), + 'deleted_file_details': deleted_files, # 保存删除的文件路径用于统计 + 'duration_seconds': duration + } + self.db.add_scan_history(scan_data) + + # 显示统计信息 + self.show_intelligent_statistics(scan_data) + + return scan_data + + except Exception as e: + logger.error(f"智能清理过程中发生错误: {e}") + self.db.add_operation("error", "SYSTEM", reason="intelligent_cleanup_failed", details=str(e)) + raise + + def show_intelligent_statistics(self, scan_data): + """显示智能清理统计信息(增强版)""" + logger.info("\n" + "="*60) + logger.info("智能清理统计信息") + logger.info("="*60) + # 单文件夹显示更简洁 + if len(self.target_dirs) == 1: + logger.info(f"扫描目录: {self.target_dirs[0]}") + else: + logger.info(f"扫描目录: {', '.join(self.target_dirs)}") + if self.prefer_folders: + logger.info(f"优先目录: {', '.join(self.prefer_folders)}") + logger.info(f"总视频文件: {scan_data['total_files']} 个") + logger.info(f"相似电影组: {scan_data['similar_groups']} 组") + logger.info(f"保留文件: {scan_data['kept_files']} 个") + logger.info(f"删除文件: {scan_data['deleted_files']} 个") + + # 单文件夹不需要显示各目录统计 + if len(self.target_dirs) > 1: + folder_stats = {} + for deleted_file in scan_data.get('deleted_file_details', []): + source_folder = self.get_file_source_folder(deleted_file) + if source_folder not in folder_stats: + folder_stats[source_folder] = 0 + folder_stats[source_folder] += 1 + + if folder_stats: + logger.info("各目录删除统计:") + for folder, count in folder_stats.items(): + logger.info(f" {folder}: {count} 个文件") + + logger.info(f"释放空间: 约 {scan_data['deleted_files'] * 2:.2f} GB (估算)") + logger.info(f"耗时: {scan_data['duration_seconds']:.2f} 秒") + +def main(): + parser = argparse.ArgumentParser(description='智能电影重复文件清理工具 - 增强版') + # 修改为支持多个目录,但通过 nargs='*' 也支持0个(使用默认值) + parser.add_argument('directories', nargs='*', help='要扫描的目录路径(支持多个目录)') + parser.add_argument('--dry-run', action='store_true', help='干运行模式,只显示不会实际删除') + parser.add_argument('--strategy', choices=['quality', 'size', 'resolution', 'newest'], + default='quality', help='选择最佳版本策略(默认: quality)') + parser.add_argument('--similarity-threshold', type=float, default=0.8, + help='相似度阈值(0.0-1.0,默认: 0.8)') + parser.add_argument('--skip-start', type=float, default=0.1, + help='跳过文件开头的比例(0.0-0.5,默认: 0.1)') + parser.add_argument('--db-path', default='file_cleaner.db', help='数据库文件路径') + parser.add_argument('--workers', type=int, default=4, help='并行工作线程数 (默认: 4)') + parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], + default='INFO', help='日志级别 (默认: INFO)') + parser.add_argument('--log-file', default='duplicate_cleaner.log', help='日志文件路径') + parser.add_argument('--prefer-folder', nargs='+', help='优先保留的文件夹(当文件质量相同时)') + parser.add_argument('--content-analysis', action='store_true', + help='启用基于内容的分析(更准确但更慢)') + parser.add_argument('--no-content-analysis', action='store_true', + help='禁用基于内容的分析(更快但准确性较低)') + + args = parser.parse_args() + + # 处理目录参数:如果没有指定目录,使用当前目录 + if not args.directories: + args.directories = [os.getcwd()] + logger.info(f"未指定目录,使用当前目录: {args.directories[0]}") + + # 验证目录参数 + for directory in args.directories: + if not os.path.exists(directory): + logger.error(f"错误: 目录 {directory} 不存在") + return + + # 验证 skip-start 参数 + if args.skip_start < 0 or args.skip_start > 0.5: + logger.error("错误: --skip-start 参数必须在 0.0 到 0.5 之间") + return + + log_level = getattr(logging, args.log_level) + global logger + logger = setup_logging(log_level, args.log_file) + + # 确定是否使用内容分析 + use_content_analysis = True + if args.no_content_analysis: + use_content_analysis = False + elif args.content_analysis: + use_content_analysis = True + + logger.info(f"启动智能电影重复文件清理器") + logger.info(f"目标目录: {args.directories}") + logger.info(f"选择策略: {args.strategy}") + logger.info(f"相似阈值: {args.similarity_threshold}") + if args.prefer_folder: + logger.info(f"优先文件夹: {args.prefer_folder}") + + cleaner = IntelligentDuplicateCleaner( + args.directories, + args.db_path, + args.workers, + args.prefer_folder + ) + + try: + if use_content_analysis: + logger.info("使用基于内容的高级分析模式") + result = cleaner.run_advanced_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + use_content_analysis=use_content_analysis + ) + else: + logger.info("使用基于元数据的快速分析模式") + result = cleaner.run_intelligent_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + skip_start_percent=args.skip_start + ) + + if not args.dry_run and result: + logger.info(f"\n=== 清理总结 ===") + logger.info(f"相似电影组: {result.get('similar_groups', 0)} 组") + logger.info(f"保留文件: {result.get('kept_files', 0)} 个") + logger.info(f"删除文件: {result.get('deleted_files', 0)} 个") + logger.info(f"耗时: {result.get('duration_seconds', 0):.2f} 秒") + + except KeyboardInterrupt: + logger.info("\n用户中断操作") + cleaner.db.add_operation("error", "SYSTEM", reason="user_interrupt") + except Exception as e: + logger.error(f"发生错误: {e}") + cleaner.db.add_operation("error", "SYSTEM", reason="exception", details=str(e)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/历史版本/duplicate_cleanerV5视频解析.py b/历史版本/duplicate_cleanerV5视频解析.py new file mode 100644 index 0000000..9226658 --- /dev/null +++ b/历史版本/duplicate_cleanerV5视频解析.py @@ -0,0 +1,1149 @@ +import os +import hashlib +import zipfile +import rarfile +import subprocess +from datetime import datetime +import argparse +import sqlite3 +import logging +from typing import Dict, List, Any, Set, Tuple +from concurrent.futures import ThreadPoolExecutor, as_completed +import time +import re +from pathlib import Path +import shutil # 添加这个导入 + +# 配置日志系统 +def setup_logging(log_level=logging.INFO, log_file='duplicate_cleaner.log'): + """设置日志配置""" + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(log_file, encoding='utf-8'), + logging.StreamHandler() + ] + ) + return logging.getLogger(__name__) + +logger = setup_logging() + +class PerformanceOptimizedFileDatabase: + def __init__(self, db_path: str = "file_cleaner.db"): + self.db_path = db_path + self.batch_size = 1000 + self.init_database() + + def init_database(self): + """初始化数据库表结构""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute('PRAGMA journal_mode=WAL') + cursor.execute('PRAGMA synchronous=NORMAL') + cursor.execute('PRAGMA cache_size=-64000') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE, + file_hash TEXT, + file_size INTEGER, + file_type TEXT, + mod_time DATETIME, + is_archive BOOLEAN DEFAULT 0, + archive_path TEXT, + is_deleted BOOLEAN DEFAULT 0, + created_time DATETIME DEFAULT CURRENT_TIMESTAMP, + last_scanned DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS operations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + operation_type TEXT, + file_path TEXT, + file_hash TEXT, + reason TEXT, + details TEXT, + operation_time DATETIME DEFAULT CURRENT_TIMESTAMP + ) + ''') + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS scan_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + scan_time DATETIME DEFAULT CURRENT_TIMESTAMP, + target_directory TEXT, + total_files INTEGER, + duplicate_groups INTEGER, + deleted_files INTEGER, + deleted_archives INTEGER, + duration_seconds REAL + ) + ''') + + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_deleted ON files(is_deleted)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_files_size ON files(file_size)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_time ON operations(operation_time)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)') + + conn.commit() + conn.close() + logger.info("数据库初始化完成") + + def bulk_add_files(self, file_infos: List[Dict[str, Any]]): + """批量添加文件记录""" + if not file_infos: + return + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + for i in range(0, len(file_infos), self.batch_size): + batch = file_infos[i:i + self.batch_size] + placeholders = [] + values = [] + + for file_info in batch: + placeholders.append('(?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)') + values.extend([ + file_info['path'], + file_info['hash'], + file_info.get('size', 0), + file_info.get('type', 'unknown'), + file_info['mod_time'], + file_info.get('is_archive', False), + file_info.get('archive_path'), + 0 + ]) + + sql = f''' + INSERT OR REPLACE INTO files + (file_path, file_hash, file_size, file_type, mod_time, is_archive, archive_path, is_deleted, last_scanned) + VALUES {",".join(placeholders)} + ''' + cursor.execute(sql, values) + + conn.commit() + logger.debug(f"批量添加了 {len(file_infos)} 个文件记录") + except Exception as e: + logger.error(f"批量添加文件记录时出错: {e}") + conn.rollback() + finally: + conn.close() + + def mark_file_deleted(self, file_path: str, reason: str = "duplicate"): + """标记文件为已删除""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + UPDATE files + SET is_deleted = 1, last_scanned = CURRENT_TIMESTAMP + WHERE file_path = ? + ''', (file_path,)) + + cursor.execute('SELECT file_hash FROM files WHERE file_path = ?', (file_path,)) + result = cursor.fetchone() + file_hash = result[0] if result else None + + self.add_operation("delete", file_path, file_hash, reason) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (标记删除): {e}") + finally: + conn.close() + + def add_operation(self, operation_type: str, file_path: str, file_hash: str = None, + reason: str = "", details: str = ""): + """添加操作记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO operations (operation_type, file_path, file_hash, reason, details) + VALUES (?, ?, ?, ?, ?) + ''', (operation_type, file_path, file_hash, reason, details)) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加操作): {e}") + finally: + conn.close() + + def add_scan_history(self, scan_data: Dict[str, Any]): + """添加扫描历史记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + INSERT INTO scan_history + (target_directory, total_files, duplicate_groups, deleted_files, deleted_archives, duration_seconds) + VALUES (?, ?, ?, ?, ?, ?) + ''', ( + scan_data.get('target_directory', ''), + scan_data.get('total_files', 0), + scan_data.get('duplicate_groups', 0), + scan_data.get('deleted_files', 0), + scan_data.get('deleted_archives', 0), + scan_data.get('duration_seconds', 0) + )) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加扫描历史): {e}") + finally: + conn.close() + + def get_scan_statistics(self) -> Dict[str, Any]: + """获取扫描统计信息""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute('SELECT COUNT(*) FROM files') + total_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM files WHERE is_deleted = 1') + deleted_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(DISTINCT file_hash) FROM files WHERE is_deleted = 0') + unique_files = cursor.fetchone()[0] + + cursor.execute('SELECT COUNT(*) FROM operations') + total_operations = cursor.fetchone()[0] + + return { + 'total_files': total_files, + 'deleted_files': deleted_files, + 'unique_files': unique_files, + 'total_operations': total_operations + } + except Exception as e: + logger.error(f"数据库错误 (获取统计): {e}") + return {} + finally: + conn.close() + +class MovieMetadataExtractor: + """电影元数据提取器""" + + # 常见分辨率模式 + RESOLUTION_PATTERNS = [ + r'(\d{3,4}[pi])', # 1080p, 720p, 480p, 2160p + r'([24]k)', # 2k, 4k + r'(hd)', # hd + r'(fhd)', # fhd + r'(uhd)', # uhd + ] + + # 常见编码格式 + CODEC_PATTERNS = [ + r'(x264)', r'(x265)', r'(h264)', r'(h265)', r'(hevc)', + r'(avc)', r'(divx)', r'(xvid)' + ] + + # 常见来源 + SOURCE_PATTERNS = [ + r'(bluray)', r'(blu-ray)', r'(webdl)', r'(web-dl)', + r'(hdtv)', r'(dvdrip)', r'(bdrip)', r'(brrip)' + ] + + # 常见音频格式 + AUDIO_PATTERNS = [ + r'(dts)', r'(ac3)', r'(aac)', r'(flac)', r'(dd)' + ] + + # 常见需要移除的模式 - 增强版 + @staticmethod + def extract_movie_name_enhanced(filename): + """增强版电影名称提取""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 更严格的模式匹配 + patterns_to_remove = [ + # 广告相关模式 + r'[\[\(]?广告[\]\)]?', r'[\[\(]?推广[\]\)]?', r'[\[\(]?宣传[\]\)]?', + r'[\[\(]?片头[\]\)]?', r'[\[\(]?片花[\]\)]?', + r'^[^a-zA-Z0-9\u4e00-\u9fff]*', # 开头的特殊字符 + r'[\s_\-]*([\[\(]?\d{4}[\]\)]?)[\s_\-]*', # 年份 + # 分辨率 + r'[\s_\-]*(\d{3,4}[pi])[\s_\-]*', + r'[\s_\-]*([24]k)[\s_\-]*', + r'[\s_\-]*(hd|fhd|uhd)[\s_\-]*', + # 编码 + r'[\s_\-]*(x264|x265|h264|h265|hevc|avc|divx|xvid)[\s_\-]*', + # 来源 + r'[\s_\-]*(bluray|blu-ray|webdl|web-dl|hdtv|dvdrip|bdrip|brrip)[\s_\-]*', + # 音频 + r'[\s_\-]*(dts|ac3|aac|flac|dd)[\s_\-]*', + # 发布组和其他信息 + r'[\s_\-]*([\[\(][^\]\)]+[\]\)])[\s_\-]*', # 所有括号内容 + r'[\s_\-]*([【][^】]+[】])[\s_\-]*', # 中文括号 + r'[\s_\-]*([╬┅┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋]+)[\s_\-]*', # 特殊符号 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, '', name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r'[\._\-\s]+', ' ', name) + name = name.strip() + + return name + + @staticmethod + def extract_core_movie_name(filename): + """提取核心电影名称(最严格的清理)""" + name = MovieMetadataExtractor.extract_movie_name_enhanced(filename) + + # 进一步清理:移除可能的前缀和后缀 + # 常见的无关前缀 + prefixes_to_remove = [ + '电影', '高清', '最新', '完整版', '未删减版', '国语', '英语', + '中字', '中文字幕', '双语字幕', '特效字幕' + ] + + for prefix in prefixes_to_remove: + if name.lower().startswith(prefix.lower()): + name = name[len(prefix):].strip() + + return name + + @staticmethod + def extract_movie_name(filename): + """提取电影名称""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见需要移除的模式 + patterns_to_remove = [ + # 年份 + r'\s*[\(\[]?\d{4}[\)\]]?', + # 分辨率 + r'\s*\d{3,4}[pi]', + r'\s*[24]k', + r'\s*hd', + r'\s*fhd', + r'\s*uhd', + # 编码 + r'\s*x264', r'\s*x265', r'\s*h264', r'\s*h265', r'\s*hevc', + r'\s*avc', r'\s*divx', r'\s*xvid', + # 来源 + r'\s*bluray', r'\s*blu-ray', r'\s*webdl', r'\s*web-dl', + r'\s*hdtv', r'\s*dvdrip', r'\s*bdrip', r'\s*brrip', + # 音频 + r'\s*dts', r'\s*ac3', r'\s*aac', r'\s*flac', r'\s*dd', + # 发布组和其他信息 + r'\s*-\s*[^-]+$', # 最后一个 - 之后的内容 + r'\[[^\]]+\]', # 方括号内容 + r'\([^\)]+\)', # 圆括号内容 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, '', name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r'[\._\-\s]+', ' ', name) + name = name.strip() + + return name + + @staticmethod + def extract_resolution(filename): + """提取分辨率""" + filename_lower = filename.lower() + + resolution_map = { + '2160p': '4K', '4k': '4K', + '1080p': '1080p', + '720p': '720p', + '480p': '480p', + 'hd': 'HD' + } + + for pattern, resolution in resolution_map.items(): + if pattern in filename_lower: + return resolution + + return 'Unknown' + + @staticmethod + def extract_quality_score(filename, file_size): + """计算质量评分""" + score = 0 + + # 基于文件大小的评分 + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + score += 30 + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + score += 20 + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + score += 10 + + # 基于分辨率的评分 + resolution = MovieMetadataExtractor.extract_resolution(filename) + resolution_scores = { + '4K': 25, + '1080p': 20, + '720p': 15, + 'HD': 10, + 'Unknown': 5 + } + score += resolution_scores.get(resolution, 5) + + # 基于编码的评分 + filename_lower = filename.lower() + if 'x265' in filename_lower or 'hevc' in filename_lower: + score += 10 # 更高效的编码 + if 'x264' in filename_lower: + score += 5 + + # 基于来源的评分 + if 'bluray' in filename_lower or 'blu-ray' in filename_lower: + score += 15 + elif 'webdl' in filename_lower or 'web-dl' in filename_lower: + score += 10 + elif 'hdtv' in filename_lower: + score += 5 + + return score + +class AdvancedMovieMetadataExtractor(MovieMetadataExtractor): + """高级电影元数据提取器""" + + @staticmethod + def extract_detailed_metadata(filename, file_path=None): + """提取详细的电影元数据""" + metadata = { + 'title': '', + 'year': '', + 'quality': '', + 'codec': '', + 'source': '', + 'audio': '', + 'group': '' + } + + # 提取年份 + year_match = re.search(r'(19|20)\d{2}', filename) + if year_match: + metadata['year'] = year_match.group() + + # 提取质量信息 + quality_terms = ['4k', '2160p', '1080p', '720p', '480p', 'hd', 'fhd', 'uhd'] + for term in quality_terms: + if term in filename.lower(): + metadata['quality'] = term.upper() + break + + # 提取编码信息 + codec_terms = ['x264', 'x265', 'h264', 'h265', 'hevc', 'avc'] + for term in codec_terms: + if term in filename.lower(): + metadata['codec'] = term.upper() + break + + # 提取来源信息 + source_terms = ['bluray', 'blu-ray', 'webdl', 'web-dl', 'hdtv', 'dvdrip'] + for term in source_terms: + if term in filename.lower(): + metadata['source'] = term.upper() + break + + # 尝试从文件名中提取电影标题(更智能的方法) + metadata['title'] = AdvancedMovieMetadataExtractor.extract_movie_title_advanced(filename) + + return metadata + + @staticmethod + def extract_movie_title_advanced(filename): + """高级电影标题提取""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见的需要移除的模式(更全面的列表) + patterns_to_remove = [ + # 年份模式 + r'[\(\[]?\s*(19|20)\d{2}\s*[\)\]]?', + # 质量模式 + r'\b(4k|2160p|1080p|720p|480p|hd|fhd|uhd)\b', + # 编码模式 + r'\b(x264|x265|h264|h265|hevc|avc|divx|xvid)\b', + # 来源模式 + r'\b(bluray|blu-ray|webdl|web-dl|hdtv|dvdrip|bdrip|brrip)\b', + # 音频模式 + r'\b(dts|ac3|aac|flac|dd|dts-hd|truehd)\b', + # 发布组模式 + r'\[[^\]]+\]', + r'\s*-\s*[^-]+$', + # 特殊字符和序列号 + r'[\(\{\[].*?[\)\}\]]', + r'\b(cd\d|disc\d|part\d)\b', + r'[\._\-]', + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, ' ', name, flags=re.IGNORECASE) + + # 清理多余空格 + name = re.sub(r'\s+', ' ', name).strip() + + # 移除常见的无关词汇 + common_words = [ + 'full', 'movie', 'film', 'video', 'hd', 'fhd', 'uhd', + 'english', 'chinese', 'sub', 'subtitle', 'dubbed', + 'extended', 'director', 'cut', 'theatrical', 'unrated' + ] + + words = name.split() + filtered_words = [word for word in words if word.lower() not in common_words] + + return ' '.join(filtered_words) + +# 尝试导入视频处理相关的库,如果失败则提供回退方案 +try: + import cv2 + import imagehash + from PIL import Image + import numpy as np + from skimage.metrics import structural_similarity as ssim + VIDEO_PROCESSING_AVAILABLE = True +except ImportError as e: + logger.warning(f"视频处理库导入失败: {e}") + logger.warning("基于内容的视频分析功能将被禁用") + VIDEO_PROCESSING_AVAILABLE = False + # 创建虚拟类以避免后续导入错误 + class DummyCV2: + VideoCapture = None + CAP_PROP_FRAME_COUNT = 0 + CAP_PROP_FPS = 0 + CAP_PROP_POS_FRAMES = 0 + COLOR_BGR2GRAY = 0 + + def isOpened(self): return False + def read(self): return False, None + def release(self): pass + + cv2 = DummyCV2() + imagehash = type('DummyImageHash', (), {'average_hash': lambda x: 'dummy'})() + Image = type('DummyImage', (), {'fromarray': lambda x: type('DummyPIL', (), {})()})() + +class VideoFingerprintExtractor: + """视频指纹提取器 - 基于关键帧和音频特征""" + + def __init__(self): + self.frame_hashes = {} + + def extract_key_frames(self, video_path, num_frames=10, skip_start=0.1): + """提取关键帧""" + if not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理功能不可用,跳过关键帧提取") + return [] + + try: + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + logger.warning(f"无法打开视频文件: {video_path}") + return [] + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + fps = cap.get(cv2.CAP_PROP_FPS) + duration = total_frames / fps if fps > 0 else 0 + + # 跳过开头 + start_frame = int(total_frames * skip_start) + cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) + + frames_to_extract = min(num_frames, total_frames - start_frame) + frame_interval = max(1, (total_frames - start_frame) // frames_to_extract) + + key_frames = [] + frame_hashes = [] + + for i in range(frames_to_extract): + frame_pos = start_frame + i * frame_interval + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_pos) + ret, frame = cap.read() + + if ret and frame is not None: + # 转换为灰度图并调整大小以提高处理速度 + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + resized = cv2.resize(gray, (64, 64)) + + # 转换为PIL图像并计算哈希 + pil_img = Image.fromarray(resized) + frame_hash = imagehash.average_hash(pil_img) + + key_frames.append(frame) + frame_hashes.append(str(frame_hash)) + + cap.release() + return frame_hashes + + except Exception as e: + logger.error(f"提取关键帧时出错 {video_path}: {e}") + return [] + + def extract_audio_fingerprint(self, video_path): + """提取音频指纹(简化版)""" + try: + # 使用文件大小和持续时间作为简化的音频特征 + file_size = os.path.getsize(video_path) + + # 尝试获取视频时长 + duration = self.get_video_duration(video_path) + + return f"audio_{file_size}_{duration}" + except Exception as e: + logger.error(f"提取音频指纹时出错 {video_path}: {e}") + return "audio_unknown" + + def get_video_duration(self, video_path): + """获取视频时长""" + try: + result = subprocess.run([ + 'ffprobe', '-v', 'error', '-show_entries', + 'format=duration', '-of', + 'default=noprint_wrappers=1:nokey=1', video_path + ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + + duration = float(result.stdout.strip()) + return duration + except: + # 如果ffprobe不可用,使用OpenCV估算 + try: + if VIDEO_PROCESSING_AVAILABLE: + cap = cv2.VideoCapture(video_path) + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + cap.release() + + if fps > 0: + return frame_count / fps + except: + pass + + return 0 + + def extract_video_fingerprint(self, video_path, num_frames=8, skip_start=0.1): + """提取完整的视频指纹""" + try: + # 提取关键帧哈希 + frame_hashes = self.extract_key_frames(video_path, num_frames, skip_start) + + if not frame_hashes: + return None + + # 提取音频指纹 + audio_fingerprint = self.extract_audio_fingerprint(video_path) + + # 组合指纹 + frame_fingerprint = "_".join(sorted(frame_hashes)) + full_fingerprint = f"video_{frame_fingerprint}_{audio_fingerprint}" + + return full_fingerprint + + except Exception as e: + logger.error(f"提取视频指纹时出错 {video_path}: {e}") + return None + + def calculate_video_similarity(self, fingerprint1, fingerprint2): + """计算两个视频指纹的相似度""" + if not fingerprint1 or not fingerprint2: + return 0 + + if fingerprint1 == fingerprint2: + return 1.0 + + # 简单的相似度计算:基于共同帧哈希的数量 + try: + # 提取帧哈希部分 + frames1 = set(fingerprint1.split('_')[1:-2]) # 去掉video_前缀和音频部分 + frames2 = set(fingerprint2.split('_')[1:-2]) + + if not frames1 or not frames2: + return 0 + + # 计算Jaccard相似度 + intersection = len(frames1.intersection(frames2)) + union = len(frames1.union(frames2)) + + similarity = intersection / union if union > 0 else 0 + return similarity + + except Exception as e: + logger.error(f"计算视频相似度时出错: {e}") + return 0 + +class ContentBasedDuplicateDetector: + """基于内容的重复检测器""" + + def __init__(self, similarity_threshold=0.7): + self.similarity_threshold = similarity_threshold + self.fingerprint_extractor = VideoFingerprintExtractor() + self.metadata_extractor = AdvancedMovieMetadataExtractor() + + def group_similar_movies_by_content(self, files): + """基于内容指纹对电影进行分组""" + if not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理功能不可用,跳过基于内容的分析") + return [] + + logger.info("开始基于内容指纹的电影相似度分析...") + + # 提取所有文件的指纹 + file_fingerprints = {} + for file_info in files: + file_path = file_info['path'] + logger.debug(f"提取指纹: {os.path.basename(file_path)}") + + fingerprint = self.fingerprint_extractor.extract_video_fingerprint(file_path) + if fingerprint: + file_info['content_fingerprint'] = fingerprint + file_fingerprints[file_path] = fingerprint + else: + file_info['content_fingerprint'] = None + + # 基于指纹进行分组 + groups = [] + processed_files = set() + + for file_path1, fingerprint1 in file_fingerprints.items(): + if file_path1 in processed_files: + continue + + current_group = [file_path1] + processed_files.add(file_path1) + + for file_path2, fingerprint2 in file_fingerprints.items(): + if file_path2 in processed_files or file_path1 == file_path2: + continue + + similarity = self.fingerprint_extractor.calculate_video_similarity( + fingerprint1, fingerprint2 + ) + + if similarity >= self.similarity_threshold: + current_group.append(file_path2) + processed_files.add(file_path2) + + if len(current_group) > 1: + groups.append(current_group) + + # 转换为文件信息组 + file_groups = [] + for group in groups: + file_info_group = [] + for file_path in group: + file_info = next((f for f in files if f['path'] == file_path), None) + if file_info: + file_info_group.append(file_info) + file_groups.append(file_info_group) + + logger.info(f"基于内容指纹找到 {len(file_groups)} 组相似电影") + return file_groups + + def enhance_with_metadata_matching(self, files, content_groups): + """使用元数据匹配增强内容分组""" + logger.info("使用元数据匹配增强内容分组...") + + # 为每个文件提取详细元数据 + for file_info in files: + filename = file_info.get('filename', '') + metadata = self.metadata_extractor.extract_detailed_metadata(filename) + file_info['detailed_metadata'] = metadata + + # 基于元数据的补充分组 + metadata_groups = self.group_by_metadata(files) + + # 合并内容分组和元数据分组 + merged_groups = self.merge_groups(content_groups, metadata_groups) + + return merged_groups + + def group_by_metadata(self, files): + """基于元数据分组""" + metadata_groups = {} + + for file_info in files: + metadata = file_info.get('detailed_metadata', {}) + title = metadata.get('title', '').lower().strip() + year = metadata.get('year', '') + + if title and len(title) > 2: + group_key = f"{title}_{year}" if year else title + + if group_key not in metadata_groups: + metadata_groups[group_key] = [] + metadata_groups[group_key].append(file_info) + + # 只返回有多个文件的组 + return [group for group in metadata_groups.values() if len(group) > 1] + + def merge_groups(self, content_groups, metadata_groups): + """合并内容分组和元数据分组""" + all_groups = content_groups.copy() + + for metadata_group in metadata_groups: + # 检查这个元数据组是否已经存在于内容分组中 + found = False + for content_group in content_groups: + common_files = set(f['path'] for f in content_group) & set(f['path'] for f in metadata_group) + if common_files: + # 合并组 + content_group.extend([f for f in metadata_group if f['path'] not in set(f['path'] for f in content_group)]) + found = True + break + + if not found: + all_groups.append(metadata_group) + + return all_groups + +class IntelligentDuplicateCleaner: + # ... 其他代码保持不变 ... + + def remove_similar_duplicates(self, similar_groups, dry_run=True, strategy='quality', no_backup=False): + """删除相似的重复文件 - 修复跨设备移动问题""" + logger.info("开始处理相似电影文件...") + + kept_files = [] + deleted_files = [] + delete_errors = [] + + for group_name, file_group in similar_groups.items(): + if len(file_group) <= 1: + continue + + best_file, files_to_delete = self.select_best_version(file_group, strategy) + + logger.info(f"\n电影组: {group_name}") + logger.info(f" 保留: {best_file['filename']} " + f"(质量分: {best_file.get('quality_score', 0)})") + + kept_files.append(best_file) + + for file_info in files_to_delete: + file_path = file_info['path'] + + if dry_run: + logger.info(f" [干运行] 将删除: {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)})") + else: + try: + if os.path.exists(file_path): + if no_backup: + # 直接删除模式 + os.remove(file_path) + logger.info(f" 🗑️ 已直接删除: {file_info['filename']}") + else: + # 创建备份 - 修复跨设备移动问题 + # 在源文件所在目录创建备份,避免跨设备问题 + source_dir = os.path.dirname(file_path) + backup_dir = os.path.join(source_dir, ".similar_movie_backup") + os.makedirs(backup_dir, exist_ok=True) + + backup_path = os.path.join(backup_dir, os.path.basename(file_path)) + counter = 1 + while os.path.exists(backup_path): + name, ext = os.path.splitext(os.path.basename(file_path)) + backup_path = os.path.join(backup_dir, f"{name}_{counter}{ext}") + counter += 1 + + # 使用shutil.move或copy2+remove来跨设备移动 + try: + # 先尝试在同一设备内移动 + os.rename(file_path, backup_path) + logger.info(f" 已移动相似电影到同设备备份: {file_info['filename']}") + except OSError as e: + if e.errno == 18: # EXDEV - 跨设备链接错误 + # 使用复制+删除的方式跨设备移动 + logger.info(f" 跨设备移动文件,使用复制方式: {file_info['filename']}") + shutil.copy2(file_path, backup_path) # 复制文件和元数据 + os.remove(file_path) # 删除原文件 + logger.info(f" 已复制并删除相似电影到跨设备备份: {file_info['filename']}") + else: + raise # 重新抛出其他错误 + + deleted_files.append(file_path) + # 记录删除操作 + self.db.mark_file_deleted(file_path, "similar_movie") + + else: + logger.warning(f" 文件不存在,跳过删除: {file_info['filename']}") + + except Exception as e: + error_msg = f"删除文件时出错 {file_path}: {e}" + logger.error(error_msg) + delete_errors.append(error_msg) + self.db.add_operation("error", file_path, reason="delete_failed", details=str(e)) + + if delete_errors: + logger.error(f"删除过程中遇到 {len(delete_errors)} 个错误") + + logger.info(f"保留了 {len(kept_files)} 个最佳版本文件") + logger.info(f"删除了 {len(deleted_files)} 个相似电影文件") + + return kept_files, deleted_files + + def remove_empty_folders_efficient(self, target_dir=None): + """高效删除空文件夹 - 修复跨设备问题""" + if target_dir is None: + target_dir = self.target_dirs[0] + + logger.info(f"开始清理空文件夹: {target_dir}") + + empty_folders = [] + + for root, dirs, files in os.walk(target_dir, topdown=False): + # 跳过备份目录和系统目录 + skip_dirs = ['@eaDir', '.Trash', '.duplicate_backup', 'temp_extract', '.similar_movie_backup'] + if any(skip_dir in root for skip_dir in skip_dirs): + continue + + if not dirs and not files and root != target_dir: + try: + # 检查目录是否为空(可能有隐藏文件) + if len(os.listdir(root)) == 0: + os.rmdir(root) + empty_folders.append(root) + self.db.add_operation("delete_folder", root, reason="empty_folder") + logger.debug(f"删除空文件夹: {root}") + except OSError as e: + logger.debug(f"无法删除文件夹 {root}: {e}") + + logger.info(f"删除了 {len(empty_folders)} 个空文件夹") + return empty_folders + + def run_intelligent_cleanup(self, dry_run=True, strategy='quality', + similarity_threshold=0.8, skip_start_percent=0.1, + no_backup=False): + """运行智能清理流程 - 增强版,支持备份策略""" + logger.info("开始智能电影重复文件清理流程(增强版)") + if no_backup: + logger.warning("⚠️ 直接删除模式已启用 - 文件将永久删除,不可恢复!") + start_time = time.time() + + self.db.add_operation("scan_start", str(self.target_dirs), reason=f"intelligent_cleanup_{'no_backup' if no_backup else 'with_backup'}") + + try: + # 1. 扫描所有目录的文件并提取元数据 + all_files = self.scan_files_parallel() + + if not all_files: + logger.warning("没有找到任何视频文件") + return {} + + # 2. 使用增强版算法查找相似的电影文件 + similar_groups = self.find_similar_movies_enhanced( + all_files, similarity_threshold, skip_start_percent + ) + + if not similar_groups: + logger.info("没有找到相似的电影文件") + return {} + + # 3. 删除相似的重复文件 + kept_files, deleted_files = self.remove_similar_duplicates( + similar_groups, dry_run, strategy, no_backup + ) + + # 4. 清理所有目录的空文件夹 + if not dry_run: + for target_dir in self.target_dirs: + self.remove_empty_folders_efficient(target_dir) + + # 记录扫描结束 + self.db.add_operation("scan_complete", str(self.target_dirs), + reason="intelligent_cleanup_enhanced_finished") + + # 计算持续时间 + duration = time.time() - start_time + + # 记录扫描历史 + scan_data = { + 'target_directory': str(self.target_dirs), + 'total_files': len(all_files), + 'similar_groups': len(similar_groups), + 'kept_files': len(kept_files), + 'deleted_files': len(deleted_files), + 'deleted_file_details': deleted_files, + 'duration_seconds': duration, + 'no_backup_mode': no_backup + } + self.db.add_scan_history(scan_data) + + # 显示统计信息 + self.show_intelligent_statistics(scan_data) + + # 只有在备份模式下才显示备份位置 + if not dry_run and deleted_files and not no_backup: + self.show_backup_locations() + + return scan_data + + except Exception as e: + logger.error(f"智能清理过程中发生错误: {e}") + self.db.add_operation("error", "SYSTEM", + reason="intelligent_cleanup_enhanced_failed", details=str(e)) + raise + + def show_backup_locations(self): + """显示备份文件位置信息""" + logger.info("\n备份文件位置:") + backup_dirs_found = set() + + for target_dir in self.target_dirs: + for root, dirs, files in os.walk(target_dir): + if '.similar_movie_backup' in dirs: + backup_dir = os.path.join(root, '.similar_movie_backup') + backup_dirs_found.add(backup_dir) + + if backup_dirs_found: + for backup_dir in backup_dirs_found: + # 计算备份目录中的文件数量 + try: + backup_files = [f for f in os.listdir(backup_dir) + if os.path.isfile(os.path.join(backup_dir, f))] + total_size = sum(os.path.getsize(os.path.join(backup_dir, f)) + for f in backup_files) / (1024*1024*1024) # GB + + logger.info(f" {backup_dir}: {len(backup_files)} 个文件, 总大小: {total_size:.2f} GB") + except OSError as e: + logger.warning(f" 无法访问备份目录 {backup_dir}: {e}") + else: + logger.info(" 未找到备份目录") + +# 在 main() 函数中添加备份策略选项 +def main(): + # 首先声明全局变量 + global logger + + parser = argparse.ArgumentParser(description='智能电影重复文件清理工具 - 增强版') + parser.add_argument('directories', nargs='*', help='要扫描的目录路径(支持多个目录)') + parser.add_argument('--dry-run', action='store_true', help='干运行模式,只显示不会实际删除') + parser.add_argument('--strategy', choices=['quality', 'size', 'resolution', 'newest'], + default='quality', help='选择最佳版本策略(默认: quality)') + parser.add_argument('--similarity-threshold', type=float, default=0.8, + help='相似度阈值(0.0-1.0,默认: 0.8)') + parser.add_argument('--skip-start', type=float, default=0.1, + help='跳过文件开头的比例(0.0-0.5,默认: 0.1)') + parser.add_argument('--db-path', default='file_cleaner.db', help='数据库文件路径') + parser.add_argument('--workers', type=int, default=4, help='并行工作线程数 (默认: 4)') + parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], + default='INFO', help='日志级别 (默认: INFO)') + parser.add_argument('--log-file', default='duplicate_cleaner.log', help='日志文件路径') + parser.add_argument('--prefer-folder', nargs='+', help='优先保留的文件夹(当文件质量相同时)') + parser.add_argument('--content-analysis', action='store_true', + help='启用基于内容的分析(更准确但更慢)') + parser.add_argument('--no-content-analysis', action='store_true', + help='禁用基于内容的分析(更快但准确性较低)') + parser.add_argument('--backup-dir', help='指定备份目录路径(避免跨设备问题)') + parser.add_argument('--no-backup', action='store_true', help='不创建备份(直接删除文件)') + + args = parser.parse_args() + + # 处理目录参数 + if not args.directories: + args.directories = [os.getcwd()] + + # 验证目录参数 - 使用 print 而不是 logger + for directory in args.directories: + if not os.path.exists(directory): + print(f"错误: 目录 {directory} 不存在") + return + + # 验证参数 + if args.skip_start < 0 or args.skip_start > 0.5: + print("错误: --skip-start 参数必须在 0.0 到 0.5 之间") + return + + # 重新配置日志(根据命令行参数) + log_level = getattr(logging, args.log_level) + logger = setup_logging(log_level, args.log_file) + + # 现在可以使用 logger 了 + if len(args.directories) == 1 and args.directories[0] == os.getcwd(): + logger.info(f"未指定目录,使用当前目录: {args.directories[0]}") + + # 确定是否使用内容分析 + use_content_analysis = True + if args.no_content_analysis: + use_content_analysis = False + elif args.content_analysis: + use_content_analysis = True + + # 如果视频处理库不可用,强制禁用内容分析 + if use_content_analysis and not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理库不可用,自动禁用内容分析") + use_content_analysis = False + + logger.info(f"启动智能电影重复文件清理器") + logger.info(f"目标目录: {args.directories}") + logger.info(f"选择策略: {args.strategy}") + logger.info(f"相似阈值: {args.similarity_threshold}") + if args.prefer_folder: + logger.info(f"优先文件夹: {args.prefer_folder}") + if args.backup_dir: + logger.info(f"指定备份目录: {args.backup_dir}") + if args.no_backup: + logger.warning("警告: 已启用直接删除模式,不会创建备份!") + + cleaner = IntelligentDuplicateCleaner( + args.directories, + args.db_path, + args.workers, + args.prefer_folder + ) + + try: + if use_content_analysis: + logger.info("使用基于内容的高级分析模式") + result = cleaner.run_advanced_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + use_content_analysis=use_content_analysis + ) + else: + # 使用新的直接删除模式 + result = cleaner.run_intelligent_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + skip_start_percent=args.skip_start, + no_backup=args.no_backup # 传递这个参数 + ) + + if not args.dry_run and result: + logger.info(f"\n=== 清理总结 ===") + logger.info(f"相似电影组: {result.get('similar_groups', 0)} 组") + logger.info(f"保留文件: {result.get('kept_files', 0)} 个") + logger.info(f"删除文件: {result.get('deleted_files', 0)} 个") + logger.info(f"耗时: {result.get('duration_seconds', 0):.2f} 秒") + + # 显示备份信息 + if not args.no_backup: + cleaner.show_backup_locations() + + except KeyboardInterrupt: + logger.info("\n用户中断操作") + cleaner.db.add_operation("error", "SYSTEM", reason="user_interrupt") + except Exception as e: + logger.error(f"发生错误: {e}") + cleaner.db.add_operation("error", "SYSTEM", reason="exception", details=str(e)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/历史版本/duplicate_cleanerV5视频解析2.py b/历史版本/duplicate_cleanerV5视频解析2.py new file mode 100644 index 0000000..5dcfec0 --- /dev/null +++ b/历史版本/duplicate_cleanerV5视频解析2.py @@ -0,0 +1,2071 @@ +import os +import hashlib +import zipfile +import rarfile +import subprocess +from datetime import datetime +import argparse +import sqlite3 +import logging +from typing import Dict, List, Any, Set, Tuple +from concurrent.futures import ThreadPoolExecutor, as_completed +import time +import re +from pathlib import Path +import shutil # 添加这个导入 + + +# 配置日志系统 +def setup_logging(log_level=logging.INFO, log_file="duplicate_cleaner.log"): + """设置日志配置""" + logging.basicConfig( + level=log_level, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + logging.FileHandler(log_file, encoding="utf-8"), + logging.StreamHandler(), + ], + ) + return logging.getLogger(__name__) + + +logger = setup_logging() + + +class PerformanceOptimizedFileDatabase: + def __init__(self, db_path: str = "file_cleaner.db"): + self.db_path = db_path + self.batch_size = 1000 + self.init_database() + + def init_database(self): + """初始化数据库表结构""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + cursor.execute("PRAGMA journal_mode=WAL") + cursor.execute("PRAGMA synchronous=NORMAL") + cursor.execute("PRAGMA cache_size=-64000") + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE, + file_hash TEXT, + file_size INTEGER, + file_type TEXT, + mod_time DATETIME, + is_archive BOOLEAN DEFAULT 0, + archive_path TEXT, + is_deleted BOOLEAN DEFAULT 0, + created_time DATETIME DEFAULT CURRENT_TIMESTAMP, + last_scanned DATETIME DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS operations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + operation_type TEXT, + file_path TEXT, + file_hash TEXT, + reason TEXT, + details TEXT, + operation_time DATETIME DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS scan_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + scan_time DATETIME DEFAULT CURRENT_TIMESTAMP, + target_directory TEXT, + total_files INTEGER, + duplicate_groups INTEGER, + deleted_files INTEGER, + deleted_archives INTEGER, + duration_seconds REAL + ) + """ + ) + + cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path)") + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_files_deleted ON files(is_deleted)" + ) + cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_size ON files(file_size)") + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_operations_time ON operations(operation_time)" + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)" + ) + + conn.commit() + conn.close() + logger.info("数据库初始化完成") + + def bulk_add_files(self, file_infos: List[Dict[str, Any]]): + """批量添加文件记录""" + if not file_infos: + return + + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + for i in range(0, len(file_infos), self.batch_size): + batch = file_infos[i : i + self.batch_size] + placeholders = [] + values = [] + + for file_info in batch: + placeholders.append("(?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)") + values.extend( + [ + file_info["path"], + file_info["hash"], + file_info.get("size", 0), + file_info.get("type", "unknown"), + file_info["mod_time"], + file_info.get("is_archive", False), + file_info.get("archive_path"), + 0, + ] + ) + + sql = f""" + INSERT OR REPLACE INTO files + (file_path, file_hash, file_size, file_type, mod_time, is_archive, archive_path, is_deleted, last_scanned) + VALUES {",".join(placeholders)} + """ + cursor.execute(sql, values) + + conn.commit() + logger.debug(f"批量添加了 {len(file_infos)} 个文件记录") + except Exception as e: + logger.error(f"批量添加文件记录时出错: {e}") + conn.rollback() + finally: + conn.close() + + def mark_file_deleted(self, file_path: str, reason: str = "duplicate"): + """标记文件为已删除""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute( + """ + UPDATE files + SET is_deleted = 1, last_scanned = CURRENT_TIMESTAMP + WHERE file_path = ? + """, + (file_path,), + ) + + cursor.execute( + "SELECT file_hash FROM files WHERE file_path = ?", (file_path,) + ) + result = cursor.fetchone() + file_hash = result[0] if result else None + + self.add_operation("delete", file_path, file_hash, reason) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (标记删除): {e}") + finally: + conn.close() + + def add_operation( + self, + operation_type: str, + file_path: str, + file_hash: str = None, + reason: str = "", + details: str = "", + ): + """添加操作记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute( + """ + INSERT INTO operations (operation_type, file_path, file_hash, reason, details) + VALUES (?, ?, ?, ?, ?) + """, + (operation_type, file_path, file_hash, reason, details), + ) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加操作): {e}") + finally: + conn.close() + + def add_scan_history(self, scan_data: Dict[str, Any]): + """添加扫描历史记录""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute( + """ + INSERT INTO scan_history + (target_directory, total_files, duplicate_groups, deleted_files, deleted_archives, duration_seconds) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + scan_data.get("target_directory", ""), + scan_data.get("total_files", 0), + scan_data.get("duplicate_groups", 0), + scan_data.get("deleted_files", 0), + scan_data.get("deleted_archives", 0), + scan_data.get("duration_seconds", 0), + ), + ) + + conn.commit() + except Exception as e: + logger.error(f"数据库错误 (添加扫描历史): {e}") + finally: + conn.close() + + def get_scan_statistics(self) -> Dict[str, Any]: + """获取扫描统计信息""" + conn = sqlite3.connect(self.db_path) + cursor = conn.cursor() + + try: + cursor.execute("SELECT COUNT(*) FROM files") + total_files = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM files WHERE is_deleted = 1") + deleted_files = cursor.fetchone()[0] + + cursor.execute( + "SELECT COUNT(DISTINCT file_hash) FROM files WHERE is_deleted = 0" + ) + unique_files = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM operations") + total_operations = cursor.fetchone()[0] + + return { + "total_files": total_files, + "deleted_files": deleted_files, + "unique_files": unique_files, + "total_operations": total_operations, + } + except Exception as e: + logger.error(f"数据库错误 (获取统计): {e}") + return {} + finally: + conn.close() + + +class MovieMetadataExtractor: + """电影元数据提取器""" + + # 常见分辨率模式 + RESOLUTION_PATTERNS = [ + r"(\d{3,4}[pi])", # 1080p, 720p, 480p, 2160p + r"([24]k)", # 2k, 4k + r"(hd)", # hd + r"(fhd)", # fhd + r"(uhd)", # uhd + ] + + # 常见编码格式 + CODEC_PATTERNS = [ + r"(x264)", + r"(x265)", + r"(h264)", + r"(h265)", + r"(hevc)", + r"(avc)", + r"(divx)", + r"(xvid)", + ] + + # 常见来源 + SOURCE_PATTERNS = [ + r"(bluray)", + r"(blu-ray)", + r"(webdl)", + r"(web-dl)", + r"(hdtv)", + r"(dvdrip)", + r"(bdrip)", + r"(brrip)", + ] + + # 常见音频格式 + AUDIO_PATTERNS = [r"(dts)", r"(ac3)", r"(aac)", r"(flac)", r"(dd)"] + + # 常见需要移除的模式 - 增强版 + @staticmethod + def extract_movie_name_enhanced(filename): + """增强版电影名称提取""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 更严格的模式匹配 + patterns_to_remove = [ + # 广告相关模式 + r"[\[\(]?广告[\]\)]?", + r"[\[\(]?推广[\]\)]?", + r"[\[\(]?宣传[\]\)]?", + r"[\[\(]?片头[\]\)]?", + r"[\[\(]?片花[\]\)]?", + r"^[^a-zA-Z0-9\u4e00-\u9fff]*", # 开头的特殊字符 + r"[\s_\-]*([\[\(]?\d{4}[\]\)]?)[\s_\-]*", # 年份 + # 分辨率 + r"[\s_\-]*(\d{3,4}[pi])[\s_\-]*", + r"[\s_\-]*([24]k)[\s_\-]*", + r"[\s_\-]*(hd|fhd|uhd)[\s_\-]*", + # 编码 + r"[\s_\-]*(x264|x265|h264|h265|hevc|avc|divx|xvid)[\s_\-]*", + # 来源 + r"[\s_\-]*(bluray|blu-ray|webdl|web-dl|hdtv|dvdrip|bdrip|brrip)[\s_\-]*", + # 音频 + r"[\s_\-]*(dts|ac3|aac|flac|dd)[\s_\-]*", + # 发布组和其他信息 + r"[\s_\-]*([\[\(][^\]\)]+[\]\)])[\s_\-]*", # 所有括号内容 + r"[\s_\-]*([【][^】]+[】])[\s_\-]*", # 中文括号 + r"[\s_\-]*([╬┅┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋]+)[\s_\-]*", # 特殊符号 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, "", name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r"[\._\-\s]+", " ", name) + name = name.strip() + + return name + + @staticmethod + def extract_core_movie_name(filename): + """提取核心电影名称(最严格的清理)""" + name = MovieMetadataExtractor.extract_movie_name_enhanced(filename) + + # 进一步清理:移除可能的前缀和后缀 + # 常见的无关前缀 + prefixes_to_remove = [ + "电影", + "高清", + "最新", + "完整版", + "未删减版", + "国语", + "英语", + "中字", + "中文字幕", + "双语字幕", + "特效字幕", + ] + + for prefix in prefixes_to_remove: + if name.lower().startswith(prefix.lower()): + name = name[len(prefix) :].strip() + + return name + + @staticmethod + def extract_movie_name(filename): + """提取电影名称""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见需要移除的模式 + patterns_to_remove = [ + # 年份 + r"\s*[\(\[]?\d{4}[\)\]]?", + # 分辨率 + r"\s*\d{3,4}[pi]", + r"\s*[24]k", + r"\s*hd", + r"\s*fhd", + r"\s*uhd", + # 编码 + r"\s*x264", + r"\s*x265", + r"\s*h264", + r"\s*h265", + r"\s*hevc", + r"\s*avc", + r"\s*divx", + r"\s*xvid", + # 来源 + r"\s*bluray", + r"\s*blu-ray", + r"\s*webdl", + r"\s*web-dl", + r"\s*hdtv", + r"\s*dvdrip", + r"\s*bdrip", + r"\s*brrip", + # 音频 + r"\s*dts", + r"\s*ac3", + r"\s*aac", + r"\s*flac", + r"\s*dd", + # 发布组和其他信息 + r"\s*-\s*[^-]+$", # 最后一个 - 之后的内容 + r"\[[^\]]+\]", # 方括号内容 + r"\([^\)]+\)", # 圆括号内容 + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, "", name, flags=re.IGNORECASE) + + # 清理多余空格和分隔符 + name = re.sub(r"[\._\-\s]+", " ", name) + name = name.strip() + + return name + + @staticmethod + def extract_resolution(filename): + """提取分辨率""" + filename_lower = filename.lower() + + resolution_map = { + "2160p": "4K", + "4k": "4K", + "1080p": "1080p", + "720p": "720p", + "480p": "480p", + "hd": "HD", + } + + for pattern, resolution in resolution_map.items(): + if pattern in filename_lower: + return resolution + + return "Unknown" + + @staticmethod + def extract_quality_score(filename, file_size): + """计算质量评分""" + score = 0 + + # 基于文件大小的评分 + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + score += 30 + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + score += 20 + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + score += 10 + + # 基于分辨率的评分 + resolution = MovieMetadataExtractor.extract_resolution(filename) + resolution_scores = {"4K": 25, "1080p": 20, "720p": 15, "HD": 10, "Unknown": 5} + score += resolution_scores.get(resolution, 5) + + # 基于编码的评分 + filename_lower = filename.lower() + if "x265" in filename_lower or "hevc" in filename_lower: + score += 10 # 更高效的编码 + if "x264" in filename_lower: + score += 5 + + # 基于来源的评分 + if "bluray" in filename_lower or "blu-ray" in filename_lower: + score += 15 + elif "webdl" in filename_lower or "web-dl" in filename_lower: + score += 10 + elif "hdtv" in filename_lower: + score += 5 + + return score + + +class AdvancedMovieMetadataExtractor(MovieMetadataExtractor): + """高级电影元数据提取器""" + + @staticmethod + def extract_detailed_metadata(filename, file_path=None): + """提取详细的电影元数据""" + metadata = { + "title": "", + "year": "", + "quality": "", + "codec": "", + "source": "", + "audio": "", + "group": "", + } + + # 提取年份 + year_match = re.search(r"(19|20)\d{2}", filename) + if year_match: + metadata["year"] = year_match.group() + + # 提取质量信息 + quality_terms = ["4k", "2160p", "1080p", "720p", "480p", "hd", "fhd", "uhd"] + for term in quality_terms: + if term in filename.lower(): + metadata["quality"] = term.upper() + break + + # 提取编码信息 + codec_terms = ["x264", "x265", "h264", "h265", "hevc", "avc"] + for term in codec_terms: + if term in filename.lower(): + metadata["codec"] = term.upper() + break + + # 提取来源信息 + source_terms = ["bluray", "blu-ray", "webdl", "web-dl", "hdtv", "dvdrip"] + for term in source_terms: + if term in filename.lower(): + metadata["source"] = term.upper() + break + + # 尝试从文件名中提取电影标题(更智能的方法) + metadata["title"] = AdvancedMovieMetadataExtractor.extract_movie_title_advanced( + filename + ) + + return metadata + + @staticmethod + def extract_movie_title_advanced(filename): + """高级电影标题提取""" + # 移除扩展名 + name = os.path.splitext(filename)[0] + + # 常见的需要移除的模式(更全面的列表) + patterns_to_remove = [ + # 年份模式 + r"[\(\[]?\s*(19|20)\d{2}\s*[\)\]]?", + # 质量模式 + r"\b(4k|2160p|1080p|720p|480p|hd|fhd|uhd)\b", + # 编码模式 + r"\b(x264|x265|h264|h265|hevc|avc|divx|xvid)\b", + # 来源模式 + r"\b(bluray|blu-ray|webdl|web-dl|hdtv|dvdrip|bdrip|brrip)\b", + # 音频模式 + r"\b(dts|ac3|aac|flac|dd|dts-hd|truehd)\b", + # 发布组模式 + r"\[[^\]]+\]", + r"\s*-\s*[^-]+$", + # 特殊字符和序列号 + r"[\(\{\[].*?[\)\}\]]", + r"\b(cd\d|disc\d|part\d)\b", + r"[\._\-]", + ] + + for pattern in patterns_to_remove: + name = re.sub(pattern, " ", name, flags=re.IGNORECASE) + + # 清理多余空格 + name = re.sub(r"\s+", " ", name).strip() + + # 移除常见的无关词汇 + common_words = [ + "full", + "movie", + "film", + "video", + "hd", + "fhd", + "uhd", + "english", + "chinese", + "sub", + "subtitle", + "dubbed", + "extended", + "director", + "cut", + "theatrical", + "unrated", + ] + + words = name.split() + filtered_words = [word for word in words if word.lower() not in common_words] + + return " ".join(filtered_words) + + +# 尝试导入视频处理相关的库,如果失败则提供回退方案 +try: + import cv2 + import imagehash + from PIL import Image + import numpy as np + from skimage.metrics import structural_similarity as ssim + + VIDEO_PROCESSING_AVAILABLE = True +except ImportError as e: + logger.warning(f"视频处理库导入失败: {e}") + logger.warning("基于内容的视频分析功能将被禁用") + VIDEO_PROCESSING_AVAILABLE = False + + # 创建虚拟类以避免后续导入错误 + class DummyCV2: + VideoCapture = None + CAP_PROP_FRAME_COUNT = 0 + CAP_PROP_FPS = 0 + CAP_PROP_POS_FRAMES = 0 + COLOR_BGR2GRAY = 0 + + def isOpened(self): + return False + + def read(self): + return False, None + + def release(self): + pass + + cv2 = DummyCV2() + imagehash = type("DummyImageHash", (), {"average_hash": lambda x: "dummy"})() + Image = type( + "DummyImage", (), {"fromarray": lambda x: type("DummyPIL", (), {})()} + )() + + +class VideoFingerprintExtractor: + """视频指纹提取器 - 基于关键帧和音频特征""" + + def __init__(self): + self.frame_hashes = {} + + def extract_key_frames(self, video_path, num_frames=10, skip_start=0.1): + """提取关键帧 - 修复除以零错误""" + if not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理功能不可用,跳过关键帧提取") + return [] + + cap = None + try: + # 抑制 FFmpeg 警告 + import os + + os.environ["OPENCV_FFMPEG_LOGLEVEL"] = "0" + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + logger.warning(f"无法打开视频文件: {video_path}") + return [] + + # 获取视频属性并检查有效性 + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + fps = cap.get(cv2.CAP_PROP_FPS) + + # 防止除以零错误和无效值 + if fps <= 0: + logger.warning(f"视频FPS无效: {video_path} (fps: {fps})") + return [] + + if total_frames <= 0: + logger.warning(f"视频总帧数无效: {video_path} (总帧数: {total_frames})") + return [] + + # 计算持续时间 + duration = total_frames / fps + if duration <= 0: + logger.warning(f"视频时长无效: {video_path} (时长: {duration})") + return [] + + # 跳过开头 + start_frame = int(total_frames * skip_start) + if start_frame >= total_frames: + start_frame = max(0, total_frames - 1) + + cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) + + # 计算要提取的帧数 + available_frames = total_frames - start_frame + if available_frames <= 0: + logger.warning(f"跳过开头后无可用帧: {video_path}") + return [] + + frames_to_extract = min(num_frames, available_frames) + + # 防止除以零错误 + if frames_to_extract <= 0: + logger.warning(f"无可用帧可提取: {video_path}") + return [] + + frame_interval = max(1, available_frames // frames_to_extract) + + key_frames = [] + frame_hashes = [] + + for i in range(frames_to_extract): + frame_pos = start_frame + i * frame_interval + if frame_pos >= total_frames: + break + + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_pos) + ret, frame = cap.read() + + if ret and frame is not None: + try: + # 转换为灰度图并调整大小以提高处理速度 + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + resized = cv2.resize(gray, (64, 64)) + + # 转换为PIL图像并计算哈希 + pil_img = Image.fromarray(resized) + frame_hash = imagehash.average_hash(pil_img) + + key_frames.append(frame) + frame_hashes.append(str(frame_hash)) + except Exception as frame_error: + logger.debug(f"处理帧时出错 {video_path} 帧 {i}: {frame_error}") + continue + + return frame_hashes + + except Exception as e: + logger.error(f"提取关键帧时出错 {video_path}: {e}") + return [] + finally: + # 确保资源被释放 + if cap is not None: + cap.release() + + def extract_audio_fingerprint(self, video_path): + """提取音频指纹(简化版)""" + try: + # 使用文件大小和持续时间作为简化的音频特征 + file_size = os.path.getsize(video_path) + + # 尝试获取视频时长 + duration = self.get_video_duration(video_path) + + return f"audio_{file_size}_{duration}" + except Exception as e: + logger.error(f"提取音频指纹时出错 {video_path}: {e}") + return "audio_unknown" + + def get_video_duration(self, video_path): + """获取视频时长 - 增强错误处理""" + try: + # 首先尝试使用 OpenCV 获取时长 + if VIDEO_PROCESSING_AVAILABLE: + cap = cv2.VideoCapture(video_path) + if cap.isOpened(): + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + cap.release() + + # 防止除以零 + if fps > 0 and frame_count > 0: + duration = frame_count / fps + if duration > 0: + return duration + + # 如果 OpenCV 失败,尝试使用 ffprobe + try: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + video_path, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, # 捕获 stderr 避免输出到控制台 + text=True, + timeout=30, # 30秒超时 + ) + + if result.returncode == 0: + duration = float(result.stdout.strip()) + if duration > 0: + return duration + except ( + subprocess.TimeoutExpired, + subprocess.SubprocessError, + ValueError, + ) as e: + logger.debug(f"ffprobe 获取时长失败 {video_path}: {e}") + + return 0 # 默认返回0 + + except Exception as e: + logger.debug(f"获取视频时长时出错 {video_path}: {e}") + return 0 + + def extract_video_fingerprint(self, video_path, num_frames=8, skip_start=0.1): + """提取完整的视频指纹 - 增强错误处理""" + try: + # 首先检查文件是否存在且可读 + if not os.path.exists(video_path): + logger.warning(f"视频文件不存在: {video_path}") + return None + + if not os.access(video_path, os.R_OK): + logger.warning(f"视频文件不可读: {video_path}") + return None + + # 提取关键帧哈希 + frame_hashes = self.extract_key_frames(video_path, num_frames, skip_start) + + if not frame_hashes: + logger.debug(f"无法提取关键帧哈希: {video_path}") + return None + + # 提取音频指纹 + audio_fingerprint = self.extract_audio_fingerprint(video_path) + + # 组合指纹 + frame_fingerprint = "_".join(sorted(frame_hashes)) + full_fingerprint = f"video_{frame_fingerprint}_{audio_fingerprint}" + + return full_fingerprint + + except Exception as e: + logger.error(f"提取视频指纹时出错 {video_path}: {e}") + return None + + def calculate_video_similarity(self, fingerprint1, fingerprint2): + """计算两个视频指纹的相似度""" + if not fingerprint1 or not fingerprint2: + return 0 + + if fingerprint1 == fingerprint2: + return 1.0 + + # 简单的相似度计算:基于共同帧哈希的数量 + try: + # 提取帧哈希部分 + parts1 = fingerprint1.split("_") + parts2 = fingerprint2.split("_") + + # 确保指纹格式正确 + if len(parts1) < 3 or len(parts2) < 3: + return 0 + + frames1 = set(parts1[1:-2]) # 去掉video_前缀和音频部分 + frames2 = set(parts2[1:-2]) + + if not frames1 or not frames2: + return 0 + + # 计算Jaccard相似度 + intersection = len(frames1.intersection(frames2)) + union = len(frames1.union(frames2)) + + similarity = intersection / union if union > 0 else 0 + return similarity + + except Exception as e: + logger.error(f"计算视频相似度时出错: {e}") + return 0 + + +class ContentBasedDuplicateDetector: + """基于内容的重复检测器""" + + def __init__(self, similarity_threshold=0.7): + self.similarity_threshold = similarity_threshold + self.fingerprint_extractor = VideoFingerprintExtractor() + self.metadata_extractor = AdvancedMovieMetadataExtractor() + + def group_similar_movies_by_content(self, files): + """基于内容指纹对电影进行分组""" + if not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理功能不可用,跳过基于内容的分析") + return [] + + logger.info("开始基于内容指纹的电影相似度分析...") + + # 提取所有文件的指纹 + file_fingerprints = {} + for file_info in files: + file_path = file_info["path"] + logger.debug(f"提取指纹: {os.path.basename(file_path)}") + + fingerprint = self.fingerprint_extractor.extract_video_fingerprint( + file_path + ) + if fingerprint: + file_info["content_fingerprint"] = fingerprint + file_fingerprints[file_path] = fingerprint + else: + file_info["content_fingerprint"] = None + + # 基于指纹进行分组 + groups = [] + processed_files = set() + + for file_path1, fingerprint1 in file_fingerprints.items(): + if file_path1 in processed_files: + continue + + current_group = [file_path1] + processed_files.add(file_path1) + + for file_path2, fingerprint2 in file_fingerprints.items(): + if file_path2 in processed_files or file_path1 == file_path2: + continue + + similarity = self.fingerprint_extractor.calculate_video_similarity( + fingerprint1, fingerprint2 + ) + + if similarity >= self.similarity_threshold: + current_group.append(file_path2) + processed_files.add(file_path2) + + if len(current_group) > 1: + groups.append(current_group) + + # 转换为文件信息组 + file_groups = [] + for group in groups: + file_info_group = [] + for file_path in group: + file_info = next((f for f in files if f["path"] == file_path), None) + if file_info: + file_info_group.append(file_info) + file_groups.append(file_info_group) + + logger.info(f"基于内容指纹找到 {len(file_groups)} 组相似电影") + return file_groups + + def enhance_with_metadata_matching(self, files, content_groups): + """使用元数据匹配增强内容分组""" + logger.info("使用元数据匹配增强内容分组...") + + # 为每个文件提取详细元数据 + for file_info in files: + filename = file_info.get("filename", "") + metadata = self.metadata_extractor.extract_detailed_metadata(filename) + file_info["detailed_metadata"] = metadata + + # 基于元数据的补充分组 + metadata_groups = self.group_by_metadata(files) + + # 合并内容分组和元数据分组 + merged_groups = self.merge_groups(content_groups, metadata_groups) + + return merged_groups + + def group_by_metadata(self, files): + """基于元数据分组""" + metadata_groups = {} + + for file_info in files: + metadata = file_info.get("detailed_metadata", {}) + title = metadata.get("title", "").lower().strip() + year = metadata.get("year", "") + + if title and len(title) > 2: + group_key = f"{title}_{year}" if year else title + + if group_key not in metadata_groups: + metadata_groups[group_key] = [] + metadata_groups[group_key].append(file_info) + + # 只返回有多个文件的组 + return [group for group in metadata_groups.values() if len(group) > 1] + + def merge_groups(self, content_groups, metadata_groups): + """合并内容分组和元数据分组""" + all_groups = content_groups.copy() + + for metadata_group in metadata_groups: + # 检查这个元数据组是否已经存在于内容分组中 + found = False + for content_group in content_groups: + common_files = set(f["path"] for f in content_group) & set( + f["path"] for f in metadata_group + ) + if common_files: + # 合并组 + content_group.extend( + [ + f + for f in metadata_group + if f["path"] not in set(f["path"] for f in content_group) + ] + ) + found = True + break + + if not found: + all_groups.append(metadata_group) + + return all_groups + + +class IntelligentDuplicateCleaner: + def __init__( + self, target_dirs, db_path="file_cleaner.db", max_workers=4, prefer_folders=None + ): + # 修改为支持多个目录 + if isinstance(target_dirs, str): + self.target_dirs = [target_dirs] + else: + self.target_dirs = target_dirs + + self.prefer_folders = prefer_folders or [] + self.db = PerformanceOptimizedFileDatabase(db_path) + self.max_workers = max_workers + self.metadata_extractor = MovieMetadataExtractor() + # 添加内容检测器 + self.content_detector = ContentBasedDuplicateDetector() + + # 媒体文件扩展名 + self.video_extensions = { + ".mp4", + ".avi", + ".mkv", + ".mov", + ".wmv", + ".flv", + ".webm", + ".m4v", + ".3gp", + ".mpg", + ".mpeg", + ".ts", + ".m2ts", + ".vob", + ".rmvb", + } + self.audio_extensions = { + ".mp3", + ".wav", + ".flac", + ".aac", + ".ogg", + ".wma", + ".m4a", + ".aiff", + ".ape", + ".opus", + ".amr", + } + + # 性能统计 + self.stats = { + "files_processed": 0, + "files_skipped": 0, + "hash_time": 0, + "start_time": None, + } + + self.hash_cache = {} + + logger.info(f"初始化智能重复文件清理器,目标目录: {target_dirs}") + + def get_file_source_folder(self, file_path): + """获取文件所属的源文件夹""" + for target_dir in self.target_dirs: + if file_path.startswith(target_dir): + return target_dir + return None + + def get_file_hash_complete(self, file_path): + """完整文件哈希计算""" + hash_md5 = hashlib.md5() + try: + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"计算文件完整哈希时出错 {file_path}: {e}") + return None + + def get_file_sample_hash(self, file_path, sample_points=3, sample_size=4096): + """文件采样哈希""" + try: + file_size = os.path.getsize(file_path) + if file_size <= sample_size * sample_points: + # 小文件直接计算完整哈希 + return self.get_file_hash_complete(file_path) + + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + # 采样点:开头、25%、50%、75%、结尾 + positions = [ + 0, # 开头 + file_size // 4 - sample_size // 2, # 25% + file_size // 2 - sample_size // 2, # 50% + file_size * 3 // 4 - sample_size // 2, # 75% + file_size - sample_size, # 结尾 + ] + + for pos in positions[:sample_points]: + if pos < 0: + pos = 0 + f.seek(pos) + hash_md5.update(f.read(sample_size)) + + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"文件采样时出错 {file_path}: {e}") + return None + + def extract_content_signature(self, file_path, skip_start_percent=0.01): + """提取内容特征签名 - 跳过开头部分避免广告影响""" + try: + file_size = os.path.getsize(file_path) + + # 跳过开头的部分(通常是广告) + skip_bytes = int(file_size * skip_start_percent) + + # 简单的内容特征提取策略 + signature_parts = [] + + # 1. 文件大小范围 + size_bucket = self.get_size_bucket(file_size) + signature_parts.append(f"size_{size_bucket}") + + # 2. 跳过开头的文件采样哈希 + sample_hash = self.get_file_sample_hash_skip_start(file_path, skip_bytes) + if sample_hash: + signature_parts.append(f"sample_{sample_hash[:12]}") + + return "_".join(signature_parts) + + except Exception as e: + logger.error(f"提取内容特征时出错 {file_path}: {e}") + return None + + def get_file_sample_hash_skip_start( + self, file_path, skip_bytes, sample_points=4, sample_size=8192 + ): + """文件采样哈希 - 跳过开头指定字节数""" + try: + file_size = os.path.getsize(file_path) + if file_size <= skip_bytes + sample_size * sample_points: + # 如果文件太小,使用完整哈希但跳过开头 + return self.get_file_hash_skip_start(file_path, skip_bytes) + + hash_md5 = hashlib.md5() + + with open(file_path, "rb") as f: + # 跳过开头指定字节 + f.seek(skip_bytes) + + # 采样点:跳过开头后的位置 + positions = [ + skip_bytes, # 跳过后的开头 + skip_bytes + (file_size - skip_bytes) // 3, # 1/3处 + skip_bytes + (file_size - skip_bytes) * 2 // 3, # 2/3处 + file_size - sample_size, # 结尾 + ] + + for pos in positions[:sample_points]: + if pos < skip_bytes: + pos = skip_bytes + if pos + sample_size > file_size: + pos = file_size - sample_size + f.seek(pos) + hash_md5.update(f.read(sample_size)) + + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"文件采样时出错 {file_path}: {e}") + return None + + def get_file_hash_skip_start(self, file_path, skip_bytes): + """完整文件哈希 - 跳过开头指定字节数""" + hash_md5 = hashlib.md5() + try: + with open(file_path, "rb") as f: + # 跳过开头 + f.seek(skip_bytes) + for chunk in iter(lambda: f.read(8192), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + except Exception as e: + logger.error(f"计算文件哈希时出错 {file_path}: {e}") + return None + + def get_size_bucket(self, file_size): + """将文件大小分桶""" + if file_size > 8 * 1024 * 1024 * 1024: # >8GB + return "xl" + elif file_size > 4 * 1024 * 1024 * 1024: # >4GB + return "large" + elif file_size > 2 * 1024 * 1024 * 1024: # >2GB + return "medium" + elif file_size > 1 * 1024 * 1024 * 1024: # >1GB + return "small" + else: + return "tiny" + + def process_single_file(self, file_path): + """处理单个文件,提取元数据""" + if not os.path.exists(file_path): + return None + # 检查文件是否可读 + if not os.access(file_path, os.R_OK): + logger.debug(f"文件不可读,跳过: {file_path}") + self.stats["files_skipped"] += 1 + return None + + # 检查文件大小,跳过过小或过大的文件 + try: + file_size = os.path.getsize(file_path) + if file_size < 1024: # 小于1KB的文件跳过 + logger.debug(f"文件过小,跳过: {file_path}") + self.stats["files_skipped"] += 1 + return None + if file_size > 100 * 1024 * 1024 * 1024: # 大于100GB的文件跳过 + logger.debug(f"文件过大,跳过: {file_path}") + self.stats["files_skipped"] += 1 + return None + except OSError: + self.stats["files_skipped"] += 1 + return None + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext in self.video_extensions: + start_time = time.time() + + file_stat = os.stat(file_path) + cache_key = (file_path, file_stat.st_size, file_stat.st_mtime) + + if cache_key in self.hash_cache: + file_hash = self.hash_cache[cache_key] + else: + # 对于大视频文件,使用采样哈希 + if file_stat.st_size > 500 * 1024 * 1024: # >500MB + file_hash = self.get_file_sample_hash(file_path) + else: + file_hash = self.get_file_hash_complete(file_path) + + if file_hash: + self.hash_cache[cache_key] = file_hash + + hash_time = time.time() - start_time + self.stats["hash_time"] += hash_time + + if file_hash: + # 提取电影元数据 + filename = os.path.basename(file_path) + movie_name = self.metadata_extractor.extract_movie_name(filename) + resolution = self.metadata_extractor.extract_resolution(filename) + quality_score = self.metadata_extractor.extract_quality_score( + filename, file_stat.st_size + ) + content_signature = self.extract_content_signature(file_path) + + file_info = { + "path": file_path, + "hash": file_hash, + "size": file_stat.st_size, + "type": "video", + "mod_time": datetime.fromtimestamp(file_stat.st_mtime), + "is_archive": False, + "archive_path": None, + "movie_name": movie_name, + "resolution": resolution, + "quality_score": quality_score, + "content_signature": content_signature, + "filename": filename, + } + + self.stats["files_processed"] += 1 + if self.stats["files_processed"] % 1000 == 0: + logger.info( + f"已处理 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件" + ) + + return file_info + + self.stats["files_skipped"] += 1 + return None + + def scan_files_parallel(self): + """并行扫描多个目录中的所有文件""" + logger.info(f"开始并行扫描 {len(self.target_dirs)} 个目录...") + self.stats["start_time"] = time.time() + + file_type_stats = {"video": 0, "audio": 0, "other": 0, "skipped": 0} + + all_files = [] + media_files_to_process = [] + + logger.info("第一阶段:收集所有目录的文件路径...") + for target_dir in self.target_dirs: + logger.info(f"扫描目录: {target_dir}") + for root, dirs, files in os.walk(target_dir): + if any( + skip_dir in root + for skip_dir in ["temp_extract", "@eaDir", ".Trash"] + ): + continue + + for file in files: + file_path = os.path.join(root, file) + file_ext = os.path.splitext(file)[1].lower() + + if file_ext in self.video_extensions: + media_files_to_process.append(file_path) + file_type_stats["video"] += 1 + elif file_ext in self.audio_extensions: + media_files_to_process.append(file_path) + file_type_stats["audio"] += 1 + else: + file_type_stats["other"] += 1 + + logger.info("文件类型统计:") + logger.info(f" 视频文件: {file_type_stats['video']}") + logger.info(f" 音频文件: {file_type_stats['audio']}") + logger.info(f" 其他文件: {file_type_stats['other']}") + logger.info(f" 总计媒体文件: {len(media_files_to_process)}") + + if len(media_files_to_process) == 0: + logger.warning("没有找到任何媒体文件!请检查文件扩展名配置和目录路径。") + return [] + + logger.info("第二阶段:并行处理文件...") + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_file = { + executor.submit(self.process_single_file, file_path): file_path + for file_path in media_files_to_process + } + + batch_files = [] + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + if result: + # 添加文件来源信息 + result["source_folder"] = self.get_file_source_folder(file_path) + batch_files.append(result) + + if len(batch_files) >= 1000: + self.db.bulk_add_files(batch_files) + batch_files = [] + all_files.extend(batch_files) + + except Exception as e: + logger.error(f"处理文件 {file_path} 时出错: {e}") + self.stats["files_skipped"] += 1 + + if batch_files: + self.db.bulk_add_files(batch_files) + all_files.extend(batch_files) + + total_time = time.time() - self.stats["start_time"] + logger.info( + f"文件扫描完成。处理了 {self.stats['files_processed']} 个文件,跳过 {self.stats['files_skipped']} 个文件" + ) + logger.info(f"哈希计算总时间: {self.stats['hash_time']:.2f}秒") + logger.info(f"总扫描时间: {total_time:.2f}秒") + + return all_files + + def find_similar_movies_enhanced( + self, files, similarity_threshold=0.8, skip_start_percent=0.1 + ): + """增强版相似电影查找 - 跳过开头避免广告影响""" + logger.info("开始查找相似电影文件(增强版)...") + + # 使用增强版电影名称提取 + movie_groups = {} + for file_info in files: + # 使用两种方法提取电影名称 + filename = file_info.get("filename", "") + movie_name_normal = self.metadata_extractor.extract_movie_name(filename) + movie_name_core = self.metadata_extractor.extract_core_movie_name(filename) + + # 选择最合适的名称 + if movie_name_core and len(movie_name_core) > 3: + movie_name = movie_name_core + else: + movie_name = movie_name_normal + + if movie_name and len(movie_name) > 2: + if movie_name not in movie_groups: + movie_groups[movie_name] = [] + movie_groups[movie_name].append(file_info) + + # 查找每个电影组内的重复/相似文件 + similar_groups = {} + + for movie_name, file_group in movie_groups.items(): + if len(file_group) <= 1: + continue + + logger.info(f"分析电影: {movie_name} (共{len(file_group)}个版本)") + + # 按内容特征分组 - 使用跳过开头的特征 + signature_groups = {} + for file_info in file_group: + # 重新计算内容特征,跳过开头 + file_path = file_info["path"] + enhanced_signature = self.extract_content_signature( + file_path, skip_start_percent + ) + + if enhanced_signature: + signature = enhanced_signature + else: + signature = file_info.get("content_signature", "unknown") + + if signature not in signature_groups: + signature_groups[signature] = [] + signature_groups[signature].append(file_info) + + # 对于每个签名组,选择最佳版本 + for signature, signature_group in signature_groups.items(): + if len(signature_group) > 1: + # 按质量评分排序 + signature_group.sort( + key=lambda x: x.get("quality_score", 0), reverse=True + ) + + group_key = f"{movie_name}_{signature}" + similar_groups[group_key] = signature_group + + logger.info( + f" 发现 {len(signature_group)} 个相似文件 (特征: {signature}):" + ) + for i, file_info in enumerate(signature_group): + logger.info( + f" {i+1}. {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)})" + ) + + logger.info(f"找到 {len(similar_groups)} 组相似电影文件") + return similar_groups + + def select_best_version(self, file_group, strategy="quality"): + """选择最佳版本的文件(增强版,支持文件夹优先级)""" + if not file_group: + return None, [] + + # 创建文件组的副本以避免修改原始数据 + sorted_group = file_group.copy() + + # 第一步:如果设置了文件夹优先级,优先考虑 + if self.prefer_folders: + # 为每个文件计算优先级分数 + for file_info in sorted_group: + source_folder = self.get_file_source_folder(file_info["path"]) + if source_folder in self.prefer_folders: + # 在质量分基础上增加优先级分数 + file_info["priority_boost"] = ( + 1000 - self.prefer_folders.index(source_folder) * 100 + ) + else: + file_info["priority_boost"] = 0 + + # 第二步:按策略排序 + if strategy == "quality": + # 如果有优先级提升,则结合质量分和优先级 + if self.prefer_folders: + sorted_group.sort( + key=lambda x: x.get("quality_score", 0) + + x.get("priority_boost", 0), + reverse=True, + ) + else: + sorted_group.sort(key=lambda x: x.get("quality_score", 0), reverse=True) + elif strategy == "size": + sorted_group.sort(key=lambda x: x["size"], reverse=True) + elif strategy == "resolution": + resolution_order = {"4K": 4, "1080p": 3, "720p": 2, "HD": 1, "Unknown": 0} + sorted_group.sort( + key=lambda x: resolution_order.get(x.get("resolution", "Unknown"), 0), + reverse=True, + ) + else: # 'newest' + sorted_group.sort(key=lambda x: x["mod_time"], reverse=True) + + best_file = sorted_group[0] + files_to_delete = sorted_group[1:] + + return best_file, files_to_delete + + def remove_similar_duplicates( + self, similar_groups, dry_run=True, strategy="quality", no_backup=False + ): + """删除相似的重复文件 - 支持直接删除模式""" + logger.info("开始处理相似电影文件...") + + kept_files = [] + deleted_files = [] + delete_errors = [] + + for group_name, file_group in similar_groups.items(): + if len(file_group) <= 1: + continue + + best_file, files_to_delete = self.select_best_version(file_group, strategy) + + logger.info(f"\n电影组: {group_name}") + logger.info( + f" 保留: {best_file['filename']} " + f"(质量分: {best_file.get('quality_score', 0)})" + ) + + kept_files.append(best_file) + + for file_info in files_to_delete: + file_path = file_info["path"] + + if dry_run: + logger.info( + f" [干运行] 将删除: {file_info['filename']} " + f"(质量分: {file_info.get('quality_score', 0)})" + ) + else: + try: + if os.path.exists(file_path): + if no_backup: + # 直接删除模式 + os.remove(file_path) + logger.info(f" 🗑️ 已直接删除: {file_info['filename']}") + deleted_files.append(file_path) + else: + # 备份模式 + source_dir = os.path.dirname(file_path) + backup_dir = os.path.join( + source_dir, ".similar_movie_backup" + ) + os.makedirs(backup_dir, exist_ok=True) + + backup_path = os.path.join( + backup_dir, os.path.basename(file_path) + ) + counter = 1 + while os.path.exists(backup_path): + name, ext = os.path.splitext( + os.path.basename(file_path) + ) + backup_path = os.path.join( + backup_dir, f"{name}_{counter}{ext}" + ) + counter += 1 + + try: + os.rename(file_path, backup_path) + logger.info( + f" 已移动相似电影到备份: {file_info['filename']}" + ) + except OSError as e: + if e.errno == 18: # EXDEV - 跨设备链接错误 + logger.info( + f" 跨设备移动文件,使用复制方式: {file_info['filename']}" + ) + shutil.copy2(file_path, backup_path) + os.remove(file_path) + logger.info( + f" 已复制并删除相似电影到备份: {file_info['filename']}" + ) + else: + raise + + deleted_files.append(file_path) + + # 记录删除操作 + self.db.mark_file_deleted(file_path, "similar_movie") + + else: + logger.warning( + f" 文件不存在,跳过删除: {file_info['filename']}" + ) + + except Exception as e: + error_msg = f"删除文件时出错 {file_path}: {e}" + logger.error(error_msg) + delete_errors.append(error_msg) + self.db.add_operation( + "error", file_path, reason="delete_failed", details=str(e) + ) + + if delete_errors: + logger.error(f"删除过程中遇到 {len(delete_errors)} 个错误") + + logger.info(f"保留了 {len(kept_files)} 个最佳版本文件") + logger.info(f"处理了 {len(deleted_files)} 个相似电影文件") + + return kept_files, deleted_files + + def remove_empty_folders_efficient(self, target_dir=None): + """高效删除空文件夹 - 修复跨设备问题""" + if target_dir is None: + target_dir = self.target_dirs[0] + + logger.info(f"开始清理空文件夹: {target_dir}") + + empty_folders = [] + + for root, dirs, files in os.walk(target_dir, topdown=False): + # 跳过备份目录和系统目录 + skip_dirs = [ + "@eaDir", + ".Trash", + ".duplicate_backup", + "temp_extract", + ".similar_movie_backup", + ] + if any(skip_dir in root for skip_dir in skip_dirs): + continue + + if not dirs and not files and root != target_dir: + try: + # 检查目录是否为空(可能有隐藏文件) + if len(os.listdir(root)) == 0: + os.rmdir(root) + empty_folders.append(root) + self.db.add_operation( + "delete_folder", root, reason="empty_folder" + ) + logger.debug(f"删除空文件夹: {root}") + except OSError as e: + logger.debug(f"无法删除文件夹 {root}: {e}") + + logger.info(f"删除了 {len(empty_folders)} 个空文件夹") + return empty_folders + + def run_advanced_cleanup( + self, + dry_run=True, + strategy="quality", + similarity_threshold=0.7, + use_content_analysis=True, + no_backup=False, + ): + """运行高级清理流程 - 支持直接删除模式""" + logger.info("开始高级电影重复文件清理流程") + if no_backup: + logger.warning("⚠️ 直接删除模式已启用 - 文件将永久删除,不可恢复!") + + start_time = time.time() + + self.db.add_operation( + "scan_start", + str(self.target_dirs), + reason=f"advanced_cleanup_{'no_backup' if no_backup else 'with_backup'}", + ) + + try: + # 1. 扫描所有目录的文件并提取元数据 + all_files = self.scan_files_parallel() + + if not all_files: + logger.warning("没有找到任何视频文件") + return {} + + # 2. 使用高级算法查找相似的电影文件 + similar_groups = self.find_similar_movies_advanced( + all_files, similarity_threshold, use_content_analysis + ) + + if not similar_groups: + logger.info("没有找到相似的电影文件") + return {} + + # 3. 删除相似的重复文件(传递 no_backup 参数) + kept_files, deleted_files = self.remove_similar_duplicates( + similar_groups, dry_run, strategy, no_backup + ) + + # 4. 清理所有目录的空文件夹 + if not dry_run: + for target_dir in self.target_dirs: + self.remove_empty_folders_efficient(target_dir) + + # 记录扫描结束 + self.db.add_operation( + "scan_complete", + str(self.target_dirs), + reason="advanced_cleanup_finished", + ) + + # 计算持续时间 + duration = time.time() - start_time + + # 记录扫描历史 + scan_data = { + "target_directory": str(self.target_dirs), + "total_files": len(all_files), + "similar_groups": len(similar_groups), + "kept_files": len(kept_files), + "deleted_files": len(deleted_files), + "deleted_file_details": deleted_files, + "duration_seconds": duration, + "no_backup_mode": no_backup, + } + self.db.add_scan_history(scan_data) + + # 显示统计信息 + self.show_advanced_statistics(scan_data) + + # 只有在备份模式下才显示备份位置 + if not dry_run and deleted_files and not no_backup: + self.show_backup_locations() + + return scan_data + + except Exception as e: + logger.error(f"高级清理过程中发生错误: {e}") + self.db.add_operation( + "error", "SYSTEM", reason="advanced_cleanup_failed", details=str(e) + ) + raise + + def find_similar_movies_advanced( + self, files, similarity_threshold=0.7, use_content_analysis=True + ): + """高级相似电影查找 - 结合元数据和内容分析""" + logger.info("开始高级相似电影查找...") + + if use_content_analysis and VIDEO_PROCESSING_AVAILABLE: + # 使用基于内容的分析 + logger.info("使用基于内容的视频指纹分析") + content_groups = self.content_detector.group_similar_movies_by_content( + files + ) + + # 使用元数据增强内容分组 + enhanced_groups = self.content_detector.enhance_with_metadata_matching( + files, content_groups + ) + + # 转换为字典格式以保持兼容性 + similar_groups = {} + for i, group in enumerate(enhanced_groups): + group_key = f"content_group_{i}" + similar_groups[group_key] = group + + logger.info(f"基于内容分析找到 {len(similar_groups)} 组相似电影") + return similar_groups + else: + # 回退到元数据相似性分析 + logger.info("使用元数据相似性分析") + return self.find_similar_movies_enhanced(files, similarity_threshold) + + def show_advanced_statistics(self, scan_data): + """显示高级清理统计信息""" + logger.info("\n" + "=" * 60) + logger.info("高级清理统计信息") + logger.info("=" * 60) + logger.info(f"扫描目录: {', '.join(self.target_dirs)}") + logger.info(f"总视频文件: {scan_data['total_files']} 个") + logger.info(f"相似电影组: {scan_data['similar_groups']} 组") + logger.info(f"保留文件: {scan_data['kept_files']} 个") + logger.info(f"删除文件: {scan_data['deleted_files']} 个") + + # 计算节省的空间(估算) + estimated_saved_gb = scan_data["deleted_files"] * 2 # 假设平均每个文件2GB + logger.info(f"释放空间: 约 {estimated_saved_gb:.2f} GB (估算)") + logger.info(f"耗时: {scan_data['duration_seconds']:.2f} 秒") + + def run_intelligent_cleanup( + self, + dry_run=True, + strategy="quality", + similarity_threshold=0.8, + skip_start_percent=0.1, + no_backup=False, + ): + """运行智能清理流程 - 增强版,支持备份策略""" + logger.info("开始智能电影重复文件清理流程(增强版)") + if no_backup: + logger.warning("⚠️ 直接删除模式已启用 - 文件将永久删除,不可恢复!") + + start_time = time.time() + + self.db.add_operation( + "scan_start", + str(self.target_dirs), + reason=f"intelligent_cleanup_{'no_backup' if no_backup else 'with_backup'}", + ) + + try: + # 1. 扫描所有目录的文件并提取元数据 + all_files = self.scan_files_parallel() + + if not all_files: + logger.warning("没有找到任何视频文件") + return {} + + # 2. 使用增强版算法查找相似的电影文件 + similar_groups = self.find_similar_movies_enhanced( + all_files, similarity_threshold, skip_start_percent + ) + + if not similar_groups: + logger.info("没有找到相似的电影文件") + return {} + + # 3. 删除相似的重复文件(传递 no_backup 参数) + kept_files, deleted_files = self.remove_similar_duplicates( + similar_groups, dry_run, strategy, no_backup + ) + + # 4. 清理所有目录的空文件夹 + if not dry_run: + for target_dir in self.target_dirs: + self.remove_empty_folders_efficient(target_dir) + + # 记录扫描结束 + self.db.add_operation( + "scan_complete", + str(self.target_dirs), + reason="intelligent_cleanup_finished", + ) + + # 计算持续时间 + duration = time.time() - start_time + + # 记录扫描历史 + scan_data = { + "target_directory": str(self.target_dirs), + "total_files": len(all_files), + "similar_groups": len(similar_groups), + "kept_files": len(kept_files), + "deleted_files": len(deleted_files), + "deleted_file_details": deleted_files, + "duration_seconds": duration, + "no_backup_mode": no_backup, + } + self.db.add_scan_history(scan_data) + + # 显示统计信息 + self.show_intelligent_statistics(scan_data) + + # 只有在备份模式下才显示备份位置 + if not dry_run and deleted_files and not no_backup: + self.show_backup_locations() + + return scan_data + + except Exception as e: + logger.error(f"智能清理过程中发生错误: {e}") + self.db.add_operation( + "error", "SYSTEM", reason="intelligent_cleanup_failed", details=str(e) + ) + raise + + def show_intelligent_statistics(self, scan_data): + """显示智能清理统计信息""" + logger.info("\n" + "=" * 60) + logger.info("智能清理统计信息") + logger.info("=" * 60) + logger.info(f"扫描目录: {', '.join(self.target_dirs)}") + logger.info(f"总视频文件: {scan_data['total_files']} 个") + logger.info(f"相似电影组: {scan_data['similar_groups']} 组") + logger.info(f"保留文件: {scan_data['kept_files']} 个") + logger.info(f"删除文件: {scan_data['deleted_files']} 个") + + # 计算节省的空间(估算) + estimated_saved_gb = scan_data["deleted_files"] * 2 # 假设平均每个文件2GB + logger.info(f"释放空间: 约 {estimated_saved_gb:.2f} GB (估算)") + logger.info(f"耗时: {scan_data['duration_seconds']:.2f} 秒") + + def show_backup_locations(self): + """显示备份文件位置信息""" + logger.info("\n备份文件位置:") + backup_dirs_found = set() + + for target_dir in self.target_dirs: + for root, dirs, files in os.walk(target_dir): + if ".similar_movie_backup" in dirs: + backup_dir = os.path.join(root, ".similar_movie_backup") + backup_dirs_found.add(backup_dir) + + if backup_dirs_found: + for backup_dir in backup_dirs_found: + # 计算备份目录中的文件数量 + try: + backup_files = [ + f + for f in os.listdir(backup_dir) + if os.path.isfile(os.path.join(backup_dir, f)) + ] + total_size = sum( + os.path.getsize(os.path.join(backup_dir, f)) + for f in backup_files + ) / ( + 1024 * 1024 * 1024 + ) # GB + + logger.info( + f" {backup_dir}: {len(backup_files)} 个文件, 总大小: {total_size:.2f} GB" + ) + except OSError as e: + logger.warning(f" 无法访问备份目录 {backup_dir}: {e}") + else: + logger.info(" 未找到备份目录") + + +# 在 main() 函数中添加备份策略选项 +def main(): + # 首先声明全局变量 + global logger + + parser = argparse.ArgumentParser(description="智能电影重复文件清理工具 - 增强版") + parser.add_argument( + "directories", nargs="*", help="要扫描的目录路径(支持多个目录)" + ) + parser.add_argument( + "--dry-run", action="store_true", help="干运行模式,只显示不会实际删除" + ) + parser.add_argument( + "--strategy", + choices=["quality", "size", "resolution", "newest"], + default="quality", + help="选择最佳版本策略(默认: quality)", + ) + parser.add_argument( + "--similarity-threshold", + type=float, + default=0.8, + help="相似度阈值(0.0-1.0,默认: 0.8)", + ) + parser.add_argument( + "--skip-start", + type=float, + default=0.1, + help="跳过文件开头的比例(0.0-0.5,默认: 0.1)", + ) + parser.add_argument("--db-path", default="file_cleaner.db", help="数据库文件路径") + parser.add_argument( + "--workers", type=int, default=4, help="并行工作线程数 (默认: 4)" + ) + parser.add_argument( + "--log-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + default="INFO", + help="日志级别 (默认: INFO)", + ) + parser.add_argument( + "--log-file", default="duplicate_cleaner.log", help="日志文件路径" + ) + parser.add_argument( + "--prefer-folder", nargs="+", help="优先保留的文件夹(当文件质量相同时)" + ) + parser.add_argument( + "--content-analysis", + action="store_true", + help="启用基于内容的分析(更准确但更慢)", + ) + parser.add_argument( + "--no-content-analysis", + action="store_true", + help="禁用基于内容的分析(更快但准确性较低)", + ) + parser.add_argument("--backup-dir", help="指定备份目录路径(避免跨设备问题)") + parser.add_argument( + "--no-backup", action="store_true", help="不创建备份(直接删除文件)" + ) + + args = parser.parse_args() + + # 处理目录参数 + if not args.directories: + args.directories = [os.getcwd()] + + # 验证目录参数 - 使用 print 而不是 logger + for directory in args.directories: + if not os.path.exists(directory): + print(f"错误: 目录 {directory} 不存在") + return + + # 验证参数 + if args.skip_start < 0 or args.skip_start > 0.5: + print("错误: --skip-start 参数必须在 0.0 到 0.5 之间") + return + + # 重新配置日志(根据命令行参数) + log_level = getattr(logging, args.log_level) + logger = setup_logging(log_level, args.log_file) + + # 现在可以使用 logger 了 + if len(args.directories) == 1 and args.directories[0] == os.getcwd(): + logger.info(f"未指定目录,使用当前目录: {args.directories[0]}") + + # 确定是否使用内容分析 + use_content_analysis = True + if args.no_content_analysis: + use_content_analysis = False + elif args.content_analysis: + use_content_analysis = True + + # 如果视频处理库不可用,强制禁用内容分析 + if use_content_analysis and not VIDEO_PROCESSING_AVAILABLE: + logger.warning("视频处理库不可用,自动禁用内容分析") + use_content_analysis = False + + logger.info(f"启动智能电影重复文件清理器") + logger.info(f"目标目录: {args.directories}") + logger.info(f"选择策略: {args.strategy}") + logger.info(f"相似阈值: {args.similarity_threshold}") + if args.prefer_folder: + logger.info(f"优先文件夹: {args.prefer_folder}") + if args.backup_dir: + logger.info(f"指定备份目录: {args.backup_dir}") + if args.no_backup: + logger.warning("警告: 已启用直接删除模式,不会创建备份!") + + cleaner = IntelligentDuplicateCleaner( + args.directories, args.db_path, args.workers, args.prefer_folder + ) + + try: + if use_content_analysis: + logger.info("使用基于内容的高级分析模式") + result = cleaner.run_advanced_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + use_content_analysis=use_content_analysis, + no_backup=args.no_backup, # 添加这个参数 + ) + else: + # 使用新的直接删除模式 + result = cleaner.run_intelligent_cleanup( + dry_run=args.dry_run, + strategy=args.strategy, + similarity_threshold=args.similarity_threshold, + skip_start_percent=args.skip_start, + no_backup=args.no_backup, + ) + + if not args.dry_run and result: + logger.info(f"\n=== 清理总结 ===") + logger.info(f"相似电影组: {result.get('similar_groups', 0)} 组") + logger.info(f"保留文件: {result.get('kept_files', 0)} 个") + logger.info(f"删除文件: {result.get('deleted_files', 0)} 个") + logger.info(f"耗时: {result.get('duration_seconds', 0):.2f} 秒") + + # 显示备份信息 + if not args.no_backup: + cleaner.show_backup_locations() + + except KeyboardInterrupt: + logger.info("\n用户中断操作") + cleaner.db.add_operation("error", "SYSTEM", reason="user_interrupt") + except Exception as e: + logger.error(f"发生错误: {e}") + cleaner.db.add_operation("error", "SYSTEM", reason="exception", details=str(e)) + + +if __name__ == "__main__": + main() diff --git a/历史版本/运营命令.txt b/历史版本/运营命令.txt new file mode 100644 index 0000000..8f57d04 --- /dev/null +++ b/历史版本/运营命令.txt @@ -0,0 +1,17 @@ +# 启用内容分析 +python duplicate_cleanerV5视频解析 /volume4/media2 /volume2/music --content-analysis + +# 基本使用(备份模式) +python duplicate_cleanerV5视频解析2.py /volume4/media2 /volume2/music + +# 直接删除模式 +python duplicate_cleanerV5视频解析2.py /volume4/media2 /volume2/music --no-backup + +# 干运行预览 +python duplicate_cleanerV5视频解析2.py /volume4/media2 /volume2/music --dry-run + +# 自定义策略 +python duplicate_cleanerV5视频解析2.py /volume4/media2 /volume2/music --strategy resolution --similarity-threshold 0.9 + +# 查找并删除备份目录 +find /volume2/music /volume4/media2 -name ".similar_movie_backup" -type d -exec rm -rf {} + \ No newline at end of file