diff --git a/bagheeraview.py b/bagheeraview.py index 4815858..d670d0c 100755 --- a/bagheeraview.py +++ b/bagheeraview.py @@ -14,7 +14,7 @@ Classes: MainWindow: The main application window containing the thumbnail grid and docks. """ __appname__ = "BagheeraView" -__version__ = "0.9.25" +__version__ = "0.9.26" __author__ = "Ignacio Serantes" __email__ = "kde@aynoa.net" __license__ = "LGPL" @@ -1768,13 +1768,18 @@ class MainWindow(QMainWindow): UITexts.MENU_DETECT_CURRENT_SEARCH) detect_current_action.triggered.connect(self.start_duplicate_detection) - detect_all_action = duplicates_menu.addAction(UITexts.MENU_DETECT_ALL) - detect_all_action.triggered.connect(self.detect_all_duplicates) - force_full_action = duplicates_menu.addAction(UITexts.MENU_FORCE_FULL_ANALYSIS) force_full_action.triggered.connect( lambda: self.start_duplicate_detection(force_full=True)) + detect_all_action = duplicates_menu.addAction(UITexts.MENU_DETECT_ALL) + detect_all_action.triggered.connect(self.detect_all_duplicates) + + force_full_all_action = duplicates_menu.addAction( + UITexts.MENU_FORCE_FULL_ALL_ANALYSIS) + force_full_all_action.triggered.connect( + lambda: self.detect_all_duplicates(force_full=True)) + review_ignored_action = duplicates_menu.addAction(UITexts.MENU_REVIEW_IGNORED) review_ignored_action.triggered.connect(self.review_ignored_duplicates) @@ -1784,6 +1789,13 @@ class MainWindow(QMainWindow): QIcon.fromTheme("edit-clear-all"), UITexts.MENU_CLEAN_UP_HASHES) clean_hashes_action.triggered.connect(self.clean_duplicate_hashes) + repair_index_action = duplicates_menu.addAction(UITexts.MENU_REPAIR_DATABASE) + repair_index_action.triggered.connect(self.repair_duplicate_index) + + clear_exceptions_action = duplicates_menu.addAction( + UITexts.MENU_CLEAR_EXCEPTIONS) + clear_exceptions_action.triggered.connect(self.clear_ignored_duplicates) + if self.duplicate_cache: count, size_bytes = self.duplicate_cache.get_hash_stats() size_mb = size_bytes / (1024 * 1024) @@ -1828,7 +1840,7 @@ class MainWindow(QMainWindow): menu.exec(self.menu_btn.mapToGlobal(QPoint(0, self.menu_btn.height()))) - def detect_all_duplicates(self): + def detect_all_duplicates(self, force_full=False): """Gathers files from whitelist (respecting blacklist) and runs detector.""" QApplication.setOverrideCursor(Qt.WaitCursor) try: @@ -1849,7 +1861,7 @@ class MainWindow(QMainWindow): # By default, we use optimized (incremental) mode to avoid repeating # comparisons. - self.start_duplicate_detection(force_full=False, custom_paths=paths) + self.start_duplicate_detection(force_full=force_full, custom_paths=paths) def _gather_files_for_duplicates(self): """Helper to collect image paths based on whitelist and blacklist settings.""" @@ -1892,6 +1904,40 @@ class MainWindow(QMainWindow): count = self.duplicate_cache.clean_stale_hashes() self.status_lbl.setText(f"Cleaned up {count} stale hash entries.") + def repair_duplicate_index(self): + """Regenerates the BK-Tree and reverse index.""" + if not self.duplicate_cache: + return + + if self.duplicate_detector and self.duplicate_detector.isRunning(): + QMessageBox.information(self, UITexts.DUPLICATE_DETECTION_TITLE, + UITexts.DUPLICATE_ALREADY_RUNNING) + return + + self.status_lbl.setText(UITexts.REPAIRING_DATABASE) + QApplication.setOverrideCursor(Qt.WaitCursor) + try: + self.duplicate_cache.regenerate_bktree() + self.status_lbl.setText(UITexts.READY) + finally: + QApplication.restoreOverrideCursor() + + def clear_ignored_duplicates(self): + """Clears the ignored pairs database after user confirmation.""" + if not self.duplicate_cache: + return + + confirm = QMessageBox(self) + confirm.setIcon(QMessageBox.Question) + confirm.setWindowTitle(UITexts.CONFIRM_CLEAR_EXCEPTIONS_TITLE) + confirm.setText(UITexts.CONFIRM_CLEAR_EXCEPTIONS_TEXT) + confirm.setStandardButtons(QMessageBox.Yes | QMessageBox.No) + confirm.setDefaultButton(QMessageBox.No) + + if confirm.exec() == QMessageBox.Yes: + self.duplicate_cache.clear_exceptions() + self.status_lbl.setText(UITexts.READY) + def clear_duplicate_hashes(self): if not self.duplicate_cache: return diff --git a/constants.py b/constants.py index 87efae6..12d141f 100644 --- a/constants.py +++ b/constants.py @@ -29,7 +29,7 @@ if FORCE_X11: # --- CONFIGURATION --- PROG_NAME = "Bagheera Image Viewer" PROG_ID = "bagheeraview" -PROG_VERSION = "0.9.25" +PROG_VERSION = "0.9.26" PROG_AUTHOR = "Ignacio Serantes" # --- CACHE SETTINGS --- @@ -75,6 +75,8 @@ DUPLICATE_CACHE_PATH = os.path.join(APP_DATA_DIR, "duplicates") DUPLICATE_HASH_DB_NAME = b"hashes" DUPLICATE_EXCEPTIONS_DB_NAME = b"exceptions" DUPLICATE_PENDING_DB_NAME = b"pending" +DUPLICATE_BKTREE_DB_NAME = b"bktree" +DUPLICATE_HASH_TO_FILES_DB_NAME = b"hash_to_files" def save_app_config(): @@ -523,9 +525,16 @@ _UI_TEXTS = { "MENU_DUPLICATES": "Duplicates", "MENU_DETECT_CURRENT_SEARCH": "Detect in current search", "MENU_DETECT_ALL": "Detect all", + "MENU_FORCE_FULL_ALL_ANALYSIS": "Force full all analysis", "MENU_FORCE_FULL_ANALYSIS": "Force full analysis", "MENU_REVIEW_IGNORED": "Review ignored", "MENU_CLEAN_UP_HASHES": "Clean up", + "MENU_REPAIR_DATABASE": "Repair index", + "MENU_CLEAR_EXCEPTIONS": "Clear ignored pairs", + "CONFIRM_CLEAR_EXCEPTIONS_TITLE": "Confirm Clear Ignored Pairs", + "CONFIRM_CLEAR_EXCEPTIONS_TEXT": "Are you sure you want to clear all " + "ignored duplicate pairs? They will be detected again in the next scan.", + "REPAIRING_DATABASE": "Repairing duplicate index...", "MENU_CLEAR_HASHES": "Clear hashes ({} items, {:.1f} MB on disk)", "CONFIRM_CLEAR_HASHES_TITLE": "Confirm Clear Hashes", "CONFIRM_CLEAR_HASHES_TEXT": "Are you sure you want to permanently delete " @@ -786,7 +795,8 @@ _UI_TEXTS = { "RENAME_ERROR_EXISTS": "File '{}' already exists.", "FILE_RENAMED": "File renamed to {}", "ERROR_RENAME": "Could not rename file: {}", - "ERROR_JPEG_METADATA_LIMIT": "Metadata size limit exceeded for '{}'. This JPEG file has too much existing metadata (XMP) to save more.", + "ERROR_JPEG_METADATA_LIMIT": "Metadata size limit exceeded for '{}'. This " + "JPEG file has too much existing metadata (XMP) to save more.", "MAIN_DOCK_TITLE": "", "LAYOUTS_TAB": "Layouts", "LAYOUTS_TABLE_HEADER": ["Name", "Last Modified"], @@ -1067,9 +1077,16 @@ _UI_TEXTS = { "MENU_DUPLICATES": "Duplicados", "MENU_DETECT_CURRENT_SEARCH": "Detectar en búsqueda actual", "MENU_DETECT_ALL": "Detectar todos", + "MENU_FORCE_FULL_ALL_ANALYSIS": "Forzar análisis completo de todo", "MENU_FORCE_FULL_ANALYSIS": "Forzar análisis completo", "MENU_REVIEW_IGNORED": "Revisar ignorados", "MENU_CLEAN_UP_HASHES": "Limpiar", + "MENU_REPAIR_DATABASE": "Reparar índice", + "MENU_CLEAR_EXCEPTIONS": "Limpiar parejas ignoradas", + "CONFIRM_CLEAR_EXCEPTIONS_TITLE": "Confirmar Limpieza de Ignorados", + "CONFIRM_CLEAR_EXCEPTIONS_TEXT": "¿Seguro que quieres borrar todas las parejas " + "de duplicados ignoradas? Se volverán a detectar en el próximo escaneo.", + "REPAIRING_DATABASE": "Reparando índice de duplicados...", "MENU_CLEAR_HASHES": "Limpiar hashes ({} ítems, {:.1f} MB en disco)", "CONFIRM_CLEAR_HASHES_TITLE": "Confirmar Limpieza de Hashes", "CONFIRM_CLEAR_HASHES_TEXT": "¿Seguro que quieres eliminar permanentemente " @@ -1340,7 +1357,8 @@ _UI_TEXTS = { "RENAME_ERROR_EXISTS": "El archivo '{}' ya existe.", "FILE_RENAMED": "Archivo renombrado a {}", "ERROR_RENAME": "No se pudo renombrar el archivo: {}", - "ERROR_JPEG_METADATA_LIMIT": "Límite de metadatos excedido para '{}'. Este archivo JPEG ya tiene demasiados metadatos (XMP) para guardar más.", + "ERROR_JPEG_METADATA_LIMIT": "Límite de metadatos excedido para '{}'. Este " + "archivo JPEG ya tiene demasiados metadatos (XMP) para guardar más.", "MAIN_DOCK_TITLE": "Panel principal", "LAYOUTS_TAB": "Diseños", "LAYOUTS_TABLE_HEADER": ["Nombre", "Última Modificación"], @@ -1625,9 +1643,16 @@ _UI_TEXTS = { "MENU_DUPLICATES": "Duplicados", "MENU_DETECT_CURRENT_SEARCH": "Detectar na busca actual", "MENU_DETECT_ALL": "Detectar todos", + "MENU_FORCE_FULL_ALL_ANALYSIS": "Forzar análise completa de todo", "MENU_FORCE_FULL_ANALYSIS": "Forzar análise completa", "MENU_REVIEW_IGNORED": "Revisar ignorados", "MENU_CLEAN_UP_HASHES": "Limpar", + "MENU_REPAIR_DATABASE": "Reparar índice", + "MENU_CLEAR_EXCEPTIONS": "Limpar parellas ignoradas", + "CONFIRM_CLEAR_EXCEPTIONS_TITLE": "Confirmar Limpeza de Ignorados", + "CONFIRM_CLEAR_EXCEPTIONS_TEXT": "Seguro que queres borrar todas as parellas " + "de duplicados ignoradas? Volveranse detectar no vindeiro escaneo.", + "REPAIRING_DATABASE": "Reparando índice de duplicados...", "MENU_CLEAR_HASHES": "Limpar hashes ({} elementos, {:.1f} MB en disco)", "CONFIRM_CLEAR_HASHES_TITLE": "Confirmar Limpeza de Hashes", "CONFIRM_CLEAR_HASHES_TEXT": "Seguro que queres eliminar permanentemente toda " @@ -1896,7 +1921,8 @@ _UI_TEXTS = { "RENAME_ERROR_EXISTS": "O ficheiro '{}' xa existe.", "FILE_RENAMED": "Ficheiro renomeado a {}", "ERROR_RENAME": "Non se puido renomear o ficheiro: {}", - "ERROR_JPEG_METADATA_LIMIT": "Límite de metadatos excedido para '{}'. Este ficheiro JPEG xa ten demasiados metadatos (XMP) para gardar máis.", + "ERROR_JPEG_METADATA_LIMIT": "Límite de metadatos excedido para '{}'. Este " + "ficheiro JPEG xa ten demasiados metadatos (XMP) para gardar máis.", "MAIN_DOCK_TITLE": "Panel principal", "LAYOUTS_TAB": "Deseños", "LAYOUTS_TABLE_HEADER": ["Nome", "Última Modificación"], diff --git a/duplicatecache.py b/duplicatecache.py index 02fc2bf..794ba06 100644 --- a/duplicatecache.py +++ b/duplicatecache.py @@ -12,7 +12,7 @@ Classes: import os import logging import struct -import time +import time as time_module import collections import shutil import lmdb @@ -25,11 +25,12 @@ from PySide6.QtCore import ( ) import imagehash # For perceptual hashing +from constants import MAX_DHASH_DISTANCE from constants import ( DUPLICATE_CACHE_PATH, DUPLICATE_HASH_DB_NAME, DUPLICATE_EXCEPTIONS_DB_NAME, DUPLICATE_PENDING_DB_NAME, - MAX_DHASH_DISTANCE, UITexts + DUPLICATE_BKTREE_DB_NAME, DUPLICATE_HASH_TO_FILES_DB_NAME, UITexts ) logger = logging.getLogger(__name__) @@ -40,45 +41,6 @@ DuplicateResult = collections.namedtuple( ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp']) -class BKTree: - """A Burkhard-Keller tree for efficient similarity searching using Hamming - distance.""" - def __init__(self, distance_func): - self.distance_func = distance_func - self.tree = None - - def add(self, item): - if self.tree is None: - self.tree = (item, {}) - return - node = self.tree - while True: - val, children = node - dist = self.distance_func(item, val) - if dist == 0: - return - if dist in children: - node = children[dist] - else: - children[dist] = (item, {}) - break - - def query(self, item, max_dist): - if self.tree is None: - return [] - results = [] - candidates = [self.tree] - while candidates: - val, children = candidates.pop() - dist = self.distance_func(item, val) - if dist <= max_dist: - results.append((val, dist)) - for d in range(max(0, dist - max_dist), dist + max_dist + 1): - if d in children: - candidates.append(children[d]) - return results - - class HashWorker(QRunnable): """Worker to calculate image hash in a thread pool.""" def __init__(self, path, detector, result_dict, mutex, semaphore): @@ -100,7 +62,6 @@ class HashWorker(QRunnable): self.result_dict[self.path] = h except Exception as e: logger.warning(f"HashWorker failed for {self.path}: {e}") - self.semaphore.release() @@ -108,15 +69,18 @@ class DuplicateCache(QObject): """ Manages a persistent LMDB cache for perceptual hashes and duplicate relationships. Uses (device_id, inode) as primary keys for robustness against file renames/moves. - """ + """ # noqa: E501 + def __init__(self): super().__init__() self._lmdb_env = None self._hash_db = None self._exceptions_db = None self._pending_db = None + self._bktree_db = None + self._hash_to_files_db = None self._db_lock = QMutex() # Protects LMDB transactions - # In-memory cache for hashes: (dev, inode) -> (hash_value, path) + # In-memory cache for hashes: (dev, inode) -> (hash_value, mtime, path) self._hash_cache = {} self._hash_cache_lock = QReadWriteLock() @@ -130,13 +94,18 @@ class DuplicateCache(QObject): self._lmdb_env = lmdb.open( DUPLICATE_CACHE_PATH, map_size=10 * 1024 * 1024 * 1024, # 10GB default - max_dbs=3, # For hashes, exceptions and pending + # Hashes, exceptions, pending, bktree, hash_to_files + max_dbs=5, readonly=False, create=True ) self._hash_db = self._lmdb_env.open_db(DUPLICATE_HASH_DB_NAME) self._exceptions_db = self._lmdb_env.open_db(DUPLICATE_EXCEPTIONS_DB_NAME) self._pending_db = self._lmdb_env.open_db(DUPLICATE_PENDING_DB_NAME) + self._bktree_db = self._lmdb_env.open_db(DUPLICATE_BKTREE_DB_NAME) + self._hash_to_files_db = self._lmdb_env.open_db( + DUPLICATE_HASH_TO_FILES_DB_NAME, + dupsort=True) logger.info(f"Duplicate LMDB cache opened: {DUPLICATE_CACHE_PATH}") except Exception as e: logger.error(f"Failed to open duplicate LMDB cache: {e}") @@ -149,6 +118,8 @@ class DuplicateCache(QObject): self._hash_db = None self._exceptions_db = None self._pending_db = None + self._bktree_db = None + self._hash_to_files_db = None def get_hash_stats(self): """Returns (count, size_bytes) for the hash database.""" @@ -181,17 +152,39 @@ class DuplicateCache(QObject): except Exception as e: logger.error(f"Error clearing duplicate LMDB: {e}") + def clear_exceptions(self): + """Clears all entries from the exceptions database.""" + if not self._lmdb_env or self._exceptions_db is None: + return False + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=True) as txn: + # Clear the DB but keep the handle valid + txn.drop(self._exceptions_db, delete=False) + logger.info("Duplicate exceptions database cleared.") + return True + def __del__(self): self.lmdb_close() @staticmethod def _get_inode_info(path): try: + if not path: + return 0, None stat_info = os.stat(path) return stat_info.st_dev, struct.pack('Q', stat_info.st_ino) except OSError: return 0, None + @staticmethod + def _hash_str_to_bytes(hash_str): + return struct.pack('>Q', int(hash_str, 16)) + + @staticmethod + def _hamming_distance(h1_bytes, h2_bytes): + return bin(struct.unpack('>Q', h1_bytes)[0] ^ + struct.unpack('>Q', h2_bytes)[0]).count('1') + def _get_lmdb_key(self, dev_id, inode_key_bytes): return f"{dev_id}-{inode_key_bytes.hex()}".encode('utf-8') @@ -215,10 +208,13 @@ class DuplicateCache(QObject): # Handle format "hash_value_str|mtime|path_str" or old "hash|path" parts = value_bytes.decode('utf-8').split('|', 2) if len(parts) == 3: - hash_str, mtime_str, path_str = parts + hash_str = parts[0] + mtime_str = parts[1] + path_str = os.path.abspath(os.path.normpath(parts[2])) mtime = float(mtime_str) elif len(parts) == 2: - hash_str, path_str = parts + hash_str = parts[0] + path_str = os.path.abspath(os.path.normpath(parts[1])) mtime = 0.0 # Force re-hash else: return None, 0, None @@ -229,34 +225,231 @@ class DuplicateCache(QObject): return hash_str, mtime, path_str return None, 0, None - def get_hash_for_path(self, path, current_mtime, dev_id=None, inode_key_bytes=None): + def get_hash_info_for_path(self, path, current_mtime, dev_id=None, + inode_key_bytes=None): if dev_id is None or inode_key_bytes is None: dev_id, inode_key_bytes = self._get_inode_info(path) if not inode_key_bytes: - return None - hash_value, cached_mtime, _ = self.get_hash_and_path(dev_id, inode_key_bytes) + return None, 0, None + hash_value, cached_mtime, cached_path = self.get_hash_and_path( + dev_id, + inode_key_bytes) # Return hash only if mtime matches (with small float tolerance) if hash_value and abs(cached_mtime - current_mtime) < 0.001: - return hash_value - return None + return hash_value, cached_mtime, cached_path + return None, 0, None def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None): - if dev_id is None or inode_key_bytes is None: + if dev_id is None or inode_key_bytes is None: # noqa: E501 + dev_id, inode_key_bytes = self._get_inode_info(path) if not inode_key_bytes or not self._lmdb_env: return False + hash_bytes = self._hash_str_to_bytes(hash_value) + file_id_bytes = f"{dev_id}-{inode_key_bytes.hex()}".encode('utf-8') + + path = os.path.abspath(os.path.normpath(path)) value_str = f"{hash_value}|{mtime}|{path}" with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes) + + # Verificar si el archivo ya tenía un hash distinto (actualización) + # Check if the file already had a different hash (update) + old_val = txn.get(lmdb_key, db=self._hash_db) + if old_val: + old_parts = old_val.decode('utf-8').split('|') + if old_parts[0] != hash_value: # noqa: E501 + old_h_bytes = self._hash_str_to_bytes(old_parts[0]) + txn.delete(old_h_bytes, + file_id_bytes, db=self._hash_to_files_db) + txn.put(lmdb_key, value_str.encode('utf-8'), db=self._hash_db) + txn.put(hash_bytes, file_id_bytes, db=self._hash_to_files_db) + + # Actualización incremental del BK-Tree persistente + # Incremental update of the persistent BK-Tree + self._persistent_bktree_add(txn, hash_bytes) # noqa: E501 with QWriteLocker(self._hash_cache_lock): self._hash_cache[(dev_id, inode_key_bytes)] = (hash_value, mtime, path) return True + def _persistent_bktree_add(self, txn, hash_bytes): + root_hash = txn.get(b'__root__', db=self._bktree_db) + if root_hash is None: + txn.put(b'__root__', hash_bytes, db=self._bktree_db) + return + + curr_hash = root_hash + while True: + if curr_hash == hash_bytes: + return + dist = self._hamming_distance(hash_bytes, curr_hash) + children = self._decode_children(txn.get(curr_hash, db=self._bktree_db)) + + if dist in children: + curr_hash = children[dist] + else: + children[dist] = hash_bytes + txn.put(curr_hash, self._encode_children(children), db=self._bktree_db) + break + + def persistent_bktree_query(self, hash_str, max_dist): + """Busca en el árbol de disco hashes similares al proporcionado.""" + hash_bytes = self._hash_str_to_bytes(hash_str) + results = [] + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=False) as txn: + root = txn.get(b'__root__', db=self._bktree_db) + if not root: + return [] + candidates = [root] + while candidates: + curr = candidates.pop() + dist = self._hamming_distance(hash_bytes, curr) + if dist <= max_dist: + results.append((curr, dist)) + children = self._decode_children(txn.get(curr, db=self._bktree_db)) + for d, child_hash in children.items(): + if abs(dist - d) <= max_dist: + candidates.append(child_hash) + return results + + def regenerate_bktree(self, progress_callback=None): + """ + Regenerates the BK-Tree and reverse index from the hashes database. + Useful if index corruption is suspected. + """ + if not self._lmdb_env: + return False + + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=True) as txn: + # 1. Clear existing indices (keeps handles valid) + txn.drop(self._bktree_db, delete=False) + txn.drop(self._hash_to_files_db, delete=False) + + # 2. Iterate hashes and rebuild + cursor = txn.cursor(db=self._hash_db) + count = 0 + total = txn.stat(db=self._hash_db)['entries'] + hashes_added_to_tree = set() + + for file_id_bytes, value_bytes in cursor: + try: + val_str = value_bytes.decode('utf-8') + parts = val_str.split('|') + if not parts: + continue + + hash_str = parts[0] + hash_bytes = self._hash_str_to_bytes(hash_str) + + if hash_bytes not in hashes_added_to_tree: + self._persistent_bktree_add(txn, hash_bytes) + hashes_added_to_tree.add(hash_bytes) + + txn.put(hash_bytes, file_id_bytes, db=self._hash_to_files_db) + + count += 1 + if progress_callback and count % 100 == 0: + progress_callback(count, total) + except Exception as e: + logger.error(f"Error re-indexing {file_id_bytes}: {e}") + return True + + def get_files_for_hash(self, hash_bytes): + """Returns all files that share a hash using the reverse index. + """ # noqa: E501 + files = [] + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=False) as txn: + cursor = txn.cursor(db=self._hash_to_files_db) + if cursor.set_key(hash_bytes): + for file_id_bytes in cursor.iternext_dup(): + info = txn.get(file_id_bytes, db=self._hash_db) + if info: + parts = info.decode('utf-8').split('|') + if len(parts) >= 3: + fid_str = file_id_bytes.decode('utf-8') + dev = int(fid_str.split('-')[0]) + inode = bytes.fromhex(fid_str.split('-')[1]) + files.append((parts[2], dev, inode)) + return files + + def check_reverse_index_integrity(self): + """ + Performs a diagnostic of the reverse index. + Returns a dictionary with the count of total and orphaned entries. + """ + stats = {'total': 0, 'orphans': 0, 'mismatches': 0} + if not self._lmdb_env: + return stats + + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=False) as txn: + cursor = txn.cursor(db=self._hash_to_files_db) + for hash_bytes, file_id_bytes in cursor: + stats['total'] += 1 + info = txn.get(file_id_bytes, db=self._hash_db) + if not info: + stats['orphans'] += 1 + continue + + try: + stored_hash_str = info.decode('utf-8').split('|')[0] + if self._hash_str_to_bytes(stored_hash_str) != hash_bytes: + stats['mismatches'] += 1 + except Exception: + stats['orphans'] += 1 + return stats + + def prune_reverse_index_orphans(self): + """Removes orphaned records from the reverse index surgically + without rebuilding the entire tree. + """ + removed = 0 + if not self._lmdb_env: + return 0 + + with QMutexLocker(self._db_lock): + with self._lmdb_env.begin(write=True) as txn: + cursor = txn.cursor(db=self._hash_to_files_db) + for hash_bytes, file_id_bytes in cursor: + info = txn.get(file_id_bytes, db=self._hash_db) + should_delete = False + if not info: + should_delete = True + else: + stored_hash_str = info.decode('utf-8').split('|')[0] + if self._hash_str_to_bytes(stored_hash_str) != hash_bytes: + should_delete = True + + if should_delete: + cursor.delete() + removed += 1 + return removed + + @staticmethod + def _decode_children(data): + if not data: + return {} + res = {} + for i in range(0, len(data), 9): + res[data[i]] = data[i+1:i+9] + return res + + @staticmethod + def _encode_children(children_dict): + res = bytearray() + for d, h in children_dict.items(): + res.append(d) + res.extend(h) + return bytes(res) + def remove_hash_for_path(self, path, clear_relationships=True): """ Removes the hash entry for a path. @@ -273,6 +466,21 @@ class DuplicateCache(QObject): with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes) + + # Safeguard: Do not proceed if identity information is invalid + if dev_id == 0 or not inode_key_bytes: + return False + + # Limpiar el índice inverso antes de borrar la entrada principal + val_bytes = txn.get(lmdb_key, db=self._hash_db) + if val_bytes: + try: + parts = val_bytes.decode('utf-8').split('|') + h_bytes = self._hash_str_to_bytes(parts[0]) + txn.delete(h_bytes, lmdb_key, db=self._hash_to_files_db) + except Exception: + pass + txn.delete(lmdb_key, db=self._hash_db) with QWriteLocker(self._hash_cache_lock): @@ -309,12 +517,13 @@ class DuplicateCache(QObject): if not inode1 or not inode2: return False - exception_key = self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2) + exception_key = self._get_pair_lmdb_key_from_ids( + dev1, inode1, dev2, inode2) if not exception_key: return False # Store paths in value to make exception recovery independent of hash DB - ts = timestamp if timestamp is not None else int(time.time()) + ts = timestamp if timestamp is not None else int(time_module.time()) val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}" value = val_str.encode('utf-8') @@ -335,7 +544,8 @@ class DuplicateCache(QObject): if not inode1 or not inode2: return False - exception_key = self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2) + exception_key = self._get_pair_lmdb_key_from_ids( + dev1, inode1, dev2, inode2) if not exception_key: return False @@ -345,8 +555,8 @@ class DuplicateCache(QObject): def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None): - """Removes all entries involving a specific (dev, inode) pair from a pair-based - DB.""" + """Removes all entries involving a specific (dev, inode) pair from a + pair-based DB.""" if not self._lmdb_env: return @@ -355,7 +565,8 @@ class DuplicateCache(QObject): def do_remove(t): cursor = t.cursor(db=db_handle) keys_to_delete = [] - for key_bytes, _ in cursor: + for key_bytes, _ in cursor: # noqa: E501 + key_str = key_bytes.decode('utf-8') parts = key_str.split('-') if len(parts) < 4: @@ -386,7 +597,7 @@ class DuplicateCache(QObject): return False # Store paths in value to allow reconstruction without scanning - ts = timestamp if timestamp is not None else int(time.time()) + ts = timestamp if timestamp is not None else int(time_module.time()) val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}" value = val_str.encode('utf-8') @@ -408,14 +619,15 @@ class DuplicateCache(QObject): if not self._lmdb_env or self._pending_db is None or not pairs_data: return False - with QMutexLocker(self._db_lock): + with QMutexLocker(self._db_lock): # noqa: E501 + with self._lmdb_env.begin(write=True) as txn: for p1, p2, similarity, timestamp in pairs_data: key = self._get_pair_lmdb_key(p1, p2) if not key: continue - ts = timestamp if timestamp is not None else int(time.time()) + ts = timestamp if timestamp is not None else int(time_module.time()) sim_str = str(similarity) if similarity is not None else "" val_str = f"{p1}|{p2}|{sim_str}|{ts}" value = val_str.encode('utf-8') @@ -423,20 +635,24 @@ class DuplicateCache(QObject): return True def get_all_exceptions_set(self): - """Returns a set of canonical pairs (frozenset) marked as exceptions.""" + """Returns a set of canonical pairs (frozenset of inode-IDs) marked as + exceptions.""" exceptions = set() if not self._lmdb_env or self._exceptions_db is None: return exceptions with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._exceptions_db) - for _, value_bytes in cursor: - try: - parts = value_bytes.decode('utf-8').split('|') - if len(parts) >= 2: - exceptions.add(frozenset((parts[0], parts[1]))) - except Exception: - continue + for key_bytes, _ in cursor: + # Format is dev1-ino1-dev2-ino2. Since ino hex is 16 chars, + # we parse from parts knowing that IDs always end with hex. + parts = key_bytes.decode('utf-8').split('-') + # Handle potential negative dev_ids or other hyphenated parts + # Each ID is [device_part] + "-" + [16 hex chars inode] + if len(parts) >= 4: + id2 = f"{parts[-2]}-{parts[-1]}" + id1 = "-".join(parts[:-2]) + exceptions.add(frozenset((id1, id2))) return exceptions def get_all_pending_duplicates(self): @@ -445,17 +661,49 @@ class DuplicateCache(QObject): if not self._lmdb_env or self._pending_db is None: return results - keys_to_delete = [] - with QMutexLocker(self._db_lock): - with self._lmdb_env.begin(write=False) as txn: + keys_to_delete = [] # Keys to delete from pending DB + # List to mark exceptions outside the read transaction + links_to_ignore = [] # noqa: E501 + + with QMutexLocker(self._db_lock): # noqa: E501 + + with self._lmdb_env.begin(write=False) as txn: # noqa: E501 + cursor = txn.cursor(db=self._pending_db) for key, value_bytes in cursor: try: parts = value_bytes.decode('utf-8').split('|') - p1, p2 = parts[0], parts[1] + p1 = os.path.abspath(os.path.normpath(parts[0])) # noqa: E501 + p2 = os.path.abspath(os.path.normpath(parts[1])) + + # Prueba definitiva de identidad para symlinks/hardlinks + try: + if os.path.samefile(p1, p2) or \ + os.path.realpath(p1) == os.path.realpath(p2): + keys_to_delete.append(key) + # Move from pending to exception silently if now links + links_to_ignore.append((p1, p2)) # noqa: E501 + continue + except OSError: + # Si el archivo no existe, limpiar de pendientes + keys_to_delete.append(key) + pass + sim = int(parts[2]) if len(parts) > 2 and parts[2] else None - ts = int(parts[3]) if len(parts) > 3 else 0 + ts = int(parts[3]) if len(parts) > 3 else 0 # noqa: E501 if os.path.exists(p1) and os.path.exists(p2): + # Verificar si ya es una excepción conocida (por inodo) + # Check if it's already a known exception (by inode) + dev1, ino1 = self._get_inode_info(p1) + + dev2, ino2 = self._get_inode_info(p2) + if ino1 and ino2: + ex_key = self._get_pair_lmdb_key_from_ids( + dev1, ino1, dev2, ino2) + if txn.get(ex_key, db=self._exceptions_db): + keys_to_delete.append(key) + continue + results.append( DuplicateResult(p1, p2, None, False, sim, ts)) else: @@ -468,11 +716,16 @@ class DuplicateCache(QObject): try: with self._lmdb_env.begin(write=True) as txn: for k in keys_to_delete: - txn.delete(k, db=self._pending_db) + if txn.get(k, db=self._pending_db): + txn.delete(k, db=self._pending_db) logger.info(f"Cleaned up {len(keys_to_delete)} invalid " "pending duplicates (files deleted externally)") except Exception as e: - logger.error(f"Error cleaning up pending duplicates from DB: {e}") + logger.error( + f"Error cleaning up pending duplicates from DB: {e}") + + for p1, p2 in links_to_ignore: + self.mark_as_exception(p1, p2, True, similarity=100) return results @@ -489,7 +742,8 @@ class DuplicateCache(QObject): try: p1, p2 = None, None sim = None - ts = 0 + ts = 0 # noqa: E501 + val_str = value_bytes.decode('utf-8') if '|' in val_str: @@ -499,7 +753,8 @@ class DuplicateCache(QObject): p1, p2 = parts[0], parts[1] if len(parts) > 2 and parts[2]: sim = int(parts[2]) - if len(parts) > 3: + if len(parts) > 3: # noqa: E501 + ts = int(parts[3]) else: ts = int(os.path.getmtime(p1)) \ @@ -507,7 +762,8 @@ class DuplicateCache(QObject): if not p1 or not p2: # Legacy format fallback: lookup paths in hash db - key_str = key_bytes.decode('utf-8') + # Legacy format fallback: lookup paths in hash db + key_str = key_bytes.decode('utf-8') # noqa: E501 kp = key_str.split('-') if len(kp) == 4: k1, k2 = f"{kp[0]}-{kp[1]}".encode(), @@ -605,9 +861,11 @@ class DuplicateCache(QObject): # This happens if the file is renamed within the same filesystem. if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes): hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes) - if hash_value: + if hash_value: # noqa: E501 + self.add_hash_for_path(new_path, hash_value, mtime) self._update_pair_paths(old_path, new_path, self._pending_db) + self._update_pair_paths(old_path, new_path, self._exceptions_db) return True return False @@ -617,11 +875,13 @@ class DuplicateCache(QObject): # 3. Add a new entry with the new (dev, inode) and path, using the old hash. hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes) if hash_value: - # This removes the old (dev, inode) entry - self.remove_hash_for_path(old_path) + # Avoid deleting relationships when renaming (only update path) + # Avoid deleting relationships when renaming (only update path) + self.remove_hash_for_path(old_path, clear_relationships=False) # Adds new (dev, inode) entry self.add_hash_for_path(new_path, hash_value, mtime) self._update_pair_paths(old_path, new_path, self._pending_db) + self._update_pair_paths(old_path, new_path, self._exceptions_db) return True return False @@ -629,16 +889,39 @@ class DuplicateCache(QObject): """Updates stored paths in a pair-based DB value when a file is renamed.""" if not self._lmdb_env or db_handle is None: return + + # Optimization: Pre-calculate the inode ID to filter keys during iteration. + # Scanning only keys is much faster than retrieving associated values from disk. + # Scanning only keys is much faster than retrieving associated values from disk. + dev, ino = self._get_inode_info(old_path) + if not ino: + return + file_id_bytes = f"{dev}-{ino.hex()}".encode('utf-8') + with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: cursor = txn.cursor(db=db_handle) - for key, value_bytes in cursor: - val_str = value_bytes.decode('utf-8') - if old_path in val_str: - p1, p2 = val_str.split('|') - np1 = new_path if p1 == old_path else p1 - np2 = new_path if p2 == old_path else p2 - txn.put(key, f"{np1}|{np2}".encode('utf-8'), db=db_handle) + # iternext(values=False) iterates only the index keys, avoiding + # unnecessary I/O. + for key in cursor.iternext(values=False): # noqa: E501 + + if file_id_bytes in key: + value_bytes = txn.get(key, db=db_handle) + if not value_bytes: + continue + + val_str = value_bytes.decode('utf-8') + parts = val_str.split('|') # noqa: E501 + + if len(parts) >= 2: + p1, p2 = parts[0], parts[1] # noqa: E501 + if p1 == old_path or p2 == old_path: + # Actualizamos solo la ruta que ha cambiado + parts[0] = new_path if p1 == old_path else p1 + parts[1] = new_path if p2 == old_path else p2 + txn.put(key, + "|".join(parts).encode('utf-8'), + db=db_handle) class DuplicateDetector(QThread): @@ -666,23 +949,49 @@ class DuplicateDetector(QThread): self.wait() # Add this line def run(self): + # Asegurar que todas las rutas sean absolutas y normalizadas al inicio para + # total consistency + # total consistency + self.paths_to_scan = [os.path.abspath(os.path.normpath(p)) + for p in self.paths_to_scan] + + # Pre-load exceptions to avoid UnboundLocalError in Phase 1 and maintain + # consistency + exceptions_set = self.duplicate_cache.get_all_exceptions_set() + scan_paths_set = set(self.paths_to_scan) + total_files = len(self.paths_to_scan) found_duplicates = [] # To store frozenset((path1, path2)) for uniqueness - unique_duplicate_pairs = set() - last_update_time = 0 + unique_inode_pairs = set() + last_ui_update_time = 0 pool = self.pool_manager.get_pool() - # 1. Load existing pending duplicates from cache to avoid recalculation (unless - # force_full) - if not self.force_full: - pending = self.duplicate_cache.get_all_pending_duplicates() - for p in pending: - if p.path1 in self.paths_to_scan and p.path2 in self.paths_to_scan: - if p.similarity is None or p.similarity >= self.threshold: - found_duplicates.append(p) - unique_duplicate_pairs.add(frozenset((p.path1, p.path2))) + # 0. Deduplicate input paths by physical identity (symlinks to same file) + # This avoids comparing a file with its symlink or multiple symlinks to + # the same target. + unique_paths_to_scan = [] + canonical_paths = {} # Initialize canonical_paths dictionary + + for p in self.paths_to_scan: + try: + # Resolve symlinks to their real physical location + rp = os.path.realpath(p) + if rp not in canonical_paths: + canonical_paths[rp] = p + unique_paths_to_scan.append(p) + else: # noqa: E501 + + # It's a symlink or hardlink to something already in the list. + # We mark it as an exception (similarity 100) so it doesn't show up. + self.duplicate_cache.mark_as_exception( + p, canonical_paths[rp], True, similarity=100) + except OSError: + unique_paths_to_scan.append(p) + + total_unique = len(unique_paths_to_scan) + processed_initial = 0 # Convert similarity threshold (percentage) to Hamming distance distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100) @@ -693,12 +1002,10 @@ class DuplicateDetector(QThread): # 2. Phase 1: Hash Collection (Parallelized) path_to_hash = {} - dirty_hashes_objs = set() dirty_paths = set() paths_to_hash_parallel = [] - processed_initial = 0 - for i, path in enumerate(self.paths_to_scan): + for i, path in enumerate(unique_paths_to_scan): if not self._is_running: break @@ -709,22 +1016,32 @@ class DuplicateDetector(QThread): # Update UI during initial cache check (Phase 1 part A) processed_initial += 1 - cached_h = \ - self.duplicate_cache.get_hash_for_path(path, mtime, dev, inode) + cached_h, _, cached_path = \ + self.duplicate_cache.get_hash_info_for_path(path, mtime, dev, inode) - if cached_h: - path_to_hash[path] = (cached_h, dev, inode) + if cached_h: # noqa: E501 + + if cached_path == path: + path_to_hash[path] = (cached_h, dev, inode) + else: + # El archivo se movió o renombró: actualizar caché y marcar + # como sucio + self.duplicate_cache.add_hash_for_path( + path, cached_h, mtime, dev, inode) + dirty_paths.add(path) + path_to_hash[path] = (cached_h, dev, inode) else: dirty_paths.add(path) paths_to_hash_parallel.append((path, mtime, dev, inode)) - if time.perf_counter() - last_update_time > 0.05: + if time_module.perf_counter() - last_ui_update_time > 0.05: # Scale this part to 0-50% of the total bar - progress = int((processed_initial / total_files) * total_files) + # Scale this part to 0-50% of the total bar + progress = int((processed_initial / total_unique) * total_files) self.progress_update.emit( progress, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="...")) - last_update_time = time.perf_counter() + last_ui_update_time = time_module.perf_counter() except OSError: continue @@ -735,6 +1052,7 @@ class DuplicateDetector(QThread): new_hashes = {} sem = QSemaphore(0) + # Phase 1 part B: Parallel hashing for new/changed files # Phase 1 part B: Parallel hashing for new/changed files processed_hashing = total_files - len(paths_to_hash_parallel) @@ -753,141 +1071,177 @@ class DuplicateDetector(QThread): if not self._is_running: break processed_hashing += 1 - if time.perf_counter() - last_update_time > 0.03: + if time_module.perf_counter() - last_ui_update_time > 0.03: self.progress_update.emit( processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="...")) - last_update_time = time.perf_counter() + last_ui_update_time = time_module.perf_counter() for p, mtime, dev, inode in paths_to_hash_parallel: h = new_hashes.get(p) if h: path_to_hash[p] = (h, dev, inode) - dirty_hashes_objs.add(imagehash.hex_to_hash(h)) self.duplicate_cache.add_hash_for_path(p, h, mtime, dev, inode) + # Cargar duplicados pendientes existentes (a menos que sea force_full) + # Load existing pending duplicates (unless force_full) + if not self.force_full: + pending = self.duplicate_cache.get_all_pending_duplicates() + for p in pending: + # Normalize database paths to ensure match with scan set + np1, np2 = (os.path.abspath(os.path.normpath(p.path1)), + os.path.abspath(os.path.normpath(p.path2))) + + if np1 in scan_paths_set and np2 in scan_paths_set: + # If any of the files changed (is in dirty_paths), the pending + # result is invalid and must be ignored for recalculation. + if np1 in dirty_paths or np2 in dirty_paths: + continue + + # Omitir identidades físicas (enlaces) + # Skip physical identities (links) + # Skip physical identities (links) + try: + if np1 == np2 or os.path.samefile(np1, np2): + # Mover de pendiente a excepción silenciosamente si ahora + # son links + self.duplicate_cache.mark_as_exception( + np1, np2, True, similarity=100) + self.duplicate_cache.mark_as_pending(np1, np2, False) + continue + except OSError: + continue + + # Check if already marked as exception + # Check if already marked as exception + dev1, ino1 = self.duplicate_cache._get_inode_info(np1) + dev2, ino2 = self.duplicate_cache._get_inode_info(np2) + if ino1 and ino2: + id1, id2 = f"{dev1}-{ino1.hex()}", f"{dev2}-{ino2.hex()}" + if frozenset((id1, id2)) in exceptions_set: + # Si ya es una excepción, asegurar que no esté en pendientes + self.duplicate_cache.mark_as_pending(np1, np2, False) + continue + + if p.similarity is None or p.similarity >= self.threshold: # noqa: E501 + + # Usar las rutas normalizadas en el resultado + res = p._replace(path1=np1, path2=np2) + found_duplicates.append(res) + # Guardar inodos para evitar recalcular en Phase 2 + if ino1 and ino2: + unique_inode_pairs.add(frozenset((id1, id2))) + if not self._is_running: self.detection_finished.emit() return - # 3. Phase 2: Comparison (Optimized with BK-Tree) - hash_map = collections.defaultdict(list) - bk_tree = BKTree(lambda a, b: a - b) - - path_items = list(path_to_hash.items()) - total_items = len(path_items) - - for i, (p, (h_str, dev, inode)) in enumerate(path_items): - if not self._is_running: - break - - # Sub-phase: Indexing hashes into the BK-Tree for comparison - if time.perf_counter() - last_update_time > 0.05 \ - or i == 0 or i == total_items - 1: - # Scale Indexing to 50% - 75% range of the total bar - indexing_progress = int((i / total_items) * (total_files / 2)) \ - if total_items > 0 else 0 - self.progress_update.emit( - total_files + indexing_progress, total_files * 2, - UITexts.DUPLICATE_MSG_ANALYZING.format(filename="...")) - last_update_time = time.perf_counter() - - h_obj = imagehash.hex_to_hash(h_str) - if h_obj not in hash_map: - bk_tree.add(h_obj) - hash_map[h_obj].append((p, dev, inode)) - if self.force_full or p in dirty_paths: - dirty_hashes_objs.add(h_obj) - - # Optimization: Only query the tree for hashes associated with new or modified - # files. - # This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were - # handled in previous runs. - hashes_to_query = list(dirty_hashes_objs) \ - if not self.force_full else list(hash_map.keys()) - total_queries = len(hashes_to_query) - pending_db_updates = [] - - # Pre-load exceptions into memory to avoid thousands of DB lookups - self.progress_update.emit( - total_files, total_files * 2, - UITexts.DUPLICATE_MSG_ANALYZING.format(filename="...")) - exceptions_set = self.duplicate_cache.get_all_exceptions_set() - - if total_queries == 0: - # Nothing new to analyze, jump to end of detection phase + # --- KEY OPTIMIZATION: EARLY EXIT --- + # If there are no new or modified files and no full analysis was forced, + # devolvemos los resultados que ya estaban en la base de datos de pendientes. + if not dirty_paths and not self.force_full: self.progress_update.emit( total_files * 2, total_files * 2, - UITexts.DUPLICATE_MSG_ANALYZING.format(filename="... (OK)")) + UITexts.DUPLICATE_FINISHED) + self.duplicates_found.emit(found_duplicates) + self.detection_finished.emit() + return - for i, h1 in enumerate(hashes_to_query): + # 3. Phase 2: Incremental Comparison using Persistent BK-Tree + # 3. Phase 2: Incremental Comparison using Persistent BK-Tree + paths_to_query = list(dirty_paths) if not self.force_full \ + else unique_paths_to_scan + + total_queries = len(paths_to_query) + results_to_save = [] + + for i, p1 in enumerate(paths_to_query): if not self._is_running: break - items1 = hash_map[h1] + h1_data = path_to_hash.get(p1) + if not h1_data: + continue + h1_str, dev1, ino1 = h1_data - # Update progress more frequently during analysis phase - if time.perf_counter() - last_update_time > 0.05 \ - or i == 0 or i == total_queries - 1: - # Scale Comparison to 75% - 100% range - comparison_progress = int(((i + 1) / total_queries) - * (total_files / 2)) \ - if total_queries > 0 else (total_files / 2) + if time_module.perf_counter() - last_ui_update_time > 0.05: + # Scale Analysis progress to 50% - 100% range of the status bar + progress = total_files + int((i / total_queries) * total_files) self.progress_update.emit( - int(total_files * 1.5 + comparison_progress), total_files * 2, - UITexts.DUPLICATE_MSG_ANALYZING.format(filename="...")) - last_update_time = time.perf_counter() + progress, total_files * 2, + UITexts.DUPLICATE_MSG_ANALYZING.format( + filename=os.path.basename(p1))) + last_ui_update_time = time_module.perf_counter() - # Query tree for similar hashes - for h2, distance in bk_tree.query(h1, distance_threshold): - items2 = hash_map[h2] + # Query the persistent tree for similar hashes (direct from disk) + similar_hashes = self.duplicate_cache.persistent_bktree_query( + h1_str, distance_threshold) - for p1, dev1, ino1 in items1: - for p2, dev2, ino2 in items2: - if not self._is_running: - break - if (dev1, ino1) == (dev2, ino2): + for h2_bytes, distance in similar_hashes: + # Find all files sharing this similar hash (reverse index) + matches = self.duplicate_cache.get_files_for_hash(h2_bytes) + + for p2, dev2, ino2 in matches: + if not self._is_running: + break + + # Check if p2 is within the current scan scope to avoid + # results outside the folders the user is currently browsing. + if p2 not in scan_paths_set: + continue + + # 1. Check if it's exactly the same path (already normalized) + # 1. Check if it's exactly the same path (already normalized) + if p1 == p2: + continue + + # 2. Check memory caches for physical identity (Inodes) + # 2. Check memory caches for physical identity (Inodes) + id1, id2 = f"{dev1}-{ino1.hex()}", f"{dev2}-{ino2.hex()}" + inode_pair = frozenset((id1, id2)) + + if inode_pair in unique_inode_pairs or inode_pair in exceptions_set: + continue + + # 3. Absolute physical identity (pointers to the same object) + # 3. Absolute physical identity (pointers to the same object) + try: + if (dev1, ino1) == (dev2, ino2) or os.path.samefile(p1, p2): + # Silent identity (symlinks): mark and skip + self.duplicate_cache.mark_as_exception( + p1, p2, True, similarity=100) + exceptions_set.add(inode_pair) + unique_inode_pairs.add(inode_pair) continue + except OSError: + pass - # Optimization: Skip pair if BOTH were already verified - if not self.force_full \ - and p1 not in dirty_paths and p2 not in dirty_paths: - continue + # 4. Avoid duplicating pairs already processed in this session + # (A-B is the same as B-A) + # (A-B is the same as B-A) + if inode_pair in unique_inode_pairs: + continue - canonical = frozenset((p1, p2)) - if not self._is_running: - break + # 5. Calculate actual similarity + # 5. Calculate actual similarity + sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100) + if sim < self.threshold: + continue - if canonical not in unique_duplicate_pairs: - if canonical not in exceptions_set: - sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100) - ts = int(time.time()) - res = DuplicateResult(p1, p2, str(h1), False, sim, ts) - found_duplicates.append(res) - unique_duplicate_pairs.add(canonical) - - # Frequent UI heartbeat for large duplicate groups - if time.perf_counter() - last_update_time > 0.05: - comparison_progress = int(((i + 1) / total_queries) - * (total_files / 2)) - self.progress_update.emit( - int(total_files * 1.5 + comparison_progress), - total_files * 2, - UITexts.DUPLICATE_MSG_ANALYZING.format( - filename="...")) - last_update_time = time.perf_counter() - - # Collect for batch update to improve performance - pending_db_updates.append((p1, p2, sim, ts)) + ts = int(time_module.time()) + res = DuplicateResult(p1, p2, h1_str, False, sim, ts) + found_duplicates.append(res) + unique_inode_pairs.add(inode_pair) + results_to_save.append((p1, p2, sim, ts)) # Periodically flush pending updates to DB - if len(pending_db_updates) >= 50: - self.duplicate_cache.mark_as_pending_batch(pending_db_updates) - pending_db_updates = [] + if len(results_to_save) >= 50: + self.duplicate_cache.mark_as_pending_batch(results_to_save) + results_to_save = [] # Final flush of remaining updates - if pending_db_updates: - self.duplicate_cache.mark_as_pending_batch(pending_db_updates) + if results_to_save: + self.duplicate_cache.mark_as_pending_batch(results_to_save) self.duplicates_found.emit(found_duplicates) self.detection_finished.emit() diff --git a/metadatamanager.py b/metadatamanager.py index adf2277..71f93da 100644 --- a/metadatamanager.py +++ b/metadatamanager.py @@ -20,7 +20,8 @@ except ImportError: from utils import preserve_mtime -from constants import RATING_XATTR_NAME, XATTR_NAME +from constants import RATING_XATTR_NAME, XATTR_NAME, UITexts + logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 6bef09f..1ee5e09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bagheeraview" -version = "0.9.25" +version = "0.9.26" authors = [ { name = "Ignacio Serantes" } ] diff --git a/setup.py b/setup.py index 4000ca4..43e04e1 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name="bagheeraview", - version="0.9.25", + version="0.9.26", author="Ignacio Serantes", description="Bagheera Image Viewer - An image viewer for KDE with Baloo in mind", long_description="A fast image viewer built with PySide6, featuring search and "