Better status bar messages
This commit is contained in:
@@ -389,6 +389,45 @@ class DuplicateCache(QObject):
|
||||
txn.delete(key, db=self._pending_db)
|
||||
return True
|
||||
|
||||
def mark_as_pending_batch(self, pairs_data):
|
||||
"""
|
||||
Marks multiple pairs as pending review in a single transaction.
|
||||
pairs_data: list of (path1, path2, similarity, timestamp)
|
||||
"""
|
||||
if not self._lmdb_env or self._pending_db is None or not pairs_data:
|
||||
return False
|
||||
|
||||
with QMutexLocker(self._db_lock):
|
||||
with self._lmdb_env.begin(write=True) as txn:
|
||||
for p1, p2, similarity, timestamp in pairs_data:
|
||||
key = self._get_pair_lmdb_key(p1, p2)
|
||||
if not key:
|
||||
continue
|
||||
|
||||
ts = timestamp if timestamp is not None else int(time.time())
|
||||
sim_str = str(similarity) if similarity is not None else ""
|
||||
val_str = f"{p1}|{p2}|{sim_str}|{ts}"
|
||||
value = val_str.encode('utf-8')
|
||||
txn.put(key, value, db=self._pending_db)
|
||||
return True
|
||||
|
||||
def get_all_exceptions_set(self):
|
||||
"""Returns a set of canonical pairs (frozenset) marked as exceptions."""
|
||||
exceptions = set()
|
||||
if not self._lmdb_env or self._exceptions_db is None:
|
||||
return exceptions
|
||||
with QMutexLocker(self._db_lock):
|
||||
with self._lmdb_env.begin(write=False) as txn:
|
||||
cursor = txn.cursor(db=self._exceptions_db)
|
||||
for _, value_bytes in cursor:
|
||||
try:
|
||||
parts = value_bytes.decode('utf-8').split('|')
|
||||
if len(parts) >= 2:
|
||||
exceptions.add(frozenset((parts[0], parts[1])))
|
||||
except Exception:
|
||||
continue
|
||||
return exceptions
|
||||
|
||||
def get_all_pending_duplicates(self):
|
||||
"""Retrieves all pending duplicate pairs from the database."""
|
||||
results = []
|
||||
@@ -646,13 +685,19 @@ class DuplicateDetector(QThread):
|
||||
dirty_hashes_objs = set()
|
||||
dirty_paths = set()
|
||||
paths_to_hash_parallel = []
|
||||
processed_initial = 0
|
||||
|
||||
for i, path in enumerate(self.paths_to_scan):
|
||||
if not self._is_running:
|
||||
break
|
||||
|
||||
for path in self.paths_to_scan:
|
||||
try:
|
||||
stat_info = os.stat(path)
|
||||
mtime = stat_info.st_mtime
|
||||
dev, inode = stat_info.st_dev, struct.pack('Q', stat_info.st_ino)
|
||||
|
||||
# Update UI during initial cache check (Phase 1 part A)
|
||||
processed_initial += 1
|
||||
cached_h = \
|
||||
self.duplicate_cache.get_hash_for_path(path, mtime, dev, inode)
|
||||
|
||||
@@ -661,18 +706,27 @@ class DuplicateDetector(QThread):
|
||||
else:
|
||||
dirty_paths.add(path)
|
||||
paths_to_hash_parallel.append((path, mtime, dev, inode))
|
||||
|
||||
if time.perf_counter() - last_update_time > 0.05:
|
||||
# Scale this part to 0-50% of the total bar
|
||||
progress = int((processed_initial / total_files) * total_files)
|
||||
self.progress_update.emit(
|
||||
progress, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_HASHING.format(filename=os.path.basename(path)))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
# Phase 1 starts with files already found in cache or skipped
|
||||
processed_hashing = total_files - len(paths_to_hash_parallel)
|
||||
|
||||
if paths_to_hash_parallel and self._is_running:
|
||||
batch_size = pool.maxThreadCount() * 2
|
||||
results_mutex = QMutex()
|
||||
new_hashes = {}
|
||||
sem = QSemaphore(0)
|
||||
|
||||
# Phase 1 part B: Parallel hashing for new/changed files
|
||||
processed_hashing = total_files - len(paths_to_hash_parallel)
|
||||
|
||||
for i in range(0, len(paths_to_hash_parallel), batch_size):
|
||||
if not self._is_running:
|
||||
break
|
||||
@@ -681,14 +735,14 @@ class DuplicateDetector(QThread):
|
||||
pool.start(HashWorker(
|
||||
p_data[0], self, new_hashes, results_mutex, sem))
|
||||
|
||||
for _ in range(len(current_batch)):
|
||||
for j in range(len(current_batch)):
|
||||
while not sem.tryAcquire(1, 100):
|
||||
if not self._is_running:
|
||||
break
|
||||
if not self._is_running:
|
||||
break
|
||||
processed_hashing += 1
|
||||
if time.perf_counter() - last_update_time > 0.05:
|
||||
if time.perf_counter() - last_update_time > 0.03:
|
||||
self.progress_update.emit(
|
||||
processed_hashing, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
|
||||
@@ -705,16 +759,26 @@ class DuplicateDetector(QThread):
|
||||
self.detection_finished.emit()
|
||||
return
|
||||
|
||||
# Signal phase transition to exactly 50%
|
||||
self.progress_update.emit(
|
||||
total_files, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
|
||||
# 3. Phase 2: Comparison (Optimized with BK-Tree)
|
||||
hash_map = collections.defaultdict(list)
|
||||
bk_tree = BKTree(lambda a, b: a - b)
|
||||
|
||||
for p, (h_str, dev, inode) in path_to_hash.items():
|
||||
path_items = list(path_to_hash.items())
|
||||
total_items = len(path_items)
|
||||
|
||||
for i, (p, (h_str, dev, inode)) in enumerate(path_items):
|
||||
if not self._is_running:
|
||||
break
|
||||
|
||||
# Sub-phase: Indexing hashes into the BK-Tree for comparison
|
||||
if time.perf_counter() - last_update_time > 0.05 or i == 0 or i == total_items - 1:
|
||||
# Scale Indexing to 50% - 75% range of the total bar
|
||||
indexing_progress = int((i / total_items) * (total_files / 2)) if total_items > 0 else 0
|
||||
self.progress_update.emit(
|
||||
total_files + indexing_progress, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
h_obj = imagehash.hex_to_hash(h_str)
|
||||
if h_obj not in hash_map:
|
||||
bk_tree.add(h_obj)
|
||||
@@ -729,6 +793,19 @@ class DuplicateDetector(QThread):
|
||||
hashes_to_query = list(dirty_hashes_objs) \
|
||||
if not self.force_full else list(hash_map.keys())
|
||||
total_queries = len(hashes_to_query)
|
||||
pending_db_updates = []
|
||||
|
||||
# Pre-load exceptions into memory to avoid thousands of DB lookups
|
||||
self.progress_update.emit(
|
||||
total_files, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
exceptions_set = self.duplicate_cache.get_all_exceptions_set()
|
||||
|
||||
if total_queries == 0:
|
||||
# Nothing new to analyze, jump to end of detection phase
|
||||
self.progress_update.emit(
|
||||
total_files * 2, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="... (OK)"))
|
||||
|
||||
for i, h1 in enumerate(hashes_to_query):
|
||||
if not self._is_running:
|
||||
@@ -736,12 +813,13 @@ class DuplicateDetector(QThread):
|
||||
|
||||
items1 = hash_map[h1]
|
||||
|
||||
if time.perf_counter() - last_update_time > 0.1:
|
||||
# Scale Phase 2 progress to the 50%-100% range
|
||||
phase2_progress = int(((i + 1) / total_queries) * total_files) \
|
||||
if total_queries > 0 else total_files
|
||||
# Update progress more frequently during analysis phase
|
||||
if time.perf_counter() - last_update_time > 0.05 or i == 0 or i == total_queries - 1:
|
||||
# Scale Comparison to 75% - 100% range
|
||||
comparison_progress = int(((i + 1) / total_queries) * (total_files / 2)) \
|
||||
if total_queries > 0 else (total_files / 2)
|
||||
self.progress_update.emit(
|
||||
total_files + phase2_progress, total_files * 2,
|
||||
int(total_files * 1.5 + comparison_progress), total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
@@ -764,15 +842,34 @@ class DuplicateDetector(QThread):
|
||||
canonical = frozenset((p1, p2))
|
||||
if not self._is_running:
|
||||
break
|
||||
|
||||
if canonical not in unique_duplicate_pairs:
|
||||
if not self.duplicate_cache.is_exception(p1, p2):
|
||||
if canonical not in exceptions_set:
|
||||
sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100)
|
||||
ts = int(time.time())
|
||||
res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
|
||||
found_duplicates.append(res)
|
||||
unique_duplicate_pairs.add(canonical)
|
||||
self.duplicate_cache.mark_as_pending(
|
||||
p1, p2, True, similarity=sim, timestamp=ts)
|
||||
|
||||
# Frequent UI heartbeat for large duplicate groups
|
||||
if time.perf_counter() - last_update_time > 0.05:
|
||||
phase2_progress = int(((i + 1) / total_queries) * total_files)
|
||||
self.progress_update.emit(
|
||||
total_files + phase2_progress, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename=os.path.basename(p1)))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
# Collect for batch update to improve performance
|
||||
pending_db_updates.append((p1, p2, sim, ts))
|
||||
|
||||
# Periodically flush pending updates to DB
|
||||
if len(pending_db_updates) >= 50:
|
||||
self.duplicate_cache.mark_as_pending_batch(pending_db_updates)
|
||||
pending_db_updates = []
|
||||
|
||||
# Final flush of remaining updates
|
||||
if pending_db_updates:
|
||||
self.duplicate_cache.mark_as_pending_batch(pending_db_updates)
|
||||
|
||||
self.duplicates_found.emit(found_duplicates)
|
||||
self.detection_finished.emit()
|
||||
|
||||
Reference in New Issue
Block a user