This commit is contained in:
Ignacio Serantes
2026-05-10 16:37:46 +02:00
parent 6207cab27a
commit af21672b1c
3 changed files with 166 additions and 72 deletions

View File

@@ -10,6 +10,7 @@ import lmdb
import os
import re
import sys
import unicodedata
from typing import Tuple
PROPERTIES_ID_MAP = {
@@ -100,6 +101,18 @@ PROPERTIES_ID_MAP = {
}
def normalize_text(text):
"""
Remove accents/diacritics for string comparison.
"""
if not text:
return ""
text = unicodedata.normalize('NFD', text)
text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
# return text.lower().strip()
return text.strip()
class BalooTools:
"""Class to interact directly with the Baloo LMDB index."""
@@ -214,11 +227,46 @@ class BalooTools:
for p in parts:
p = p.strip()
if p:
tag = p.removeprefix('TAG-').removeprefix('TA')
tags.append(tag)
""" 'TA' elements are tags normalized to lowercase
and stripped of accents/diacritics, while 'TAG'
elements are the original tags as they were added by
the user. We need to process both to ensure we can
match tags in a case-insensitive and
accent-insensitive way. But we only want to add the
original tags to the final result, not the
normalized ones, because the normalized ones are
not handle correctly tags with spaces and words with
less than three characters.
"""
if p.startswith('TAG-'):
tag = p.removeprefix('TAG-')
tags.append(tag)
return {'tags': tags}
# return {'tags': ",".join(tags)}
result_set = set(tags)
""" Must add individual parts of the tags to the result set
to be able to match them with queries like 'tags:callas'
or 'tags:maria' for tags "María Callas" or "Person/María
Callas". To maintain Baloo tag behaviour with spaces, it's
not possible to search for tags="María Callas" and must
search for tags=María tags:Callas, items with spaces are
not added to avoid confusion."""
for item in tags:
parts = re.split(r'[ /\n\t]+', item)
for part in parts:
if part:
result_set.add(part)
normalize_part = normalize_text(part)
if normalize_part:
result_set.add(normalize_part)
tags = sorted(list(result_set))
if not tags:
return {}
else:
return {'tags': tags}
except lmdb.Error as e:
print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)