v1.1.0

2026-05-09 10:26:57 +02:00
parent 3fb55ee4f3
commit 6207cab27a
7 changed files with 622 additions and 177 deletions
--- a/bagheera_search_lib/bagheera_search.py
+++ b/bagheera_search_lib/bagheera_search.py
@@ -10,9 +10,136 @@ import sys
 from pathlib import Path
 from typing import Dict, Any, Iterator, Optional, Union

-from baloo_tools import get_resolution
+from baloo_tools import (get_info, get_tags)
 from bagheera_query_parser_lib import parse_date

+from pyparsing import (
+    alphanums, one_of, infix_notation,
+    Group, opAssoc, ParserElement, QuotedString, Word
+)
+
+ParserElement.enable_packrat()
+
+
+def expression_contains_property(text):
+    pattern = r"\b(?!tags\b)\w+[ \t]*(?:>=|<=|!=|=|>|<|:)"
+    return bool(re.search(pattern, text, re.IGNORECASE))
+
+
+def expression_contains_tags(text):
+    pattern = r"\btags\b[ \t]*(?:>=|<=|!=|=|>|<|:)"
+    return bool(re.search(pattern, text, re.IGNORECASE))
+
+
+class EvaluateExpression:
+    def __init__(self):
+        # Pre-define the grammar structure during initialization
+        self.grammar = self._build_grammar()
+
+    def _compare_single(self, l_val, op, r_val):
+        """
+        Atomic comparison logic for individual values.
+        Handles numeric conversion and standard operators.
+        """
+        # Numeric conversion for mathematical operators
+        if op in (">", "<", ">=", "<="):
+            try:
+                # Attempt to treat both sides as floats
+                curr_l, curr_r = float(l_val), float(r_val)
+            except (ValueError, TypeError):
+                # Fallback to string comparison if conversion fails
+                curr_l, curr_r = str(l_val), str(r_val)
+        else:
+            # Default to string representation for other operators
+            curr_l, curr_r = str(l_val), str(r_val)
+
+        # Standard operator logic
+        if op == "=":
+            return l_val == r_val
+        if op == "!=":
+            return l_val != r_val
+        if op == ">":
+            return curr_l > curr_r
+        if op == "<":
+            return curr_l < curr_r
+        if op == ">=":
+            return curr_l >= curr_r
+        if op == "<=":
+            return curr_l <= curr_r
+        if op == ":":
+            return str(r_val).lower() in str(l_val).lower()
+        return False
+
+    def _compare(self, data, left_key, op, right_val):
+        """
+        Main comparison router. Checks if the field is a list or a single value.
+        """
+        # Normalize data keys to lowercase for case-insensitive lookup
+        normalized_data = {k.lower(): v for k, v in data.items()}
+
+        # Extract the left-hand value (the field from the JSON)
+        l_val = normalized_data.get(left_key.lower(), left_key)
+
+        # Extract the right-hand value (check if it's a literal or another field)
+        r_val = normalized_data.get(str(right_val).lower(), right_val)
+
+        # IF THE FIELD VALUE IS A LIST
+        if isinstance(l_val, list):
+            # Return True if ANY item in the list satisfies the condition
+            return any(self._compare_single(item, op, r_val) for item in l_val)
+
+        # IF THE FIELD VALUE IS A SINGLE DATA POINT
+        return self._compare_single(l_val, op, r_val)
+
+    def _build_grammar(self):
+        """
+        Defines the pyparsing grammar for the expression engine.
+        """
+        operators = one_of(">= <= != = > < :")
+        identifier = Word(alphanums + "_./\\")
+        quoted_string = QuotedString("'") | QuotedString('"')
+        operand = quoted_string | identifier
+
+        # Define basic condition (e.g., "width > 100" or "word")
+        condition = Group((operand + operators + operand) | operand)
+
+        # Attach the parse action to convert tokens into executable functions (lambdas)
+        condition.set_parse_action(lambda t: self._create_evaluator_func(t[0]))
+
+        return infix_notation(
+            condition,
+            [
+                ("NOT", 1, opAssoc.RIGHT, lambda t: (
+                    lambda data: not t[0][1](data))),
+                ("AND", 2, opAssoc.LEFT, lambda t: (
+                    lambda data: all(f(data) for f in t[0] if callable(f)))),
+                ("OR", 2, opAssoc.LEFT, lambda t: (
+                    lambda data: any(f(data) for f in t[0] if callable(f)))),
+            ],
+        )
+
+    def _create_evaluator_func(self, tokens):
+        """
+        Creates a closure that captures tokens and waits for the data dictionary.
+        """
+        if len(tokens) == 1:
+            # Rule: Single term -> path CONTAINS term
+            return lambda data: self._compare(data, 'path', ':', tokens[0])
+        else:
+            # Rule: Explicit triplet (key, operator, value)
+            return lambda data: self._compare(data, tokens[0], tokens[1], tokens[2])
+
+    def compile(self, expression):
+        """
+        Parses the expression once and returns a reusable function.
+        """
+        try:
+            return self.grammar.parse_string(expression, parse_all=True)[0]
+        except Exception as e:
+            print(f"Compilation Error: {e}")
+            # Fallback: return a function that always fails gracefully
+            return lambda data: False
+

 class BagheeraSearcher:
    """Class to handle Baloo searches and interact with the C wrapper."""
@@ -69,84 +196,8 @@ class BagheeraSearcher:

        return lib

-    def check_keywords(
-        self, text: str, query: str, file_path: str = "", file_id: int = 0
-    ) -> bool:
-        """
-        Evaluates if a text meets a logical query.
-        Supports: AND, OR, ( ), dimensions (width=height, etc.), and shapes.
-        """
-        if file_path:
-            try:
-                w, h = get_resolution(file_id)
-            except Exception:
-                w, h = -1, -1
-
-            def replace_dim(match: re.Match) -> str:
-                if w <= 0 or h <= 0:
-                    return "__false__"
-
-                s = match.group(0).upper()
-                if "PORTRAIT" in s:
-                    return "__true__" if w < h else "__false__"
-                if "LANDSCAPE" in s:
-                    return "__true__" if w > h else "__false__"
-                if "SQUARE" in s:
-                    return "__true__" if w == h else "__false__"
-
-                op = match.group(1)
-                ops_map = {
-                    "=": w == h,
-                    ">": w > h,
-                    "<": w < h,
-                    ">=": w >= h,
-                    "<=": w <= h,
-                    "!=": w != h,
-                }
-                return "__true__" if ops_map.get(op, False) else "__false__"
-
-            query = re.sub(
-                r"\b(PORTRAIT|LANDSCAPE|SQUARE)\b",
-                replace_dim,
-                query,
-                flags=re.IGNORECASE,
-            )
-            query = re.sub(
-                r"\bwidth\s*(<=|>=|!=|<|>|=)\s*height\b",
-                replace_dim,
-                query,
-                flags=re.IGNORECASE,
-            )
-
-        text = text.lower()
-        query = re.sub(r"(?<=\w)\s+(?=\w)", " AND ", query)
-
-        tokens = re.findall(r"\(|\)|OR|AND|[^\s()]+", query)
-        regex_parts = []
-
-        for t in tokens:
-            if t in ("(", ")"):
-                regex_parts.append(t)
-            elif t == "OR":
-                regex_parts.append("|")
-            elif t == "AND":
-                continue
-            elif t == "__true__":
-                regex_parts.append("(?=.*)")
-            elif t == "__false__":
-                regex_parts.append("(?!)")
-            else:
-                regex_parts.append(rf"(?=.*{re.escape(t)})")
-
-        final_regex = "".join(regex_parts).lower()
-
-        try:
-            return bool(re.search(f"^{final_regex}.*", text, re.DOTALL))
-        except re.error:
-            return False
-
    def get_baloo_info(self, file_path: str) -> Dict[str, str]:
-        """Retrieves properties for a specific file from Baloo."""
+        """Extract properties for a specific file directly from file."""
        result = self.baloo_lib.get_file_properties(file_path.encode("utf-8"))
        if not result:
            return {}
@@ -181,6 +232,8 @@ class BagheeraSearcher:
        options: Dict[str, Any],
        search_opts: Dict[str, Any],
        files_count: int,
+        exclude_evaluator: Any,
+        exclude_sources: Dict[str, bool]
    ) -> Iterator[Dict[str, Any]]:
        """Executes a recursive search yielded item by item."""
        options["query"] = query_text
@@ -195,15 +248,20 @@ class BagheeraSearcher:
                continue

            self.ids_processed.add(file_id)
-            rec_exclude = search_opts.get("recursive_exclude")

-            if not rec_exclude or not self.check_keywords(
-                item["path"], rec_exclude, item["path"], file_id
-            ):
+            if exclude_evaluator:
+                file_info = {'path': item["path"]}
+                if exclude_sources.get('properties'):
+                    file_info = file_info | get_info(file_id)
+                if exclude_sources.get('tags'):
+                    file_info = file_info | get_tags(file_id)
+            else:
+                file_info = None
+
+            if not file_info or not exclude_evaluator(file_info):
                if files_count >= search_opts.get("offset", 0):
                    search_opts["limit"] -= 1
                    yield item
-
                files_count += 1

    def search(
@@ -215,6 +273,30 @@ class BagheeraSearcher:
        """
        Main search generator. Yields file dictionaries.
        """
+        if search_opts['exclude']:
+            ee = EvaluateExpression()
+            exclude_evaluator = ee.compile(search_opts['exclude'])
+            exclude_sources = {}
+            if expression_contains_property(search_opts['exclude']):
+                exclude_sources['properties'] = True
+            if expression_contains_tags(search_opts['exclude']):
+                exclude_sources['tags'] = True
+        else:
+            exclude_evaluator = None
+            exclude_sources = {}
+
+        if search_opts['recursive_exclude']:
+            ee = EvaluateExpression()
+            recurse_exclude_evaluator = ee.compile(search_opts['recursive_exclude'])
+            recurse_exclude_sources = {}
+            if expression_contains_property(search_opts['recursive_exclude']):
+                recurse_exclude_sources['properties'] = True
+            if expression_contains_tags(search_opts['recursive_exclude']):
+                recurse_exclude_sources['tags'] = True
+        else:
+            recurse_exclude_evaluator = None
+            recurse_exclude_sources = {}
+
        main_options["query"] = parse_date(query_text)
        files = self._execute_query(main_options)

@@ -241,15 +323,22 @@ class BagheeraSearcher:
                continue

            self.ids_processed.add(file_id)
-            exclude_pattern = search_opts.get("exclude")

-            if not exclude_pattern or not self.check_keywords(
-                item["path"], exclude_pattern, item["path"], file_id
-            ):
+            if exclude_evaluator:
+                file_info = {'path': item["path"]}
+                if exclude_sources.get('properties'):
+                    file_info = file_info | get_info(file_id)
+                if exclude_sources.get('tags'):
+                    file_info = file_info | get_tags(file_id)
+            else:
+                file_info = None
+
+            if not file_info or not exclude_evaluator(file_info):
                if is_recursive:
                    main_options["directory"] = item["path"]
                    yield from self.search_recursive(
-                        query_text, main_options, search_opts, files_count
+                        query_text, main_options, search_opts, files_count,
+                        recurse_exclude_evaluator, recurse_exclude_sources
                    )
                else:
                    yield item