diff --git a/pyobvector/__init__.py b/pyobvector/__init__.py index d6033c7..1766590 100644 --- a/pyobvector/__init__.py +++ b/pyobvector/__init__.py @@ -38,6 +38,7 @@ * st_astext GIS function: return a Point in human-readable format * FtsParser Text Parser Type for Full Text Search * FtsIndexParam Full Text Search index parameter +* make_analyzer_properties Build parser_properties string for built-in analyzer parser * MatchAgainst Full Text Search clause """ @@ -104,5 +105,6 @@ "OceanBase", "FtsParser", "FtsIndexParam", + "make_analyzer_properties", "MatchAgainst", ] diff --git a/pyobvector/client/__init__.py b/pyobvector/client/__init__.py index 1a3a772..0e946c9 100644 --- a/pyobvector/client/__init__.py +++ b/pyobvector/client/__init__.py @@ -32,6 +32,7 @@ * ObSubKeyPartition Specify Key subpartition info * FtsParser Text Parser Type for Full Text Search * FtsIndexParam Full Text Search index parameter +* make_analyzer_properties Build parser_properties string for built-in analyzer parser """ import os @@ -44,7 +45,7 @@ from .schema_type import DataType from .collection_schema import FieldSchema, CollectionSchema from .partitions import * -from .fts_index_param import FtsParser, FtsIndexParam +from .fts_index_param import FtsParser, FtsIndexParam, make_analyzer_properties def _resolve_password(password: str) -> str: @@ -114,4 +115,5 @@ def SeekdbRemoteClient( "ObSubKeyPartition", "FtsParser", "FtsIndexParam", + "make_analyzer_properties", ] diff --git a/pyobvector/client/fts_index_param.py b/pyobvector/client/fts_index_param.py index 813cd7e..42fdf40 100644 --- a/pyobvector/client/fts_index_param.py +++ b/pyobvector/client/fts_index_param.py @@ -1,5 +1,6 @@ """A module to specify fts index parameters""" +import json from enum import Enum @@ -11,6 +12,7 @@ class FtsParser(Enum): NGRAM2 = 2 # NGRAM2 parser (supported from V4.3.5 BP2+) BASIC_ENGLISH = 3 # Basic English parser JIEBA = 4 # jieba parser + ANALYZER = 5 # analyzer parser with PARSER_PROPERTIES support class FtsIndexParam: @@ -19,8 +21,9 @@ class FtsIndexParam: Args: index_name: Index name field_names: List of field names to create full-text index on - parser_type: Parser type, can be FtsParser enum or string (for custom parsers) - If None, uses default Space parser + parser_properties: Content placed inside PARSER_PROPERTIES = (...) in the DDL. + When set and parser_type is not None, the clause is appended. + Required for FtsParser.ANALYZER; optional for others. """ def __init__( @@ -28,10 +31,12 @@ def __init__( index_name: str, field_names: list[str], parser_type: FtsParser | str | None = None, + parser_properties: str | None = None, ): self.index_name = index_name self.field_names = field_names self.parser_type = parser_type + self.parser_properties = parser_properties def param_str(self) -> str | None: """Convert parser type to string format for SQL.""" @@ -40,6 +45,15 @@ def param_str(self) -> str | None: if isinstance(self.parser_type, str): # Custom parser name (e.g., "thai_ftparser") + if ( + self.parser_type.lower() == "analyzer" + and self.parser_properties is None + ): + raise ValueError( + 'parser_type "analyzer" requires parser_properties ' + "(OceanBase rejects WITH PARSER analyzer without PARSER_PROPERTIES). " + 'Example value: analysis = \'{"analyzer": "standard"}\'' + ) return self.parser_type.lower() if isinstance(self.parser_type, FtsParser): @@ -53,6 +67,14 @@ def param_str(self) -> str | None: return "beng" if self.parser_type == FtsParser.JIEBA: return "jieba" + if self.parser_type == FtsParser.ANALYZER: + if self.parser_properties is None: + raise ValueError( + "FtsParser.ANALYZER requires parser_properties " + "(OceanBase rejects WITH PARSER analyzer without PARSER_PROPERTIES). " + 'Example value: analysis = \'{"analyzer": "standard"}\'' + ) + return "analyzer" # Raise exception for unrecognized FtsParser enum values raise ValueError(f"Unrecognized FtsParser enum value: {self.parser_type}") @@ -63,6 +85,8 @@ def __iter__(self): yield "field_names", self.field_names if self.parser_type: yield "parser_type", self.parser_type + if self.parser_properties is not None: + yield "parser_properties", self.parser_properties def __str__(self): return str(dict(self)) @@ -74,3 +98,16 @@ def __eq__(self, other: object) -> bool: if isinstance(other, dict): return dict(self) == other return False + + +def make_analyzer_properties(analyzer_type: str = "standard") -> str: + """Build the parser_properties string for a built-in analyzer parser. + + Args: + analyzer_type: Analyzer name. Defaults to "standard". + + Returns: + A string suitable for FtsIndexParam(parser_properties=...), e.g. + ``analysis = '{"analyzer": "standard"}'`` + """ + return f"analysis = '{json.dumps({'analyzer': analyzer_type})}'" diff --git a/pyobvector/client/ob_vec_client.py b/pyobvector/client/ob_vec_client.py index 138af3c..be1f192 100644 --- a/pyobvector/client/ob_vec_client.py +++ b/pyobvector/client/ob_vec_client.py @@ -144,14 +144,16 @@ def create_table_with_index_params( vidx.create(self.engine, checkfirst=True) # create fts indexes if fts_idxs is not None: - for fts_idx in fts_idxs: + for fts_idx_param in fts_idxs: idx_cols = [ - table.c[field_name] for field_name in fts_idx.field_names + table.c[field_name] + for field_name in fts_idx_param.field_names ] fts_idx = FtsIndex( - fts_idx.index_name, - fts_idx.param_str(), + fts_idx_param.index_name, + fts_idx_param.param_str(), *idx_cols, + parser_properties=fts_idx_param.parser_properties, ) fts_idx.create(self.engine, checkfirst=True) @@ -227,6 +229,7 @@ def create_fts_idx_with_fts_index_param( fts_idx_param.index_name, fts_idx_param.param_str(), *idx_cols, + parser_properties=fts_idx_param.parser_properties, ) fts_idx.create(self.engine, checkfirst=True) diff --git a/pyobvector/schema/full_text_index.py b/pyobvector/schema/full_text_index.py index f0dc80b..9149afd 100644 --- a/pyobvector/schema/full_text_index.py +++ b/pyobvector/schema/full_text_index.py @@ -34,12 +34,36 @@ def visit_fts_index(self, index, create_ok=False): class FtsIndex(Index): - """Fts Index schema.""" + """Fts Index schema. + + Args: + name: Index name. + fts_parser: Parser name (e.g. "ngram", "ik", "analyzer"). + column_names: Columns to index. + parser_properties: Content placed inside PARSER_PROPERTIES = (...) in the DDL. + Required when fts_parser is "analyzer"; raises ValueError if omitted. + """ __visit_name__ = "fts_index" - def __init__(self, name, fts_parser: str, *column_names, **kw): + def __init__( + self, + name, + fts_parser: str | None, + *column_names, + parser_properties: str | None = None, + **kw, + ): + if isinstance(fts_parser, str): + fts_parser = fts_parser.lower() + if fts_parser == "analyzer" and parser_properties is None: + raise ValueError( + 'FtsIndex with fts_parser="analyzer" requires parser_properties ' + "(OceanBase rejects WITH PARSER analyzer without PARSER_PROPERTIES). " + 'Example value: analysis = \'{"analyzer": "standard"}\'' + ) self.fts_parser = fts_parser + self.parser_properties = parser_properties super().__init__(name, *column_names, **kw) def create(self, bind, checkfirst: bool = False) -> None: @@ -59,6 +83,9 @@ def compile_create_fts_index(element, compiler, **kw): # pylint: disable=unused table_name = index.table.name column_list = ", ".join([column.name for column in index.columns]) fts_parser = index.fts_parser - if fts_parser is not None: - return f"CREATE FULLTEXT INDEX {index.name} ON {table_name} ({column_list}) WITH PARSER {fts_parser}" - return f"CREATE FULLTEXT INDEX {index.name} ON {table_name} ({column_list})" + if fts_parser is None: + return f"CREATE FULLTEXT INDEX {index.name} ON {table_name} ({column_list})" + sql = f"CREATE FULLTEXT INDEX {index.name} ON {table_name} ({column_list}) WITH PARSER {fts_parser}" + if index.parser_properties is not None: + sql += f" PARSER_PROPERTIES=({index.parser_properties})" + return sql diff --git a/tests/test_fts_index.py b/tests/test_fts_index.py index 9b545f4..10bed93 100644 --- a/tests/test_fts_index.py +++ b/tests/test_fts_index.py @@ -1,12 +1,27 @@ import unittest -from pyobvector import * -from sqlalchemy import Column, Integer, text +from pyobvector import ( + ObVecClient, + FtsIndexParam, + FtsParser, + MatchAgainst, + make_analyzer_properties, +) +from pyobvector.schema.full_text_index import ( + FtsIndex, + CreateFtsIndex, + compile_create_fts_index, +) +from sqlalchemy import Column, Integer, MetaData, Table, text from sqlalchemy.dialects.mysql import TEXT import logging logger = logging.getLogger(__name__) +class _MockCompiler: + """Minimal stub passed to compile_create_fts_index (unused by the function).""" + + class ObFtsIndexTest(unittest.TestCase): def setUp(self) -> None: self.client = ObVecClient() @@ -498,3 +513,126 @@ def test_fts_create_after_insert(self): ) self.client.drop_table_if_exist(test_collection_name) + + +class FtsAnalyzerCompilationTest(unittest.TestCase): + """Unit tests for ANALYZER parser SQL compilation — no DB connection required.""" + + def _build_index(self, parser_properties: str) -> FtsIndex: + meta = MetaData() + table = Table( + "articles", + meta, + Column("id", Integer, primary_key=True), + Column("body", TEXT), + ) + return FtsIndex( + "ft_idx_body", + "analyzer", + table.c["body"], + parser_properties=parser_properties, + ) + + def test_fts_analyzer_param_str(self): + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type=FtsParser.ANALYZER, + parser_properties='analysis = \'{"analyzer": "standard"}\'', + ) + self.assertEqual(param.param_str(), "analyzer") + + def test_fts_analyzer_with_parser_properties(self): + props = 'analysis = \'{"analyzer": "standard"}\'' + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type=FtsParser.ANALYZER, + parser_properties=props, + ) + self.assertEqual(param.parser_properties, props) + param_dict = dict(param) + self.assertEqual(param_dict["parser_properties"], props) + + def test_fts_analyzer_sql_with_properties(self): + props = 'analysis = \'{"analyzer": "standard"}\'' + idx = self._build_index(props) + sql = compile_create_fts_index(CreateFtsIndex(idx), _MockCompiler()) + self.assertIn("WITH PARSER analyzer", sql) + self.assertIn(f"PARSER_PROPERTIES = ({props})", sql) + + def test_fts_analyzer_requires_parser_properties(self): + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type=FtsParser.ANALYZER, + ) + with self.assertRaises(ValueError): + param.param_str() + + def test_fts_parser_properties_not_tied_to_analyzer_type(self): + # parser_properties can be used with any parser type, not only ANALYZER + meta = MetaData() + table = Table( + "articles", + meta, + Column("id", Integer, primary_key=True), + Column("body", TEXT), + ) + idx = FtsIndex( + "ft_idx_body", + "ngram", + table.c["body"], + parser_properties="token_size = 2", + ) + sql = compile_create_fts_index(CreateFtsIndex(idx), _MockCompiler()) + self.assertIn("WITH PARSER ngram", sql) + self.assertIn("PARSER_PROPERTIES = (token_size = 2)", sql) + + def test_make_analyzer_properties_default(self): + result = make_analyzer_properties() + self.assertEqual(result, 'analysis = \'{"analyzer": "standard"}\'') + + def test_make_analyzer_properties_custom(self): + result = make_analyzer_properties("ik_smart") + self.assertEqual(result, 'analysis = \'{"analyzer": "ik_smart"}\'') + + def test_make_analyzer_properties_integrates_with_fts_index_param(self): + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type=FtsParser.ANALYZER, + parser_properties=make_analyzer_properties(), + ) + self.assertEqual(param.param_str(), "analyzer") + self.assertIn('"analyzer": "standard"', param.parser_properties) + + def test_fts_index_direct_analyzer_requires_parser_properties(self): + meta = MetaData() + table = Table( + "articles", + meta, + Column("id", Integer, primary_key=True), + Column("body", TEXT), + ) + with self.assertRaises(ValueError): + FtsIndex("ft_idx_body", "analyzer", table.c["body"]) + + def test_fts_index_param_str_string_analyzer_requires_parser_properties(self): + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type="analyzer", + ) + with self.assertRaises(ValueError): + param.param_str() + + def test_fts_index_param_str_string_analyzer_with_parser_properties(self): + props = 'analysis = \'{"analyzer": "standard"}\'' + param = FtsIndexParam( + index_name="ft_idx_body", + field_names=["body"], + parser_type="analyzer", + parser_properties=props, + ) + self.assertEqual(param.param_str(), "analyzer")