# -*- coding: utf-8 -*-"""Field type definitions for sayt2.Seven field types covering all search/store/sort use cases. Each type is apydantic ``BaseModel`` with validation, serialization, and discriminated-unionsupport for polymorphic deserialization from config files.Field types carry **no dependency on tantivy** — the mapping from fielddefinitions to tantivy schema objects lives in ``dataset.py``."""from__future__importannotationsimporthashlibimportjsonimporttypingasTfrompydanticimportBaseModelfrompydanticimportFieldfrompydanticimportmodel_validatorfrom.constantsimportFieldTypeEnumfrom.constantsimportNumericKindEnumfrom.constantsimportTokenizerEnum# --- base -------------------------------------------------------------------
[docs]classBaseField(BaseModel):""" Common base for all field types. Every subclass must override ``type`` with a ``T.Literal["..."]`` so that pydantic's discriminated union can reconstruct the correct class from a plain dict. """type:str# overridden by each subclass as a Literalname:str=Field(min_length=1)stored:bool=True
# --- text family -------------------------------------------------------------
[docs]classStoredField(BaseField):"""Store-only field. Not indexed, not searchable, not sortable."""type:T.Literal["stored"]=FieldTypeEnum.STORED.value
[docs]classKeywordField(BaseField):""" Exact-match field (id, tag, enum). Uses the ``raw`` tokenizer under the hood — the entire field value is treated as one token. """type:T.Literal["keyword"]=FieldTypeEnum.KEYWORD.valueboost:float=Field(default=1.0,gt=0)
[docs]classTextField(BaseField):""" Full-text search field. Uses the ``default`` (Unicode-aware word boundary) or ``en_stem`` (English stemming) tokenizer. """type:T.Literal["text"]=FieldTypeEnum.TEXT.valuetokenizer:T.Literal["default","en_stem"]=TokenizerEnum.DEFAULT.valueboost:float=Field(default=1.0,gt=0)
[docs]classNgramField(BaseField):""" Search-as-you-type field. Builds an ngram inverted index so that any substring of length ``[min_gram, max_gram]`` is a valid query token. """type:T.Literal["ngram"]=FieldTypeEnum.NGRAM.valuemin_gram:int=Field(default=2,ge=1)max_gram:int=Field(default=6,ge=1)prefix_only:bool=Falselowercase:bool=Trueboost:float=Field(default=1.0,gt=0)@model_validator(mode="after")def_max_gte_min(self)->NgramField:ifself.max_gram<self.min_gram:raiseValueError(f"max_gram ({self.max_gram}) must be >= min_gram ({self.min_gram})")returnself
# --- numeric family ----------------------------------------------------------
[docs]classNumericField(BaseField):""" Numeric field. Defaults to sort-only (``indexed=False, fast=True``) which is the typical use case for rating/year columns. """type:T.Literal["numeric"]=FieldTypeEnum.NUMERIC.valuekind:T.Literal["i64","u64","f64"]=NumericKindEnum.I64.valueindexed:bool=Falsefast:bool=True
[docs]classDatetimeField(BaseField):"""Datetime field backed by tantivy's date type."""type:T.Literal["datetime"]=FieldTypeEnum.DATETIME.valueindexed:bool=Truefast:bool=True
# --- union & helpers ---------------------------------------------------------T_Field=T.Annotated[T.Union[StoredField,KeywordField,TextField,NgramField,NumericField,DatetimeField,BooleanField,],Field(discriminator="type"),]"""Discriminated union of all field types. Use with ``TypeAdapter`` forpolymorphic deserialization:: from pydantic import TypeAdapter adapter = TypeAdapter(T_Field) field = adapter.validate_python({"type": "ngram", "name": "title"})"""
[docs]deffields_schema_hash(fields:list[T_Field])->str:# type: ignore[type-arg]""" Deterministic hash of a list of field definitions. Used as part of cache keys so that changing the schema automatically invalidates stale caches. """payload="|".join(json.dumps(f.model_dump(exclude_none=True),sort_keys=True)forfinfields)returnhashlib.sha256(payload.encode()).hexdigest()[:16]