Source code for sayt2.fields

# -*- coding: utf-8 -*-

"""
Field type definitions for sayt2.

Seven field types covering all search/store/sort use cases.  Each type is a
pydantic ``BaseModel`` with validation, serialization, and discriminated-union
support for polymorphic deserialization from config files.

Field types carry **no dependency on tantivy** — the mapping from field
definitions to tantivy schema objects lives in ``dataset.py``.
"""

from __future__ import annotations

import hashlib
import json
import typing as T

from pydantic import BaseModel
from pydantic import Field
from pydantic import model_validator

from .constants import FieldTypeEnum
from .constants import NumericKindEnum
from .constants import TokenizerEnum

# --- base -------------------------------------------------------------------


[docs] class BaseField(BaseModel): """ Common base for all field types. Every subclass must override ``type`` with a ``T.Literal["..."]`` so that pydantic's discriminated union can reconstruct the correct class from a plain dict. """ type: str # overridden by each subclass as a Literal name: str = Field(min_length=1) stored: bool = True
# --- text family -------------------------------------------------------------
[docs] class StoredField(BaseField): """Store-only field. Not indexed, not searchable, not sortable.""" type: T.Literal["stored"] = FieldTypeEnum.STORED.value
[docs] class KeywordField(BaseField): """ Exact-match field (id, tag, enum). Uses the ``raw`` tokenizer under the hood — the entire field value is treated as one token. """ type: T.Literal["keyword"] = FieldTypeEnum.KEYWORD.value boost: float = Field(default=1.0, gt=0)
[docs] class TextField(BaseField): """ Full-text search field. Uses the ``default`` (Unicode-aware word boundary) or ``en_stem`` (English stemming) tokenizer. """ type: T.Literal["text"] = FieldTypeEnum.TEXT.value tokenizer: T.Literal["default", "en_stem"] = TokenizerEnum.DEFAULT.value boost: float = Field(default=1.0, gt=0)
[docs] class NgramField(BaseField): """ Search-as-you-type field. Builds an ngram inverted index so that any substring of length ``[min_gram, max_gram]`` is a valid query token. """ type: T.Literal["ngram"] = FieldTypeEnum.NGRAM.value min_gram: int = Field(default=2, ge=1) max_gram: int = Field(default=6, ge=1) prefix_only: bool = False lowercase: bool = True boost: float = Field(default=1.0, gt=0) @model_validator(mode="after") def _max_gte_min(self) -> NgramField: if self.max_gram < self.min_gram: raise ValueError( f"max_gram ({self.max_gram}) must be >= min_gram ({self.min_gram})" ) return self
# --- numeric family ----------------------------------------------------------
[docs] class NumericField(BaseField): """ Numeric field. Defaults to sort-only (``indexed=False, fast=True``) which is the typical use case for rating/year columns. """ type: T.Literal["numeric"] = FieldTypeEnum.NUMERIC.value kind: T.Literal["i64", "u64", "f64"] = NumericKindEnum.I64.value indexed: bool = False fast: bool = True
[docs] class DatetimeField(BaseField): """Datetime field backed by tantivy's date type.""" type: T.Literal["datetime"] = FieldTypeEnum.DATETIME.value indexed: bool = True fast: bool = True
[docs] class BooleanField(BaseField): """Boolean field.""" type: T.Literal["boolean"] = FieldTypeEnum.BOOLEAN.value indexed: bool = True
# --- union & helpers --------------------------------------------------------- T_Field = T.Annotated[ T.Union[ StoredField, KeywordField, TextField, NgramField, NumericField, DatetimeField, BooleanField, ], Field(discriminator="type"), ] """Discriminated union of all field types. Use with ``TypeAdapter`` for polymorphic deserialization:: from pydantic import TypeAdapter adapter = TypeAdapter(T_Field) field = adapter.validate_python({"type": "ngram", "name": "title"}) """
[docs] def fields_schema_hash(fields: list[T_Field]) -> str: # type: ignore[type-arg] """ Deterministic hash of a list of field definitions. Used as part of cache keys so that changing the schema automatically invalidates stale caches. """ payload = "|".join( json.dumps(f.model_dump(exclude_none=True), sort_keys=True) for f in fields ) return hashlib.sha256(payload.encode()).hexdigest()[:16]