Documentation
¶
Index ¶
- Constants
- Variables
- func BoolNormalize(v any) (any, bool)
- func CIDRToRange(cidr string) (start, end float64, err error)
- func CompressionStats(terms []string) (compressed, original int, ratio float64)
- func DateToEpochMs(v any) (any, bool)
- func DurationToMs(v any) (any, bool)
- func EmailDomain(v any) (any, bool)
- func Encode(idx *GINIndex) ([]byte, error)
- func EncodeToMetadata(idx *GINIndex, cfg ParquetConfig) (key string, value string, err error)
- func EncodeWithLevel(idx *GINIndex, level CompressionLevel) ([]byte, error)
- func ExtractLiterals(pattern string) ([]string, error)
- func ExtractTrigrams(s string) []string
- func GenerateBigrams(text string) []string
- func GenerateNGrams(text string, n int, opts ...NGramOption) ([]string, error)
- func GenerateTrigrams(text string) []string
- func HasGINIndex(parquetFile string, cfg ParquetConfig) (bool, error)
- func HasGINIndexReader(parquetFile string, cfg ParquetConfig, reader io.ReaderAt, size int64) (bool, error)
- func HasSidecar(parquetFile string) bool
- func IPv4ToInt(v any) (any, bool)
- func ISODateToEpochMs(v any) (any, bool)
- func IsDirectory(path string) bool
- func IsS3Path(path string) bool
- func IsValidJSONPath(path string) bool
- func ListGINFiles(dir string) ([]string, error)
- func ListParquetFiles(dir string) ([]string, error)
- func MustValidateJSONPath(path string) string
- func NormalizePath(path string) string
- func ParseJSONPath(path string) (jp.Expr, error)
- func ParseS3Path(path string) (bucket, key string, err error)
- func RebuildWithIndex(parquetFile string, idx *GINIndex, cfg ParquetConfig) error
- func SemVerToInt(v any) (any, bool)
- func SetAdaptiveInvariantLogger(l *log.Logger)
- func SidecarPath(parquetFile string) string
- func ToLower(v any) (any, bool)
- func URLHost(v any) (any, bool)
- func ValidateJSONPath(path string) error
- func WriteCompressedTerms(w io.Writer, blocks []CompressedTermBlock) error
- func WriteSidecar(parquetFile string, idx *GINIndex) error
- type AdaptiveStringIndex
- type BloomFilter
- func (bf *BloomFilter) Add(data []byte)
- func (bf *BloomFilter) AddString(s string)
- func (bf *BloomFilter) Bits() []uint64
- func (bf *BloomFilter) MayContain(data []byte) bool
- func (bf *BloomFilter) MayContainString(s string) bool
- func (bf *BloomFilter) NumBits() uint32
- func (bf *BloomFilter) NumHashes() uint8
- type BloomFilterOption
- type BuilderOption
- type CompressedTermBlock
- type CompressionLevel
- type ConfigOption
- func WithAdaptiveBucketCount(bucketCount int) ConfigOption
- func WithAdaptiveCoverageCeiling(ceiling float64) ConfigOption
- func WithAdaptiveMinRGCoverage(minCoverage int) ConfigOption
- func WithAdaptivePromotedTermCap(cap int) ConfigOption
- func WithBoolNormalizeTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithCustomDateTransformer(path, alias, layout string, opts ...TransformerOption) ConfigOption
- func WithCustomTransformer(path, alias string, fn FieldTransformer, opts ...TransformerOption) ConfigOption
- func WithDateTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithDurationTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithEmailDomainTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithFTSPaths(paths ...string) ConfigOption
- func WithFieldTransformer(path string, fn FieldTransformer) ConfigOption
- func WithIPv4Transformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithISODateTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithNumericBucketTransformer(path, alias string, size float64, opts ...TransformerOption) ConfigOption
- func WithRegexExtractIntTransformer(path, alias, pattern string, group int, opts ...TransformerOption) ConfigOption
- func WithRegexExtractTransformer(path, alias, pattern string, group int, opts ...TransformerOption) ConfigOption
- func WithRegisteredTransformer(path, alias string, id TransformerID, params []byte, opts ...TransformerOption) ConfigOption
- func WithSemVerTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithToLowerTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- func WithURLHostTransformer(path, alias string, opts ...TransformerOption) ConfigOption
- type CustomDateParams
- type DocID
- type DocIDCodec
- type FieldTransformer
- func CustomDateToEpochMs(layout string) FieldTransformer
- func NumericBucket(size float64) FieldTransformer
- func ReconstructTransformer(id TransformerID, params json.RawMessage) (FieldTransformer, error)
- func RegexExtract(pattern string, group int) FieldTransformer
- func RegexExtractInt(pattern string, group int) FieldTransformer
- type GINBuilder
- type GINConfig
- type GINIndex
- func BuildFromParquet(parquetFile string, jsonColumn string, config GINConfig) (*GINIndex, error)
- func BuildFromParquetReader(parquetFile string, jsonColumn string, config GINConfig, reader io.ReaderAt, ...) (*GINIndex, error)
- func Decode(data []byte) (*GINIndex, error)
- func DecodeFromMetadata(value string) (*GINIndex, error)
- func LoadIndex(parquetFile string, cfg ParquetConfig) (*GINIndex, error)
- func LoadIndexReader(parquetFile string, cfg ParquetConfig, reader io.ReaderAt, size int64) (*GINIndex, error)
- func NewGINIndex() *GINIndex
- func ReadFromParquetMetadata(parquetFile string, cfg ParquetConfig) (*GINIndex, error)
- func ReadFromParquetMetadataReader(parquetFile string, cfg ParquetConfig, reader io.ReaderAt, size int64) (*GINIndex, error)
- func ReadSidecar(parquetFile string) (*GINIndex, error)
- type Header
- type HyperLogLog
- func (hll *HyperLogLog) Add(data []byte)
- func (hll *HyperLogLog) AddString(s string)
- func (hll *HyperLogLog) Clear()
- func (hll *HyperLogLog) Clone() *HyperLogLog
- func (hll *HyperLogLog) Estimate() uint64
- func (hll *HyperLogLog) Merge(other *HyperLogLog)
- func (hll *HyperLogLog) Precision() uint8
- func (hll *HyperLogLog) Registers() []uint8
- type HyperLogLogOption
- type IdentityCodec
- type JSONPathError
- type NGramConfig
- type NGramOption
- type NullIndex
- type NumericBucketParams
- type NumericIndex
- type NumericValueType
- type Operator
- type ParquetConfig
- type ParquetIndexWriter
- type PathEntry
- type PathMode
- type Predicate
- func Contains(path string, pattern string) Predicate
- func EQ(path string, value any) Predicate
- func GT(path string, value any) Predicate
- func GTE(path string, value any) Predicate
- func IN(path string, values ...any) Predicate
- func InSubnet(path, cidr string) []Predicate
- func InSubnetAs(path, alias, cidr string) []Predicate
- func IsNotNull(path string) Predicate
- func IsNull(path string) Predicate
- func LT(path string, value any) Predicate
- func LTE(path string, value any) Predicate
- func NE(path string, value any) Predicate
- func NIN(path string, values ...any) Predicate
- func Regex(path string, pattern string) Predicate
- type PrefixCompressor
- type PrefixCompressorOption
- type PrefixEntry
- type RGNumericStat
- type RGSet
- func (rs *RGSet) All() *RGSet
- func (rs *RGSet) Clear(rgID int)
- func (rs *RGSet) Clone() *RGSet
- func (rs *RGSet) Count() int
- func (rs *RGSet) Intersect(other *RGSet) *RGSet
- func (rs *RGSet) Invert() *RGSet
- func (rs *RGSet) IsEmpty() bool
- func (rs *RGSet) IsSet(rgID int) bool
- func (rs *RGSet) Roaring() *roaring.Bitmap
- func (rs *RGSet) Set(rgID int)
- func (rs *RGSet) ToSlice() []int
- func (rs *RGSet) Union(other *RGSet) *RGSet
- func (rs *RGSet) UnionWith(other *RGSet)
- type RGSetOption
- type RGStringLengthStat
- type RegexLiteralInfo
- type RegexParams
- type RepresentationInfo
- type RepresentationSpec
- type RepresentationValue
- type RowGroupCodec
- type S3Client
- func (c *S3Client) BuildFromParquet(bucket, key, jsonColumn string, ginCfg GINConfig) (*GINIndex, error)
- func (c *S3Client) Exists(bucket, key string) (bool, error)
- func (c *S3Client) GetObjectSize(bucket, key string) (int64, error)
- func (c *S3Client) HasGINIndex(bucket, key string, cfg ParquetConfig) (bool, error)
- func (c *S3Client) HasSidecar(bucket, parquetKey string) (bool, error)
- func (c *S3Client) ListGINFiles(bucket, prefix string) ([]string, error)
- func (c *S3Client) ListParquetFiles(bucket, prefix string) ([]string, error)
- func (c *S3Client) LoadIndex(bucket, parquetKey string, cfg ParquetConfig) (*GINIndex, error)
- func (c *S3Client) OpenParquet(bucket, key string) (*parquet.File, io.ReaderAt, int64, error)
- func (c *S3Client) ReadFile(bucket, key string) ([]byte, error)
- func (c *S3Client) ReadFromParquetMetadata(bucket, key string, cfg ParquetConfig) (*GINIndex, error)
- func (c *S3Client) ReadSidecar(bucket, parquetKey string) (*GINIndex, error)
- func (c *S3Client) WriteFile(bucket, key string, data []byte) error
- func (c *S3Client) WriteSidecar(bucket, parquetKey string, idx *GINIndex) error
- type S3Config
- type SerializedConfig
- type StringIndex
- type StringLengthIndex
- type TransformerFailureMode
- type TransformerID
- type TransformerOption
- type TransformerSpec
- type TrigramIndex
Constants ¶
const ( MagicBytes = "GIN\x01" // Version is the binary format version. Decode rejects mismatches with // ErrVersionMismatch; the only migration path is to rebuild the index // with the target binary. Version history: // v8: explicit companion transformer failure modes in serialized config // and representation metadata (strict by default, soft-fail opt-in) // v7: explicit representation metadata for derived alias routing // (phase 09 derived representations) // v6: PathEntry.Mode byte + FlagTrigramIndex bit reassignment // (phase 08 adaptive high-cardinality indexing) // v5: never released; payloads are always rejected. Was an in-tree // iteration of the adaptive string index section before the wire // format was finalised in v6. // v4: earlier pre-OSS format Version = uint16(8) )
const ( TypeString uint8 = 1 << iota TypeInt TypeFloat TypeBool TypeNull )
const DefaultMetadataKey = "gin.index"
const (
FlagHasDocIDMap uint16 = 1 << iota
)
const (
FlagTrigramIndex uint8 = 1 << iota // path has trigram index for CONTAINS queries
)
Variables ¶
var ( // ErrVersionMismatch is returned by Decode when the binary format version // does not match the expected version (Version constant). ErrVersionMismatch = errors.New("version mismatch") // ErrInvalidFormat is returned by Decode when the binary data is structurally // invalid: unrecognized magic bytes, oversized allocations, or corrupt fields. ErrInvalidFormat = errors.New("invalid format") )
Functions ¶
func BoolNormalize ¶
BoolNormalize normalizes various boolean-like values to actual booleans. Handles: bool, "true"/"false"/"yes"/"no"/"1"/"0"/"on"/"off", float64 (0 = false).
func CIDRToRange ¶
CIDRToRange parses a CIDR notation string and returns the start and end IP addresses as float64 values suitable for use with GTE/LTE predicates on IPv4ToInt-transformed fields. Example: CIDRToRange("192.168.1.0/24") returns (3232235776, 3232236031, nil)
func CompressionStats ¶
CompressionRatio returns the compression ratio for a set of terms. Returns (compressed size, original size, ratio).
func DateToEpochMs ¶
DateToEpochMs parses "2006-01-02" format to Unix milliseconds (midnight UTC).
func DurationToMs ¶
DurationToMs parses Go duration strings (e.g., "1h30m", "500ms") to milliseconds.
func EmailDomain ¶
EmailDomain extracts and lowercases the domain from an email address.
func EncodeToMetadata ¶
func EncodeToMetadata(idx *GINIndex, cfg ParquetConfig) (key string, value string, err error)
func EncodeWithLevel ¶
func EncodeWithLevel(idx *GINIndex, level CompressionLevel) ([]byte, error)
EncodeWithLevel serializes the index with the specified compression level. Use CompressionNone (0) for no compression, or 1-19 for zstd compression levels.
func ExtractLiterals ¶
ExtractLiterals extracts literal strings from a regex pattern that can be used for trigram-based candidate selection. Returns a slice of literal alternatives. For patterns like "foo|bar", returns ["foo", "bar"]. For patterns like "(error|warn)_msg", returns ["error_msg", "warn_msg"] (combined).
func ExtractTrigrams ¶
func GenerateBigrams ¶
func GenerateNGrams ¶
func GenerateNGrams(text string, n int, opts ...NGramOption) ([]string, error)
func GenerateTrigrams ¶
func HasGINIndex ¶
func HasGINIndex(parquetFile string, cfg ParquetConfig) (bool, error)
func HasGINIndexReader ¶
func HasSidecar ¶
func ISODateToEpochMs ¶
ISODateToEpochMs parses RFC3339/ISO8601 strings to Unix milliseconds.
func IsDirectory ¶
func IsValidJSONPath ¶
func ListGINFiles ¶
func ListParquetFiles ¶
func MustValidateJSONPath ¶
func NormalizePath ¶
NormalizePath converts a JSONPath to a canonical dot-notation form without validating that the path uses only GIN-supported JSONPath features. Callers handling untrusted input should use ValidateJSONPath or canonicalizeSupportedPath first.
func ParseJSONPath ¶
ParseJSONPath parses and validates a JSONPath, returning the parsed expression.
func ParseS3Path ¶
func RebuildWithIndex ¶
func RebuildWithIndex(parquetFile string, idx *GINIndex, cfg ParquetConfig) error
func SemVerToInt ¶
SemVerToInt encodes semantic versions as integers: major*1000000 + minor*1000 + patch. Supports formats: "1.2.3", "v1.2.3", "1.2", "v1.2", "1.2.3-beta" (pre-release suffix ignored).
func SetAdaptiveInvariantLogger ¶ added in v0.2.0
SetAdaptiveInvariantLogger installs a logger that surfaces adaptive index invariant violations (e.g. a path flagged PathModeAdaptiveHybrid with no matching AdaptiveStringIndexes section). The default is nil (silent); pass log.Default() or your own *log.Logger to opt in. Safe for concurrent use.
func SidecarPath ¶
func ValidateJSONPath ¶
ValidateJSONPath validates a JSONPath expression and ensures it only uses features supported by the GIN index (dot notation, wildcards). Unsupported: array indices [0], filters [?()], recursive descent .., scripts
func WriteCompressedTerms ¶
func WriteCompressedTerms(w io.Writer, blocks []CompressedTermBlock) error
func WriteSidecar ¶
Types ¶
type AdaptiveStringIndex ¶ added in v0.2.0
type AdaptiveStringIndex struct {
// Terms holds the promoted exact-match values in sorted order.
Terms []string
// RGBitmaps[i] lists the row groups that contain Terms[i].
RGBitmaps []*RGSet
// BucketRGBitmaps partitions the long-tail terms by xxhash; len must be a
// non-zero power of two. A bucket hit is a superset match (may include
// row groups that do not actually contain the queried term).
BucketRGBitmaps []*RGSet
}
AdaptiveStringIndex stores promoted exact terms plus lossy tail buckets. Terms must be sorted lexically; RGBitmaps is parallel to Terms. Values that are not promoted fall into one of len(BucketRGBitmaps) hash buckets, which may return false-positive row groups.
func NewAdaptiveStringIndex ¶ added in v0.2.0
func NewAdaptiveStringIndex(terms []string, rgBitmaps []*RGSet, bucketBitmaps []*RGSet) (*AdaptiveStringIndex, error)
NewAdaptiveStringIndex validates and constructs an adaptive string index.
type BloomFilter ¶
type BloomFilter struct {
// contains filtered or unexported fields
}
func BloomFilterFromBits ¶
func BloomFilterFromBits(bits []uint64, numBits uint32, numHashes uint8) *BloomFilter
func MustNewBloomFilter ¶
func MustNewBloomFilter(numBits uint32, numHashes uint8, opts ...BloomFilterOption) *BloomFilter
func NewBloomFilter ¶
func NewBloomFilter(numBits uint32, numHashes uint8, opts ...BloomFilterOption) (*BloomFilter, error)
func (*BloomFilter) Add ¶
func (bf *BloomFilter) Add(data []byte)
func (*BloomFilter) AddString ¶
func (bf *BloomFilter) AddString(s string)
func (*BloomFilter) Bits ¶
func (bf *BloomFilter) Bits() []uint64
func (*BloomFilter) MayContain ¶
func (bf *BloomFilter) MayContain(data []byte) bool
func (*BloomFilter) MayContainString ¶
func (bf *BloomFilter) MayContainString(s string) bool
func (*BloomFilter) NumBits ¶
func (bf *BloomFilter) NumBits() uint32
func (*BloomFilter) NumHashes ¶
func (bf *BloomFilter) NumHashes() uint8
type BloomFilterOption ¶
type BloomFilterOption func(*BloomFilter) error
type BuilderOption ¶
type BuilderOption func(*GINBuilder) error
func WithCodec ¶
func WithCodec(codec DocIDCodec) BuilderOption
type CompressedTermBlock ¶
type CompressedTermBlock struct {
FirstTerm string
Entries []PrefixEntry
}
func ReadCompressedTerms ¶
func ReadCompressedTerms(r io.Reader) ([]CompressedTermBlock, error)
type CompressionLevel ¶
type CompressionLevel int
CompressionLevel specifies the compression level for index serialization.
const ( CompressionNone CompressionLevel = 0 // No compression CompressionFastest CompressionLevel = 1 // zstd level 1 CompressionBalanced CompressionLevel = 3 // zstd level 3 CompressionBetter CompressionLevel = 9 // zstd level 9 CompressionBest CompressionLevel = 15 // zstd level 15 (recommended) CompressionMax CompressionLevel = 19 // zstd level 19 (slow) )
type ConfigOption ¶
func WithAdaptiveBucketCount ¶ added in v0.2.0
func WithAdaptiveBucketCount(bucketCount int) ConfigOption
WithAdaptiveBucketCount sets the fan-out of the long-tail bucket layer. Must be a positive power of two. To disable adaptive mode, omit this option (and WithAdaptivePromotedTermCap) or build a GINConfig literal with AdaptiveBucketCount/AdaptivePromotedTermCap set to 0; this option rejects 0 to keep the builder path explicit.
func WithAdaptiveCoverageCeiling ¶ added in v0.2.0
func WithAdaptiveCoverageCeiling(ceiling float64) ConfigOption
WithAdaptiveCoverageCeiling sets the maximum fraction of row groups a term may cover and still be eligible for promotion. Terms above the ceiling are treated as too-ubiquitous and fall through to the bucket layer. Must be in the open interval (0, 1).
func WithAdaptiveMinRGCoverage ¶ added in v0.2.0
func WithAdaptiveMinRGCoverage(minCoverage int) ConfigOption
WithAdaptiveMinRGCoverage sets the minimum number of row groups a term must cover to be eligible for promotion to the exact adaptive index. Terms below this threshold fall into the bucket layer.
func WithAdaptivePromotedTermCap ¶ added in v0.2.0
func WithAdaptivePromotedTermCap(cap int) ConfigOption
WithAdaptivePromotedTermCap caps the number of terms promoted to the exact adaptive index per high-cardinality path. Zero disables adaptive mode.
func WithBoolNormalizeTransformer ¶
func WithBoolNormalizeTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithCustomDateTransformer ¶
func WithCustomDateTransformer(path, alias, layout string, opts ...TransformerOption) ConfigOption
func WithCustomTransformer ¶ added in v0.2.0
func WithCustomTransformer(path, alias string, fn FieldTransformer, opts ...TransformerOption) ConfigOption
func WithDateTransformer ¶
func WithDateTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithDurationTransformer ¶
func WithDurationTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithEmailDomainTransformer ¶
func WithEmailDomainTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithFTSPaths ¶
func WithFTSPaths(paths ...string) ConfigOption
func WithFieldTransformer ¶
func WithFieldTransformer(path string, fn FieldTransformer) ConfigOption
func WithIPv4Transformer ¶
func WithIPv4Transformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithISODateTransformer ¶
func WithISODateTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithNumericBucketTransformer ¶
func WithNumericBucketTransformer(path, alias string, size float64, opts ...TransformerOption) ConfigOption
func WithRegexExtractIntTransformer ¶
func WithRegexExtractIntTransformer(path, alias, pattern string, group int, opts ...TransformerOption) ConfigOption
func WithRegexExtractTransformer ¶
func WithRegexExtractTransformer(path, alias, pattern string, group int, opts ...TransformerOption) ConfigOption
func WithRegisteredTransformer ¶
func WithRegisteredTransformer(path, alias string, id TransformerID, params []byte, opts ...TransformerOption) ConfigOption
func WithSemVerTransformer ¶
func WithSemVerTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithToLowerTransformer ¶
func WithToLowerTransformer(path, alias string, opts ...TransformerOption) ConfigOption
func WithURLHostTransformer ¶
func WithURLHostTransformer(path, alias string, opts ...TransformerOption) ConfigOption
type CustomDateParams ¶
type CustomDateParams struct {
Layout string `json:"layout"`
}
type DocIDCodec ¶
DocIDCodec encodes/decodes composite information into a single DocID.
type FieldTransformer ¶
FieldTransformer transforms a value before indexing. Returns (transformedValue, ok). If ok=false, the companion representation follows the registration's configured failure mode. Strict is the default.
func CustomDateToEpochMs ¶
func CustomDateToEpochMs(layout string) FieldTransformer
CustomDateToEpochMs returns a transformer for custom date formats.
func NumericBucket ¶
func NumericBucket(size float64) FieldTransformer
NumericBucket returns a transformer that buckets numeric values by size. Example: NumericBucket(100) transforms 150 -> 100, 250 -> 200.
func ReconstructTransformer ¶
func ReconstructTransformer(id TransformerID, params json.RawMessage) (FieldTransformer, error)
func RegexExtract ¶
func RegexExtract(pattern string, group int) FieldTransformer
RegexExtract returns a transformer that extracts a substring via regex capture group. Pattern is compiled once at config time. Group 0 = full match, group 1+ = capture groups.
func RegexExtractInt ¶
func RegexExtractInt(pattern string, group int) FieldTransformer
RegexExtractInt extracts a substring via regex and converts it to float64.
type GINBuilder ¶
type GINBuilder struct {
// contains filtered or unexported fields
}
func NewBuilder ¶
func NewBuilder(config GINConfig, numRGs int, opts ...BuilderOption) (*GINBuilder, error)
func (*GINBuilder) AddDocument ¶
func (b *GINBuilder) AddDocument(docID DocID, jsonDoc []byte) error
func (*GINBuilder) Finalize ¶
func (b *GINBuilder) Finalize() *GINIndex
type GINConfig ¶
type GINConfig struct {
CardinalityThreshold uint32
BloomFilterSize uint32
BloomFilterHashes uint8
EnableTrigrams bool
TrigramMinLength int
HLLPrecision uint8
PrefixBlockSize int
AdaptiveMinRGCoverage int
AdaptivePromotedTermCap int
AdaptiveCoverageCeiling float64
AdaptiveBucketCount int
// contains filtered or unexported fields
}
func DefaultConfig ¶
func DefaultConfig() GINConfig
func NewConfig ¶
func NewConfig(opts ...ConfigOption) (GINConfig, error)
func (GINConfig) AdaptiveEnabled ¶ added in v0.2.0
AdaptiveEnabled reports whether adaptive high-cardinality indexing is enabled.
type GINIndex ¶
type GINIndex struct {
// GINIndex is immutable after `Finalize()` or `Decode()`; pathLookup is
// derived, non-serialized state rebuilt once and then treated as read-only.
Header Header
PathDirectory []PathEntry
GlobalBloom *BloomFilter
StringIndexes map[uint16]*StringIndex
AdaptiveStringIndexes map[uint16]*AdaptiveStringIndex
NumericIndexes map[uint16]*NumericIndex
NullIndexes map[uint16]*NullIndex
TrigramIndexes map[uint16]*TrigramIndex
StringLengthIndexes map[uint16]*StringLengthIndex
PathCardinality map[uint16]*HyperLogLog
DocIDMapping []DocID
Config *GINConfig
// contains filtered or unexported fields
}
func BuildFromParquet ¶
func BuildFromParquetReader ¶
func Decode ¶
Decode deserializes an index, validates cross-structure path references, and canonicalizes supported JSONPath spellings in PathDirectory while rebuilding derived lookup state.
func DecodeFromMetadata ¶
func LoadIndexReader ¶
func NewGINIndex ¶
func NewGINIndex() *GINIndex
func ReadFromParquetMetadata ¶
func ReadFromParquetMetadata(parquetFile string, cfg ParquetConfig) (*GINIndex, error)
func ReadSidecar ¶
func (*GINIndex) MatchingDocIDs ¶
func (*GINIndex) Representations ¶ added in v0.2.0
func (idx *GINIndex) Representations(path string) []RepresentationInfo
type HyperLogLog ¶
type HyperLogLog struct {
// contains filtered or unexported fields
}
HyperLogLog implements the HyperLogLog algorithm for cardinality estimation. It uses 2^precision registers to estimate the number of distinct elements.
func HyperLogLogFromRegisters ¶
func HyperLogLogFromRegisters(registers []uint8, precision uint8) *HyperLogLog
func MustNewHyperLogLog ¶
func MustNewHyperLogLog(precision uint8, opts ...HyperLogLogOption) *HyperLogLog
func NewHyperLogLog ¶
func NewHyperLogLog(precision uint8, opts ...HyperLogLogOption) (*HyperLogLog, error)
NewHyperLogLog creates a new HyperLogLog with the given precision. Precision must be between 4 and 16. Higher precision = more accuracy but more memory. Memory usage: 2^precision bytes. Standard error: 1.04 / sqrt(m) where m = 2^precision
func (*HyperLogLog) Add ¶
func (hll *HyperLogLog) Add(data []byte)
func (*HyperLogLog) AddString ¶
func (hll *HyperLogLog) AddString(s string)
func (*HyperLogLog) Clear ¶
func (hll *HyperLogLog) Clear()
func (*HyperLogLog) Clone ¶
func (hll *HyperLogLog) Clone() *HyperLogLog
func (*HyperLogLog) Estimate ¶
func (hll *HyperLogLog) Estimate() uint64
func (*HyperLogLog) Merge ¶
func (hll *HyperLogLog) Merge(other *HyperLogLog)
func (*HyperLogLog) Precision ¶
func (hll *HyperLogLog) Precision() uint8
func (*HyperLogLog) Registers ¶
func (hll *HyperLogLog) Registers() []uint8
type HyperLogLogOption ¶
type HyperLogLogOption func(*HyperLogLog) error
type IdentityCodec ¶
type IdentityCodec struct{}
IdentityCodec treats the position as the DocID (1:1 mapping).
func NewIdentityCodec ¶
func NewIdentityCodec() *IdentityCodec
func (*IdentityCodec) Decode ¶
func (c *IdentityCodec) Decode(docID DocID) []int
func (*IdentityCodec) Encode ¶
func (c *IdentityCodec) Encode(indices ...int) DocID
func (*IdentityCodec) Name ¶
func (c *IdentityCodec) Name() string
type JSONPathError ¶
func (*JSONPathError) Error ¶
func (e *JSONPathError) Error() string
type NGramConfig ¶
type NGramOption ¶
type NGramOption func(*NGramConfig) error
func WithN ¶
func WithN(n int) NGramOption
func WithPadding ¶
func WithPadding(pad string) NGramOption
type NumericBucketParams ¶
type NumericBucketParams struct {
Size float64 `json:"size"`
}
type NumericIndex ¶
type NumericIndex struct {
// ValueType is the numeric storage mode: int-only or float/mixed.
ValueType NumericValueType
IntGlobalMin int64
IntGlobalMax int64
GlobalMin float64
GlobalMax float64
RGStats []RGNumericStat
}
type NumericValueType ¶ added in v0.2.0
type NumericValueType uint8
const ( NumericValueTypeIntOnly NumericValueType = iota NumericValueTypeFloatMixed )
type ParquetConfig ¶
type ParquetConfig struct {
MetadataKey string
}
func DefaultParquetConfig ¶
func DefaultParquetConfig() ParquetConfig
type ParquetIndexWriter ¶
type ParquetIndexWriter struct {
// contains filtered or unexported fields
}
func NewParquetIndexWriter ¶
func NewParquetIndexWriter(w io.Writer, schema *parquet.Schema, jsonColumn string, numRowGroups int, ginConfig GINConfig, pqConfig ParquetConfig) (*ParquetIndexWriter, error)
type PathEntry ¶
type PathEntry struct {
PathID uint16
PathName string
ObservedTypes uint8
Cardinality uint32
// Mode is the exclusive string-evaluation mode for this path.
Mode PathMode
Flags uint8
// AdaptivePromotedTerms and AdaptiveBucketCount are derived metadata
// populated from the adaptive section at decode time. They are not
// persisted in the path directory; encoders must not rely on them.
AdaptivePromotedTerms uint16
AdaptiveBucketCount uint16
}
type PathMode ¶ added in v0.2.0
type PathMode uint8
PathMode is the exclusive storage mode for a path entry. The zero value is the classic exact mode.
const ( // PathModeClassic keeps the full exact string index for a path. // Its user-facing string label remains "exact" because that describes the // query semantics more clearly than the internal mode name. PathModeClassic PathMode = iota // PathModeBloomOnly stores no exact term index and answers via bloom-filter fallback. PathModeBloomOnly // PathModeAdaptiveHybrid stores promoted exact terms plus lossy tail buckets. PathModeAdaptiveHybrid )
type Predicate ¶
func InSubnet ¶
InSubnet creates predicates using the conventional IPv4 companion alias "ipv4_int". Use InSubnetAs when a path is configured with a different alias.
func InSubnetAs ¶ added in v0.2.0
InSubnetAs creates predicates to check if an IP field (transformed with IPv4ToInt under the provided alias) falls within a CIDR subnet range. Example: InSubnetAs("$.client_ip", "ipv4_int", "192.168.1.0/24") returns predicates for 192.168.1.0-255. Panics if CIDR is invalid - use CIDRToRange for error handling.
type PrefixCompressor ¶
type PrefixCompressor struct {
// contains filtered or unexported fields
}
PrefixCompressor implements front-coding compression for sorted string lists. Each string is stored as: shared prefix length + suffix. This works well for sorted terms that share common prefixes.
func MustNewPrefixCompressor ¶
func MustNewPrefixCompressor(blockSize int, opts ...PrefixCompressorOption) *PrefixCompressor
func NewPrefixCompressor ¶
func NewPrefixCompressor(blockSize int, opts ...PrefixCompressorOption) (*PrefixCompressor, error)
func (*PrefixCompressor) BlockSize ¶
func (pc *PrefixCompressor) BlockSize() int
func (*PrefixCompressor) Compress ¶
func (pc *PrefixCompressor) Compress(terms []string) []CompressedTermBlock
func (*PrefixCompressor) Decompress ¶
func (pc *PrefixCompressor) Decompress(blocks []CompressedTermBlock) []string
type PrefixCompressorOption ¶
type PrefixCompressorOption func(*PrefixCompressor) error
type PrefixEntry ¶
type RGNumericStat ¶
type RGSet ¶
type RGSet struct {
NumRGs int
// contains filtered or unexported fields
}
func MustNewRGSet ¶
func MustNewRGSet(numRGs int, opts ...RGSetOption) *RGSet
type RGSetOption ¶
type RGStringLengthStat ¶
type RegexLiteralInfo ¶
type RegexLiteralInfo struct {
Literals []string // Extracted literal strings
HasWildcard bool // Pattern contains unbounded wildcards
MinLength int // Minimum length of any literal
}
RegexLiteralInfo contains extracted information from a regex pattern
func AnalyzeRegex ¶
func AnalyzeRegex(pattern string) (*RegexLiteralInfo, error)
AnalyzeRegex extracts literals and metadata from a regex pattern
type RegexParams ¶
type RepresentationInfo ¶ added in v0.2.0
type RepresentationSpec ¶ added in v0.2.0
type RepresentationSpec struct {
SourcePath string `json:"source_path"`
Alias string `json:"alias"`
TargetPath string `json:"target_path"`
Transformer TransformerSpec `json:"transformer"`
Serializable bool `json:"serializable"`
}
type RepresentationValue ¶ added in v0.2.0
func As ¶ added in v0.2.0
func As(alias string, value any) RepresentationValue
type RowGroupCodec ¶
type RowGroupCodec struct {
// contains filtered or unexported fields
}
RowGroupCodec encodes file index and row group index into a DocID. Layout: DocID = fileIndex * rowGroupsPerFile + rgIndex
func NewRowGroupCodec ¶
func NewRowGroupCodec(rowGroupsPerFile int) *RowGroupCodec
func (*RowGroupCodec) Decode ¶
func (c *RowGroupCodec) Decode(docID DocID) []int
func (*RowGroupCodec) Encode ¶
func (c *RowGroupCodec) Encode(indices ...int) DocID
func (*RowGroupCodec) Name ¶
func (c *RowGroupCodec) Name() string
func (*RowGroupCodec) RowGroupsPerFile ¶
func (c *RowGroupCodec) RowGroupsPerFile() int
type S3Client ¶
type S3Client struct {
// contains filtered or unexported fields
}
func NewS3Client ¶
func NewS3ClientFromEnv ¶
func (*S3Client) BuildFromParquet ¶
func (*S3Client) GetObjectSize ¶
func (*S3Client) HasGINIndex ¶
func (c *S3Client) HasGINIndex(bucket, key string, cfg ParquetConfig) (bool, error)
func (*S3Client) HasSidecar ¶
func (*S3Client) ListGINFiles ¶
func (*S3Client) ListParquetFiles ¶
func (*S3Client) LoadIndex ¶
func (c *S3Client) LoadIndex(bucket, parquetKey string, cfg ParquetConfig) (*GINIndex, error)
func (*S3Client) OpenParquet ¶
func (*S3Client) ReadFromParquetMetadata ¶
func (c *S3Client) ReadFromParquetMetadata(bucket, key string, cfg ParquetConfig) (*GINIndex, error)
func (*S3Client) ReadSidecar ¶
type S3Config ¶
type S3Config struct {
Endpoint string
Region string
AccessKey string
SecretKey string
PathStyle bool
}
func S3ConfigFromEnv ¶
func S3ConfigFromEnv() S3Config
type SerializedConfig ¶
type SerializedConfig struct {
BloomFilterSize uint32 `json:"bloom_filter_size"`
BloomFilterHashes uint8 `json:"bloom_filter_hashes"`
EnableTrigrams bool `json:"enable_trigrams"`
TrigramMinLength int `json:"trigram_min_length"`
HLLPrecision uint8 `json:"hll_precision"`
PrefixBlockSize int `json:"prefix_block_size"`
AdaptiveMinRGCoverage int `json:"adaptive_min_rg_coverage"`
AdaptivePromotedTermCap int `json:"adaptive_promoted_term_cap"`
AdaptiveCoverageCeiling float64 `json:"adaptive_coverage_ceiling"`
AdaptiveBucketCount int `json:"adaptive_bucket_count"`
FTSPaths []string `json:"fts_paths,omitempty"`
Transformers []TransformerSpec `json:"transformers,omitempty"`
}
type StringIndex ¶
type StringLengthIndex ¶
type StringLengthIndex struct {
GlobalMin uint32
GlobalMax uint32
RGStats []RGStringLengthStat
}
type TransformerFailureMode ¶ added in v0.2.0
type TransformerFailureMode string
const ( TransformerFailureStrict TransformerFailureMode = "strict" TransformerFailureSoft TransformerFailureMode = "soft_fail" )
type TransformerID ¶
type TransformerID uint8
const ( TransformerUnknown TransformerID = iota TransformerISODateToEpochMs TransformerDateToEpochMs TransformerCustomDateToEpochMs TransformerToLower TransformerIPv4ToInt TransformerSemVerToInt TransformerRegexExtract TransformerRegexExtractInt TransformerDurationToMs TransformerEmailDomain TransformerURLHost TransformerNumericBucket TransformerBoolNormalize )
type TransformerOption ¶ added in v0.2.0
type TransformerOption func(*transformerRegistrationOptions) error
func WithTransformerFailureMode ¶ added in v0.2.0
func WithTransformerFailureMode(mode TransformerFailureMode) TransformerOption
type TransformerSpec ¶
type TransformerSpec struct {
Path string `json:"path"`
Alias string `json:"alias,omitempty"`
TargetPath string `json:"target_path,omitempty"`
FailureMode TransformerFailureMode `json:"failure_mode,omitempty"`
ID TransformerID `json:"id"`
Name string `json:"name"`
Params json.RawMessage `json:"params,omitempty"`
}
func NewTransformerSpec ¶
func NewTransformerSpec(path string, id TransformerID, params json.RawMessage) TransformerSpec
type TrigramIndex ¶
type TrigramIndex struct {
Trigrams map[string]*RGSet
NumRGs int
N int
Padding string
MinLength int
}
func MustNewTrigramIndex ¶ added in v0.2.0
func MustNewTrigramIndex(numRGs int, opts ...NGramOption) *TrigramIndex
func NewTrigramIndex ¶
func NewTrigramIndex(numRGs int, opts ...NGramOption) (*TrigramIndex, error)
func (*TrigramIndex) Add ¶
func (ti *TrigramIndex) Add(value string, rgID int)
func (*TrigramIndex) Search ¶
func (ti *TrigramIndex) Search(pattern string) *RGSet
func (*TrigramIndex) TrigramCount ¶
func (ti *TrigramIndex) TrigramCount() int
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
cmd
|
|
|
gin-index
command
|
|
|
examples
|
|
|
basic
command
Example: Basic GIN index usage with equality queries
|
Example: Basic GIN index usage with equality queries |
|
full
command
Example: Comprehensive GIN index usage demonstrating all index types and query operators
|
Example: Comprehensive GIN index usage demonstrating all index types and query operators |
|
fulltext
command
Example: Full-text search with trigram index (CONTAINS queries)
|
Example: Full-text search with trigram index (CONTAINS queries) |
|
nested
command
Example: Nested JSON objects and arrays
|
Example: Nested JSON objects and arrays |
|
null
command
Example: NULL handling queries
|
Example: NULL handling queries |
|
parquet
command
|
|
|
range
command
Example: Numeric range queries with GIN index
|
Example: Numeric range queries with GIN index |
|
regex
command
Example: Regex pattern matching with trigram-based candidate selection
|
Example: Regex pattern matching with trigram-based candidate selection |
|
serialize
command
Example: Serializing and deserializing GIN index
|
Example: Serializing and deserializing GIN index |
|
transformers
command
Example: Field transformers for date indexing
|
Example: Field transformers for date indexing |
|
transformers-advanced
command
Example: Advanced field transformers for IP ranges, semantic versions, emails, and regex extraction
|
Example: Advanced field transformers for IP ranges, semantic versions, emails, and regex extraction |