dedup

package
v2.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 18, 2026 License: Apache-2.0 Imports: 20 Imported by: 0

Documentation

Index

Constants

View Source
const EnvPrefix = "LSH"

Variables

View Source
var (
	ErrShingleResultIsEmpty = errors.New("error shingle result is empty")
	ErrEmptyInputString     = errors.New("empty input string")
)

Functions

func CalculateJaccardOptimized

func CalculateJaccardOptimized(sourceSet set.GenericDataSet[string], targetStr string, shingleSize int) float64

func EstimateJaccard

func EstimateJaccard(sig1, sig2 []uint64) float64

func Shingle

func Shingle(input string, size int) set.GenericDataSet[string]

Types

type Config

type Config struct {
	lsh.Config
	ShingleSize      int     `env:"_SHINGLE_SIZE" envDefault:"3"`
	JaccardThreshold float64 `env:"_JAC_THRESHOLD" envDefault:"0.6"`
}

func GetConfigFromEnv

func GetConfigFromEnv() (*Config, error)

func (*Config) CalculateApproximateThreshold

func (c *Config) CalculateApproximateThreshold() float64

CalculateApproximateThreshold computes s ~ (1/B)^(1/R) — the Jaccard similarity where the LSH collision probability transitions sharply.

func (*Config) HashVersion

func (c *Config) HashVersion(group string) (string, error)

HashVersion computes a deterministic prefix from group + all config fields.

func (*Config) Validate

func (c *Config) Validate() error

Validate checks that the LSH approximate threshold is strictly below JaccardThreshold. If violated, recall at the application threshold drops below 50%.

type Hasher

type Hasher struct {
	// contains filtered or unexported fields
}

func NewHasher

func NewHasher(bands, rows int, seed int64) *Hasher

func (*Hasher) ComputeSignature

func (h *Hasher) ComputeSignature(tokens set.GenericDataSet[string], sig []uint64)

type Record

type Record struct {
	ID        string
	Input     string
	GroupID   string
	Signature []uint64
}

type Service

type Service struct {
	// contains filtered or unexported fields
}

func NewService

func NewService(repo repositories.Storage, config *Config) (*Service, error)

func (*Service) GetNewID

func (s *Service) GetNewID(input string) string

func (*Service) Upsert

func (s *Service) Upsert(ctx context.Context, group, input string) (string, error)

func (*Service) WithMetrics

func (s *Service) WithMetrics(m *lsh.Instruments)

WithMetrics injects pre-built OpenTelemetry instruments. If not called, metrics are silently skipped.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL