Documentation
¶
Index ¶
- func ComputeCentroid(embeddings [][]float64, indices []int) []float64
- func CosineDistance(a, b []float64) float64
- func GenerateLabel(keywords []ScoredKeyword, maxTerms int) string
- func KeywordStrings(keywords []ScoredKeyword) []string
- func SortedClusterLabels(clusters map[int][]int) []int
- type Cluster
- type ClusterBirth
- type ClusterMember
- type ClusterResult
- type ClusterRunStats
- type ClusterTexts
- type ClusterUpdate
- type Clusterer
- type DBSCANConfig
- type DiffResult
- type EmbeddingRecord
- type KeywordResult
- type KnowledgeStore
- type PostgresKnowledgeStore
- func (s *PostgresKnowledgeStore) DeleteCluster(ctx context.Context, clusterID uuid.UUID) error
- func (s *PostgresKnowledgeStore) ListUsersWithEmbeddings(ctx context.Context) ([]string, error)
- func (s *PostgresKnowledgeStore) LoadClustersForUser(ctx context.Context, userID string) ([]StoredCluster, error)
- func (s *PostgresKnowledgeStore) LoadEmbeddingsForUser(ctx context.Context, userID string) ([]EmbeddingRecord, error)
- func (s *PostgresKnowledgeStore) LoadTextsForSourceIDs(ctx context.Context, sourceIDs []uuid.UUID) (map[uuid.UUID]string, error)
- func (s *PostgresKnowledgeStore) ResolveOwnersByConversationGroupIDs(ctx context.Context, groupIDs []uuid.UUID) ([]string, error)
- func (s *PostgresKnowledgeStore) SaveCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
- func (s *PostgresKnowledgeStore) UpdateCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
- type ScoredKeyword
- type StoredCluster
- type StoredClusterMember
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ComputeCentroid ¶
ComputeCentroid returns the mean vector of the given embeddings.
func CosineDistance ¶
CosineDistance computes 1 - cosine_similarity between two vectors. Returns 0.0 for identical directions, 2.0 for opposite directions.
func GenerateLabel ¶
func GenerateLabel(keywords []ScoredKeyword, maxTerms int) string
GenerateLabel creates a comma-separated label from the top keywords.
func KeywordStrings ¶
func KeywordStrings(keywords []ScoredKeyword) []string
KeywordStrings extracts just the term strings from scored keywords.
func SortedClusterLabels ¶
SortedClusterLabels returns cluster labels sorted by label number.
Types ¶
type Cluster ¶
type Cluster struct {
ID uuid.UUID
UserID string
Label string
Keywords []string
Centroid []float64
MemberCount int
Trend int // 0=growing, 1=stable, 2=decaying
SourceType int // 0=entries, 1=memories, 2=mixed
Members []ClusterMember
}
Cluster represents a persisted knowledge cluster with metadata.
type ClusterBirth ¶
ClusterBirth represents a new cluster discovered by DBSCAN.
type ClusterMember ¶
ClusterMember identifies an embedding source.
type ClusterResult ¶
type ClusterResult struct {
// Clusters maps cluster label (0-based) to the set of member indices.
Clusters map[int][]int
// Noise contains indices of points classified as noise by DBSCAN.
Noise []int
}
ClusterResult represents the output of a single clustering run.
func RunDBSCAN ¶
func RunDBSCAN(embeddings [][]float64, cfg DBSCANConfig) ClusterResult
RunDBSCAN executes the DBSCAN algorithm on the given embeddings using cosine distance. Each embedding is a float64 slice. Returns cluster assignments.
type ClusterRunStats ¶
type ClusterRunStats struct {
UsersProcessed int `json:"users_processed"`
Clustersborn int `json:"clusters_born"`
ClustersUpdated int `json:"clusters_updated"`
ClustersDied int `json:"clusters_died"`
Failures int `json:"failures"`
}
ClusterRunStats summarizes a single clustering cycle.
type ClusterTexts ¶
ClusterTexts maps a cluster label (or index) to the combined text of its members.
type ClusterUpdate ¶
type ClusterUpdate struct {
ClusterID uuid.UUID
NewMembers []int // indices in the new embedding set
NewCentroid []float64
OverlapRatio float64 // fraction of new cluster members that were in the old cluster
}
ClusterUpdate represents an existing cluster that matched a new DBSCAN cluster.
type Clusterer ¶
type Clusterer struct {
// contains filtered or unexported fields
}
Clusterer runs DBSCAN clustering on user embeddings. It is triggered by the BackgroundIndexer after new embeddings are created, or manually via the admin trigger endpoint.
func NewClusterer ¶
func NewClusterer(store KnowledgeStore, decay time.Duration, keywordsCount int, cfg DBSCANConfig) *Clusterer
NewClusterer creates a new clusterer.
func (*Clusterer) ClusterByConversationGroups ¶
ClusterByConversationGroups resolves the owner user IDs for the given conversation group IDs, then runs DBSCAN for each affected user. Called by the BackgroundIndexer after new embeddings are created.
type DBSCANConfig ¶
type DBSCANConfig struct {
// Epsilon is the maximum cosine distance between two points to be
// considered neighbors. Range: 0.0 (identical) to 2.0 (opposite).
Epsilon float64
// MinPoints is the minimum number of points required to form a dense region.
MinPoints int
}
DBSCANConfig holds parameters for the DBSCAN algorithm.
func DefaultDBSCANConfig ¶
func DefaultDBSCANConfig() DBSCANConfig
DefaultDBSCANConfig returns sensible defaults for clustering embeddings.
type DiffResult ¶
type DiffResult struct {
// Updated clusters: existing cluster matched to new DBSCAN output.
Updated []ClusterUpdate
// Born clusters: new DBSCAN clusters that don't match any existing cluster.
Born []ClusterBirth
// Died clusters: existing clusters with no matching new DBSCAN cluster.
Died []uuid.UUID
}
DiffResult describes the changes between previous clusters and a new DBSCAN run.
func DiffClusters ¶
func DiffClusters( newResult ClusterResult, embeddings [][]float64, sourceIDs []uuid.UUID, existingClusters []Cluster, ) DiffResult
DiffClusters compares a new DBSCAN ClusterResult against existing clusters and determines which clusters are updated, born, or died.
members maps each existing cluster ID to the set of source IDs that belonged to it. sourceIDs is the ordered list of source IDs corresponding to the embeddings passed to DBSCAN (i.e., sourceIDs[i] is the source of embeddings[i]). embeddings are the vectors used in the current DBSCAN run.
Matching uses majority-member overlap: a new DBSCAN cluster is matched to the existing cluster that shares the highest fraction of members, provided that fraction exceeds 0.5 (majority). Each existing cluster is matched at most once (best match wins).
type EmbeddingRecord ¶
type EmbeddingRecord struct {
SourceID uuid.UUID // entry_id or memory_id
SourceType int // 0=entry, 1=memory
UserID string // owner of the data
Embedding []float64
}
EmbeddingRecord represents a stored embedding that the clustering goroutine reads.
type KeywordResult ¶
type KeywordResult struct {
ClusterLabel int
Keywords []ScoredKeyword
}
KeywordResult holds the top keywords for a single cluster.
func ExtractKeywords ¶
func ExtractKeywords(clusterTexts ClusterTexts, topN int) []KeywordResult
ExtractKeywords computes c-TF-IDF across all clusters and returns the top-N keywords per cluster. c-TF-IDF treats each cluster's combined text as a single document and identifies terms that are distinctively frequent in one cluster compared to all others.
topN controls how many keywords are returned per cluster.
type KnowledgeStore ¶
type KnowledgeStore interface {
// ListUsersWithEmbeddings returns distinct user IDs that have embeddings.
ListUsersWithEmbeddings(ctx context.Context) ([]string, error)
// LoadEmbeddingsForUser returns all embeddings belonging to a user.
LoadEmbeddingsForUser(ctx context.Context, userID string) ([]EmbeddingRecord, error)
// LoadClustersForUser returns all stored clusters (with members) for a user.
LoadClustersForUser(ctx context.Context, userID string) ([]StoredCluster, error)
// SaveCluster creates a new cluster with its members.
SaveCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
// UpdateCluster updates an existing cluster's metadata and replaces its members.
UpdateCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
// DeleteCluster removes a cluster and its members.
DeleteCluster(ctx context.Context, clusterID uuid.UUID) error
// LoadTextsForSourceIDs returns the indexed text content for the given source IDs.
// Used for c-TF-IDF keyword extraction after clustering.
LoadTextsForSourceIDs(ctx context.Context, sourceIDs []uuid.UUID) (map[uuid.UUID]string, error)
// ResolveOwnersByConversationGroupIDs returns distinct owner user IDs for the
// given conversation group IDs. Used by the indexer to determine which users
// need re-clustering after new embeddings are created.
ResolveOwnersByConversationGroupIDs(ctx context.Context, groupIDs []uuid.UUID) ([]string, error)
}
KnowledgeStore defines the persistence interface for the clustering goroutine.
type PostgresKnowledgeStore ¶
type PostgresKnowledgeStore struct {
// contains filtered or unexported fields
}
PostgresKnowledgeStore implements KnowledgeStore using GORM + PostgreSQL.
func NewPostgresKnowledgeStore ¶
func NewPostgresKnowledgeStore(db *gorm.DB) *PostgresKnowledgeStore
NewPostgresKnowledgeStore creates a new PostgreSQL-backed knowledge store.
func OpenPostgresKnowledgeStore ¶
func OpenPostgresKnowledgeStore(dbURL string) (*PostgresKnowledgeStore, error)
OpenPostgresKnowledgeStore opens a new gorm.DB connection for the knowledge store.
func (*PostgresKnowledgeStore) DeleteCluster ¶
func (*PostgresKnowledgeStore) ListUsersWithEmbeddings ¶
func (s *PostgresKnowledgeStore) ListUsersWithEmbeddings(ctx context.Context) ([]string, error)
func (*PostgresKnowledgeStore) LoadClustersForUser ¶
func (s *PostgresKnowledgeStore) LoadClustersForUser(ctx context.Context, userID string) ([]StoredCluster, error)
func (*PostgresKnowledgeStore) LoadEmbeddingsForUser ¶
func (s *PostgresKnowledgeStore) LoadEmbeddingsForUser(ctx context.Context, userID string) ([]EmbeddingRecord, error)
func (*PostgresKnowledgeStore) LoadTextsForSourceIDs ¶
func (*PostgresKnowledgeStore) ResolveOwnersByConversationGroupIDs ¶
func (*PostgresKnowledgeStore) SaveCluster ¶
func (s *PostgresKnowledgeStore) SaveCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
func (*PostgresKnowledgeStore) UpdateCluster ¶
func (s *PostgresKnowledgeStore) UpdateCluster(ctx context.Context, cluster StoredCluster, members []StoredClusterMember) error
type ScoredKeyword ¶
ScoredKeyword is a term with its c-TF-IDF score.
type StoredCluster ¶
type StoredCluster struct {
ID uuid.UUID
UserID string
Label string
Keywords []string
Centroid []float64
MemberCount int
Trend int // 0=growing, 1=stable, 2=decaying
SourceType int // 0=entries, 1=memories, 2=mixed
CreatedAt time.Time
UpdatedAt time.Time
Members []StoredClusterMember
}
StoredCluster is a cluster as persisted in the database.