datastore

package
v0.0.0-...-c22db21 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 19, 2026 License: Apache-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrDBFileNotFound = errors.New("file not found in database")

ErrDBFileNotFound is returned when a file is not found.

View Source
var IsDuplicateFuncs = map[string]IsDuplicateFunc{
	"file_metadata": DedupeByFileMetadata,
	"dummy":         DummyDedupe,
	"none":          DummyDedupe,
	"ignore":        DummyDedupe,
	"upsert":        DedupeUpsert,
}

IsDuplicateFuncs is a map of deduplication functions by name.

Functions

func DedupeByFileMetadata

func DedupeByFileMetadata(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DedupeByFileMetadata is a deduplication function that checks if the document is a duplicate based on the file metadata.

func DedupeUpsert

func DedupeUpsert(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

func DummyDedupe

func DummyDedupe(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

DummyDedupe is a dummy deduplication function that always returns false (i.e. "No Duplicate").

func GetDefaultDSNs

func GetDefaultDSNs(indexDSN, vectorDSN string) (string, string, bool, error)

GetDefaultDSNs returns the paths for the datastore and vectorstore databases. In addition, it returns a boolean indicating whether the datastore is an archive.

func LogEmbeddingFunc

func LogEmbeddingFunc(embeddingFunc vs.EmbeddingFunc) vs.EmbeddingFunc

Types

type Datastore

type Datastore struct {
	Index                  index.Index
	Vectorstore            vectorstore.VectorStore
	EmbeddingConfig        config.EmbeddingsConfig
	EmbeddingModelProvider etypes.EmbeddingModelProvider
}

func NewDatastore

func NewDatastore(ctx context.Context, indexDSN string, automigrate bool, vectorDSN string, embeddingProvider etypes.EmbeddingModelProvider) (*Datastore, error)

func (*Datastore) Close

func (s *Datastore) Close() error

func (*Datastore) CreateDataset

func (s *Datastore) CreateDataset(ctx context.Context, dataset types.Dataset, opts *types.DatasetCreateOpts) error

func (*Datastore) DeleteDataset

func (s *Datastore) DeleteDataset(ctx context.Context, datasetID string) error

func (*Datastore) DeleteDocument

func (s *Datastore) DeleteDocument(ctx context.Context, documentID, datasetID string) error

func (*Datastore) DeleteFile

func (s *Datastore) DeleteFile(ctx context.Context, datasetID, fileID string) error

func (*Datastore) ExportDatasetsToFile

func (s *Datastore) ExportDatasetsToFile(ctx context.Context, path string, datasets ...string) error

func (*Datastore) FindFile

func (s *Datastore) FindFile(ctx context.Context, searchFile types.File) (*types.File, error)

func (*Datastore) GetDataset

func (s *Datastore) GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error)

func (*Datastore) GetDatasetForDocument

func (s *Datastore) GetDatasetForDocument(ctx context.Context, documentID string) (*types.Dataset, error)

func (*Datastore) GetDocuments

func (s *Datastore) GetDocuments(ctx context.Context, datasetID string, where map[string]string, whereDocument []types.WhereDocument) ([]types.Document, error)

func (*Datastore) ImportDatasetsFromFile

func (s *Datastore) ImportDatasetsFromFile(ctx context.Context, path string, datasets ...string) error

func (*Datastore) Ingest

func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename string, content []byte, opts IngestOpts) ([]string, error)

Ingest loads a document from a reader and adds it to the dataset.

func (*Datastore) ListDatasets

func (s *Datastore) ListDatasets(ctx context.Context) ([]types.Dataset, error)

func (*Datastore) PruneFiles

func (s *Datastore) PruneFiles(ctx context.Context, datasetID string, pathPrefix string, keep []string) ([]types.File, error)

func (*Datastore) Retrieve

func (s *Datastore) Retrieve(ctx context.Context, datasetIDs []string, query string, opts RetrieveOpts) (*types.RetrievalResponse, error)

func (*Datastore) SimilaritySearch

func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocuments int, datasetID string, where map[string]string, whereDocument []types2.WhereDocument) ([]types2.Document, error)

func (*Datastore) UpdateDataset

func (s *Datastore) UpdateDataset(ctx context.Context, updatedDataset types.Dataset, opts *UpdateDatasetOpts) (*types.Dataset, error)

type IngestOpts

type IngestOpts struct {
	FileMetadata        *types.FileMetadata
	IsDuplicateFuncName string
	IsDuplicateFunc     IsDuplicateFunc
	IngestionFlows      []flows.IngestionFlow
	ExtraMetadata       map[string]any
	ReuseEmbeddings     bool
	ReuseFiles          bool
}

type IsDuplicateFunc

type IsDuplicateFunc func(ctx context.Context, d *Datastore, datasetID string, content []byte, opts IngestOpts) (bool, error)

IsDuplicateFunc is a function that determines whether a document is a duplicate or if it should be ingested. The function should return true if the document is a duplicate (and thus should not be ingested) and false otherwise.

type RetrieveOpts

type RetrieveOpts struct {
	TopK          int
	Keywords      []string
	RetrievalFlow *flows.RetrievalFlow
}

type UpdateDatasetOpts

type UpdateDatasetOpts struct {
	ReplaceMedata bool
}

Directories

Path Synopsis
lib
Package postprocessors is basically the same as package transformers, but used at a different stage of the RAG pipeline
Package postprocessors is basically the same as package transformers, but used at a different stage of the RAG pipeline

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL