storage

package
v1.3.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 21, 2026 License: Apache-2.0 Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BadgerStore

type BadgerStore struct {
	// contains filtered or unexported fields
}

BadgerStore implements the VisitedStore interface using BadgerDB

func NewBadgerStore

func NewBadgerStore(ctx context.Context, stateDir, siteDomain string, resume bool, logger *logrus.Entry) (*BadgerStore, error)

NewBadgerStore initializes and returns a new BadgerStore

func (*BadgerStore) CheckImageStatus

func (s *BadgerStore) CheckImageStatus(normalizedImgURL string) (models.ImageStatus, *models.ImageDBEntry, error)

CheckImageStatus implements the VisitedStore interface

func (*BadgerStore) CheckPageStatus

func (s *BadgerStore) CheckPageStatus(normalizedPageURL string) (models.PageStatus, *models.PageDBEntry, error)

CheckPageStatus implements the VisitedStore interface

func (*BadgerStore) Close

func (s *BadgerStore) Close() error

Close implements the VisitedStore interface

func (*BadgerStore) GetPageContentHash

func (s *BadgerStore) GetPageContentHash(normalizedPageURL string) (hash string, exists bool, err error)

GetPageContentHash retrieves the content hash for a previously crawled page. Returns the hash string, whether it exists, and any error.

func (*BadgerStore) GetVisitedCount

func (s *BadgerStore) GetVisitedCount() (int, error)

GetVisitedCount implements the VisitedStore interface. Returns the cached key count (O(1)) maintained by atomic increments on writes.

func (*BadgerStore) MarkPageVisited

func (s *BadgerStore) MarkPageVisited(normalizedPageURL string) (bool, error)

MarkPageVisited implements the VisitedStore interface

func (*BadgerStore) RequeueIncomplete

func (s *BadgerStore) RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (int, int, error)

RequeueIncomplete implements the VisitedStore interface

func (*BadgerStore) RunGC

func (s *BadgerStore) RunGC(ctx context.Context, interval time.Duration)

RunGC runs BadgerDB's garbage collection periodically

func (*BadgerStore) UpdateImageStatus

func (s *BadgerStore) UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error

UpdateImageStatus implements the VisitedStore interface

func (*BadgerStore) UpdatePageStatus

func (s *BadgerStore) UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error

UpdatePageStatus implements the VisitedStore interface

func (*BadgerStore) WriteVisitedLog

func (s *BadgerStore) WriteVisitedLog(filePath string) error

WriteVisitedLog implements the VisitedStore interface.

type ImageStore

type ImageStore interface {
	// CheckImageStatus retrieves the status and details of an image URL
	// Returns status (ImageStatusSuccess, ImageStatusFailure, ImageStatusNotFound, ImageStatusDBError),
	// the ImageDBEntry if found and parsed, and any error
	CheckImageStatus(normalizedImgURL string) (status models.ImageStatus, entry *models.ImageDBEntry, err error)

	// UpdateImageStatus updates the status and details for an image URL
	UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error
}

ImageStore handles image processing state

type PageStore

type PageStore interface {
	// MarkPageVisited marks a page URL as visited (pending state)
	// Returns true if the URL was newly added, false if it already existed
	MarkPageVisited(normalizedPageURL string) (bool, error)

	// CheckPageStatus retrieves the status and details of a page URL
	// Returns status (PageStatusSuccess, PageStatusFailure, PageStatusPending, PageStatusNotFound, PageStatusDBError),
	// the PageDBEntry if found and parsed, and any error
	CheckPageStatus(normalizedPageURL string) (status models.PageStatus, entry *models.PageDBEntry, err error)

	// UpdatePageStatus updates the status and details for a page URL
	UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error

	// GetPageContentHash retrieves the content hash for a previously crawled page
	// Returns the hash string, whether it exists, and any error
	GetPageContentHash(normalizedPageURL string) (hash string, exists bool, err error)
}

PageStore handles page visitation state

type StoreAdmin

type StoreAdmin interface {
	// GetVisitedCount returns an approximate count of all keys in the store
	GetVisitedCount() (int, error)

	// RequeueIncomplete scans the DB and sends incomplete items (failed, pending, empty) to the provided channel
	// Should be called only during resume
	RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (requeuedCount int, scanErrors int, err error)

	// WriteVisitedLog writes all page and image keys (URLs) to the specified file path
	WriteVisitedLog(filePath string) error

	// RunGC runs periodic garbage collection. Should be run in a goroutine
	RunGC(ctx context.Context, interval time.Duration)

	// Close cleanly closes the database connection
	Close() error
}

StoreAdmin handles lifecycle and administrative operations

type VisitedStore

type VisitedStore interface {
	PageStore
	ImageStore
	StoreAdmin
}

VisitedStore combines all store interfaces for components that need full access

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL