Documentation
¶
Index ¶
- type BadgerStore
- func (s *BadgerStore) CheckImageStatus(normalizedImgURL string) (models.ImageStatus, *models.ImageDBEntry, error)
- func (s *BadgerStore) CheckPageStatus(normalizedPageURL string) (models.PageStatus, *models.PageDBEntry, error)
- func (s *BadgerStore) Close() error
- func (s *BadgerStore) GetPageContentHash(normalizedPageURL string) (hash string, exists bool, err error)
- func (s *BadgerStore) GetVisitedCount() (int, error)
- func (s *BadgerStore) MarkPageVisited(normalizedPageURL string) (bool, error)
- func (s *BadgerStore) RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (int, int, error)
- func (s *BadgerStore) RunGC(ctx context.Context, interval time.Duration)
- func (s *BadgerStore) UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error
- func (s *BadgerStore) UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error
- func (s *BadgerStore) WriteVisitedLog(filePath string) error
- type ImageStore
- type PageStore
- type StoreAdmin
- type VisitedStore
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BadgerStore ¶
type BadgerStore struct {
// contains filtered or unexported fields
}
BadgerStore implements the VisitedStore interface using BadgerDB
func NewBadgerStore ¶
func NewBadgerStore(ctx context.Context, stateDir, siteDomain string, resume bool, logger *logrus.Entry) (*BadgerStore, error)
NewBadgerStore initializes and returns a new BadgerStore
func (*BadgerStore) CheckImageStatus ¶
func (s *BadgerStore) CheckImageStatus(normalizedImgURL string) (models.ImageStatus, *models.ImageDBEntry, error)
CheckImageStatus implements the VisitedStore interface
func (*BadgerStore) CheckPageStatus ¶
func (s *BadgerStore) CheckPageStatus(normalizedPageURL string) (models.PageStatus, *models.PageDBEntry, error)
CheckPageStatus implements the VisitedStore interface
func (*BadgerStore) Close ¶
func (s *BadgerStore) Close() error
Close implements the VisitedStore interface
func (*BadgerStore) GetPageContentHash ¶
func (s *BadgerStore) GetPageContentHash(normalizedPageURL string) (hash string, exists bool, err error)
GetPageContentHash retrieves the content hash for a previously crawled page. Returns the hash string, whether it exists, and any error.
func (*BadgerStore) GetVisitedCount ¶
func (s *BadgerStore) GetVisitedCount() (int, error)
GetVisitedCount implements the VisitedStore interface. Returns the cached key count (O(1)) maintained by atomic increments on writes.
func (*BadgerStore) MarkPageVisited ¶
func (s *BadgerStore) MarkPageVisited(normalizedPageURL string) (bool, error)
MarkPageVisited implements the VisitedStore interface
func (*BadgerStore) RequeueIncomplete ¶
func (s *BadgerStore) RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (int, int, error)
RequeueIncomplete implements the VisitedStore interface
func (*BadgerStore) RunGC ¶
func (s *BadgerStore) RunGC(ctx context.Context, interval time.Duration)
RunGC runs BadgerDB's garbage collection periodically
func (*BadgerStore) UpdateImageStatus ¶
func (s *BadgerStore) UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error
UpdateImageStatus implements the VisitedStore interface
func (*BadgerStore) UpdatePageStatus ¶
func (s *BadgerStore) UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error
UpdatePageStatus implements the VisitedStore interface
func (*BadgerStore) WriteVisitedLog ¶
func (s *BadgerStore) WriteVisitedLog(filePath string) error
WriteVisitedLog implements the VisitedStore interface.
type ImageStore ¶
type ImageStore interface {
// CheckImageStatus retrieves the status and details of an image URL
// Returns status (ImageStatusSuccess, ImageStatusFailure, ImageStatusNotFound, ImageStatusDBError),
// the ImageDBEntry if found and parsed, and any error
CheckImageStatus(normalizedImgURL string) (status models.ImageStatus, entry *models.ImageDBEntry, err error)
// UpdateImageStatus updates the status and details for an image URL
UpdateImageStatus(normalizedImgURL string, entry *models.ImageDBEntry) error
}
ImageStore handles image processing state
type PageStore ¶
type PageStore interface {
// MarkPageVisited marks a page URL as visited (pending state)
// Returns true if the URL was newly added, false if it already existed
MarkPageVisited(normalizedPageURL string) (bool, error)
// CheckPageStatus retrieves the status and details of a page URL
// Returns status (PageStatusSuccess, PageStatusFailure, PageStatusPending, PageStatusNotFound, PageStatusDBError),
// the PageDBEntry if found and parsed, and any error
CheckPageStatus(normalizedPageURL string) (status models.PageStatus, entry *models.PageDBEntry, err error)
// UpdatePageStatus updates the status and details for a page URL
UpdatePageStatus(normalizedPageURL string, entry *models.PageDBEntry) error
// GetPageContentHash retrieves the content hash for a previously crawled page
// Returns the hash string, whether it exists, and any error
GetPageContentHash(normalizedPageURL string) (hash string, exists bool, err error)
}
PageStore handles page visitation state
type StoreAdmin ¶
type StoreAdmin interface {
// GetVisitedCount returns an approximate count of all keys in the store
GetVisitedCount() (int, error)
// RequeueIncomplete scans the DB and sends incomplete items (failed, pending, empty) to the provided channel
// Should be called only during resume
RequeueIncomplete(ctx context.Context, workChan chan<- models.WorkItem) (requeuedCount int, scanErrors int, err error)
// WriteVisitedLog writes all page and image keys (URLs) to the specified file path
WriteVisitedLog(filePath string) error
// RunGC runs periodic garbage collection. Should be run in a goroutine
RunGC(ctx context.Context, interval time.Duration)
// Close cleanly closes the database connection
Close() error
}
StoreAdmin handles lifecycle and administrative operations
type VisitedStore ¶
type VisitedStore interface {
PageStore
ImageStore
StoreAdmin
}
VisitedStore combines all store interfaces for components that need full access