db

package
v0.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 3, 2026 License: MIT Imports: 39 Imported by: 0

Documentation

Index

Constants

View Source
const (
	IteratorModeFull              = tree.IteratorModeFull
	IteratorModeKeysOnly          = tree.IteratorModeKeysOnly
	IteratorModePointerProjection = tree.IteratorModePointerProjection
)
View Source
const (
	MetaPage0ID = 0
	MetaPage1ID = 1
	KeepRecent  = 10000
)

Variables

View Source
var (
	// ErrLocked indicates the database directory is already opened by another process.
	ErrLocked = lockfile.ErrLocked
	// ErrReadOnly indicates a write was attempted on a read-only DB handle.
	ErrReadOnly = errors.New("treedb: read-only")
	// ErrClosed indicates the DB handle is closed or closing for reads.
	ErrClosed = errors.New("treedb: db is closed")
)
View Source
var ErrVacuumInProgress = errors.New("online vacuum already in progress")
View Source
var ErrVacuumUnsupported = errors.New("online vacuum unsupported on this platform")

Functions

func NormalizePublicBatchReserveHint added in v0.4.0

func NormalizePublicBatchReserveHint(size int) int

NormalizePublicBatchReserveHint keeps small public hints behaving like entry reserves, but treats larger hints as approximate byte budgets so callers do not accidentally preallocate one entry per byte. This is intentionally discontinuous at the cutover for compatibility with small entry-count hints.

For internal use; behavior may change without notice and is not part of the supported external API surface of the db package.

func ResolveInlineThresholdForKey added in v0.4.0

func ResolveInlineThresholdForKey(baseThreshold int, key []byte, domains []ValueLogDomainThreshold) int

ResolveInlineThresholdForKey chooses an inline threshold for key using longest-prefix domain overrides and a global fallback.

Callers should pass NormalizeValueLogDomainThresholds output so that the first match is the intended longest-prefix override.

func SaveFormatConfig added in v0.4.0

func SaveFormatConfig(dir string, cfg FormatConfig) error

SaveFormatConfig writes cfg to dir/format.json atomically.

func VacuumIndexOffline

func VacuumIndexOffline(opts Options) error

VacuumIndexOffline rewrites index.db into a fresh file and swaps it in.

This is intended to reclaim space (reduce `index.db` chunk count) and restore locality after long churn. It is an offline operation (requires exclusive open lock).

func ValidateFragmentationReport

func ValidateFragmentationReport(rep map[string]string) error

ValidateFragmentationReport validates basic invariants on a FragmentationReport output map. It is intended for tests and operational "health" tooling.

func ValueReaderForState

func ValueReaderForState(state *DBState) tree.SlabReader

ValueReaderForState returns a reader that resolves value-log pointers.

Types

type Batch

type Batch struct {
	// contains filtered or unexported fields
}

Batch implements the cosmos-db Batch interface.

func (*Batch) Close

func (b *Batch) Close() error

func (*Batch) Delete

func (b *Batch) Delete(key []byte) error

func (*Batch) DeleteView

func (b *Batch) DeleteView(key []byte) error

DeleteView records a Delete without copying the key bytes. Callers must treat key as immutable until the batch is written or closed.

func (*Batch) GetByteSize

func (b *Batch) GetByteSize() (int, error)

func (*Batch) Replay

func (b *Batch) Replay(fn func(batch.Entry) error) error

func (*Batch) Reserve added in v0.4.0

func (b *Batch) Reserve(n int)

Reserve forwards best-effort preallocation hints to the internal batch.

func (*Batch) Reset

func (b *Batch) Reset()

Reset clears the batch for reuse.

func (*Batch) Set

func (b *Batch) Set(key, value []byte) error

func (*Batch) SetOps

func (b *Batch) SetOps(ops []batch.Entry) error

func (*Batch) SetPointer

func (b *Batch) SetPointer(key []byte, ptr page.ValuePtr) error

SetPointer records a pointer without copying the value bytes.

func (*Batch) SetPointerView added in v0.2.0

func (b *Batch) SetPointerView(key []byte, ptr page.ValuePtr) error

SetPointerView records a pointer without copying the key bytes.

func (*Batch) SetView

func (b *Batch) SetView(key, value []byte) error

SetView records a Put without copying key/value bytes. Callers must treat key/value as immutable until the batch is written or closed.

This is intentionally not part of the public batch.Interface; it is a best-effort optimization used by higher-level layers (e.g. cached streaming).

func (*Batch) Write

func (b *Batch) Write() error

func (*Batch) WriteSync

func (b *Batch) WriteSync() error

type DB

type DB struct {
	// contains filtered or unexported fields
}

func Open

func Open(opts Options) (*DB, error)

Open opens the database.

func (*DB) AcquireSnapshot

func (db *DB) AcquireSnapshot() *Snapshot

AcquireSnapshot returns a new snapshot.

func (*DB) Close

func (db *DB) Close() error

func (*DB) Commit

func (db *DB) Commit(newRootID uint64) error

Commit persists the new root (Sync=true by default). Note: This is usually called internally by Batch.Write or externally if manual root management. If manual, retired pages are unknown? `Commit` signature assumes manual root. If external user calls Commit, they might not know retired pages. We'll accept nil for retired if manual.

func (*DB) CompactIndex

func (db *DB) CompactIndex() error

CompactIndex rewrites the entire B-Tree sequentially to the end of the file. This improves Full Scan performance by restoring physical locality. Note: This operation causes file growth as old pages are not immediately reclaimed (they are leaked to the freelist but not reused during this append-only build).

func (*DB) Delete

func (db *DB) Delete(key []byte) error

Delete removes a key.

func (*DB) DeleteSync

func (db *DB) DeleteSync(key []byte) error

DeleteSync removes a key and syncs.

func (*DB) Dir added in v0.3.0

func (db *DB) Dir() string

Dir returns the on-disk directory backing the DB.

func (*DB) FragmentationReport

func (db *DB) FragmentationReport() (map[string]string, error)

FragmentationReport returns best-effort structural stats about the user index that help diagnose scan regressions after churn.

func (*DB) Get

func (db *DB) Get(key []byte) ([]byte, error)

Get returns the value for a key.

Semantics: Returns a safe copy of the value.

func (*DB) GetAppend

func (db *DB) GetAppend(key, dst []byte) ([]byte, error)

GetAppend appends the value for the key to dst and returns the new slice. If the key is not found, it returns dst and ErrKeyNotFound.

func (*DB) GetMany added in v0.4.0

func (db *DB) GetMany(keys [][]byte) ([][]byte, error)

GetMany returns values for keys.

Semantics: Returns safe copies of values. Missing keys are returned as nil entries with no error.

func (*DB) GetManyParallelPlan added in v0.4.0

func (db *DB) GetManyParallelPlan(keyCount int) (workers int, parallel bool)

GetManyParallelPlan reports how this backend would schedule GetMany for the provided key count.

func (*DB) GetUnsafe

func (db *DB) GetUnsafe(key []byte) ([]byte, error)

GetUnsafe returns the value for a key.

Semantics: Returns a safe copy of the value. For zero-copy views tied to a snapshot lifetime, use Snapshot.GetUnsafe.

func (*DB) Has

func (db *DB) Has(key []byte) (bool, error)

Has checks if a key exists.

func (*DB) InlineThreshold

func (db *DB) InlineThreshold() int

func (*DB) InlineThresholdForKey added in v0.4.0

func (db *DB) InlineThresholdForKey(key []byte) int

func (*DB) Iterator

func (db *DB) Iterator(start, end []byte) (iterator.UnsafeIterator, error)

Iterator returns an iterator.

func (*DB) IteratorWithOptions added in v0.4.0

func (db *DB) IteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)

IteratorWithOptions returns an iterator with explicit value materialization controls.

func (*DB) MarkValueLogZombie

func (db *DB) MarkValueLogZombie(id uint32) error

MarkValueLogZombie marks a value-log segment as zombie so it can be removed once all snapshots release it.

func (*DB) NewBatch

func (db *DB) NewBatch() batch.Interface

func (*DB) NewBatchWithSize

func (db *DB) NewBatchWithSize(size int) batch.Interface

NewBatchWithSize accepts the public cosmos-db style size hint. Small values are treated like exact entry reserves; larger values are normalized as approximate byte budgets and capped to avoid preallocating one entry per byte. The normalization is intentionally discontinuous at the cutover: `publicBatchHintExactEntryReserveMax` still means "reserve that many entries", while the next value is treated as a byte budget and normalized downward.

func (*DB) Pager

func (db *DB) Pager() *pager.Pager

Getters

func (*DB) Print

func (db *DB) Print() error

Print debugs the tree (simple dump).

func (*DB) Prune

func (db *DB) Prune()

Prune reclaims pages from the graveyard.

func (*DB) RefreshValueLogSet added in v0.3.0

func (db *DB) RefreshValueLogSet() error

RefreshValueLogSet publishes a new DBState with the current value-log set (excluding zombies) without creating a new commit.

func (*DB) RegisterValueLogSegment added in v0.4.0

func (db *DB) RegisterValueLogSegment(path string, fileID uint32) error

RegisterValueLogSegment registers a newly created value-log segment with the backend read manager without scanning the filesystem. Cached mode uses this when it rotates the shared value log so outer-leaf commits can publish a current ValueLogSet via CurrentSetNoRefresh.

func (*DB) ReverseIterator

func (db *DB) ReverseIterator(start, end []byte) (iterator.UnsafeIterator, error)

ReverseIterator returns a reverse iterator.

func (*DB) ReverseIteratorWithOptions added in v0.4.0

func (db *DB) ReverseIteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)

ReverseIteratorWithOptions returns a reverse iterator with explicit value materialization controls.

func (*DB) Set

func (db *DB) Set(key, value []byte) error

Set sets the value for a key.

func (*DB) SetCurrentValueLogReadBarrier added in v0.4.0

func (db *DB) SetCurrentValueLogReadBarrier(fn func(fileID uint32) error)

SetCurrentValueLogReadBarrier installs a callback that will be invoked before backend-internal reads of segments still marked currentWritable.

func (*DB) SetLeafPageLog added in v0.4.0

func (db *DB) SetLeafPageLog(log LeafPageLog)

SetLeafPageLog installs the value-log appender used for value-log-backed leaf pages. It is typically wired by the cached layer after opening the backend.

func (*DB) SetSync

func (db *DB) SetSync(key, value []byte) error

SetSync sets the value and syncs to disk.

func (*DB) SetZipperParallelMergePressureSource added in v0.4.0

func (db *DB) SetZipperParallelMergePressureSource(src zipper.ParallelMergePressureSource)

SetZipperParallelMergePressureSource installs an optional pressure signal for future zipper generations and the current live zipper.

func (*DB) State

func (db *DB) State() *DBState

func (*DB) Stats

func (db *DB) Stats() map[string]string

Stats returns database statistics.

func (*DB) VacuumIndexOnline

func (db *DB) VacuumIndexOnline(ctx context.Context) error

VacuumIndexOnline rebuilds the index into a new file and swaps it in with a short writer pause. Old snapshots remain valid by pinning the previous index generation until readers drain; disk space is reclaimed once the old mmap is closed.

func (*DB) ValueLogGC added in v0.3.0

func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogGCStats, error)

ValueLogGC deletes fully-unreferenced value-log segments.

It scans the user + system trees for value-log pointers, computes referenced segments, and removes segments that are:

  • not referenced,
  • not the currently-active segment per lane,
  • and not pinned by active snapshots.

func (*DB) ValueLogRewriteChunkPlan added in v0.4.0

func (db *DB) ValueLogRewriteChunkPlan(ctx context.Context, opts ValueLogRewriteOnlineOptions, chunkBytes int64) (ValueLogRewriteChunkPlan, error)

func (*DB) ValueLogRewriteOnline added in v0.4.0

func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnlineOptions) (stats ValueLogRewriteStats, err error)

ValueLogRewriteOnline rewrites pointer-backed values in bounded commit batches, then atomically swaps keys to rewritten pointers.

func (*DB) ValueLogRewritePlan added in v0.4.0

func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlineOptions) (ValueLogRewritePlan, error)

ValueLogRewritePlan returns the segments that would be selected for sparse online rewrite given opts. It performs the same live-byte estimation work as ValueLogRewriteOnline sparse selection, but does not modify the DB.

func (*DB) Zipper

func (db *DB) Zipper() *zipper.Zipper

type DBIterator

type DBIterator struct {
	// contains filtered or unexported fields
}

DBIterator wraps tree.Iterator and holds a Snapshot.

func (*DBIterator) Close

func (it *DBIterator) Close() error

func (*DBIterator) DebugStats

func (it *DBIterator) DebugStats() (queueLen int, sourcesUsed int)

func (*DBIterator) Domain

func (it *DBIterator) Domain() (start, end []byte)

func (*DBIterator) Error

func (it *DBIterator) Error() error

func (*DBIterator) IsDeleted

func (it *DBIterator) IsDeleted() bool

func (*DBIterator) Key

func (it *DBIterator) Key() []byte

func (*DBIterator) KeyCopy

func (it *DBIterator) KeyCopy(dst []byte) []byte

func (*DBIterator) Next

func (it *DBIterator) Next()

func (*DBIterator) Seek

func (it *DBIterator) Seek(key []byte)

UnsafeIterator methods

func (*DBIterator) UnsafeEntry

func (it *DBIterator) UnsafeEntry() ([]byte, page.ValuePtr, byte)

func (*DBIterator) UnsafeKey

func (it *DBIterator) UnsafeKey() []byte

func (*DBIterator) UnsafeValue

func (it *DBIterator) UnsafeValue() []byte

func (*DBIterator) Valid

func (it *DBIterator) Valid() bool

func (*DBIterator) Value

func (it *DBIterator) Value() []byte

func (*DBIterator) ValueCopy

func (it *DBIterator) ValueCopy(dst []byte) []byte

type DBState

type DBState struct {
	CommitSeq        uint64
	RootPageID       uint64
	SystemRootPageID uint64
	ValueLogSet      *valuelog.Set
}

type DurabilityMode added in v0.3.0

type DurabilityMode uint8

DurabilityMode configures cached-mode durability semantics.

These modes are explicit and intentionally replace the previous boolean combination of DisableWAL + RelaxedSync + AllowUnsafe.

const (
	// DurabilityDurable enables WAL (journal) and uses fsync for sync operations.
	DurabilityDurable DurabilityMode = iota
	// DurabilityWALOnRelaxed keeps WAL enabled but disables fsync (crash-consistent).
	DurabilityWALOnRelaxed
	// DurabilityWALOffRelaxed disables WAL and fsync (unsafe; recent writes
	// may be lost and sync calls may defer backend publication until a later
	// checkpoint/flush boundary).
	DurabilityWALOffRelaxed
)

type FormatConfig added in v0.4.0

type FormatConfig struct {
	Version int `json:"version"`

	IndexOuterLeavesInValueLog bool `json:"index_outer_leaves_in_vlog"`

	LeafPrefixCompression     bool `json:"leaf_prefix_compression"`
	IndexColumnarLeaves       bool `json:"index_columnar_leaves"`
	IndexPackedValuePtr       bool `json:"index_packed_valueptr"`
	IndexInternalBaseDelta    bool `json:"index_internal_base_delta"`
	IndexAdaptiveLeafEncoding bool `json:"index_adaptive_leaf_encoding"`

	ValueLogCompression string `json:"vlog_compression"`
	ValueLogBlockCodec  string `json:"vlog_block_codec"`
	ValueLogAutoPolicy  string `json:"vlog_auto_policy"`
}

FormatConfig captures the format-affecting knobs that maintenance tooling should preserve when rewriting index/value-log state.

This file is best-effort and pre-alpha; callers should tolerate it being absent. Versioned files written by SaveFormatConfig are expected to be fully populated; if new fields are added in the future, the version should be bumped so older binaries do not accidentally apply zero-values.

func LoadFormatConfig added in v0.4.0

func LoadFormatConfig(dir string) (FormatConfig, bool, error)

LoadFormatConfig loads the best-effort persisted format config for dir. The returned bool reports whether the file was found.

func (FormatConfig) ApplyIndexFormatToOptions added in v0.4.0

func (cfg FormatConfig) ApplyIndexFormatToOptions(opts *Options)

ApplyIndexFormatToOptions overwrites index-format-affecting knobs in opts from cfg.

This is intentionally narrower than ApplyToOptions: it is safe for normal DB opens where callers may want to tune runtime policies (e.g. value-log compression) via env vars or flags, while still ensuring the index encoding matches on-disk state.

func (FormatConfig) ApplyToOptions added in v0.4.0

func (cfg FormatConfig) ApplyToOptions(opts *Options)

ApplyToOptions overwrites format-affecting knobs in opts from cfg.

Callers should treat cfg as best-effort (it may be absent) and may apply explicit overrides after this call.

type IntegrityMode added in v0.3.0

type IntegrityMode uint8

IntegrityMode configures value-log read integrity checks.

It intentionally replaces the previous DisableReadChecksum boolean.

const (
	// IntegrityVerify enables checksum verification on value-log reads.
	IntegrityVerify IntegrityMode = iota
	// IntegritySkipChecksums disables checksum verification on value-log reads (unsafe).
	IntegritySkipChecksums
)

type Iterator

type Iterator interface {
	Valid() bool
	Next()
	Key() []byte
	Value() []byte
	KeyCopy(dst []byte) []byte
	ValueCopy(dst []byte) []byte
	Close() error
	Error() error
	// Reset resets the iterator for reuse.
	Reset(start, end []byte)
}

Iterator is the internal interface for iteration.

type IteratorMode added in v0.4.0

type IteratorMode = tree.IteratorMode

type IteratorOptions added in v0.4.0

type IteratorOptions = tree.IteratorOptions

type LeafPageLog added in v0.4.0

type LeafPageLog interface {
	AppendLeafPage(leafPage []byte) (page.ValuePtr, error)
	Flush() error
	Sync() error
}

LeafPageLog appends and flushes B+Tree leaf pages stored in the value log.

This is used when Options.IndexOuterLeavesInValueLog is enabled. Implementations are expected to reuse the existing value-log record encoding and compression semantics (i.e. they should append normal value-log records and return ValuePtr references).

type Options

type Options struct {
	Dir string
	// IgnoreFormatConfig disables best-effort persisted format.json loading in
	// TreeDB open paths that auto-apply index-format knobs from disk (e.g.
	// treedb.Open, treedb.OpenBackend) and in offline maintenance helpers
	// (VacuumIndexOffline, ValueLogRewriteOffline, treemap vacuum/rewrite/vlog-gc).
	IgnoreFormatConfig bool
	// ReadOnly opens the database without acquiring an exclusive lock and without
	// modifying on-disk state (no recovery truncation, no WAL replay, no background
	// maintenance). Only read operations are supported.
	ReadOnly  bool
	ChunkSize int64 // Default 256KiB
	// DictDBChunkSize controls the mmap chunk size used for the `dictdb/` side
	// store when TreeDB is opened via the public `treedb.Open` wrapper.
	//
	// It is intentionally independent of ChunkSize so benchmarks and callers can
	// tune the main index pager without inflating dictdb disk usage.
	//
	// Values <= 0 use a default of 64KiB.
	DictDBChunkSize int64
	// TemplateDBChunkSize controls the mmap chunk size used for the `templatedb/`
	// side store when template compression is enabled.
	//
	// Values <= 0 use a default of 64KiB.
	TemplateDBChunkSize int64
	KeepRecent          uint64 // Default 10000
	// PagerSyncConcurrency controls how many goroutines may msync dirty chunks
	// in parallel during Sync. Values <= 0 use the default (1).
	PagerSyncConcurrency int
	// PagerMmapPopulate enables MAP_POPULATE on Linux when mmapping index.db
	// chunks. This can reduce minor-fault overhead under random access patterns
	// at the cost of increased work at map/grow time.
	PagerMmapPopulate bool
	// PagerPrefetchOnRead enables best-effort prefetch hints (madvise WILLNEED)
	// for mmapped index chunks (Linux only). When enabled, TreeDB may issue
	// prefetch requests opportunistically (e.g. before rewriting child pages
	// during checkpoint/merge). It is a no-op on unsupported platforms.
	PagerPrefetchOnRead bool

	// Durability configures cached-mode durability semantics.
	//
	// The default (zero) is DurabilityDurable.
	Durability DurabilityMode
	// DisableBackgroundPrune keeps pruning on the commit critical path (legacy
	// behavior). When false (default), a bounded background pruner frees pages
	// asynchronously to reduce commit latency under churn.
	DisableBackgroundPrune bool
	// PruneInterval controls how often the background pruner wakes up (0 uses a
	// default).
	PruneInterval time.Duration
	// PruneMaxPages bounds how many pages are freed per pruner tick (0 uses a
	// default; <0 means unlimited).
	PruneMaxPages int
	// PruneMaxDuration bounds how long a pruner tick may run (0 uses a default;
	// <0 means unlimited).
	PruneMaxDuration time.Duration

	FlushThreshold int64
	// MemtableMode selects the cached-mode memtable implementation.
	// Supported values: "skiplist", "hash_sorted", "btree", "append_only", "adaptive".
	MemtableMode string
	// MemtableShards controls the number of mutable memtable shards in cached
	// mode. Values <= 0 use a runtime-dependent default.
	MemtableShards int
	// DomainIngressWorkers enables experimental domain-local ingress workers in
	// cached mode. Values <= 0 keep the legacy direct write path.
	DomainIngressWorkers int
	// DomainIngressQueueSize configures the per-worker ingress queue length when
	// DomainIngressWorkers is enabled. Values <= 0 use a default.
	DomainIngressQueueSize int
	// PreferAppendAlloc makes the page allocator ignore the freelist and append
	// new pages instead. This can improve scan locality under churn at the cost
	// of file growth (space is reclaimed later via vacuum).
	PreferAppendAlloc bool
	// FreelistRegionPages and FreelistRegionRadius bias freelist reuse toward
	// nearby page regions to improve locality. Leave both at 0 to disable the
	// bias (default). If either is set, missing values will use defaults.
	// Set FreelistRegionRadius < 0 to force-disable the bias.
	FreelistRegionPages  uint64
	FreelistRegionRadius int

	// LeafFillTargetPPM and InternalFillTargetPPM control how full newly-written
	// B+Tree pages are allowed to become before forcing a split (soft-full).
	// Lower values reduce split churn and slow re-fragmentation under updates, at
	// the cost of higher page count (more index bytes).
	//
	// Values are in parts-per-million where 1_000_000 means "allow full pages"
	// (current behavior). Zero uses the default (1_000_000).
	LeafFillTargetPPM     uint32
	InternalFillTargetPPM uint32
	// MaintenanceOpsPerCoalesce controls the maintenance budget during zipper
	// merge. It bounds coalesce work to roughly len(ops)/K operations per batch.
	// 0 uses the default; negative disables the budget (full maintenance).
	MaintenanceOpsPerCoalesce int
	// LeafPrefixCompression enables prefix-compressed leaf nodes for new pages.
	LeafPrefixCompression bool
	// IndexColumnarLeaves enables the experimental columnar leaf encoding for new pages.
	IndexColumnarLeaves bool
	// IndexPackedValuePtr enables the experimental packed 12-byte ValuePtr encoding
	// for pointer entries in new leaf pages.
	//
	// Packed pointers store ValuePtr.Offset as u32 on disk. Callers must ensure
	// value-log segments are rotated such that offsets remain representable.
	IndexPackedValuePtr bool
	// IndexInternalBaseDelta enables the experimental internal-node base-delta encoding.
	IndexInternalBaseDelta bool
	// IndexOuterLeavesInValueLog stores B+Tree leaf pages (the pages containing
	// key/value entries) in the persistent value log instead of index.db.
	//
	// When enabled, internal nodes store encoded value-log pointers for leaf
	// children. This is pre-alpha and changes on-disk format/assumptions.
	IndexOuterLeavesInValueLog bool
	// IndexAdaptiveLeafEncoding enables per-page adaptive selection of leaf
	// encoding flags using deterministic heuristics from key/value shape.
	//
	// This option only affects newly-written leaf pages.
	IndexAdaptiveLeafEncoding bool
	// MaxQueuedMemtables controls how much immutable-memtable backlog the cached
	// layer will allow before applying backpressure (i.e. forcing flush work on
	// writers). A negative value disables backpressure entirely (higher short-term
	// ingest, but potentially unbounded flush debt). Zero uses the default.
	MaxQueuedMemtables int

	// SlowdownBacklogSeconds begins applying writer backpressure when queued flush
	// backlog exceeds this many seconds of estimated flush work (0 disables).
	SlowdownBacklogSeconds float64
	// StopBacklogSeconds blocks writers when queued flush backlog exceeds this many
	// seconds of estimated flush work (0 disables).
	StopBacklogSeconds float64
	// MaxBacklogBytes is an absolute cap on queued flush backlog bytes (0 disables).
	MaxBacklogBytes int64

	// WriterFlushMaxMemtables bounds how much queued work a writer will help flush
	// per write when backpressure is active (0 uses a default).
	WriterFlushMaxMemtables int
	// WriterFlushMaxDuration bounds how long a writer will spend helping flush per
	// write when backpressure is active (0 disables the time bound).
	WriterFlushMaxDuration time.Duration
	// FlushBuildConcurrency controls how many goroutines may be used to build a
	// combined flush batch from multiple immutable memtables in cached mode.
	// Values <= 1 disable parallelism.
	FlushBuildConcurrency int
	// FlushBuildMinEntries gates the parallel build path by total entries.
	// Values <= 0 use a default of 16k.
	FlushBuildMinEntries int
	// FlushBuildMinUnits gates the parallel build path by number of queued units.
	// Values <= 0 use a default of 2.
	FlushBuildMinUnits int
	// FlushBuildChunkCap controls the maximum entries per build chunk.
	// A value of 0 enables adaptive chunk sizing, values < 0 use the fixed default of 8192,
	// and values > 0 set an explicit cap.
	FlushBuildChunkCap int
	// FlushBuildChunkTargetBytes controls adaptive chunk sizing (bytes per chunk).
	// Values <= 0 use a default of 2MiB.
	FlushBuildChunkTargetBytes int
	// FlushBuildChunkMinBytes clamps adaptive chunk sizes (minimum bytes).
	// Values <= 0 use a default of 1MiB.
	FlushBuildChunkMinBytes int
	// FlushBuildChunkMaxBytes clamps adaptive chunk sizes (maximum bytes).
	// Values <= 0 use a default of 4MiB.
	FlushBuildChunkMaxBytes int
	// FlushBuildPrefetchUnits controls how many memtables to start building ahead
	// of the consumer. Values <= 0 use FlushBuildConcurrency.
	FlushBuildPrefetchUnits int

	// FlushBackendMaxEntries caps how many operations are buffered into a single
	// backend batch before committing it and continuing with a fresh batch.
	//
	// This increases backend commit cadence during very large flushes, which can
	// reduce index.db high-watermark growth under small KeepRecent windows by
	// making retired pages eligible for reuse sooner.
	//
	// 0 uses the internal default. Negative disables chunking (single backend
	// commit per flush).
	FlushBackendMaxEntries int
	// FlushBackendMaxBatches caps how many intermediate backend commits a single
	// flush may emit (0=default, <0=disable cap).
	FlushBackendMaxBatches int

	// JournalLanes controls the number of active commit/value log lanes (0=default).
	// Max supported lanes is 255; value-log segment sequence per lane is capped at 8,388,607.
	JournalLanes int
	// WALMaxSegmentBytes caps the size of a single WAL segment payload.
	// 0 uses the default limit.
	WALMaxSegmentBytes int64
	// JournalCompression enables best-effort zstd compression for cached-mode
	// journal/commitlog segments (metadata only).
	//
	// The redo log will only keep compressed bytes when they are smaller than the
	// raw payload, so compression never causes size amplification.
	JournalCompression bool

	// ValueLog configures value-log pointer behavior and read integrity.
	ValueLog ValueLogOptions

	// NotifyError is an optional hook for background maintenance failures.
	NotifyError func(error)

	// VerifyOnRead forces checksum verification on every index page read,
	// bypassing the verified-page cache.
	VerifyOnRead bool
	// DisableSideStores skips opening dictdb/templatedb side stores.
	// This is intended for internal side-store usage (e.g. templatedb itself).
	DisableSideStores bool

	// DisablePiggybackCompaction disables opportunistic defragmentation during writes.
	// When false (default), nodes are rewritten if their siblings are physically
	// distant, keeping the tree clustered. Set to true to maximize write speed.
	DisablePiggybackCompaction bool

	// BackgroundCheckpointInterval enables periodic durable checkpoints in cached
	// mode. A checkpoint creates a backend sync boundary and trims
	// cached-mode WAL segments to keep `wal/` growth bounded.
	//
	// Semantics:
	// - `0` uses a default.
	// - `<0` disables the periodic interval trigger.
	BackgroundCheckpointInterval time.Duration
	// BackgroundCheckpointIdleDuration triggers an opportunistic checkpoint after
	// a period of write-idleness in cached mode.
	//
	// Semantics:
	// - `0` uses a default.
	// - `<0` disables the idle trigger.
	BackgroundCheckpointIdleDuration time.Duration
	// BackgroundIndexVacuumInterval enables periodic online index vacuum passes.
	// `0` uses a default; `<0` disables.
	BackgroundIndexVacuumInterval time.Duration
	// BackgroundIndexVacuumSpanRatioPPM sets the span ratio threshold that
	// triggers a vacuum pass (0 uses a default).
	BackgroundIndexVacuumSpanRatioPPM uint32
	// MaxWALBytes triggers an immediate checkpoint in cached mode when the sum of
	// WAL segment sizes exceeds this many bytes (0 uses a default; <0 disables the
	// size trigger). This is an operational safety cap; it does not make each
	// individual write durable (use *Sync APIs for that).
	MaxWALBytes int64
}

type Snapshot

type Snapshot struct {
	// contains filtered or unexported fields
}

func (*Snapshot) Close

func (s *Snapshot) Close() error

Close releases the snapshot.

func (*Snapshot) Get

func (s *Snapshot) Get(key []byte) ([]byte, error)

Get returns value from snapshot.

func (*Snapshot) GetAppend added in v0.4.0

func (s *Snapshot) GetAppend(key, dst []byte) ([]byte, error)

GetAppend appends the value for key to dst and returns the grown slice. If key is not found, it returns dst and tree.ErrKeyNotFound.

func (*Snapshot) GetEntry

func (s *Snapshot) GetEntry(key []byte) (node.LeafEntry, error)

GetEntry returns the persisted leaf entry for key.

func (*Snapshot) GetEntryExact added in v0.4.0

func (s *Snapshot) GetEntryExact(key []byte) (node.LeafEntry, error)

GetEntryExact is an alias for GetEntry.

func (*Snapshot) GetUnsafe

func (s *Snapshot) GetUnsafe(key []byte) ([]byte, error)

GetUnsafe returns a zero-copy view of the value from the snapshot. The slice is valid until the snapshot is closed.

func (*Snapshot) Has

func (s *Snapshot) Has(key []byte) (bool, error)

func (*Snapshot) Pager

func (s *Snapshot) Pager() *pager.Pager

func (*Snapshot) State

func (s *Snapshot) State() *DBState

type SnapshotPool

type SnapshotPool struct {
	// contains filtered or unexported fields
}

SnapshotPool manages a pool of Snapshot objects to reduce allocation overhead.

func NewSnapshotPool

func NewSnapshotPool() *SnapshotPool

func (*SnapshotPool) Get

func (p *SnapshotPool) Get() *Snapshot

func (*SnapshotPool) Put

func (p *SnapshotPool) Put(s *Snapshot)

type ValueLogAutoPolicy added in v0.3.0

type ValueLogAutoPolicy uint8

ValueLogAutoPolicy controls auto-mode dict vs block selection bias.

const (
	ValueLogAutoBalanced ValueLogAutoPolicy = iota
	ValueLogAutoThroughput
	ValueLogAutoSize
)

type ValueLogBlockCodec added in v0.3.0

type ValueLogBlockCodec uint8

ValueLogBlockCodec selects the block codec used for block compression modes.

const (
	ValueLogBlockSnappy ValueLogBlockCodec = iota
	ValueLogBlockLZ4
)

type ValueLogCompressionMode added in v0.3.0

type ValueLogCompressionMode uint8

ValueLogCompressionMode selects value-log compression behavior in cached mode.

const (
	// ValueLogCompressionOff stores value-log grouped frames uncompressed.
	//
	// Zero is intentionally reserved as "unset/default".
	// db.Open normalizes zero to ValueLogCompressionAuto.
	ValueLogCompressionOff ValueLogCompressionMode = iota + 1
	// ValueLogCompressionBlock uses block compression without dictionaries.
	ValueLogCompressionBlock
	// ValueLogCompressionDict uses dictionary compression when available.
	ValueLogCompressionDict
	// ValueLogCompressionAuto adaptively chooses off/block/dict.
	ValueLogCompressionAuto
)

type ValueLogDictClassMode added in v0.4.0

type ValueLogDictClassMode uint8

ValueLogDictClassMode controls whether dictionary state is shared across all value-log payloads or split by payload class.

const (
	// ValueLogDictClassSingle keeps one shared dictionary stream for all
	// value-log payloads.
	ValueLogDictClassSingle ValueLogDictClassMode = iota
	// ValueLogDictClassSplitOuterLeaf keeps separate dictionary streams for
	// outer-leaf payloads and single-value payloads.
	ValueLogDictClassSplitOuterLeaf
)

type ValueLogDomainThreshold added in v0.4.0

type ValueLogDomainThreshold struct {
	// Prefix selects the key domain this override applies to.
	Prefix []byte
	// InlineThreshold is the maximum inline value size for keys in Prefix.
	// Values larger than this threshold are eligible for value-log pointers.
	// Zero forces all non-empty values in this domain to pointer placement.
	InlineThreshold int
}

ValueLogDomainThreshold overrides inline-vs-pointer placement policy for keys under a domain prefix.

A key belongs to the first matching prefix after normalization (longest-prefix wins).

func NormalizeValueLogDomainThresholds added in v0.4.0

func NormalizeValueLogDomainThresholds(in []ValueLogDomainThreshold) []ValueLogDomainThreshold

NormalizeValueLogDomainThresholds returns a deterministic longest-prefix-first copy suitable for hot-path threshold lookups.

It filters invalid entries (empty prefix or negative thresholds) and de-duplicates identical prefixes after sorting.

type ValueLogGCOptions added in v0.3.0

type ValueLogGCOptions struct {
	DryRun bool
	// ProtectedPaths preserves legacy callers that provide a single merged set
	// of protected paths. Prefer the specific ProtectedInUsePaths and
	// ProtectedRetainedPaths fields for blocker classification.
	ProtectedPaths []string
	// ProtectedInUsePaths are paths that may still be referenced by mutable
	// in-memory state during online maintenance.
	ProtectedInUsePaths []string
	// ProtectedRetainedPaths are paths pinned by pointer lifecycle retention.
	ProtectedRetainedPaths []string
	// ObservedSourceFileIDs enables per-classification probe counters for a
	// caller-provided subset of segment IDs (for example, rewrite-selected
	// source segments). IDs not present in the current set are ignored.
	ObservedSourceFileIDs []uint32
	// ObservedSourceAssumeUnreferenced indicates ObservedSourceFileIDs are
	// already known to be unreferenced. When true, ValueLogGC skips the
	// reachability scan and only classifies (and, if !DryRun, zombifies) the
	// observed IDs; it does not attempt to reclaim other segments.
	ObservedSourceAssumeUnreferenced bool
}

ValueLogGCOptions controls value-log garbage collection.

type ValueLogGCStats added in v0.3.0

type ValueLogGCStats struct {
	SegmentsTotal                           int
	SegmentsReferenced                      int
	SegmentsActive                          int
	SegmentsProtected                       int
	SegmentsProtectedInUse                  int
	SegmentsProtectedRetained               int
	SegmentsProtectedOverlap                int
	SegmentsProtectedOther                  int
	SegmentsEligible                        int
	SegmentsDeleted                         int
	SegmentsPending                         int
	BytesTotal                              int64
	BytesReferenced                         int64
	BytesActive                             int64
	BytesProtected                          int64
	BytesProtectedInUse                     int64
	BytesProtectedRetained                  int64
	BytesProtectedOverlap                   int64
	BytesProtectedOther                     int64
	BytesEligible                           int64
	BytesDeleted                            int64
	BytesPending                            int64
	ObservedSourceSegments                  int
	ObservedSourceSegmentsReferenced        int
	ObservedSourceSegmentsActive            int
	ObservedSourceSegmentsProtected         int
	ObservedSourceSegmentsProtectedInUse    int
	ObservedSourceSegmentsProtectedRetained int
	ObservedSourceSegmentsProtectedOverlap  int
	ObservedSourceSegmentsProtectedOther    int
	ObservedSourceSegmentsEligible          int
	ObservedSourceSegmentsDeleted           int
	ObservedSourceSegmentsPending           int
	ObservedSourceBytes                     int64
	ObservedSourceBytesReferenced           int64
	ObservedSourceBytesActive               int64
	ObservedSourceBytesProtected            int64
	ObservedSourceBytesProtectedInUse       int64
	ObservedSourceBytesProtectedRetained    int64
	ObservedSourceBytesProtectedOverlap     int64
	ObservedSourceBytesProtectedOther       int64
	ObservedSourceBytesEligible             int64
	ObservedSourceBytesDeleted              int64
	ObservedSourceBytesPending              int64
}

ValueLogGCStats summarizes value-log GC work.

type ValueLogGenerationConfig added in v0.4.0

type ValueLogGenerationConfig struct {
	// Policy selects generation behavior. Off preserves current behavior.
	Policy ValueLogGenerationPolicy
	// HotSegmentTargetBytes configures target segment size for hot generation.
	// 0 uses implementation default.
	HotSegmentTargetBytes int64
	// WarmSegmentTargetBytes configures target segment size for warm generation.
	// 0 uses implementation default.
	WarmSegmentTargetBytes int64
	// ColdSegmentTargetBytes configures target segment size for cold generation.
	// 0 uses implementation default.
	ColdSegmentTargetBytes int64
	// RewriteBudgetBytesPerSec bounds background incremental rewrite bandwidth.
	// 0 disables byte-budget trigger.
	RewriteBudgetBytesPerSec int64
	// RewriteBudgetRecordsPerSec bounds background incremental rewrite records/s.
	// 0 disables record-budget trigger.
	RewriteBudgetRecordsPerSec int
	// RewriteTriggerStaleRatioPPM triggers rewrite when stale/live ratio exceeds
	// threshold (parts-per-million, 0 disables).
	RewriteTriggerStaleRatioPPM uint32
	// RewriteTriggerTotalBytes triggers rewrite when total retained bytes exceeds
	// threshold (0 disables).
	RewriteTriggerTotalBytes int64
	// RewriteTriggerChurnPerSec triggers rewrite when churn rate exceeds
	// threshold (0 disables).
	RewriteTriggerChurnPerSec int64
	// RewriteMinSegmentAge gates online rewrite to source segments that are at
	// least this old.
	//
	// 0 uses the implementation default.
	RewriteMinSegmentAge time.Duration
}

ValueLogGenerationConfig configures generational value-log behavior.

type ValueLogGenerationPolicy added in v0.4.0

type ValueLogGenerationPolicy uint8

ValueLogGenerationPolicy controls generation-aware value-log placement. PR1 scaffolding: behavior remains legacy append-only until allocator/rewrite phases land; this policy is currently configuration + observability only.

const (
	// ValueLogGenerationDefault selects the library default (currently
	// hot/warm/cold in cached mode).
	//
	// This is intentionally the zero value so callers can opt into the default
	// behavior without explicitly setting a policy.
	ValueLogGenerationDefault ValueLogGenerationPolicy = iota
	// ValueLogGenerationOff keeps legacy single-generation behavior (no
	// background generation maintenance).
	ValueLogGenerationOff
	// ValueLogGenerationHotWarmCold enables hot/warm/cold generation policy.
	ValueLogGenerationHotWarmCold
)

type ValueLogOptions added in v0.3.0

type ValueLogOptions struct {
	// Compression selects value-log compression behavior.
	Compression ValueLogCompressionMode
	// BlockCodec selects the block codec for block compression.
	BlockCodec ValueLogBlockCodec
	// BlockTargetCompressedBytes guides grouped block size adaptation.
	//
	// 0 uses a default.
	BlockTargetCompressedBytes int
	// IncompressibleHoldBytes configures auto-mode suppression duration after
	// repeated incompressible probes.
	//
	// 0 uses a default.
	IncompressibleHoldBytes int
	// IncompressibleProbeIntervalBytes controls probe cadence while
	// incompressible hold is active.
	//
	// 0 uses a default.
	IncompressibleProbeIntervalBytes int
	// AutoPolicy controls auto-mode bias (throughput, balanced, size).
	AutoPolicy ValueLogAutoPolicy
	// DictClassMode controls dictionary-state partitioning:
	// 0=single (default shared dict stream), 1=split_outer_leaf.
	DictClassMode ValueLogDictClassMode

	// PointerThreshold controls when value-log pointers are used.
	// Values <= 0 use a default threshold. In cached mode, relaxed durability
	// settings may choose a smaller default to avoid large-scale update cliffs by
	// pushing moderate values into the value log.
	PointerThreshold int
	// Generational configures generation-aware value-log placement and rewrite
	// scheduling. PR1 wires config and stats only; behavior remains legacy until
	// follow-on phases land.
	Generational ValueLogGenerationConfig
	// ForcePointers stores all values out-of-line in the value log (no inline values).
	ForcePointers bool
	// DomainInlineThresholds provides optional per-domain overrides for
	// inline-vs-pointer placement. These overrides are evaluated by
	// longest-prefix match and fall back to PointerThreshold/default behavior
	// when no domain matches.
	DomainInlineThresholds []ValueLogDomainThreshold
	// RawWritevMinAvgBytes controls raw grouped-frame writev usage.
	//
	// 0 enables adaptive mode (no average-bytes floor).
	RawWritevMinAvgBytes int
	// RawWritevMinBatchRecords controls minimum grouped records before raw writev
	// is considered.
	//
	// <=0 uses the default.
	RawWritevMinBatchRecords int

	// ReadIntegrity configures checksum verification on value-log reads.
	ReadIntegrity IntegrityMode

	// MaxRetainedBytes emits a warning when retained value-log bytes exceed this
	// threshold (0 disables warnings). Cached mode only.
	MaxRetainedBytes int64
	// MaxRetainedBytesHard disables value-log pointers for new large values once
	// retained bytes exceed this threshold (0 disables the cap).
	MaxRetainedBytesHard int64

	// DictLookup provides dictionary bytes for value-log decoding.
	DictLookup valuelog.DictLookup

	// DictTrain configures background dictionary training for value-log frame
	// compression in cached mode.
	DictTrain compression.TrainConfig
	// DictAdaptiveRatio enables best-effort adaptive disable/pause of value-log
	// dictionary compression when payload compression ratios degrade (0 disables).
	DictAdaptiveRatio float64
	// DictMetricsWindowBytes controls the rolling window size for ratio tracking (0=default).
	DictMetricsWindowBytes int
	// DictMetricsMinRecords controls how many records must be observed in a window
	// before adaptive pause triggers (0=default).
	DictMetricsMinRecords int
	// DictMetricsPauseBytes controls how long to pause dict compression after a degraded
	// window is detected (0=default).
	DictMetricsPauseBytes int
	// DictIncompressibleHoldBytes enables classifier-driven hold mode for
	// high-entropy streams. While hold mode is active, dict attempts and trainer
	// collection are bypassed until hold bytes are consumed.
	//
	// 0 uses profile/default hold configuration; <0 explicitly disables hold
	// mode and opts out of profile defaults.
	DictIncompressibleHoldBytes int
	// DictProbeIntervalBytes controls periodic probe attempts while
	// incompressible hold mode is active.
	//
	// <=0 uses a default derived from hold bytes.
	DictProbeIntervalBytes int
	// DictMinPayloadSavingsRatio rejects newly trained dictionaries whose payload
	// ratio does not improve by at least this fraction (0 uses a cached-mode
	// throughput-oriented default: 0.02 normally, 0.05 with ForcePointers or
	// WAL disabled).
	DictMinPayloadSavingsRatio float64
	// DictMaxK clamps the maximum group size (K) used for value-log dict-compressed
	// frames.
	//
	// Larger K can improve compression ratio (more cross-record matches) and can
	// reduce framing overhead, but may increase CPU and tail latency due to larger
	// encode/decode units.
	//
	// Values <= 0 use the default (32). Values above the engine maximum are clamped.
	DictMaxK int
	// DictFrameEncodeLevel controls the zstd encoder level used for dict-compressed
	// value-log frames.
	//
	// Values <= 0 use the default (SpeedFastest).
	DictFrameEncodeLevel zstd.EncoderLevel
	// DictFrameEnableEntropy enables entropy coding for dict-compressed value-log
	// frames (higher ratio, lower throughput).
	//
	// Default is false (throughput-focused: no-entropy compression).
	DictFrameEnableEntropy bool

	// CompressionAutotune configures the wall-time value-log compression autotuner.
	CompressionAutotune valuelog.AutotuneOptions

	// TemplateMode controls template-based compression for value-log values.
	TemplateMode template.Mode
	// TemplateConfig controls template creation and encoding behavior.
	TemplateConfig template.Config
	// TemplateReadStrict controls strict template decode behavior.
	TemplateReadStrict bool
	// TemplateStore provides template routing/definition lookups for template
	// encoding (for example in offline rewrite prepass experiments).
	TemplateStore template.Store
	// TemplateLookup provides template definition bytes for value-log decoding.
	TemplateLookup valuelog.TemplateLookup
	// TemplateDecodeOptions controls decode caps for template payloads.
	TemplateDecodeOptions template.DecodeOptions
}

ValueLogOptions configures value-log pointer behavior and optional compression/dict tuning.

type ValueLogRewriteChunkPlan added in v0.4.0

type ValueLogRewriteChunkPlan struct {
	ChunkBytes int64

	SourceChunks []ValueLogRewritePlanChunk

	ChunksTotal    int
	ChunksSelected int

	BytesTotal int64
	BytesLive  int64
	BytesStale int64

	SelectedBytesTotal int64
	SelectedBytesLive  int64
	SelectedBytesStale int64

	AgeBlockedChunks          int
	AgeBlockedBytesTotal      int64
	AgeBlockedBytesLive       int64
	AgeBlockedBytesStale      int64
	AgeBlockedMinRemainingAge time.Duration
}

ValueLogRewriteChunkPlan mirrors ValueLogRewritePlan, but at chunk granularity. It is intended for future incremental rewrite scheduling work.

type ValueLogRewriteLocalityPolicy added in v0.4.0

type ValueLogRewriteLocalityPolicy string

ValueLogRewriteLocalityPolicy controls pointer rewrite ordering.

const (
	// ValueLogRewriteLocalityDefault preserves scan/input order.
	ValueLogRewriteLocalityDefault ValueLogRewriteLocalityPolicy = "default"
	// ValueLogRewriteLocalityGrouped orders by old segment+offset locality.
	ValueLogRewriteLocalityGrouped ValueLogRewriteLocalityPolicy = "grouped"
)

type ValueLogRewriteOnlineOptions added in v0.4.0

type ValueLogRewriteOnlineOptions struct {
	// BatchSize bounds pointer swaps per commit.
	BatchSize int
	// SyncEachBatch forces fsync durability boundaries for each rewritten batch.
	SyncEachBatch bool
	// MaxSegmentBytes bounds new value-log segment size during rewrite.
	// <=0 uses a default.
	MaxSegmentBytes int64
	// LocalityPolicy controls ordering of rewritten pointer candidates within
	// each batch.
	LocalityPolicy ValueLogRewriteLocalityPolicy
	// SourceFileIDs restricts rewrite to pointers currently referencing these
	// value-log segment IDs. Missing IDs are ignored.
	SourceFileIDs []uint32
	// SourceChunks restrict rewrite to explicit value-log chunks. When non-empty,
	// they take precedence over SourceFileIDs and sparse segment selection.
	SourceChunks []ValueLogRewritePlanChunk
	// SourceChunkBytes is the chunk width used to interpret SourceChunks.
	SourceChunkBytes int64
	// ProtectedPaths are value-log segment paths that must not be marked zombie
	// during rewrite cleanup.
	//
	// When non-empty, cleanup also avoids zombifying currently-active pre-existing
	// segments (cached-mode maintenance), since concurrent writers may still be
	// appending records whose pointers are not yet visible in the backend index.
	ProtectedPaths []string
	// MaxSourceSegments bounds the number of source segments selected by sparse
	// segment selection. Applies only when SourceFileIDs is empty.
	MaxSourceSegments int
	// MaxSourceBytes bounds estimated live bytes selected by sparse segment
	// selection. Applies only when SourceFileIDs is empty.
	MaxSourceBytes int64
	// MaxCopiedBytes bounds the selected source bytes actually rewritten in this
	// pass. <=0 disables the bound.
	MaxCopiedBytes int64
	// MinSegmentStaleRatio requires stale_bytes/segment_size to be at least this
	// value (0..1) when sparse segment selection is used.
	MinSegmentStaleRatio float64
	// MinSegmentStaleBytes requires estimated stale bytes to be at least this
	// threshold when sparse segment selection is used.
	MinSegmentStaleBytes int64
	// MinSegmentAge excludes very recent source segments from sparse selection.
	// This is useful for cached maintenance so freshly-written segments are not
	// immediately churned by rewrite during sustained ingest.
	MinSegmentAge time.Duration
	// ReserveRIDs allocates a contiguous RID range for rewrite-created records.
	// Cached-mode callers should provide the live runtime allocator here so
	// online rewrite and foreground writes share one RID namespace.
	ReserveRIDs func(count int) (start uint64, err error)
}

ValueLogRewriteOnlineOptions controls online rewrite behavior.

type ValueLogRewritePlan added in v0.4.0

type ValueLogRewritePlan struct {
	// SourceFileIDs are the selected value-log segment IDs. The slice is sorted.
	SourceFileIDs []uint32
	// SelectedSegments summarizes per-segment live/stale estimates for the
	// selected SourceFileIDs when live-byte estimation was performed.
	//
	// When present, it is ordered by FileID ascending.
	SelectedSegments []ValueLogRewritePlanSegment

	SegmentsTotal    int
	SegmentsSelected int

	BytesTotal int64
	BytesLive  int64
	BytesStale int64

	SelectedBytesTotal int64
	SelectedBytesLive  int64
	SelectedBytesStale int64

	// AgeBlocked* summarizes candidate segments excluded by MinSegmentAge while
	// evaluating sparse rewrite candidates. These counters are age-filter
	// diagnostics, not a guarantee that every counted segment would otherwise
	// satisfy stale/live rewrite thresholds.
	AgeBlockedSegments        int
	AgeBlockedBytesTotal      int64
	AgeBlockedBytesLive       int64
	AgeBlockedBytesStale      int64
	AgeBlockedMinRemainingAge time.Duration
}

type ValueLogRewritePlanChunk added in v0.4.0

type ValueLogRewritePlanChunk struct {
	FileID      uint32
	ChunkOffset int64
	BytesTotal  int64
	BytesLive   int64
	BytesStale  int64
	StaleRatio  float64
}

ValueLogRewritePlanChunk summarizes one sub-file chunk of a value-log segment. This is a planning primitive for future incremental rewrite work; it does not yet change execution.

type ValueLogRewritePlanSegment added in v0.4.0

type ValueLogRewritePlanSegment struct {
	FileID     uint32
	BytesTotal int64
	BytesLive  int64
	BytesStale int64
	StaleRatio float64
}

ValueLogRewritePlan summarizes which segments a sparse online rewrite would target given the current value-log set and selection knobs.

It is intended for cached-mode maintenance schedulers to decide whether a rewrite run is worth performing without forcing the rewrite implementation to do expensive live-byte estimation work twice.

type ValueLogRewriteStats added in v0.3.0

type ValueLogRewriteStats struct {
	SegmentsBefore int
	SegmentsAfter  int
	BytesBefore    int64
	BytesAfter     int64
	RecordsCopied  int
	// Value* counters track key/value-pointer payload copied by the main rewrite
	// pointer swap path.
	ValueRecordsCopied int
	ValueBytesCopied   int64
	// LeafRef* counters track outer-leaf page payload copied by the leaf-ref
	// rewrite path (indexOuterLeavesInValueLog mode).
	LeafRefRecordsCopied int
	LeafRefBytesCopied   int64
	// SourceSegmentsRequested is the number of source segments selected for this
	// rewrite run after applying selection filters.
	SourceSegmentsRequested int
	// SourceChunksRequested is the number of explicit source chunks selected for
	// this rewrite run when chunk-restricted execution is used.
	SourceChunksRequested int
	// SourceSegmentsStillReferenced is the subset of selected source segments
	// that remained referenced after rewrite pointer swaps and cleanup.
	SourceSegmentsStillReferenced int
	// SourceSegmentsUnreferenced is the subset of selected source segments that
	// became unreferenced after rewrite pointer swaps and cleanup.
	SourceSegmentsUnreferenced int
	// SourceBytesRequested is the total bytes across selected source segments.
	SourceBytesRequested int64
	// SourceBytesStillReferenced is the bytes of selected source segments that
	// remained referenced after rewrite pointer swaps and cleanup.
	SourceBytesStillReferenced int64
	// SourceBytesUnreferenced is the bytes of selected source segments that
	// became unreferenced after rewrite pointer swaps and cleanup.
	SourceBytesUnreferenced int64
	// SourceBytesProcessed is the bounded subset of selected source bytes
	// actually rewritten in this pass. When zero, the rewrite either copied
	// nothing or ran without a per-pass source-byte bound.
	SourceBytesProcessed int64
	// SourceFileIDsStillReferenced records which selected source segments
	// remained referenced after cleanup.
	SourceFileIDsStillReferenced []uint32
	// SourceFileIDsUnreferenced records which selected source segments became
	// fully unreferenced after cleanup.
	SourceFileIDsUnreferenced []uint32

	TemplateRecordsAttempted int
	TemplateRecordsKept      int
	TemplateInputBytes       int64
	TemplateOutputBytes      int64

	TemplatePointerRecordsAttempted int
	TemplatePointerRecordsKept      int
	TemplatePointerInputBytes       int64
	TemplatePointerOutputBytes      int64
	TemplatePointerReasons          map[string]uint64

	TemplateOuterLeafRecordsAttempted int
	TemplateOuterLeafRecordsKept      int
	TemplateOuterLeafInputBytes       int64
	TemplateOuterLeafOutputBytes      int64
	TemplateOuterLeafReasons          map[string]uint64
}

ValueLogRewriteStats summarizes rewrite compaction results.

func ValueLogRewriteOffline added in v0.3.0

func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error)

ValueLogRewriteOffline rewrites value-log pointers into new segments and swaps index.db to reference the new log. This is an offline operation (requires exclusive lock and a clean commitlog).

type WritePolicy

type WritePolicy struct {
	FlushThreshold  int64 // Size of memtable before flush
	InlineThreshold int   // Max size of value to store inline
}

WritePolicy defines the heuristics and thresholds for write operations.

func DefaultWritePolicy

func DefaultWritePolicy() WritePolicy

DefaultWritePolicy returns the default policy.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL