Documentation
¶
Index ¶
- Constants
- Variables
- func LeafLogDirPath(dir string) string
- func NormalizePublicBatchReserveHint(size int) int
- func ResolveInlineThresholdForKey(baseThreshold int, key []byte, domains []ValueLogDomainThreshold) int
- func SaveFormatConfig(dir string, cfg FormatConfig) error
- func VacuumIndexOffline(opts Options) error
- func ValidateFragmentationReport(rep map[string]string) error
- func ValueLogDirPath(dir string) string
- func ValueReaderForState(state *DBState) tree.SlabReader
- func WALDirPath(dir string) string
- type Batch
- func (b *Batch) Close() error
- func (b *Batch) Delete(key []byte) error
- func (b *Batch) DeleteView(key []byte) error
- func (b *Batch) GetByteSize() (int, error)
- func (b *Batch) Replay(fn func(batch.Entry) error) error
- func (b *Batch) Reserve(n int)
- func (b *Batch) Reset()
- func (b *Batch) Set(key, value []byte) error
- func (b *Batch) SetOps(ops []batch.Entry) error
- func (b *Batch) SetPointer(key []byte, ptr page.ValuePtr) error
- func (b *Batch) SetPointerView(key []byte, ptr page.ValuePtr) error
- func (b *Batch) SetView(key, value []byte) error
- func (b *Batch) Write() error
- func (b *Batch) WriteSync() error
- type DB
- func (db *DB) AcquireSnapshot() *Snapshot
- func (db *DB) Close() error
- func (db *DB) Commit(newRootID uint64) error
- func (db *DB) CompactIndex() error
- func (db *DB) Delete(key []byte) error
- func (db *DB) DeleteSync(key []byte) error
- func (db *DB) Dir() string
- func (db *DB) FragmentationReport() (map[string]string, error)
- func (db *DB) Get(key []byte) ([]byte, error)
- func (db *DB) GetAppend(key, dst []byte) ([]byte, error)
- func (db *DB) GetMany(keys [][]byte) ([][]byte, error)
- func (db *DB) GetManyParallelPlan(keyCount int) (workers int, parallel bool)
- func (db *DB) GetUnsafe(key []byte) ([]byte, error)
- func (db *DB) Has(key []byte) (bool, error)
- func (db *DB) InlineThreshold() int
- func (db *DB) InlineThresholdForKey(key []byte) int
- func (db *DB) Iterator(start, end []byte) (iterator.UnsafeIterator, error)
- func (db *DB) IteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)
- func (db *DB) LeafGenerationGC(ctx context.Context, opts LeafGenerationGCOptions) (LeafGenerationGCStats, error)
- func (db *DB) LeafGenerationPack(ctx context.Context, opts LeafGenerationPackOptions) (stats LeafGenerationPackStats, err error)
- func (db *DB) LeafGenerationPackFromPlan(ctx context.Context, opts LeafGenerationPackFromPlanOptions) (LeafGenerationPackStats, error)
- func (db *DB) LeafGenerationPackRunOnce(ctx context.Context, opts LeafGenerationPackFromPlanOptions) (LeafGenerationPackRunOnceStats, error)
- func (db *DB) LeafGenerationPlan(ctx context.Context, opts LeafGenerationPlanOptions) (LeafGenerationPlan, error)
- func (db *DB) MarkValueLogZombie(id uint32) error
- func (db *DB) NewBatch() batch.Interface
- func (db *DB) NewBatchWithSize(size int) batch.Interface
- func (db *DB) NoteLeafGenerationRecordLength(ptr page.ValuePtr)
- func (db *DB) Pager() *pager.Pager
- func (db *DB) Print() error
- func (db *DB) Prune()
- func (db *DB) RefreshValueLogSet() error
- func (db *DB) RegisterValueLogSegment(path string, fileID uint32) error
- func (db *DB) ReverseIterator(start, end []byte) (iterator.UnsafeIterator, error)
- func (db *DB) ReverseIteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)
- func (db *DB) Set(key, value []byte) error
- func (db *DB) SetCurrentValueLogReadBarrier(fn func(fileID uint32) error)
- func (db *DB) SetLeafPageLog(log LeafPageLog)
- func (db *DB) SetSync(key, value []byte) error
- func (db *DB) SetZipperParallelMergePressureSource(src zipper.ParallelMergePressureSource)
- func (db *DB) State() *DBState
- func (db *DB) Stats() map[string]string
- func (db *DB) VacuumIndexOnline(ctx context.Context) error
- func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogGCStats, error)
- func (db *DB) ValueLogRewriteChunkPlan(ctx context.Context, opts ValueLogRewriteOnlineOptions, chunkBytes int64) (ValueLogRewriteChunkPlan, error)
- func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnlineOptions) (stats ValueLogRewriteStats, err error)
- func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlineOptions) (ValueLogRewritePlan, error)
- func (db *DB) Zipper() *zipper.Zipper
- type DBIterator
- func (it *DBIterator) Close() error
- func (it *DBIterator) DebugStats() (queueLen int, sourcesUsed int)
- func (it *DBIterator) Domain() (start, end []byte)
- func (it *DBIterator) Error() error
- func (it *DBIterator) IsDeleted() bool
- func (it *DBIterator) Key() []byte
- func (it *DBIterator) KeyCopy(dst []byte) []byte
- func (it *DBIterator) Next()
- func (it *DBIterator) Seek(key []byte)
- func (it *DBIterator) UnsafeEntry() ([]byte, page.ValuePtr, byte)
- func (it *DBIterator) UnsafeKey() []byte
- func (it *DBIterator) UnsafeValue() []byte
- func (it *DBIterator) Valid() bool
- func (it *DBIterator) Value() []byte
- func (it *DBIterator) ValueCopy(dst []byte) []byte
- type DBState
- type DurabilityMode
- type FormatConfig
- type IntegrityMode
- type Iterator
- type IteratorMode
- type IteratorOptions
- type LeafGenerationGCOptions
- type LeafGenerationGCStats
- type LeafGenerationPackFromPlanOptions
- type LeafGenerationPackOptions
- type LeafGenerationPackRunOnceStats
- type LeafGenerationPackSelectOptions
- type LeafGenerationPackSelection
- type LeafGenerationPackStats
- type LeafGenerationPlan
- type LeafGenerationPlanGeneration
- type LeafGenerationPlanOptions
- type LeafPageLog
- type Options
- type Snapshot
- func (s *Snapshot) Close() error
- func (s *Snapshot) Get(key []byte) ([]byte, error)
- func (s *Snapshot) GetAppend(key, dst []byte) ([]byte, error)
- func (s *Snapshot) GetEntry(key []byte) (node.LeafEntry, error)
- func (s *Snapshot) GetEntryExact(key []byte) (node.LeafEntry, error)
- func (s *Snapshot) GetUnsafe(key []byte) ([]byte, error)
- func (s *Snapshot) Has(key []byte) (bool, error)
- func (s *Snapshot) Pager() *pager.Pager
- func (s *Snapshot) State() *DBState
- type SnapshotPool
- type ValueLogAutoPolicy
- type ValueLogBlockCodec
- type ValueLogCompressionMode
- type ValueLogDictClassMode
- type ValueLogDomainThreshold
- type ValueLogGCOptions
- type ValueLogGCStats
- type ValueLogGenerationConfig
- type ValueLogGenerationPolicy
- type ValueLogOptions
- type ValueLogRewriteChunkPlan
- type ValueLogRewriteLocalityPolicy
- type ValueLogRewriteOnlineOptions
- type ValueLogRewritePlan
- type ValueLogRewritePlanChunk
- type ValueLogRewritePlanSegment
- type ValueLogRewriteStats
- type WritePolicy
Constants ¶
const ( IteratorModeFull = tree.IteratorModeFull IteratorModeKeysOnly = tree.IteratorModeKeysOnly IteratorModePointerProjection = tree.IteratorModePointerProjection )
const ( MetaPage0ID = 0 MetaPage1ID = 1 KeepRecent = 10000 )
Variables ¶
var ( // ErrLocked indicates the database directory is already opened by another process. ErrLocked = lockfile.ErrLocked // ErrReadOnly indicates a write was attempted on a read-only DB handle. ErrReadOnly = errors.New("treedb: read-only") // ErrClosed indicates the DB handle is closed or closing for reads. ErrClosed = errors.New("treedb: db is closed") )
var ErrVacuumInProgress = errors.New("online vacuum already in progress")
var ErrVacuumUnsupported = errors.New("online vacuum unsupported on this platform")
Functions ¶
func LeafLogDirPath ¶ added in v0.5.0
func NormalizePublicBatchReserveHint ¶ added in v0.4.0
NormalizePublicBatchReserveHint keeps small public hints behaving like entry reserves, but treats larger hints as approximate byte budgets so callers do not accidentally preallocate one entry per byte. This is intentionally discontinuous at the cutover for compatibility with small entry-count hints.
For internal use; behavior may change without notice and is not part of the supported external API surface of the db package.
func ResolveInlineThresholdForKey ¶ added in v0.4.0
func ResolveInlineThresholdForKey(baseThreshold int, key []byte, domains []ValueLogDomainThreshold) int
ResolveInlineThresholdForKey chooses an inline threshold for key using longest-prefix domain overrides and a global fallback.
Callers should pass NormalizeValueLogDomainThresholds output so that the first match is the intended longest-prefix override.
func SaveFormatConfig ¶ added in v0.4.0
func SaveFormatConfig(dir string, cfg FormatConfig) error
SaveFormatConfig writes cfg to dir/format.json atomically.
func VacuumIndexOffline ¶
VacuumIndexOffline rewrites index.db into a fresh file and swaps it in.
This is intended to reclaim space (reduce `index.db` chunk count) and restore locality after long churn. It is an offline operation (requires exclusive open lock).
func ValidateFragmentationReport ¶
ValidateFragmentationReport validates basic invariants on a FragmentationReport output map. It is intended for tests and operational "health" tooling.
func ValueLogDirPath ¶ added in v0.5.0
func ValueReaderForState ¶
func ValueReaderForState(state *DBState) tree.SlabReader
ValueReaderForState returns a reader that resolves value-log pointers.
func WALDirPath ¶ added in v0.5.0
Types ¶
type Batch ¶
type Batch struct {
// contains filtered or unexported fields
}
Batch implements the cosmos-db Batch interface.
func (*Batch) DeleteView ¶
DeleteView records a Delete without copying the key bytes. Callers must treat key as immutable until the batch is written or closed.
func (*Batch) GetByteSize ¶
func (*Batch) Reserve ¶ added in v0.4.0
Reserve forwards best-effort preallocation hints to the internal batch.
func (*Batch) SetPointer ¶
SetPointer records a pointer without copying the value bytes.
func (*Batch) SetPointerView ¶ added in v0.2.0
SetPointerView records a pointer without copying the key bytes.
func (*Batch) SetView ¶
SetView records a Put without copying key/value bytes. Callers must treat key/value as immutable until the batch is written or closed.
This is intentionally not part of the public batch.Interface; it is a best-effort optimization used by higher-level layers (e.g. cached streaming).
type DB ¶
type DB struct {
// contains filtered or unexported fields
}
func (*DB) AcquireSnapshot ¶
AcquireSnapshot returns a new snapshot.
func (*DB) Commit ¶
Commit persists the new root (Sync=true by default). Note: This is usually called internally by Batch.Write or externally if manual root management. If manual, retired pages are unknown? `Commit` signature assumes manual root. If external user calls Commit, they might not know retired pages. We'll accept nil for retired if manual.
func (*DB) CompactIndex ¶
CompactIndex rewrites the entire B-Tree sequentially to the end of the file. This improves Full Scan performance by restoring physical locality. Note: This operation causes file growth as old pages are not immediately reclaimed (they are leaked to the freelist but not reused during this append-only build).
func (*DB) DeleteSync ¶
DeleteSync removes a key and syncs.
func (*DB) FragmentationReport ¶
FragmentationReport returns best-effort structural stats about the user index that help diagnose scan regressions after churn.
func (*DB) GetAppend ¶
GetAppend appends the value for the key to dst and returns the new slice. If the key is not found, it returns dst and ErrKeyNotFound.
func (*DB) GetMany ¶ added in v0.4.0
GetMany returns values for keys.
Semantics: Returns safe copies of values. Missing keys are returned as nil entries with no error.
func (*DB) GetManyParallelPlan ¶ added in v0.4.0
GetManyParallelPlan reports how this backend would schedule GetMany for the provided key count.
func (*DB) GetUnsafe ¶
GetUnsafe returns the value for a key.
Semantics: Returns a safe copy of the value. For zero-copy views tied to a snapshot lifetime, use Snapshot.GetUnsafe.
func (*DB) InlineThreshold ¶
func (*DB) InlineThresholdForKey ¶ added in v0.4.0
func (*DB) Iterator ¶
func (db *DB) Iterator(start, end []byte) (iterator.UnsafeIterator, error)
Iterator returns an iterator.
func (*DB) IteratorWithOptions ¶ added in v0.4.0
func (db *DB) IteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)
IteratorWithOptions returns an iterator with explicit value materialization controls.
func (*DB) LeafGenerationGC ¶ added in v0.5.0
func (db *DB) LeafGenerationGC(ctx context.Context, opts LeafGenerationGCOptions) (LeafGenerationGCStats, error)
func (*DB) LeafGenerationPack ¶ added in v0.5.0
func (db *DB) LeafGenerationPack(ctx context.Context, opts LeafGenerationPackOptions) (stats LeafGenerationPackStats, err error)
LeafGenerationPack rewrites live LeafRef pages from sealed source generations into a fresh leaf-log output so the old generations can later be reclaimed by ordinary generation GC.
func (*DB) LeafGenerationPackFromPlan ¶ added in v0.5.0
func (db *DB) LeafGenerationPackFromPlan(ctx context.Context, opts LeafGenerationPackFromPlanOptions) (LeafGenerationPackStats, error)
LeafGenerationPackFromPlan computes the current plan, selects a bounded candidate prefix, then packs those sealed generations.
func (*DB) LeafGenerationPackRunOnce ¶ added in v0.5.0
func (db *DB) LeafGenerationPackRunOnce(ctx context.Context, opts LeafGenerationPackFromPlanOptions) (LeafGenerationPackRunOnceStats, error)
LeafGenerationPackRunOnce computes the current plan, applies bounded selection, and either runs one pack pass or reports why it skipped.
func (*DB) LeafGenerationPlan ¶ added in v0.5.0
func (db *DB) LeafGenerationPlan(ctx context.Context, opts LeafGenerationPlanOptions) (LeafGenerationPlan, error)
LeafGenerationPlan estimates reclaim opportunities for sealed leaf generations by scanning the current live tree once and attributing reachable LeafRef pages back to manifest generations.
func (*DB) MarkValueLogZombie ¶
MarkValueLogZombie marks a value-log segment as zombie so it can be removed once all snapshots release it.
func (*DB) NewBatchWithSize ¶
NewBatchWithSize accepts the public cosmos-db style size hint. Small values are treated like exact entry reserves; larger values are normalized as approximate byte budgets and capped to avoid preallocating one entry per byte. The normalization is intentionally discontinuous at the cutover: `publicBatchHintExactEntryReserveMax` still means "reserve that many entries", while the next value is treated as a byte budget and normalized downward.
func (*DB) NoteLeafGenerationRecordLength ¶ added in v0.5.0
func (*DB) RefreshValueLogSet ¶ added in v0.3.0
RefreshValueLogSet publishes a new DBState with the current value-log set (excluding zombies) without creating a new commit.
func (*DB) RegisterValueLogSegment ¶ added in v0.4.0
RegisterValueLogSegment registers a newly created value-log segment with the backend read manager without scanning the filesystem. Cached mode uses this when it rotates the shared value log so outer-leaf commits can publish a current ValueLogSet via CurrentSetNoRefresh.
func (*DB) ReverseIterator ¶
func (db *DB) ReverseIterator(start, end []byte) (iterator.UnsafeIterator, error)
ReverseIterator returns a reverse iterator.
func (*DB) ReverseIteratorWithOptions ¶ added in v0.4.0
func (db *DB) ReverseIteratorWithOptions(start, end []byte, opts IteratorOptions) (iterator.UnsafeIterator, error)
ReverseIteratorWithOptions returns a reverse iterator with explicit value materialization controls.
func (*DB) SetCurrentValueLogReadBarrier ¶ added in v0.4.0
SetCurrentValueLogReadBarrier installs a callback that will be invoked before backend-internal reads of segments still marked currentWritable.
func (*DB) SetLeafPageLog ¶ added in v0.4.0
func (db *DB) SetLeafPageLog(log LeafPageLog)
SetLeafPageLog installs the value-log appender used for value-log-backed leaf pages. It is typically wired by the cached layer after opening the backend.
func (*DB) SetZipperParallelMergePressureSource ¶ added in v0.4.0
func (db *DB) SetZipperParallelMergePressureSource(src zipper.ParallelMergePressureSource)
SetZipperParallelMergePressureSource installs an optional pressure signal for future zipper generations and the current live zipper.
func (*DB) VacuumIndexOnline ¶
VacuumIndexOnline rebuilds the index into a new file and swaps it in with a short writer pause. Old snapshots remain valid by pinning the previous index generation until readers drain; disk space is reclaimed once the old mmap is closed.
func (*DB) ValueLogGC ¶ added in v0.3.0
func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogGCStats, error)
ValueLogGC deletes fully-unreferenced value-log segments from value_vlog.
It scans the user + system trees for value-log pointers, computes referenced value_vlog segments, and removes segments that are:
- not referenced,
- not the currently-active segment per lane,
- and not pinned by active snapshots.
func (*DB) ValueLogRewriteChunkPlan ¶ added in v0.4.0
func (db *DB) ValueLogRewriteChunkPlan(ctx context.Context, opts ValueLogRewriteOnlineOptions, chunkBytes int64) (ValueLogRewriteChunkPlan, error)
func (*DB) ValueLogRewriteOnline ¶ added in v0.4.0
func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnlineOptions) (stats ValueLogRewriteStats, err error)
ValueLogRewriteOnline rewrites pointer-backed values in bounded commit batches, then atomically swaps keys to rewritten pointers.
func (*DB) ValueLogRewritePlan ¶ added in v0.4.0
func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlineOptions) (ValueLogRewritePlan, error)
ValueLogRewritePlan returns the segments that would be selected for sparse online rewrite given opts. It performs the same live-byte estimation work as ValueLogRewriteOnline sparse selection, but does not modify the DB.
type DBIterator ¶
type DBIterator struct {
// contains filtered or unexported fields
}
DBIterator wraps tree.Iterator and holds a Snapshot.
func (*DBIterator) Close ¶
func (it *DBIterator) Close() error
func (*DBIterator) DebugStats ¶
func (it *DBIterator) DebugStats() (queueLen int, sourcesUsed int)
func (*DBIterator) Domain ¶
func (it *DBIterator) Domain() (start, end []byte)
func (*DBIterator) Error ¶
func (it *DBIterator) Error() error
func (*DBIterator) IsDeleted ¶
func (it *DBIterator) IsDeleted() bool
func (*DBIterator) Key ¶
func (it *DBIterator) Key() []byte
func (*DBIterator) KeyCopy ¶
func (it *DBIterator) KeyCopy(dst []byte) []byte
func (*DBIterator) Next ¶
func (it *DBIterator) Next()
func (*DBIterator) UnsafeEntry ¶
func (it *DBIterator) UnsafeEntry() ([]byte, page.ValuePtr, byte)
func (*DBIterator) UnsafeKey ¶
func (it *DBIterator) UnsafeKey() []byte
func (*DBIterator) UnsafeValue ¶
func (it *DBIterator) UnsafeValue() []byte
func (*DBIterator) Valid ¶
func (it *DBIterator) Valid() bool
func (*DBIterator) Value ¶
func (it *DBIterator) Value() []byte
func (*DBIterator) ValueCopy ¶
func (it *DBIterator) ValueCopy(dst []byte) []byte
type DurabilityMode ¶ added in v0.3.0
type DurabilityMode uint8
DurabilityMode configures cached-mode durability semantics.
These modes are explicit and intentionally replace the previous boolean combination of DisableWAL + RelaxedSync + AllowUnsafe.
const ( // DurabilityDurable enables WAL (journal) and uses fsync for sync operations. DurabilityDurable DurabilityMode = iota // DurabilityWALOnRelaxed keeps WAL enabled but disables fsync (crash-consistent). DurabilityWALOnRelaxed // DurabilityWALOffRelaxed disables WAL and fsync (unsafe; recent writes // may be lost and sync calls may defer backend publication until a later // checkpoint/flush boundary). DurabilityWALOffRelaxed )
type FormatConfig ¶ added in v0.4.0
type FormatConfig struct {
Version int `json:"version"`
IndexOuterLeavesInValueLog bool `json:"index_outer_leaves_in_vlog"`
LeafPrefixCompression bool `json:"leaf_prefix_compression"`
IndexColumnarLeaves bool `json:"index_columnar_leaves"`
IndexPackedValuePtr bool `json:"index_packed_valueptr"`
IndexInternalBaseDelta bool `json:"index_internal_base_delta"`
IndexAdaptiveLeafEncoding bool `json:"index_adaptive_leaf_encoding"`
ValueLogCompression string `json:"vlog_compression"`
ValueLogBlockCodec string `json:"vlog_block_codec"`
ValueLogAutoPolicy string `json:"vlog_auto_policy"`
}
FormatConfig captures the format-affecting knobs that maintenance tooling should preserve when rewriting index/value-log state.
This file is best-effort and pre-alpha; callers should tolerate it being absent. Versioned files written by SaveFormatConfig are expected to be fully populated; if new fields are added in the future, the version should be bumped so older binaries do not accidentally apply zero-values.
func LoadFormatConfig ¶ added in v0.4.0
func LoadFormatConfig(dir string) (FormatConfig, bool, error)
LoadFormatConfig loads the best-effort persisted format config for dir. The returned bool reports whether the file was found.
func (FormatConfig) ApplyIndexFormatToOptions ¶ added in v0.4.0
func (cfg FormatConfig) ApplyIndexFormatToOptions(opts *Options)
ApplyIndexFormatToOptions overwrites index-format-affecting knobs in opts from cfg.
This is intentionally narrower than ApplyToOptions: it is safe for normal DB opens where callers may want to tune runtime policies (e.g. value-log compression) via env vars or flags, while still ensuring the index encoding matches on-disk state.
func (FormatConfig) ApplyToOptions ¶ added in v0.4.0
func (cfg FormatConfig) ApplyToOptions(opts *Options)
ApplyToOptions overwrites format-affecting knobs in opts from cfg.
Callers should treat cfg as best-effort (it may be absent) and may apply explicit overrides after this call.
type IntegrityMode ¶ added in v0.3.0
type IntegrityMode uint8
IntegrityMode configures value-log read integrity checks.
It intentionally replaces the previous DisableReadChecksum boolean.
const ( // IntegrityVerify enables checksum verification on value-log reads. IntegrityVerify IntegrityMode = iota // IntegritySkipChecksums disables checksum verification on value-log reads (unsafe). IntegritySkipChecksums )
type Iterator ¶
type Iterator interface {
Valid() bool
Next()
Key() []byte
Value() []byte
KeyCopy(dst []byte) []byte
ValueCopy(dst []byte) []byte
Close() error
Error() error
// Reset resets the iterator for reuse.
Reset(start, end []byte)
}
Iterator is the internal interface for iteration.
type IteratorMode ¶ added in v0.4.0
type IteratorMode = tree.IteratorMode
type IteratorOptions ¶ added in v0.4.0
type IteratorOptions = tree.IteratorOptions
type LeafGenerationGCOptions ¶ added in v0.5.0
type LeafGenerationGCOptions struct {
DryRun bool
}
type LeafGenerationGCStats ¶ added in v0.5.0
type LeafGenerationPackFromPlanOptions ¶ added in v0.5.0
type LeafGenerationPackFromPlanOptions struct {
Sync bool
Force bool
MinPublishedAgeCommits uint64
MinCandidateGenerations int
MinExpectedReclaimBytes int64
MinExpectedReclaimRatioPPM int
MinReclaimPerByteCopiedPPM int
MaxGenerations int
MaxBytesToCopy int64
ReserveRIDs func(count int) (start uint64, err error)
}
LeafGenerationPackFromPlanOptions combines planner thresholds, bounded selection limits, and pack execution settings for the manual from-plan path.
type LeafGenerationPackOptions ¶ added in v0.5.0
type LeafGenerationPackRunOnceStats ¶ added in v0.5.0
type LeafGenerationPackRunOnceStats struct {
Plan LeafGenerationPlan
Selection LeafGenerationPackSelection
Pack LeafGenerationPackStats
Ran bool
SkipReason string
}
LeafGenerationPackRunOnceStats describes one bounded admission/evaluation pass for leaf-generation packing.
type LeafGenerationPackSelectOptions ¶ added in v0.5.0
type LeafGenerationPackSelectOptions struct {
Force bool
MinExpectedReclaimBytes int64
MinExpectedReclaimRatioPPM int
MaxGenerations int
MaxBytesToCopy int64
MinReclaimPerByteCopiedPPM int
}
LeafGenerationPackSelectOptions bounds a selected ranked subset of plan candidates.
type LeafGenerationPackSelection ¶ added in v0.5.0
type LeafGenerationPackSelection struct {
GenerationIDs []uint64
Generations []LeafGenerationPlanGeneration
BytesTotal int64
BytesLive int64
BytesDead int64
BytesToCopy int64
LivePages int
ExpectedReclaimBytes int64
ExpectedReclaimRatioPPM int
ExpectedReclaimPerByteCopiedPPM int
}
LeafGenerationPackSelection summarizes a bounded subset of pack candidates.
func SelectLeafGenerationPackCandidates ¶ added in v0.5.0
func SelectLeafGenerationPackCandidates(plan LeafGenerationPlan, opts LeafGenerationPackSelectOptions) (LeafGenerationPackSelection, error)
SelectLeafGenerationPackCandidates selects a bounded subset from an eligible leaf-generation plan. For bounded windows it maximizes reclaimable bytes within the requested generation/copy limits, then emits the chosen generations in their original plan order.
type LeafGenerationPackStats ¶ added in v0.5.0
type LeafGenerationPackStats struct {
GenerationsRequested int
GenerationsMatched int
SourceGenerationIDs []uint64
SourceFilesRequested int
SourceFileIDs []uint32
SourceBytesTotal int64
SourceBytesLive int64
SourceBytesDead int64
SourceBytesToCopy int64
ExpectedReclaimBytes int64
ExpectedReclaimRatioPPM int
ExpectedReclaimPerByteCopiedPPM int
LeafPagesCopied int
BytesCopied int64
InternalPagesVisited int
SubtreesPruned int
CreatedFileIDs []uint32
WallTimeNanos int64
}
type LeafGenerationPlan ¶ added in v0.5.0
type LeafGenerationPlan struct {
CurrentCommitSeq uint64
CurrentGenerationID uint64
Generations []LeafGenerationPlanGeneration
Candidates []LeafGenerationPlanGeneration
CandidateGenerationIDs []uint64
CandidateBytesTotal int64
CandidateBytesLive int64
CandidateBytesDead int64
CandidateBytesToCopy int64
CandidateLivePages int
ExpectedReclaimBytes int64
ExpectedReclaimRatioPPM int
ExpectedReclaimPerByteCopiedPPM int
Admission string
}
type LeafGenerationPlanGeneration ¶ added in v0.5.0
type LeafGenerationPlanGeneration struct {
GenerationID uint64
State string
FileIDs []uint32
FileCount int
BytesTotal int64
BytesLive int64
BytesDead int64
BytesToCopy int64
LivePages int
AgeCommits uint64
PinnedCount uint64
DeadRatioPPM int
LiveRatioPPM int
WholeGenerationGCEligible bool
Eligible bool
SkipReason string
}
type LeafGenerationPlanOptions ¶ added in v0.5.0
type LeafPageLog ¶ added in v0.4.0
type LeafPageLog interface {
AppendLeafPage(leafPage []byte) (page.LeafLogPtr, error)
Flush() error
Sync() error
}
LeafPageLog appends and flushes B+Tree leaf pages stored in the value log.
This is used when Options.IndexOuterLeavesInValueLog is enabled. Implementations are expected to reuse the existing value-log record encoding and compression semantics (i.e. they should append normal value-log records and return LeafLogPtr references).
type Options ¶
type Options struct {
Dir string
// IgnoreFormatConfig disables best-effort persisted format.json loading in
// TreeDB open paths that auto-apply index-format knobs from disk (e.g.
// treedb.Open, treedb.OpenBackend) and in offline maintenance helpers
// (VacuumIndexOffline, ValueLogRewriteOffline, treemap vacuum/rewrite/vlog-gc).
IgnoreFormatConfig bool
// ReadOnly opens the database without acquiring an exclusive lock and without
// modifying on-disk state (no recovery truncation, no WAL replay, no background
// maintenance). Only read operations are supported.
ReadOnly bool
ChunkSize int64 // Default 256KiB
// DictDBChunkSize controls the mmap chunk size used for the `dictdb/` side
// store when TreeDB is opened via the public `treedb.Open` wrapper.
//
// It is intentionally independent of ChunkSize so benchmarks and callers can
// tune the main index pager without inflating dictdb disk usage.
//
// Values <= 0 use a default of 64KiB.
DictDBChunkSize int64
// TemplateDBChunkSize controls the mmap chunk size used for the `templatedb/`
// side store when template compression is enabled.
//
// Values <= 0 use a default of 64KiB.
TemplateDBChunkSize int64
KeepRecent uint64 // Default 10000
// PagerSyncConcurrency controls how many goroutines may msync dirty chunks
// in parallel during Sync. Values <= 0 use the default (1).
PagerSyncConcurrency int
// PagerMmapPopulate enables MAP_POPULATE on Linux when mmapping index.db
// chunks. This can reduce minor-fault overhead under random access patterns
// at the cost of increased work at map/grow time.
PagerMmapPopulate bool
// PagerPrefetchOnRead enables best-effort prefetch hints (madvise WILLNEED)
// for mmapped index chunks (Linux only). When enabled, TreeDB may issue
// prefetch requests opportunistically (e.g. before rewriting child pages
// during checkpoint/merge). It is a no-op on unsupported platforms.
PagerPrefetchOnRead bool
// Durability configures cached-mode durability semantics.
//
// The default (zero) is DurabilityDurable.
Durability DurabilityMode
// DisableBackgroundPrune keeps pruning on the commit critical path (legacy
// behavior). When false (default), a bounded background pruner frees pages
// asynchronously to reduce commit latency under churn.
DisableBackgroundPrune bool
// PruneInterval controls how often the background pruner wakes up (0 uses a
// default).
PruneInterval time.Duration
// PruneMaxPages bounds how many pages are freed per pruner tick (0 uses a
// default; <0 means unlimited).
PruneMaxPages int
// PruneMaxDuration bounds how long a pruner tick may run (0 uses a default;
// <0 means unlimited).
PruneMaxDuration time.Duration
FlushThreshold int64
// MemtableMode selects the cached-mode memtable implementation.
// Supported values: "skiplist", "hash_sorted", "btree", "append_only", "adaptive".
MemtableMode string
// MemtableShards controls the number of mutable memtable shards in cached
// mode. Values <= 0 use a runtime-dependent default.
MemtableShards int
// DomainIngressWorkers enables experimental domain-local ingress workers in
// cached mode. Values <= 0 keep the legacy direct write path.
DomainIngressWorkers int
// DomainIngressQueueSize configures the per-worker ingress queue length when
// DomainIngressWorkers is enabled. Values <= 0 use a default.
DomainIngressQueueSize int
// PreferAppendAlloc makes the page allocator ignore the freelist and append
// new pages instead. This can improve scan locality under churn at the cost
// of file growth (space is reclaimed later via vacuum).
PreferAppendAlloc bool
// FreelistRegionPages and FreelistRegionRadius bias freelist reuse toward
// nearby page regions to improve locality. Leave both at 0 to disable the
// bias (default). If either is set, missing values will use defaults.
// Set FreelistRegionRadius < 0 to force-disable the bias.
FreelistRegionPages uint64
FreelistRegionRadius int
// LeafFillTargetPPM and InternalFillTargetPPM control how full newly-written
// B+Tree pages are allowed to become before forcing a split (soft-full).
// Lower values reduce split churn and slow re-fragmentation under updates, at
// the cost of higher page count (more index bytes).
//
// Values are in parts-per-million where 1_000_000 means "allow full pages"
// (current behavior). Zero uses the default (1_000_000).
LeafFillTargetPPM uint32
InternalFillTargetPPM uint32
// MaintenanceOpsPerCoalesce controls the maintenance budget during zipper
// merge. It bounds coalesce work to roughly len(ops)/K operations per batch.
// 0 uses the default; negative disables the budget (full maintenance).
MaintenanceOpsPerCoalesce int
// LeafPrefixCompression enables prefix-compressed leaf nodes for new pages.
LeafPrefixCompression bool
// IndexColumnarLeaves enables the experimental columnar leaf encoding for new pages.
IndexColumnarLeaves bool
// IndexPackedValuePtr enables the experimental packed 12-byte ValuePtr encoding
// for pointer entries in new leaf pages.
//
// Packed pointers store ValuePtr.Offset as u32 on disk. Callers must ensure
// value-log segments are rotated such that offsets remain representable.
IndexPackedValuePtr bool
// IndexInternalBaseDelta enables the experimental internal-node base-delta encoding.
IndexInternalBaseDelta bool
// IndexOuterLeavesInValueLog stores B+Tree leaf pages (the pages containing
// key/value entries) in the persistent value log instead of index.db.
//
// When enabled, internal nodes store encoded value-log pointers for leaf
// children. This is pre-alpha and changes on-disk format/assumptions.
IndexOuterLeavesInValueLog bool
// IndexAdaptiveLeafEncoding enables per-page adaptive selection of leaf
// encoding flags using deterministic heuristics from key/value shape.
//
// This option only affects newly-written leaf pages.
IndexAdaptiveLeafEncoding bool
// MaxQueuedMemtables controls how much immutable-memtable backlog the cached
// layer will allow before applying backpressure (i.e. forcing flush work on
// writers). A negative value disables backpressure entirely (higher short-term
// ingest, but potentially unbounded flush debt). Zero uses the default.
MaxQueuedMemtables int
// SlowdownBacklogSeconds begins applying writer backpressure when queued flush
// backlog exceeds this many seconds of estimated flush work (0 disables).
SlowdownBacklogSeconds float64
// StopBacklogSeconds blocks writers when queued flush backlog exceeds this many
// seconds of estimated flush work (0 disables).
StopBacklogSeconds float64
// MaxBacklogBytes is an absolute cap on queued flush backlog bytes (0 disables).
MaxBacklogBytes int64
// WriterFlushMaxMemtables bounds how much queued work a writer will help flush
// per write when backpressure is active (0 uses a default).
WriterFlushMaxMemtables int
// WriterFlushMaxDuration bounds how long a writer will spend helping flush per
// write when backpressure is active (0 disables the time bound).
WriterFlushMaxDuration time.Duration
// FlushBuildConcurrency controls how many goroutines may be used to build a
// combined flush batch from multiple immutable memtables in cached mode.
// Values <= 1 disable parallelism.
FlushBuildConcurrency int
// FlushBuildMinEntries gates the parallel build path by total entries.
// Values <= 0 use a default of 16k.
FlushBuildMinEntries int
// FlushBuildMinUnits gates the parallel build path by number of queued units.
// Values <= 0 use a default of 2.
FlushBuildMinUnits int
// FlushBuildChunkCap controls the maximum entries per build chunk.
// A value of 0 enables adaptive chunk sizing, values < 0 use the fixed default of 8192,
// and values > 0 set an explicit cap.
FlushBuildChunkCap int
// FlushBuildChunkTargetBytes controls adaptive chunk sizing (bytes per chunk).
// Values <= 0 use a default of 2MiB.
FlushBuildChunkTargetBytes int
// FlushBuildChunkMinBytes clamps adaptive chunk sizes (minimum bytes).
// Values <= 0 use a default of 1MiB.
FlushBuildChunkMinBytes int
// FlushBuildChunkMaxBytes clamps adaptive chunk sizes (maximum bytes).
// Values <= 0 use a default of 4MiB.
FlushBuildChunkMaxBytes int
// FlushBuildPrefetchUnits controls how many memtables to start building ahead
// of the consumer. Values <= 0 use FlushBuildConcurrency.
FlushBuildPrefetchUnits int
// FlushBackendMaxEntries caps how many operations are buffered into a single
// backend batch before committing it and continuing with a fresh batch.
//
// This increases backend commit cadence during very large flushes, which can
// reduce index.db high-watermark growth under small KeepRecent windows by
// making retired pages eligible for reuse sooner.
//
// 0 uses the internal default. Negative disables chunking (single backend
// commit per flush).
FlushBackendMaxEntries int
// FlushBackendMaxBatches caps how many intermediate backend commits a single
// flush may emit (0=default, <0=disable cap).
FlushBackendMaxBatches int
// JournalLanes controls the number of active commit/value log lanes (0=default).
// Max supported lanes is 255; value-log segment sequence per lane is capped at 8,388,607.
JournalLanes int
// WALMaxSegmentBytes caps the size of a single WAL segment payload.
// 0 uses the default limit.
WALMaxSegmentBytes int64
// JournalCompression enables best-effort zstd compression for cached-mode
// journal/commitlog segments (metadata only).
//
// The redo log will only keep compressed bytes when they are smaller than the
// raw payload, so compression never causes size amplification.
JournalCompression bool
// ValueLog configures value-log pointer behavior and read integrity.
ValueLog ValueLogOptions
// NotifyError is an optional hook for background maintenance failures.
NotifyError func(error)
// VerifyOnRead forces checksum verification on every index page read,
// bypassing the verified-page cache.
VerifyOnRead bool
// DisableSideStores skips opening dictdb/templatedb side stores.
// This is intended for internal side-store usage (e.g. templatedb itself).
DisableSideStores bool
// DisablePiggybackCompaction disables opportunistic defragmentation during writes.
// When false (default), nodes are rewritten if their siblings are physically
// distant, keeping the tree clustered. Set to true to maximize write speed.
DisablePiggybackCompaction bool
// BackgroundCheckpointInterval enables periodic durable checkpoints in cached
// mode. A checkpoint creates a backend sync boundary and trims
// cached-mode WAL segments to keep `wal/` growth bounded.
//
// Semantics:
// - `0` uses a default.
// - `<0` disables the periodic interval trigger.
BackgroundCheckpointInterval time.Duration
// BackgroundCheckpointIdleDuration triggers an opportunistic checkpoint after
// a period of write-idleness in cached mode.
//
// Semantics:
// - `0` uses a default.
// - `<0` disables the idle trigger.
BackgroundCheckpointIdleDuration time.Duration
// BackgroundIndexVacuumInterval enables periodic online index vacuum passes.
// `0` uses a default; `<0` disables.
BackgroundIndexVacuumInterval time.Duration
// BackgroundIndexVacuumSpanRatioPPM sets the span ratio threshold that
// triggers a vacuum pass (0 uses a default).
BackgroundIndexVacuumSpanRatioPPM uint32
// MaxWALBytes triggers an immediate checkpoint in cached mode when the sum of
// WAL segment sizes exceeds this many bytes (0 uses a default; <0 disables the
// size trigger). This is an operational safety cap; it does not make each
// individual write durable (use *Sync APIs for that).
MaxWALBytes int64
}
type Snapshot ¶
type Snapshot struct {
// contains filtered or unexported fields
}
func (*Snapshot) GetAppend ¶ added in v0.4.0
GetAppend appends the value for key to dst and returns the grown slice. If key is not found, it returns dst and tree.ErrKeyNotFound.
func (*Snapshot) GetEntryExact ¶ added in v0.4.0
GetEntryExact is an alias for GetEntry.
type SnapshotPool ¶
type SnapshotPool struct {
// contains filtered or unexported fields
}
SnapshotPool manages a pool of Snapshot objects to reduce allocation overhead.
func NewSnapshotPool ¶
func NewSnapshotPool() *SnapshotPool
func (*SnapshotPool) Get ¶
func (p *SnapshotPool) Get() *Snapshot
func (*SnapshotPool) Put ¶
func (p *SnapshotPool) Put(s *Snapshot)
type ValueLogAutoPolicy ¶ added in v0.3.0
type ValueLogAutoPolicy uint8
ValueLogAutoPolicy controls auto-mode dict vs block selection bias.
const ( ValueLogAutoBalanced ValueLogAutoPolicy = iota ValueLogAutoThroughput ValueLogAutoSize )
type ValueLogBlockCodec ¶ added in v0.3.0
type ValueLogBlockCodec uint8
ValueLogBlockCodec selects the block codec used for block compression modes.
const ( ValueLogBlockSnappy ValueLogBlockCodec = iota ValueLogBlockLZ4 )
type ValueLogCompressionMode ¶ added in v0.3.0
type ValueLogCompressionMode uint8
ValueLogCompressionMode selects value-log compression behavior in cached mode.
const ( // ValueLogCompressionOff stores value-log grouped frames uncompressed. // // Zero is intentionally reserved as "unset/default". // db.Open normalizes zero to ValueLogCompressionAuto. ValueLogCompressionOff ValueLogCompressionMode = iota + 1 // ValueLogCompressionBlock uses block compression without dictionaries. ValueLogCompressionBlock // ValueLogCompressionDict uses dictionary compression when available. ValueLogCompressionDict // ValueLogCompressionAuto adaptively chooses off/block/dict. ValueLogCompressionAuto )
type ValueLogDictClassMode ¶ added in v0.4.0
type ValueLogDictClassMode uint8
ValueLogDictClassMode controls whether dictionary state is shared across all value-log payloads or split by payload class.
const ( // ValueLogDictClassSingle keeps one shared dictionary stream for all // value-log payloads. ValueLogDictClassSingle ValueLogDictClassMode = iota // ValueLogDictClassSplitOuterLeaf keeps separate dictionary streams for // outer-leaf payloads and single-value payloads. ValueLogDictClassSplitOuterLeaf )
type ValueLogDomainThreshold ¶ added in v0.4.0
type ValueLogDomainThreshold struct {
// Prefix selects the key domain this override applies to.
Prefix []byte
// InlineThreshold is the maximum inline value size for keys in Prefix.
// Values larger than this threshold are eligible for value-log pointers.
// Zero forces all non-empty values in this domain to pointer placement.
InlineThreshold int
}
ValueLogDomainThreshold overrides inline-vs-pointer placement policy for keys under a domain prefix.
A key belongs to the first matching prefix after normalization (longest-prefix wins).
func NormalizeValueLogDomainThresholds ¶ added in v0.4.0
func NormalizeValueLogDomainThresholds(in []ValueLogDomainThreshold) []ValueLogDomainThreshold
NormalizeValueLogDomainThresholds returns a deterministic longest-prefix-first copy suitable for hot-path threshold lookups.
It filters invalid entries (empty prefix or negative thresholds) and de-duplicates identical prefixes after sorting.
type ValueLogGCOptions ¶ added in v0.3.0
type ValueLogGCOptions struct {
DryRun bool
// ProtectedPaths preserves legacy callers that provide a single merged set
// of protected paths. Prefer the specific ProtectedInUsePaths and
// ProtectedRetainedPaths fields for blocker classification.
ProtectedPaths []string
// ProtectedInUsePaths are paths that may still be referenced by mutable
// in-memory state during online maintenance.
ProtectedInUsePaths []string
// ProtectedRetainedPaths are paths pinned by pointer lifecycle retention.
ProtectedRetainedPaths []string
// ObservedSourceFileIDs enables per-classification probe counters for a
// caller-provided subset of segment IDs (for example, rewrite-selected
// source segments). IDs not present in the current set are ignored.
ObservedSourceFileIDs []uint32
// ObservedSourceAssumeUnreferenced indicates ObservedSourceFileIDs are
// already known to be unreferenced. When true, ValueLogGC skips the
// reachability scan and only classifies (and, if !DryRun, zombifies) the
// observed IDs; it does not attempt to reclaim other segments.
ObservedSourceAssumeUnreferenced bool
}
ValueLogGCOptions controls value-log garbage collection.
type ValueLogGCStats ¶ added in v0.3.0
type ValueLogGCStats struct {
SegmentsTotal int
SegmentsReferenced int
SegmentsActive int
SegmentsProtected int
SegmentsProtectedInUse int
SegmentsProtectedRetained int
SegmentsProtectedOverlap int
SegmentsProtectedOther int
SegmentsEligible int
SegmentsDeleted int
SegmentsPending int
BytesTotal int64
BytesReferenced int64
BytesActive int64
BytesProtected int64
BytesProtectedInUse int64
BytesProtectedRetained int64
BytesProtectedOverlap int64
BytesProtectedOther int64
BytesEligible int64
BytesDeleted int64
BytesPending int64
ObservedSourceSegments int
ObservedSourceSegmentsReferenced int
ObservedSourceSegmentsActive int
ObservedSourceSegmentsProtected int
ObservedSourceSegmentsProtectedInUse int
ObservedSourceSegmentsProtectedRetained int
ObservedSourceSegmentsProtectedOverlap int
ObservedSourceSegmentsProtectedOther int
ObservedSourceSegmentsEligible int
ObservedSourceSegmentsDeleted int
ObservedSourceSegmentsPending int
ObservedSourceBytes int64
ObservedSourceBytesReferenced int64
ObservedSourceBytesActive int64
ObservedSourceBytesProtected int64
ObservedSourceBytesProtectedInUse int64
ObservedSourceBytesProtectedRetained int64
ObservedSourceBytesProtectedOverlap int64
ObservedSourceBytesProtectedOther int64
ObservedSourceBytesEligible int64
ObservedSourceBytesDeleted int64
ObservedSourceBytesPending int64
}
ValueLogGCStats summarizes value-log GC work.
type ValueLogGenerationConfig ¶ added in v0.4.0
type ValueLogGenerationConfig struct {
// Policy selects generation behavior. Off preserves current behavior.
Policy ValueLogGenerationPolicy
// LeafSegmentTargetBytes configures target segment size for leaf_vlog
// generations when outer leaves are stored out-of-line.
//
// 0 uses the implementation default leaf-generation target.
LeafSegmentTargetBytes int64
// HotSegmentTargetBytes configures target segment size for hot generation.
// 0 uses implementation default.
HotSegmentTargetBytes int64
// WarmSegmentTargetBytes configures target segment size for warm generation.
// 0 uses implementation default.
WarmSegmentTargetBytes int64
// ColdSegmentTargetBytes configures target segment size for cold generation.
// 0 uses implementation default.
ColdSegmentTargetBytes int64
// RewriteBudgetBytesPerSec bounds background incremental rewrite bandwidth.
// 0 disables byte-budget trigger.
RewriteBudgetBytesPerSec int64
// RewriteBudgetRecordsPerSec bounds background incremental rewrite records/s.
// 0 disables record-budget trigger.
RewriteBudgetRecordsPerSec int
// RewriteTriggerStaleRatioPPM triggers rewrite when stale/live ratio exceeds
// threshold (parts-per-million, 0 disables).
RewriteTriggerStaleRatioPPM uint32
// RewriteTriggerTotalBytes triggers rewrite when total retained bytes exceeds
// threshold (0 disables).
RewriteTriggerTotalBytes int64
// RewriteTriggerChurnPerSec triggers rewrite when churn rate exceeds
// threshold (0 disables).
RewriteTriggerChurnPerSec int64
// RewriteMinSegmentAge gates online rewrite to source segments that are at
// least this old.
//
// 0 uses the implementation default.
RewriteMinSegmentAge time.Duration
}
ValueLogGenerationConfig configures generational value-log behavior.
type ValueLogGenerationPolicy ¶ added in v0.4.0
type ValueLogGenerationPolicy uint8
ValueLogGenerationPolicy controls generation-aware value-log placement. PR1 scaffolding: behavior remains legacy append-only until allocator/rewrite phases land; this policy is currently configuration + observability only.
const ( // ValueLogGenerationDefault selects the library default (currently // hot/warm/cold in cached mode). // // This is intentionally the zero value so callers can opt into the default // behavior without explicitly setting a policy. ValueLogGenerationDefault ValueLogGenerationPolicy = iota // ValueLogGenerationOff keeps legacy single-generation behavior (no // background generation maintenance). ValueLogGenerationOff // ValueLogGenerationHotWarmCold enables hot/warm/cold generation policy. ValueLogGenerationHotWarmCold )
type ValueLogOptions ¶ added in v0.3.0
type ValueLogOptions struct {
// Compression selects value-log compression behavior.
Compression ValueLogCompressionMode
// BlockCodec selects the block codec for block compression.
BlockCodec ValueLogBlockCodec
// BlockTargetCompressedBytes guides grouped block size adaptation.
//
// 0 uses a default.
BlockTargetCompressedBytes int
// IncompressibleHoldBytes configures auto-mode suppression duration after
// repeated incompressible probes.
//
// 0 uses a default.
IncompressibleHoldBytes int
// IncompressibleProbeIntervalBytes controls probe cadence while
// incompressible hold is active.
//
// 0 uses a default.
IncompressibleProbeIntervalBytes int
// AutoPolicy controls auto-mode bias (throughput, balanced, size).
AutoPolicy ValueLogAutoPolicy
// DictClassMode controls dictionary-state partitioning:
// 0=single (default shared dict stream), 1=split_outer_leaf.
DictClassMode ValueLogDictClassMode
// PointerThreshold controls when value-log pointers are used.
// Values <= 0 use a default threshold. In cached mode, relaxed durability
// settings may choose a smaller default to avoid large-scale update cliffs by
// pushing moderate values into the value log.
PointerThreshold int
// Generational configures generation-aware value-log placement and rewrite
// scheduling. PR1 wires config and stats only; behavior remains legacy until
// follow-on phases land.
Generational ValueLogGenerationConfig
// ForcePointers stores all values out-of-line in the value log (no inline values).
ForcePointers bool
// DomainInlineThresholds provides optional per-domain overrides for
// inline-vs-pointer placement. These overrides are evaluated by
// longest-prefix match and fall back to PointerThreshold/default behavior
// when no domain matches.
DomainInlineThresholds []ValueLogDomainThreshold
// RawWritevMinAvgBytes controls raw grouped-frame writev usage.
//
// 0 enables adaptive mode (no average-bytes floor).
RawWritevMinAvgBytes int
// RawWritevMinBatchRecords controls minimum grouped records before raw writev
// is considered.
//
// <=0 uses the default.
RawWritevMinBatchRecords int
// ReadIntegrity configures checksum verification on value-log reads.
ReadIntegrity IntegrityMode
// MaxRetainedBytes emits a warning when retained value-log bytes exceed this
// threshold (0 disables warnings). Cached mode only.
MaxRetainedBytes int64
// MaxRetainedBytesHard disables value-log pointers for new large values once
// retained bytes exceed this threshold (0 disables the cap).
MaxRetainedBytesHard int64
// DictLookup provides dictionary bytes for value-log decoding.
DictLookup valuelog.DictLookup
// DictCurrentForClass resolves the current dictionary ID for a payload class.
// Offline/maintenance rewrite uses this to seed class-specific rewrite codecs.
DictCurrentForClass func(context.Context, string) (uint64, error)
// DictLeafPayloadMode reports whether a published leaf dictionary expects raw
// 4KiB leaf pages (useRawPages=true) or compact split-leaf payloads
// (useRawPages=false). The returned ok flag is false when no explicit mode is
// recorded and callers should fall back to legacy defaults.
DictLeafPayloadMode func(context.Context, uint64) (useRawPages bool, ok bool, err error)
// DictPut persists dictionary bytes and returns the stable dictionary ID.
// Offline/maintenance rewrite may use this to bootstrap a class-specific dict
// before rewriting into dict-compressed frames.
DictPut func(context.Context, []byte) (uint64, error)
// DictSetCurrentForClass marks a dictionary ID as the current dict for the
// provided payload class. Rewrite bootstrap uses this after publishing a new
// class-specific dict.
DictSetCurrentForClass func(context.Context, string, uint64) error
// DictSetLeafPayloadMode records whether a published leaf dictionary expects
// raw 4KiB leaf pages or compact split-leaf payloads.
DictSetLeafPayloadMode func(context.Context, uint64, bool) error
// DictTrain configures background dictionary training for value-log frame
// compression in cached mode.
DictTrain compression.TrainConfig
// DictAdaptiveRatio enables best-effort adaptive disable/pause of value-log
// dictionary compression when payload compression ratios degrade (0 disables).
DictAdaptiveRatio float64
// DictMetricsWindowBytes controls the rolling window size for ratio tracking (0=default).
DictMetricsWindowBytes int
// DictMetricsMinRecords controls how many records must be observed in a window
// before adaptive pause triggers (0=default).
DictMetricsMinRecords int
// DictMetricsPauseBytes controls how long to pause dict compression after a degraded
// window is detected (0=default).
DictMetricsPauseBytes int
// DictIncompressibleHoldBytes enables classifier-driven hold mode for
// high-entropy streams. While hold mode is active, dict attempts and trainer
// collection are bypassed until hold bytes are consumed.
//
// 0 uses profile/default hold configuration; <0 explicitly disables hold
// mode and opts out of profile defaults.
DictIncompressibleHoldBytes int
// DictProbeIntervalBytes controls periodic probe attempts while
// incompressible hold mode is active.
//
// <=0 uses a default derived from hold bytes.
DictProbeIntervalBytes int
// DictMinPayloadSavingsRatio rejects newly trained dictionaries whose payload
// ratio does not improve by at least this fraction (0 uses a cached-mode
// throughput-oriented default: 0.02 normally, 0.05 with ForcePointers or
// WAL disabled).
DictMinPayloadSavingsRatio float64
// DictMaxK clamps the maximum group size (K) used for value-log dict-compressed
// frames.
//
// Larger K can improve compression ratio (more cross-record matches) and can
// reduce framing overhead, but may increase CPU and tail latency due to larger
// encode/decode units.
//
// Values <= 0 use the default (32). Values above the engine maximum are clamped.
DictMaxK int
// DictFrameEncodeLevel controls the zstd encoder level used for dict-compressed
// value-log frames.
//
// Values <= 0 use the default (SpeedFastest).
DictFrameEncodeLevel zstd.EncoderLevel
// DictFrameEnableEntropy enables entropy coding for dict-compressed value-log
// frames (higher ratio, lower throughput).
//
// Default is false (throughput-focused: no-entropy compression).
DictFrameEnableEntropy bool
// CompressionAutotune configures the wall-time value-log compression autotuner.
CompressionAutotune valuelog.AutotuneOptions
// TemplateMode controls template-based compression for value-log values.
TemplateMode template.Mode
// TemplateConfig controls template creation and encoding behavior.
TemplateConfig template.Config
// TemplateReadStrict controls strict template decode behavior.
TemplateReadStrict bool
// TemplateStore provides template routing/definition lookups for template
// encoding (for example in offline rewrite prepass experiments).
TemplateStore template.Store
// TemplateLookup provides template definition bytes for value-log decoding.
TemplateLookup valuelog.TemplateLookup
// TemplateDecodeOptions controls decode caps for template payloads.
TemplateDecodeOptions template.DecodeOptions
}
ValueLogOptions configures value-log pointer behavior and optional compression/dict tuning.
type ValueLogRewriteChunkPlan ¶ added in v0.4.0
type ValueLogRewriteChunkPlan struct {
ChunkBytes int64
SourceChunks []ValueLogRewritePlanChunk
ChunksTotal int
ChunksSelected int
BytesTotal int64
BytesLive int64
BytesStale int64
SelectedBytesTotal int64
SelectedBytesLive int64
SelectedBytesStale int64
AgeBlockedChunks int
AgeBlockedBytesTotal int64
AgeBlockedBytesLive int64
AgeBlockedBytesStale int64
AgeBlockedMinRemainingAge time.Duration
}
ValueLogRewriteChunkPlan mirrors ValueLogRewritePlan, but at chunk granularity. It is intended for future incremental rewrite scheduling work.
type ValueLogRewriteLocalityPolicy ¶ added in v0.4.0
type ValueLogRewriteLocalityPolicy string
ValueLogRewriteLocalityPolicy controls pointer rewrite ordering.
const ( // ValueLogRewriteLocalityDefault preserves scan/input order. ValueLogRewriteLocalityDefault ValueLogRewriteLocalityPolicy = "default" // ValueLogRewriteLocalityGrouped orders by old segment+offset locality. ValueLogRewriteLocalityGrouped ValueLogRewriteLocalityPolicy = "grouped" )
type ValueLogRewriteOnlineOptions ¶ added in v0.4.0
type ValueLogRewriteOnlineOptions struct {
// BatchSize bounds pointer swaps per commit.
BatchSize int
// SyncEachBatch forces fsync durability boundaries for each rewritten batch.
SyncEachBatch bool
// MaxSegmentBytes bounds new value-log segment size during rewrite.
// <=0 uses a default.
MaxSegmentBytes int64
// LocalityPolicy controls ordering of rewritten pointer candidates within
// each batch.
LocalityPolicy ValueLogRewriteLocalityPolicy
// SourceFileIDs restricts rewrite to pointers currently referencing these
// value-log segment IDs. Missing IDs are ignored.
SourceFileIDs []uint32
// SourceChunks restrict rewrite to explicit value-log chunks. When non-empty,
// they take precedence over SourceFileIDs and sparse segment selection.
SourceChunks []ValueLogRewritePlanChunk
// SourceChunkBytes is the chunk width used to interpret SourceChunks.
SourceChunkBytes int64
// ProtectedPaths are value-log segment paths that must not be marked zombie
// during rewrite cleanup.
//
// When non-empty, cleanup also avoids zombifying currently-active pre-existing
// segments (cached-mode maintenance), since concurrent writers may still be
// appending records whose pointers are not yet visible in the backend index.
ProtectedPaths []string
// MaxSourceSegments bounds the number of source segments selected by sparse
// segment selection. Applies only when SourceFileIDs is empty.
MaxSourceSegments int
// MaxSourceBytes bounds estimated live bytes selected by sparse segment
// selection. Applies only when SourceFileIDs is empty.
MaxSourceBytes int64
// MaxCopiedBytes bounds the selected source bytes actually rewritten in this
// pass. <=0 disables the bound.
MaxCopiedBytes int64
// MinSegmentStaleRatio requires stale_bytes/segment_size to be at least this
// value (0..1) when sparse segment selection is used.
MinSegmentStaleRatio float64
// MinSegmentStaleBytes requires estimated stale bytes to be at least this
// threshold when sparse segment selection is used.
MinSegmentStaleBytes int64
// MinSegmentAge excludes very recent source segments from sparse selection.
// This is useful for cached maintenance so freshly-written segments are not
// immediately churned by rewrite during sustained ingest.
MinSegmentAge time.Duration
// ReserveRIDs allocates a contiguous RID range for rewrite-created records.
// Cached-mode callers should provide the live runtime allocator here so
// online rewrite and foreground writes share one RID namespace.
ReserveRIDs func(count int) (start uint64, err error)
}
ValueLogRewriteOnlineOptions controls online rewrite behavior.
type ValueLogRewritePlan ¶ added in v0.4.0
type ValueLogRewritePlan struct {
// SourceFileIDs are the selected value-log segment IDs. The slice is sorted.
SourceFileIDs []uint32
// SelectedSegments summarizes per-segment live/stale estimates for the
// selected SourceFileIDs when live-byte estimation was performed.
//
// When present, it is ordered by FileID ascending.
SelectedSegments []ValueLogRewritePlanSegment
SegmentsTotal int
SegmentsSelected int
BytesTotal int64
BytesLive int64
BytesStale int64
SelectedBytesTotal int64
SelectedBytesLive int64
SelectedBytesStale int64
// AgeBlocked* summarizes candidate segments excluded by MinSegmentAge while
// evaluating sparse rewrite candidates. These counters are age-filter
// diagnostics, not a guarantee that every counted segment would otherwise
// satisfy stale/live rewrite thresholds.
AgeBlockedSegments int
AgeBlockedBytesTotal int64
AgeBlockedBytesLive int64
AgeBlockedBytesStale int64
AgeBlockedMinRemainingAge time.Duration
}
type ValueLogRewritePlanChunk ¶ added in v0.4.0
type ValueLogRewritePlanChunk struct {
FileID uint32
ChunkOffset int64
BytesTotal int64
BytesLive int64
BytesStale int64
StaleRatio float64
}
ValueLogRewritePlanChunk summarizes one sub-file chunk of a value-log segment. This is a planning primitive for future incremental rewrite work; it does not yet change execution.
type ValueLogRewritePlanSegment ¶ added in v0.4.0
type ValueLogRewritePlanSegment struct {
FileID uint32
BytesTotal int64
BytesLive int64
BytesStale int64
StaleRatio float64
}
ValueLogRewritePlan summarizes which segments a sparse online rewrite would target given the current value-log set and selection knobs.
It is intended for cached-mode maintenance schedulers to decide whether a rewrite run is worth performing without forcing the rewrite implementation to do expensive live-byte estimation work twice.
type ValueLogRewriteStats ¶ added in v0.3.0
type ValueLogRewriteStats struct {
SegmentsBefore int
SegmentsAfter int
BytesBefore int64
BytesAfter int64
RecordsCopied int
// Value* counters track key/value-pointer payload copied by the main rewrite
// pointer swap path.
ValueRecordsCopied int
ValueBytesCopied int64
// SourceSegmentsRequested is the number of source segments selected for this
// rewrite run after applying selection filters.
SourceSegmentsRequested int
// SourceChunksRequested is the number of explicit source chunks selected for
// this rewrite run when chunk-restricted execution is used.
SourceChunksRequested int
// SourceSegmentsStillReferenced is the subset of selected source segments
// that remained referenced after rewrite pointer swaps and cleanup.
SourceSegmentsStillReferenced int
// SourceSegmentsUnreferenced is the subset of selected source segments that
// became unreferenced after rewrite pointer swaps and cleanup.
SourceSegmentsUnreferenced int
// SourceBytesRequested is the total bytes across selected source segments.
SourceBytesRequested int64
// SourceBytesStillReferenced is the bytes of selected source segments that
// remained referenced after rewrite pointer swaps and cleanup.
SourceBytesStillReferenced int64
// SourceBytesUnreferenced is the bytes of selected source segments that
// became unreferenced after rewrite pointer swaps and cleanup.
SourceBytesUnreferenced int64
// SourceBytesProcessed is the bounded subset of selected source bytes
// actually rewritten in this pass. When zero, the rewrite either copied
// nothing or ran without a per-pass source-byte bound.
SourceBytesProcessed int64
// SourceFileIDsStillReferenced records which selected source segments
// remained referenced after cleanup.
SourceFileIDsStillReferenced []uint32
// SourceFileIDsUnreferenced records which selected source segments became
// fully unreferenced after cleanup.
SourceFileIDsUnreferenced []uint32
TemplateRecordsAttempted int
TemplateRecordsKept int
TemplateInputBytes int64
TemplateOutputBytes int64
TemplatePointerRecordsAttempted int
TemplatePointerRecordsKept int
TemplatePointerInputBytes int64
TemplatePointerOutputBytes int64
TemplatePointerReasons map[string]uint64
TemplateOuterLeafRecordsAttempted int
TemplateOuterLeafRecordsKept int
TemplateOuterLeafInputBytes int64
TemplateOuterLeafOutputBytes int64
TemplateOuterLeafReasons map[string]uint64
}
ValueLogRewriteStats summarizes rewrite compaction results.
func ValueLogRewriteOffline ¶ added in v0.3.0
func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error)
ValueLogRewriteOffline rewrites value-log pointers into new segments and swaps index.db to reference the new log. This is an offline operation (requires exclusive lock and a clean commitlog).
type WritePolicy ¶
type WritePolicy struct {
FlushThreshold int64 // Size of memtable before flush
InlineThreshold int // Max size of value to store inline
}
WritePolicy defines the heuristics and thresholds for write operations.
func DefaultWritePolicy ¶
func DefaultWritePolicy() WritePolicy
DefaultWritePolicy returns the default policy.
Source Files
¶
- alloc_tracker.go
- api.go
- batch.go
- commit_combiner.go
- db.go
- debug_commit_timing.go
- errors.go
- format_config.go
- fragmentation.go
- fragmentation_validate.go
- index_gen.go
- index_gen_db.go
- index_swap.go
- layout.go
- leaf_generation_gc.go
- leaf_generation_manifest.go
- leaf_generation_pack.go
- leaf_generation_pack_from_plan.go
- leaf_generation_pack_rewrite.go
- leaf_generation_pack_run_once.go
- leaf_generation_pack_select.go
- leaf_generation_plan.go
- leaf_generation_record_length_index.go
- leaf_generation_runtime.go
- leaf_generation_stats.go
- leaf_page_log.go
- open_readonly.go
- permissions.go
- policy.go
- pools.go
- prune.go
- publish_watermark_metrics.go
- vacuum_offline.go
- vacuum_online.go
- value_placement.go
- value_reader.go
- vlog_gc.go
- vlog_gc_incremental.go
- vlog_health.go
- vlog_rewrite.go
- vlog_rewrite_chunk_plan.go
- wal_recovery.go