Documentation
¶
Index ¶
- Constants
- Variables
- func VacuumIndexOffline(opts Options) error
- func ValidateFragmentationReport(rep map[string]string) error
- func ValueReaderForState(state *DBState) tree.SlabReader
- type Batch
- func (b *Batch) Close() error
- func (b *Batch) Delete(key []byte) error
- func (b *Batch) DeleteView(key []byte) error
- func (b *Batch) GetByteSize() (int, error)
- func (b *Batch) Replay(fn func(batch.Entry) error) error
- func (b *Batch) Reset()
- func (b *Batch) Set(key, value []byte) error
- func (b *Batch) SetOps(ops []batch.Entry) error
- func (b *Batch) SetPointer(key []byte, ptr page.ValuePtr) error
- func (b *Batch) SetView(key, value []byte) error
- func (b *Batch) Write() error
- func (b *Batch) WriteSync() error
- type CompactionOp
- type DB
- func (db *DB) AcquireSnapshot() *Snapshot
- func (db *DB) ApplyCompaction(ops []CompactionOp) error
- func (db *DB) ApplyCompactionMicroBatches(ops []CompactionOp, maxOps int) error
- func (db *DB) Close() error
- func (db *DB) Commit(newRootID uint64) error
- func (db *DB) CompactIndex() error
- func (db *DB) CompactSlabsIndexSwap(ctx context.Context, slabIDs []uint32, opts IndexSwapCompactionOptions) error
- func (db *DB) Delete(key []byte) error
- func (db *DB) DeleteSync(key []byte) error
- func (db *DB) FragmentationReport() (map[string]string, error)
- func (db *DB) Get(key []byte) ([]byte, error)
- func (db *DB) GetAppend(key, dst []byte) ([]byte, error)
- func (db *DB) GetUnsafe(key []byte) ([]byte, error)
- func (db *DB) Has(key []byte) (bool, error)
- func (db *DB) InlineThreshold() int
- func (db *DB) Iterator(start, end []byte) (iterator.UnsafeIterator, error)
- func (db *DB) MarkValueLogZombie(id uint32) error
- func (db *DB) NewBatch() batch.Interface
- func (db *DB) NewBatchWithSize(size int) batch.Interface
- func (db *DB) Pager() *pager.Pager
- func (db *DB) Print() error
- func (db *DB) Prune()
- func (db *DB) RefreshSlabSet() error
- func (db *DB) ReverseIterator(start, end []byte) (iterator.UnsafeIterator, error)
- func (db *DB) Set(key, value []byte) error
- func (db *DB) SetSync(key, value []byte) error
- func (db *DB) SlabManager() *slab.SlabManager
- func (db *DB) State() *DBState
- func (db *DB) Stats() map[string]string
- func (db *DB) VacuumIndexOnline(ctx context.Context) error
- func (db *DB) Zipper() *zipper.Zipper
- type DBIterator
- func (it *DBIterator) Close() error
- func (it *DBIterator) DebugStats() (queueLen int, sourcesUsed int)
- func (it *DBIterator) Domain() (start, end []byte)
- func (it *DBIterator) Error() error
- func (it *DBIterator) IsDeleted() bool
- func (it *DBIterator) Key() []byte
- func (it *DBIterator) KeyCopy(dst []byte) []byte
- func (it *DBIterator) Next()
- func (it *DBIterator) Seek(key []byte)
- func (it *DBIterator) UnsafeEntry() ([]byte, page.ValuePtr, byte)
- func (it *DBIterator) UnsafeKey() []byte
- func (it *DBIterator) UnsafeValue() []byte
- func (it *DBIterator) Valid() bool
- func (it *DBIterator) Value() []byte
- func (it *DBIterator) ValueCopy(dst []byte) []byte
- type DBState
- type IndexSwapCompactionOptions
- type IndexSwapCompactionStats
- type Iterator
- type Mode
- type Options
- type Snapshot
- func (s *Snapshot) Close() error
- func (s *Snapshot) Get(key []byte) ([]byte, error)
- func (s *Snapshot) GetEntry(key []byte) (node.LeafEntry, error)
- func (s *Snapshot) GetUnsafe(key []byte) ([]byte, error)
- func (s *Snapshot) Has(key []byte) (bool, error)
- func (s *Snapshot) Pager() *pager.Pager
- func (s *Snapshot) State() *DBState
- type SnapshotPool
- type WritePolicy
Constants ¶
const ( MetaPage0ID = 0 MetaPage1ID = 1 KeepRecent = 10000 )
Variables ¶
var ( // ErrLocked indicates the database directory is already opened by another process. ErrLocked = lockfile.ErrLocked // ErrUnsafeOptions indicates unsafe durability/integrity options were set without acknowledgement. ErrUnsafeOptions = errors.New("treedb: unsafe options require AllowUnsafe") // ErrReadOnly indicates a write was attempted on a read-only DB handle. ErrReadOnly = errors.New("treedb: read-only") )
var ErrVacuumInProgress = errors.New("online vacuum already in progress")
var ErrVacuumUnsupported = errors.New("online vacuum unsupported on this platform")
Functions ¶
func VacuumIndexOffline ¶
VacuumIndexOffline rewrites index.db into a fresh file and swaps it in.
This is intended to reclaim space (reduce `index.db` chunk count) and restore locality after long churn. It is an offline operation (requires exclusive open lock).
func ValidateFragmentationReport ¶
ValidateFragmentationReport validates basic invariants on a FragmentationReport output map. It is intended for tests and operational "health" tooling.
func ValueReaderForState ¶
func ValueReaderForState(state *DBState) tree.SlabReader
ValueReaderForState returns a reader that can resolve slab and value-log pointers.
Types ¶
type Batch ¶
type Batch struct {
// contains filtered or unexported fields
}
Batch implements the cosmos-db Batch interface.
func (*Batch) DeleteView ¶
DeleteView records a Delete without copying the key bytes. Callers must treat key as immutable until the batch is written or closed.
func (*Batch) GetByteSize ¶
func (*Batch) SetPointer ¶
SetPointer records a pointer without copying the value bytes.
func (*Batch) SetView ¶
SetView records a Put without copying key/value bytes. Callers must treat key/value as immutable until the batch is written or closed.
This is intentionally not part of the public batch.Interface; it is a best-effort optimization used by higher-level layers (e.g. cached streaming).
type CompactionOp ¶
type DB ¶
type DB struct {
// contains filtered or unexported fields
}
func (*DB) AcquireSnapshot ¶
AcquireSnapshot returns a new snapshot.
func (*DB) ApplyCompaction ¶
func (db *DB) ApplyCompaction(ops []CompactionOp) error
ApplyCompaction applies pointer updates to the current tree. It uses micro-batching to bound time under the writer lock.
func (*DB) ApplyCompactionMicroBatches ¶
func (db *DB) ApplyCompactionMicroBatches(ops []CompactionOp, maxOps int) error
ApplyCompactionMicroBatches applies compaction pointer updates in chunks of at most maxOps per commit. This bounds writer pauses and keeps the system responsive under large compactions.
func (*DB) Commit ¶
Commit persists the new root (Sync=true by default). Note: This is usually called internally by Batch.Write or externally if manual root management. If manual, retired pages are unknown? `Commit` signature assumes manual root. If external user calls Commit, they might not know retired pages. We'll accept nil for retired if manual.
func (*DB) CompactIndex ¶
CompactIndex rewrites the entire B-Tree sequentially to the end of the file. This improves Full Scan performance by restoring physical locality. Note: This operation causes file growth as old pages are not immediately reclaimed (they are leaked to the freelist but not reused during this append-only build).
func (*DB) CompactSlabsIndexSwap ¶
func (db *DB) CompactSlabsIndexSwap(ctx context.Context, slabIDs []uint32, opts IndexSwapCompactionOptions) error
CompactSlabsIndexSwap compacts one or more slab files by rebuilding the user index into a new file and swapping it in with a short writer pause.
This avoids large COW B-Tree churn during compaction pointer updates by materializing the post-compaction pointer view directly in index.db.new.
Behavior notes:
- The active slab cannot be compacted.
- Missing slab IDs are ignored (idempotent).
- Any concurrent writes are replayed into the new index, but only if the key actually changed since the base snapshot (to avoid undoing pointer remaps).
func (*DB) DeleteSync ¶
DeleteSync removes a key and syncs.
func (*DB) FragmentationReport ¶
FragmentationReport returns best-effort structural stats about the user index that help diagnose scan regressions after churn.
func (*DB) GetAppend ¶
GetAppend appends the value for the key to dst and returns the new slice. If the key is not found, it returns dst and ErrKeyNotFound.
func (*DB) GetUnsafe ¶
GetUnsafe returns the value for a key.
Semantics: Returns a safe copy of the value. For zero-copy views tied to a snapshot lifetime, use Snapshot.GetUnsafe.
func (*DB) InlineThreshold ¶
func (*DB) Iterator ¶
func (db *DB) Iterator(start, end []byte) (iterator.UnsafeIterator, error)
Iterator returns an iterator.
func (*DB) MarkValueLogZombie ¶
MarkValueLogZombie marks a value-log segment as zombie so it can be removed once all snapshots release it.
func (*DB) RefreshSlabSet ¶
RefreshSlabSet publishes a new DBState with the current SlabSet (excluding zombies) without creating a new commit. This is used by background compaction so that future snapshots stop pinning compacted slabs immediately.
func (*DB) ReverseIterator ¶
func (db *DB) ReverseIterator(start, end []byte) (iterator.UnsafeIterator, error)
ReverseIterator returns a reverse iterator.
func (*DB) SlabManager ¶
func (db *DB) SlabManager() *slab.SlabManager
func (*DB) VacuumIndexOnline ¶
VacuumIndexOnline rebuilds the index into a new file and swaps it in with a short writer pause. Old snapshots remain valid by pinning the previous index generation until readers drain; disk space is reclaimed once the old mmap is closed.
type DBIterator ¶
type DBIterator struct {
// contains filtered or unexported fields
}
DBIterator wraps tree.Iterator and holds a Snapshot.
func (*DBIterator) Close ¶
func (it *DBIterator) Close() error
func (*DBIterator) DebugStats ¶
func (it *DBIterator) DebugStats() (queueLen int, sourcesUsed int)
func (*DBIterator) Domain ¶
func (it *DBIterator) Domain() (start, end []byte)
func (*DBIterator) Error ¶
func (it *DBIterator) Error() error
func (*DBIterator) IsDeleted ¶
func (it *DBIterator) IsDeleted() bool
func (*DBIterator) Key ¶
func (it *DBIterator) Key() []byte
func (*DBIterator) KeyCopy ¶
func (it *DBIterator) KeyCopy(dst []byte) []byte
func (*DBIterator) Next ¶
func (it *DBIterator) Next()
func (*DBIterator) UnsafeEntry ¶
func (it *DBIterator) UnsafeEntry() ([]byte, page.ValuePtr, byte)
func (*DBIterator) UnsafeKey ¶
func (it *DBIterator) UnsafeKey() []byte
func (*DBIterator) UnsafeValue ¶
func (it *DBIterator) UnsafeValue() []byte
func (*DBIterator) Valid ¶
func (it *DBIterator) Valid() bool
func (*DBIterator) Value ¶
func (it *DBIterator) Value() []byte
func (*DBIterator) ValueCopy ¶
func (it *DBIterator) ValueCopy(dst []byte) []byte
type IndexSwapCompactionOptions ¶
type IndexSwapCompactionOptions struct {
// CopyBytesPerSec limits slab copy IO. 0 disables throttling.
CopyBytesPerSec int64
// CopyBurstBytes is the limiter burst size. 0 uses a 1-second burst.
CopyBurstBytes int64
// Assist is an optional hook invoked periodically during compaction work.
// It must be fast and must not assume any DB locks are held.
Assist func()
// Stats captures timing and byte counters for the compaction run.
Stats *IndexSwapCompactionStats
}
IndexSwapCompactionOptions configures compaction that rebuilds the index into a second file and swaps it in atomically.
type IndexSwapCompactionStats ¶
type IndexSwapCompactionStats struct {
TotalNanos uint64
BuildNanos uint64
CatchupNanos uint64
FinalizeNanos uint64
RemapCount uint64
RemapBytes uint64
SlabWriteBytes int
SlabDeadBytes int
}
IndexSwapCompactionStats summarizes compaction work for observability.
type Iterator ¶
type Iterator interface {
Valid() bool
Next()
Key() []byte
Value() []byte
KeyCopy(dst []byte) []byte
ValueCopy(dst []byte) []byte
Close() error
Error() error
// Reset resets the iterator for reuse.
Reset(start, end []byte)
}
Iterator is the internal interface for iteration.
type Options ¶
type Options struct {
Dir string
// ReadOnly opens the database without acquiring an exclusive lock and without
// modifying on-disk state (no recovery truncation, no WAL replay, no background
// maintenance). Only read operations are supported.
ReadOnly bool
ChunkSize int64 // Default 256MB
KeepRecent uint64 // Default 10000
// DisableBackgroundPrune keeps pruning on the commit critical path (legacy
// behavior). When false (default), a bounded background pruner frees pages
// asynchronously to reduce commit latency under churn.
DisableBackgroundPrune bool
// PruneInterval controls how often the background pruner wakes up (0 uses a
// default).
PruneInterval time.Duration
// PruneMaxPages bounds how many pages are freed per pruner tick (0 uses a
// default; <0 means unlimited).
PruneMaxPages int
// PruneMaxDuration bounds how long a pruner tick may run (0 uses a default;
// <0 means unlimited).
PruneMaxDuration time.Duration
// BackgroundCompactionInterval enables background slab compaction when > 0.
// Background compaction is managed by the public wrapper (TreeDB/Open) so it
// can coordinate with the caching layer.
BackgroundCompactionInterval time.Duration
BackgroundCompactionMaxSlabs int
BackgroundCompactionDeadRatio float64
BackgroundCompactionMinBytes uint64
BackgroundCompactionMicroBatch int
BackgroundCompactionCopyBytesPerSec int64
BackgroundCompactionCopyBurstBytes int64
BackgroundCompactionRotateBeforeWrite bool
// BackgroundCompactionIndexSwap compacts slabs by rebuilding the index into a
// new file and swapping it in once per pass (two-index-file approach). This
// can drastically reduce index churn during large slab pointer rewrites.
BackgroundCompactionIndexSwap bool
// BackgroundIndexVacuumInterval enables background index vacuum when > 0.
// The worker uses FragmentationReport span ratio to decide if a rebuild is
// warranted; see BackgroundIndexVacuumSpanRatioPPM.
BackgroundIndexVacuumInterval time.Duration
// BackgroundIndexVacuumSpanRatioPPM is the span ratio threshold (ppm) that
// triggers a background index vacuum. Zero uses a default.
BackgroundIndexVacuumSpanRatioPPM uint32
Mode Mode // Default ModeCached
FlushThreshold int64
// MemtableMode selects the cached-mode memtable implementation.
// Supported values: "skiplist", "hash_sorted", "btree", "adaptive".
MemtableMode string
// MemtableShards controls the number of mutable memtable shards in cached
// mode. Values <= 0 use a runtime-dependent default.
MemtableShards int
// PreferAppendAlloc makes the page allocator ignore the freelist and append
// new pages instead. This can improve scan locality under churn at the cost
// of file growth (space is reclaimed later via vacuum).
PreferAppendAlloc bool
// FreelistRegionPages and FreelistRegionRadius bias freelist reuse toward
// nearby page regions to improve locality. Leave both at 0 to disable the
// bias (default). If either is set, missing values will use defaults.
// Set FreelistRegionRadius < 0 to force-disable the bias.
FreelistRegionPages uint64
FreelistRegionRadius int
// LeafFillTargetPPM and InternalFillTargetPPM control how full newly-written
// B+Tree pages are allowed to become before forcing a split (soft-full).
// Lower values reduce split churn and slow re-fragmentation under updates, at
// the cost of higher page count (more index bytes).
//
// Values are in parts-per-million where 1_000_000 means "allow full pages"
// (current behavior). Zero uses the default (1_000_000).
LeafFillTargetPPM uint32
InternalFillTargetPPM uint32
// LeafPrefixCompression enables prefix-compressed leaf nodes for new pages.
LeafPrefixCompression bool
// MaxQueuedMemtables controls how much immutable-memtable backlog the cached
// layer will allow before applying backpressure (i.e. forcing flush work on
// writers). A negative value disables backpressure entirely (higher short-term
// ingest, but potentially unbounded flush debt). Zero uses the default.
MaxQueuedMemtables int
// SlowdownBacklogSeconds begins applying writer backpressure when queued flush
// backlog exceeds this many seconds of estimated flush work (0 disables).
SlowdownBacklogSeconds float64
// StopBacklogSeconds blocks writers when queued flush backlog exceeds this many
// seconds of estimated flush work (0 disables).
StopBacklogSeconds float64
// MaxBacklogBytes is an absolute cap on queued flush backlog bytes (0 disables).
MaxBacklogBytes int64
// WriterFlushMaxMemtables bounds how much queued work a writer will help flush
// per write when backpressure is active (0 uses a default).
WriterFlushMaxMemtables int
// WriterFlushMaxDuration bounds how long a writer will spend helping flush per
// write when backpressure is active (0 disables the time bound).
WriterFlushMaxDuration time.Duration
// FlushBuildConcurrency controls how many goroutines may be used to build a
// combined flush batch from multiple immutable memtables in cached mode.
// Values <= 1 disable parallelism.
FlushBuildConcurrency int
// DisableSlabTailRepairOnOpen disables best-effort recovery that truncates
// partial/corrupt tail records on the active slab. Disabling may reduce open
// latency for very large slabs but risks starting up with committed pointers
// that decode to checksum errors after a crash.
DisableSlabTailRepairOnOpen bool
// AllowUnsafe acknowledges unsafe durability/integrity options.
// When false, Open will reject options that disable WAL, fsync, checksums,
// or slab tail repair.
AllowUnsafe bool
// DisableWAL disables the Write-Ahead Log in cached mode.
// This improves performance but sacrifices durability: a crash will revert
// the database to the last Checkpoint (backend flush).
DisableWAL bool
// DisableValueLog forces cached-mode WAL to remain in legacy mode (no value-log pointers).
DisableValueLog bool
// SplitValueLog stores WAL records in wal/ while large values go to vlog/
// segments, and WAL entries reference them via pointers.
SplitValueLog bool
// WALMaxSegmentBytes caps the size of a single WAL segment payload.
// 0 uses the default limit.
WALMaxSegmentBytes int64
// MemtableValueLogPointers avoids storing large values in the memtable and
// serves them by pointer from the value log (WAL/vlog). Requires WAL/value-log.
MemtableValueLogPointers bool
// ValueLogPointerThreshold controls when WAL/vlog pointers are used.
// Values <= 0 use the default inline threshold (256 bytes).
ValueLogPointerThreshold int
// ForceValuePointers stores all values out-of-line in slabs (no inline values).
ForceValuePointers bool
// MaxValueLogRetainedBytes emits a warning when retained value-log bytes exceed
// this threshold (0 disables warnings). Cached mode only.
MaxValueLogRetainedBytes int64
// MaxValueLogRetainedBytesHard disables value-log pointers for new large
// values once retained bytes exceed this threshold (0 disables the cap).
MaxValueLogRetainedBytesHard int64
// RelaxedSync disables fsync on CommitSync and SetSync operations.
// This improves performance for synchronous workloads but provides only
// crash consistency (OS buffer cache), not true durability.
RelaxedSync bool
// NotifyError is an optional hook for background maintenance failures.
NotifyError func(error)
// DisableReadChecksum skips CRC verification on slab reads.
// This improves read performance (especially for large values) but risks
// returning silent data corruption if the disk/memory is compromised.
DisableReadChecksum bool
// SlabCompression configures compression for slab-stored values.
SlabCompression slab.CompressionOptions
// VerifyOnRead forces checksum verification on every index page read,
// bypassing the verified-page cache.
VerifyOnRead bool
// DisablePiggybackCompaction disables opportunistic defragmentation during writes.
// When false (default), nodes are rewritten if their siblings are physically
// distant, keeping the tree clustered. Set to true to maximize write speed.
DisablePiggybackCompaction bool
// BackgroundCheckpointInterval enables periodic durable checkpoints in cached
// mode. A checkpoint creates a backend sync boundary and trims
// cached-mode WAL segments to keep `wal/` growth bounded.
//
// Semantics:
// - `0` uses a default.
// - `<0` disables the periodic interval trigger.
BackgroundCheckpointInterval time.Duration
// BackgroundCheckpointIdleDuration triggers an opportunistic checkpoint after
// a period of write-idleness in cached mode.
//
// Semantics:
// - `0` uses a default.
// - `<0` disables the idle trigger.
BackgroundCheckpointIdleDuration time.Duration
// MaxWALBytes triggers an immediate checkpoint in cached mode when the sum of
// WAL segment sizes exceeds this many bytes (0 uses a default; <0 disables the
// size trigger). This is an operational safety cap; it does not make each
// individual write durable (use *Sync APIs for that).
MaxWALBytes int64
}
type Snapshot ¶
type Snapshot struct {
// contains filtered or unexported fields
}
type SnapshotPool ¶
type SnapshotPool struct {
// contains filtered or unexported fields
}
SnapshotPool manages a pool of Snapshot objects to reduce allocation overhead.
func NewSnapshotPool ¶
func NewSnapshotPool() *SnapshotPool
func (*SnapshotPool) Get ¶
func (p *SnapshotPool) Get() *Snapshot
func (*SnapshotPool) Put ¶
func (p *SnapshotPool) Put(s *Snapshot)
type WritePolicy ¶
type WritePolicy struct {
FlushThreshold int64 // Size of memtable before flush
InlineThreshold int // Max size of value to store inline
}
WritePolicy defines the heuristics and thresholds for write operations.
func DefaultWritePolicy ¶
func DefaultWritePolicy() WritePolicy
DefaultWritePolicy returns the default policy.