Documentation
¶
Overview ¶
Package autodiff implements automatic differentiation using the decorator pattern.
AutodiffBackend wraps any Backend implementation (CPU, GPU, etc.) and adds gradient tracking capabilities through a GradientTape.
Architecture:
- Decorator pattern: AutodiffBackend[B] wraps any Backend implementation
- GradientTape: Records operations during forward pass
- Operation interface: Each op (Add, Mul, MatMul) implements backward pass
- Reverse-mode AD: Computes gradients efficiently using chain rule
Usage:
// Wrap any backend with autodiff
cpuBackend := cpu.New()
autodiffBackend := autodiff.New(cpuBackend)
// Use with tensors
x := tensor.FromSlice([]float32{2.0}, tensor.Shape{1}, autodiffBackend)
y := x.Mul(x) // y = x²
// Compute gradients
y.Backward()
fmt.Println(x.Grad()) // dy/dx = 2x = 4.0
Index ¶
- func Backward[T tensor.DType, B BackwardCapable](t *tensor.Tensor[T, B], backend B) map[*tensor.RawTensor]*tensor.RawTensor
- type AutodiffBackend
- func (b *AutodiffBackend[B]) Add(a, c *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) AddScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *AutodiffBackend[B]) And(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Argmax(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Cast(x *tensor.RawTensor, dtype tensor.DataType) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Cat(tensors []*tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Chunk(x *tensor.RawTensor, n, dim int) []*tensor.RawTensor
- func (b *AutodiffBackend[B]) Conv2D(input, kernel *tensor.RawTensor, stride, padding int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Cos(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) CrossEntropy(logits, targets *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Device() tensor.Device
- func (b *AutodiffBackend[B]) Div(a, c *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) DivScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Equal(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Exp(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Expand(x *tensor.RawTensor, shape tensor.Shape) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Gather(x *tensor.RawTensor, dim int, index *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) GetTape() *GradientTape
- func (b *AutodiffBackend[B]) Greater(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) GreaterEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Inner() B
- func (b *AutodiffBackend[B]) Log(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Lower(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) LowerEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) MatMul(a, c *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) MaxPool2D(input *tensor.RawTensor, kernelSize, stride int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) MeanDim(x *tensor.RawTensor, dim int, keepDim bool) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Mul(a, c *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) MulScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Name() string
- func (b *AutodiffBackend[B]) NoGrad(fn func())
- func (b *AutodiffBackend[B]) Not(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) NotEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Or(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) ReLU(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Reshape(t *tensor.RawTensor, newShape tensor.Shape) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Rsqrt(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) SiLU(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Sigmoid(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Sin(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Softmax(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Sqrt(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Squeeze(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Sub(a, c *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) SubScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Sum(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) SumDim(x *tensor.RawTensor, dim int, keepDim bool) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Tanh(x *tensor.RawTensor) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Tape() *GradientTape
- func (b *AutodiffBackend[B]) Transpose(t *tensor.RawTensor, axes ...int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Unsqueeze(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *AutodiffBackend[B]) Where(condition, x, y *tensor.RawTensor) *tensor.RawTensor
- type BackwardCapable
- type GradientTape
- func (t *GradientTape) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) map[*tensor.RawTensor]*tensor.RawTensor
- func (t *GradientTape) Clear()
- func (t *GradientTape) IsRecording() bool
- func (t *GradientTape) NumOps() int
- func (t *GradientTape) Record(op ops.Operation)
- func (t *GradientTape) StartRecording()
- func (t *GradientTape) StopRecording()
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func Backward ¶
func Backward[T tensor.DType, B BackwardCapable](t *tensor.Tensor[T, B], backend B) map[*tensor.RawTensor]*tensor.RawTensor
Backward computes gradients for a tensor using the AutodiffBackend's tape.
This helper function extracts the tape from an AutodiffBackend and computes gradients for the given tensor.
Parameters:
- t: The output tensor to compute gradients for
- backend: The backend (must be AutodiffBackend or implement BackwardCapable)
Returns a map from RawTensor to its gradient.
Example:
backend := autodiff.New(cpu.New())
backend.Tape().StartRecording()
x := tensor.Ones[float32](Shape{2}, backend)
y := x.Mul(x) // y = x²
gradients := autodiff.Backward(y, backend)
grad := gradients[x.Raw()] // Get gradient for x
Types ¶
type AutodiffBackend ¶
AutodiffBackend wraps a Backend and adds automatic differentiation. It implements the tensor.Backend interface and records operations in a GradientTape.
Type parameter B must satisfy the tensor.Backend interface.
func New ¶
func New[B tensor.Backend](backend B) *AutodiffBackend[B]
New creates a new AutodiffBackend wrapping the given backend.
func (*AutodiffBackend[B]) Add ¶
func (b *AutodiffBackend[B]) Add(a, c *tensor.RawTensor) *tensor.RawTensor
Add performs element-wise addition and records the operation.
func (*AutodiffBackend[B]) AddScalar ¶ added in v0.3.0
AddScalar adds a scalar to tensor elements (autodiff proxy).
func (*AutodiffBackend[B]) And ¶ added in v0.3.0
func (b *AutodiffBackend[B]) And(a, other *tensor.RawTensor) *tensor.RawTensor
And performs element-wise logical AND (autodiff proxy).
func (*AutodiffBackend[B]) Argmax ¶ added in v0.3.0
Argmax returns indices of maximum values along a dimension (autodiff proxy).
func (*AutodiffBackend[B]) Cast ¶ added in v0.3.0
Cast converts tensor to a different data type (autodiff proxy).
func (*AutodiffBackend[B]) Cat ¶ added in v0.3.0
Cat concatenates tensors along a dimension (passthrough - no autodiff yet).
func (*AutodiffBackend[B]) Chunk ¶ added in v0.3.0
Chunk splits tensor into equal parts (passthrough - no autodiff yet).
func (*AutodiffBackend[B]) Conv2D ¶
func (b *AutodiffBackend[B]) Conv2D(input, kernel *tensor.RawTensor, stride, padding int) *tensor.RawTensor
Conv2D performs 2D convolution and records the operation.
CRITICAL: Conv2D must be recorded on tape for gradient flow! Just like Transpose, Conv2D creates new tensors and without recording, gradients won't flow back to the kernel/input parameters.
func (*AutodiffBackend[B]) Cos ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Cos(x *tensor.RawTensor) *tensor.RawTensor
Cos computes element-wise cosine and records the operation.
func (*AutodiffBackend[B]) CrossEntropy ¶
func (b *AutodiffBackend[B]) CrossEntropy(logits, targets *tensor.RawTensor) *tensor.RawTensor
CrossEntropy computes cross-entropy loss for classification.
Forward:
Loss = mean(-log_softmax(logits)[targets])
Uses the log-sum-exp trick for numerical stability.
Backward:
∂L/∂logits = (softmax(logits) - y_one_hot) / batch_size
Parameters:
- logits: Model predictions [batch_size, num_classes]
- targets: Ground truth class indices [batch_size]
Returns:
- Scalar loss value (mean over batch)
func (*AutodiffBackend[B]) Device ¶
func (b *AutodiffBackend[B]) Device() tensor.Device
Device returns the compute device.
func (*AutodiffBackend[B]) Div ¶
func (b *AutodiffBackend[B]) Div(a, c *tensor.RawTensor) *tensor.RawTensor
Div performs element-wise division and records the operation.
func (*AutodiffBackend[B]) DivScalar ¶ added in v0.3.0
DivScalar divides tensor elements by a scalar (autodiff proxy).
func (*AutodiffBackend[B]) Equal ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Equal(a, other *tensor.RawTensor) *tensor.RawTensor
Equal performs element-wise equality comparison (autodiff proxy).
func (*AutodiffBackend[B]) Exp ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Exp(x *tensor.RawTensor) *tensor.RawTensor
Exp computes element-wise exponential and records the operation.
func (*AutodiffBackend[B]) Expand ¶ added in v0.3.0
Expand broadcasts tensor to a larger shape (autodiff proxy).
func (*AutodiffBackend[B]) Gather ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Gather(x *tensor.RawTensor, dim int, index *tensor.RawTensor) *tensor.RawTensor
Gather selects elements along dim using index tensor (passthrough - no autodiff yet).
func (*AutodiffBackend[B]) GetTape ¶
func (b *AutodiffBackend[B]) GetTape() *GradientTape
GetTape returns the gradient tape (implements BackwardCapable interface).
func (*AutodiffBackend[B]) Greater ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Greater(a, other *tensor.RawTensor) *tensor.RawTensor
Greater performs element-wise greater-than comparison (autodiff proxy).
func (*AutodiffBackend[B]) GreaterEqual ¶ added in v0.3.0
func (b *AutodiffBackend[B]) GreaterEqual(a, other *tensor.RawTensor) *tensor.RawTensor
GreaterEqual performs element-wise greater-or-equal comparison (autodiff proxy).
func (*AutodiffBackend[B]) Inner ¶
func (b *AutodiffBackend[B]) Inner() B
Inner returns the wrapped backend for direct access.
func (*AutodiffBackend[B]) Log ¶
func (b *AutodiffBackend[B]) Log(x *tensor.RawTensor) *tensor.RawTensor
Log computes element-wise natural logarithm.
Forward:
output = log(input)
Backward:
∂L/∂input = ∂L/∂output * (1 / input)
Note: Input values must be positive. For numerical stability with values close to zero, consider using LogWithEpsilon operation instead.
func (*AutodiffBackend[B]) Lower ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Lower(a, other *tensor.RawTensor) *tensor.RawTensor
Lower performs element-wise less-than comparison (autodiff proxy).
func (*AutodiffBackend[B]) LowerEqual ¶ added in v0.3.0
func (b *AutodiffBackend[B]) LowerEqual(a, other *tensor.RawTensor) *tensor.RawTensor
LowerEqual performs element-wise less-or-equal comparison (autodiff proxy).
func (*AutodiffBackend[B]) MatMul ¶
func (b *AutodiffBackend[B]) MatMul(a, c *tensor.RawTensor) *tensor.RawTensor
MatMul performs matrix multiplication and records the operation.
func (*AutodiffBackend[B]) MaxPool2D ¶
func (b *AutodiffBackend[B]) MaxPool2D(input *tensor.RawTensor, kernelSize, stride int) *tensor.RawTensor
MaxPool2D performs 2D max pooling and records the operation.
CRITICAL: MaxPool2D must be recorded on tape for gradient flow! During backward pass, gradients only flow to positions that had max values. MaxPool2DOp stores max indices during forward pass for correct gradient routing.
func (*AutodiffBackend[B]) MeanDim ¶ added in v0.3.0
MeanDim computes mean along a dimension and records the operation.
func (*AutodiffBackend[B]) Mul ¶
func (b *AutodiffBackend[B]) Mul(a, c *tensor.RawTensor) *tensor.RawTensor
Mul performs element-wise multiplication and records the operation.
func (*AutodiffBackend[B]) MulScalar ¶ added in v0.3.0
MulScalar multiplies tensor elements by a scalar (autodiff proxy).
func (*AutodiffBackend[B]) Name ¶
func (b *AutodiffBackend[B]) Name() string
Name returns the backend name.
func (*AutodiffBackend[B]) NoGrad ¶ added in v0.3.0
func (b *AutodiffBackend[B]) NoGrad(fn func())
NoGrad temporarily disables gradient recording for inference.
This is useful for:
- Inference/evaluation (no need to track gradients)
- Gradient-free operations (e.g., updating exponential moving averages)
- Memory optimization (gradient tape doesn't grow)
The function executes the provided function with gradient recording disabled, then restores the previous recording state.
Example:
// Inference mode
backend.NoGrad(func() {
output := model.Forward(input) // No gradients recorded
predictions := output.ArgMax()
})
// Training continues normally
loss := model.Forward(trainInput)
loss.Backward() // Gradients computed
func (*AutodiffBackend[B]) Not ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Not(x *tensor.RawTensor) *tensor.RawTensor
Not performs element-wise logical NOT (autodiff proxy).
func (*AutodiffBackend[B]) NotEqual ¶ added in v0.3.0
func (b *AutodiffBackend[B]) NotEqual(a, other *tensor.RawTensor) *tensor.RawTensor
NotEqual performs element-wise inequality comparison (autodiff proxy).
func (*AutodiffBackend[B]) Or ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Or(a, other *tensor.RawTensor) *tensor.RawTensor
Or performs element-wise logical OR (autodiff proxy).
func (*AutodiffBackend[B]) ReLU ¶
func (b *AutodiffBackend[B]) ReLU(x *tensor.RawTensor) *tensor.RawTensor
ReLU applies ReLU activation and records the operation.
func (*AutodiffBackend[B]) Reshape ¶
Reshape reshapes a tensor and records the operation.
CRITICAL: Like Transpose, Reshape must be recorded on tape! Without recording, gradients won't flow back to reshaped parameters.
Example: Conv2D bias
- bias parameter: [out_channels]
- reshaped for broadcasting: [1, out_channels, 1, 1]
- Without ReshapeOp: gradient computed for reshaped tensor only
- With ReshapeOp: gradient propagates back to original bias parameter
func (*AutodiffBackend[B]) Rsqrt ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Rsqrt(x *tensor.RawTensor) *tensor.RawTensor
Rsqrt computes element-wise reciprocal square root and records the operation.
func (*AutodiffBackend[B]) SiLU ¶ added in v0.3.0
func (b *AutodiffBackend[B]) SiLU(x *tensor.RawTensor) *tensor.RawTensor
SiLU applies SiLU (Swish) activation: f(x) = x * sigmoid(x).
SiLU (Sigmoid Linear Unit), also known as Swish, is widely used in modern transformer architectures (LLaMA, Mistral, GPT-Neo).
Forward:
output = x * sigmoid(x)
Backward:
dy/dx = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
func (*AutodiffBackend[B]) Sigmoid ¶
func (b *AutodiffBackend[B]) Sigmoid(x *tensor.RawTensor) *tensor.RawTensor
Sigmoid applies sigmoid activation: σ(x) = 1 / (1 + exp(-x)).
func (*AutodiffBackend[B]) Sin ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Sin(x *tensor.RawTensor) *tensor.RawTensor
Sin computes element-wise sine and records the operation.
func (*AutodiffBackend[B]) Softmax ¶
Softmax applies softmax activation along the specified dimension.
Parameters:
- x: Input tensor
- dim: Dimension along which to compute softmax (-1 for last dimension)
Forward (for each row):
softmax(x)_i = exp(x_i - max(x)) / Σ_j exp(x_j - max(x))
The max-shifting ensures numerical stability (prevents overflow).
Backward:
The Jacobian of softmax is complex, but the gradient simplifies to: ∂L/∂x_j = softmax_j * (∂L/∂softmax_j - Σ_i (∂L/∂softmax_i * softmax_i))
func (*AutodiffBackend[B]) Sqrt ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Sqrt(x *tensor.RawTensor) *tensor.RawTensor
Sqrt computes element-wise square root and records the operation.
func (*AutodiffBackend[B]) Squeeze ¶ added in v0.3.0
Squeeze removes a dimension (recorded via Reshape).
func (*AutodiffBackend[B]) Sub ¶
func (b *AutodiffBackend[B]) Sub(a, c *tensor.RawTensor) *tensor.RawTensor
Sub performs element-wise subtraction and records the operation.
func (*AutodiffBackend[B]) SubScalar ¶ added in v0.3.0
SubScalar subtracts a scalar from tensor elements (autodiff proxy).
func (*AutodiffBackend[B]) Sum ¶ added in v0.3.0
func (b *AutodiffBackend[B]) Sum(x *tensor.RawTensor) *tensor.RawTensor
Sum reduces tensor to a single scalar by summing all elements (autodiff proxy).
func (*AutodiffBackend[B]) SumDim ¶ added in v0.3.0
SumDim sums tensor along a dimension and records the operation.
func (*AutodiffBackend[B]) Tanh ¶
func (b *AutodiffBackend[B]) Tanh(x *tensor.RawTensor) *tensor.RawTensor
Tanh applies hyperbolic tangent activation.
func (*AutodiffBackend[B]) Tape ¶
func (b *AutodiffBackend[B]) Tape() *GradientTape
Tape returns the gradient tape for manual control. Useful for:
- Starting/stopping recording
- Clearing tape between iterations
- Inspecting recorded operations
func (*AutodiffBackend[B]) Transpose ¶
Transpose transposes a tensor and records the operation.
CRITICAL: Even though conceptually transpose is a "view", the underlying backend may create a new tensor (e.g., CPU backend copies data). We MUST record this operation so gradients flow back correctly.
For example, in Linear layer:
w = weight parameter wT = w.Transpose() // Creates NEW tensor! output = input @ wT // MatMul records operation with wT
Without recording Transpose:
- Backward computes grad for wT (new tensor)
- Optimizer looks for grad of w (original parameter)
- NO GRADIENT FOUND! Parameters don't update!
With TransposeOp:
- Backward computes grad for wT
- TransposeOp.Backward propagates grad back to w
- Optimizer finds grad for w ✓
type BackwardCapable ¶
type BackwardCapable interface {
tensor.Backend
// GetTape returns the gradient tape for backward computation.
GetTape() *GradientTape
}
BackwardCapable is an interface for backends that support backward pass. AutodiffBackend implements this interface.
type GradientTape ¶
type GradientTape struct {
// contains filtered or unexported fields
}
GradientTape records operations during the forward pass and computes gradients during the backward pass using reverse-mode automatic differentiation.
Usage:
tape := NewGradientTape() tape.StartRecording() // ... perform operations ... gradients := tape.Backward(outputGrad, backend)
func NewGradientTape ¶
func NewGradientTape() *GradientTape
NewGradientTape creates a new gradient tape.
func (*GradientTape) Backward ¶
func (t *GradientTape) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) map[*tensor.RawTensor]*tensor.RawTensor
Backward computes gradients for all inputs by walking the tape in reverse.
Algorithm:
- Start with the output gradient (typically ones for scalar loss)
- Walk operations in reverse order
- For each operation, compute input gradients using chain rule
- Accumulate gradients when the same tensor is used multiple times
Returns a map from RawTensor to its accumulated gradient.
Example:
// y = (x + 2) * 3 // dy/dx = 3 tape := NewGradientTape() tape.StartRecording() // ... operations recorded ... gradients := tape.Backward(ones, backend) dydx := gradients[x.Raw()]
func (*GradientTape) Clear ¶
func (t *GradientTape) Clear()
Clear resets the tape, removing all recorded operations. Recording state is preserved.
func (*GradientTape) IsRecording ¶
func (t *GradientTape) IsRecording() bool
IsRecording returns true if the tape is currently recording operations.
func (*GradientTape) NumOps ¶
func (t *GradientTape) NumOps() int
NumOps returns the number of recorded operations.
func (*GradientTape) Record ¶
func (t *GradientTape) Record(op ops.Operation)
Record adds an operation to the tape. Only records if the tape is currently recording.
func (*GradientTape) StartRecording ¶
func (t *GradientTape) StartRecording()
StartRecording enables operation recording.
func (*GradientTape) StopRecording ¶
func (t *GradientTape) StopRecording()
StopRecording disables operation recording.