ops

package

v0.7.11 Latest Latest Go to latest Published: Feb 27, 2026 License: Apache-2.0 Imports: 3 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/born-ml/born

Links

Open Source Insights

Documentation ¶

Overview ¶

Package ops defines operation interfaces and implementations for automatic differentiation.

Each operation implements the Operation interface, which provides:

Forward pass: computed by the backend
Backward pass: computes gradients for inputs given output gradient

Supported operations:

AddOp: element-wise addition (d(a+b)/da = 1, d(a+b)/db = 1)
SubOp: element-wise subtraction
MulOp: element-wise multiplication (d(a*b)/da = b, d(a*b)/db = a)
DivOp: element-wise division
MatMulOp: matrix multiplication (d(A@B)/dA = grad@B^T, d(A@B)/dB = A^T@grad)
ReLUOp: rectified linear unit activation (d(ReLU(x))/dx = 1 if x > 0, else 0)

Index ¶

func CrossEntropyForward(logits, targets *tensor.RawTensor, device tensor.Device) *tensor.RawTensor
func Exp(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor
func Log(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor
func Softmax(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor
type AddOp
- func NewAddOp(a, b, output *tensor.RawTensor) *AddOp
- func (op *AddOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *AddOp) Inputs() []*tensor.RawTensor
- func (op *AddOp) Output() *tensor.RawTensor
type BatchMatMulOp
- func NewBatchMatMulOp(a, b, output *tensor.RawTensor) *BatchMatMulOp
- func (op *BatchMatMulOp) Backward(grad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *BatchMatMulOp) Inputs() []*tensor.RawTensor
- func (op *BatchMatMulOp) Output() *tensor.RawTensor
type CatOp
- func NewCatOp(inputs []*tensor.RawTensor, dim int, sizes []int, output *tensor.RawTensor) *CatOp
- func (op *CatOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *CatOp) Inputs() []*tensor.RawTensor
- func (op *CatOp) Output() *tensor.RawTensor
type ChunkOp
- func NewChunkOp(input *tensor.RawTensor, n, dim int, outputs []*tensor.RawTensor) *ChunkOp
- func (op *ChunkOp) Backward(_ *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor
- func (op *ChunkOp) BackwardMulti(gradOutputs []*tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *ChunkOp) Inputs() []*tensor.RawTensor
- func (op *ChunkOp) Output() *tensor.RawTensor
- func (op *ChunkOp) Outputs() []*tensor.RawTensor
type Conv2DOp
- func NewConv2DOp(input, kernel, output *tensor.RawTensor, stride, padding int) *Conv2DOp
- func (op *Conv2DOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *Conv2DOp) Inputs() []*tensor.RawTensor
- func (op *Conv2DOp) Output() *tensor.RawTensor
type CosOp
- func NewCosOp(input, output *tensor.RawTensor) *CosOp
- func (op *CosOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *CosOp) Inputs() []*tensor.RawTensor
- func (op *CosOp) Output() *tensor.RawTensor
type CrossEntropyOp
- func NewCrossEntropyOp(logits, targets, output *tensor.RawTensor) *CrossEntropyOp
- func (op *CrossEntropyOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor
- func (op *CrossEntropyOp) Inputs() []*tensor.RawTensor
- func (op *CrossEntropyOp) Output() *tensor.RawTensor
type DivOp
- func NewDivOp(a, b, output *tensor.RawTensor) *DivOp
- func (op *DivOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *DivOp) Inputs() []*tensor.RawTensor
- func (op *DivOp) Output() *tensor.RawTensor
type EmbeddingOp
- func NewEmbeddingOp(weight, indices, output *tensor.RawTensor) *EmbeddingOp
- func (op *EmbeddingOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *EmbeddingOp) Inputs() []*tensor.RawTensor
- func (op *EmbeddingOp) Output() *tensor.RawTensor
type ExpOp
- func NewExpOp(input, output *tensor.RawTensor) *ExpOp
- func (op *ExpOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *ExpOp) Inputs() []*tensor.RawTensor
- func (op *ExpOp) Output() *tensor.RawTensor
type GatherOp
- func NewGatherOp(input *tensor.RawTensor, dim int, index, output *tensor.RawTensor) *GatherOp
- func (op *GatherOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *GatherOp) Inputs() []*tensor.RawTensor
- func (op *GatherOp) Output() *tensor.RawTensor
type LogOp
- func NewLogOp(input, output *tensor.RawTensor) *LogOp
- func (op *LogOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor
- func (op *LogOp) Inputs() []*tensor.RawTensor
- func (op *LogOp) Output() *tensor.RawTensor
type LogSoftmaxOp
- func NewLogSoftmaxOp(input, output *tensor.RawTensor, softmaxData []float32) *LogSoftmaxOp
- func (op *LogSoftmaxOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor
- func (op *LogSoftmaxOp) Inputs() []*tensor.RawTensor
- func (op *LogSoftmaxOp) Output() *tensor.RawTensor
type LogWithEpsilonOp
- func NewLogWithEpsilonOp(input, output *tensor.RawTensor, epsilon float64) *LogWithEpsilonOp
- func (op *LogWithEpsilonOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor
- func (op *LogWithEpsilonOp) Inputs() []*tensor.RawTensor
- func (op *LogWithEpsilonOp) Output() *tensor.RawTensor
type MatMulOp
- func NewMatMulOp(a, b, output *tensor.RawTensor) *MatMulOp
- func (op *MatMulOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *MatMulOp) Inputs() []*tensor.RawTensor
- func (op *MatMulOp) Output() *tensor.RawTensor
type MaxPool2DOp
- func NewMaxPool2DOp(input, output *tensor.RawTensor, kernelSize, stride int) *MaxPool2DOp
- func (op *MaxPool2DOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *MaxPool2DOp) Inputs() []*tensor.RawTensor
- func (op *MaxPool2DOp) Output() *tensor.RawTensor
type MeanDimOp
- func NewMeanDimOp(x, output *tensor.RawTensor, dim int, keepDim bool) *MeanDimOp
- func (op *MeanDimOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *MeanDimOp) Inputs() []*tensor.RawTensor
- func (op *MeanDimOp) Output() *tensor.RawTensor
type MulOp
- func NewMulOp(a, b, output *tensor.RawTensor) *MulOp
- func (op *MulOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *MulOp) Inputs() []*tensor.RawTensor
- func (op *MulOp) Output() *tensor.RawTensor
type MultiOutputOperation
type Operation
type ReLUOp
- func NewReLUOp(input, output *tensor.RawTensor) *ReLUOp
- func (op *ReLUOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *ReLUOp) Inputs() []*tensor.RawTensor
- func (op *ReLUOp) Output() *tensor.RawTensor
type ReshapeOp
- func NewReshapeOp(input, output *tensor.RawTensor) *ReshapeOp
- func (op *ReshapeOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *ReshapeOp) Inputs() []*tensor.RawTensor
- func (op *ReshapeOp) Output() *tensor.RawTensor
type RsqrtOp
- func NewRsqrtOp(input, output *tensor.RawTensor) *RsqrtOp
- func (op *RsqrtOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *RsqrtOp) Inputs() []*tensor.RawTensor
- func (op *RsqrtOp) Output() *tensor.RawTensor
type SiLUOp
- func NewSiLUOp(input, output *tensor.RawTensor) *SiLUOp
- func (op *SiLUOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SiLUOp) Inputs() []*tensor.RawTensor
- func (op *SiLUOp) Output() *tensor.RawTensor
type SigmoidOp
- func NewSigmoidOp(input, output *tensor.RawTensor) *SigmoidOp
- func (op *SigmoidOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SigmoidOp) Inputs() []*tensor.RawTensor
- func (op *SigmoidOp) Output() *tensor.RawTensor
type SinOp
- func NewSinOp(input, output *tensor.RawTensor) *SinOp
- func (op *SinOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SinOp) Inputs() []*tensor.RawTensor
- func (op *SinOp) Output() *tensor.RawTensor
type SoftmaxOp
- func NewSoftmaxOp(input, output *tensor.RawTensor, dim int) *SoftmaxOp
- func (op *SoftmaxOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SoftmaxOp) Inputs() []*tensor.RawTensor
- func (op *SoftmaxOp) Output() *tensor.RawTensor
type SqrtOp
- func NewSqrtOp(input, output *tensor.RawTensor) *SqrtOp
- func (op *SqrtOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SqrtOp) Inputs() []*tensor.RawTensor
- func (op *SqrtOp) Output() *tensor.RawTensor
type SubOp
- func NewSubOp(a, b, output *tensor.RawTensor) *SubOp
- func (op *SubOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SubOp) Inputs() []*tensor.RawTensor
- func (op *SubOp) Output() *tensor.RawTensor
type SumDimOp
- func NewSumDimOp(x, output *tensor.RawTensor, dim int, keepDim bool) *SumDimOp
- func (op *SumDimOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *SumDimOp) Inputs() []*tensor.RawTensor
- func (op *SumDimOp) Output() *tensor.RawTensor
type TanhOp
- func NewTanhOp(input, output *tensor.RawTensor) *TanhOp
- func (op *TanhOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *TanhOp) Inputs() []*tensor.RawTensor
- func (op *TanhOp) Output() *tensor.RawTensor
type TransposeOp
- func NewTransposeOp(input, output *tensor.RawTensor, axes []int) *TransposeOp
- func (op *TransposeOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *TransposeOp) Inputs() []*tensor.RawTensor
- func (op *TransposeOp) Output() *tensor.RawTensor
type WhereOp
- func NewWhereOp(condition, x, y, output *tensor.RawTensor) *WhereOp
- func (op *WhereOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
- func (op *WhereOp) Inputs() []*tensor.RawTensor
- func (op *WhereOp) Output() *tensor.RawTensor

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func CrossEntropyForward ¶

func CrossEntropyForward(logits, targets *tensor.RawTensor, device tensor.Device) *tensor.RawTensor

CrossEntropyForward computes cross-entropy loss (helper function).

This is a helper for use outside autodiff context. For autodiff support, use AutodiffBackend with CrossEntropyOp.

Parameters:

logits: [batch_size, num_classes]
targets: [batch_size] (class indices)

Returns:

Scalar loss tensor (mean over batch)

func Exp ¶

func Exp(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor

Exp computes element-wise exponential (helper for softmax).

Forward: output = exp(input) Backward: ∂L/∂input = ∂L/∂output * exp(input) = ∂L/∂output * output

Note: This is a helper function, not a full Operation. For autodiff support, use ExpOp (to be implemented if needed).

func Log ¶

func Log(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor

Log computes element-wise natural logarithm (helper function).

Forward: output = log(input)

Note: This is a helper function for use outside autodiff. For autodiff support, use backend.Log() which records LogOp.

func Softmax ¶

func Softmax(input *tensor.RawTensor, device tensor.Device) *tensor.RawTensor

Softmax computes softmax along last dimension (helper function).

This is a helper for use outside autodiff. For autodiff support, use backend.Softmax() which records SoftmaxOp.

Types ¶

type AddOp ¶

type AddOp struct {
	// contains filtered or unexported fields
}

AddOp represents an element-wise addition operation: output = a + b.

Backward pass:

d(a+b)/da = 1, so grad_a = outputGrad
d(a+b)/db = 1, so grad_b = outputGrad

Note: If broadcasting was used in forward pass, gradients must be reduced (summed) along the broadcast dimensions to match input shapes.

func (*AddOp) Backward ¶

func (op *AddOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for addition. Since d(a+b)/da = d(a+b)/db = 1, the gradient flows equally to both inputs.

func (*AddOp) Inputs ¶

func (op *AddOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*AddOp) Output ¶

func (op *AddOp) Output() *tensor.RawTensor

Output returns the output tensor a + b.

type BatchMatMulOp ¶ added in v0.4.0

type BatchMatMulOp struct {
	// contains filtered or unexported fields
}

BatchMatMulOp represents a batched matrix multiplication operation: output = a @ b.

Backward pass:

d(A@B)/dA = outputGrad @ B^T
d(A@B)/dB = A^T @ outputGrad

Where @ denotes batched matrix multiplication and ^T denotes transpose.

func NewBatchMatMulOp ¶ added in v0.4.0

func NewBatchMatMulOp(a, b, output *tensor.RawTensor) *BatchMatMulOp

NewBatchMatMulOp creates a new BatchMatMulOp.

func (*BatchMatMulOp) Backward ¶ added in v0.4.0

func (op *BatchMatMulOp) Backward(grad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for batch matmul. Given C = A @ B:

dL/dA = dL/dC @ B^T
dL/dB = A^T @ dL/dC

func (*BatchMatMulOp) Inputs ¶ added in v0.4.0

func (op *BatchMatMulOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*BatchMatMulOp) Output ¶ added in v0.4.0

func (op *BatchMatMulOp) Output() *tensor.RawTensor

Output returns the output tensor a @ b.

type CatOp ¶ added in v0.5.0

type CatOp struct {
	// contains filtered or unexported fields
}

CatOp represents a concatenation operation along a dimension.

Forward: output = Cat([input1, input2, ...], dim)

Backward:

Split gradOutput along dim at input boundaries and distribute to each input.
Each input receives the gradient slice corresponding to its contribution.

Example:

inputs: [[1,2], [3,4,5]] along dim=0
output: [[1,2], [3,4,5]] (shape depends on concat)
gradOutput: [dL/d1, dL/d2, dL/d3, dL/d4, dL/d5]
gradInput1: [dL/d1, dL/d2]
gradInput2: [dL/d3, dL/d4, dL/d5]

func NewCatOp ¶ added in v0.5.0

func NewCatOp(inputs []*tensor.RawTensor, dim int, sizes []int, output *tensor.RawTensor) *CatOp

NewCatOp creates a new cat operation.

func (*CatOp) Backward ¶ added in v0.5.0

func (op *CatOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for the input tensors.

The gradient is split along the concatenation dimension based on the original input sizes. Each input receives its corresponding slice of the output gradient.

Algorithm:

Split gradOutput along dim at boundaries defined by sizes
Return one gradient slice per input tensor

func (*CatOp) Inputs ¶ added in v0.5.0

func (op *CatOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*CatOp) Output ¶ added in v0.5.0

func (op *CatOp) Output() *tensor.RawTensor

Output returns the output tensor.

type ChunkOp ¶ added in v0.5.0

type ChunkOp struct {
	// contains filtered or unexported fields
}

ChunkOp represents a chunk operation that splits a tensor into n equal parts.

Forward: outputs = Chunk(input, n, dim)

Backward:

Concatenate all output gradients back together along dim.
gradInput = Cat([gradOutput1, gradOutput2, ...], dim)

Example:

input: [1,2,3,4,5,6] along dim=0, n=3
outputs: [[1,2], [3,4], [5,6]]
gradOutputs: [[dL/d1, dL/d2], [dL/d3, dL/d4], [dL/d5, dL/d6]]
gradInput: [dL/d1, dL/d2, dL/d3, dL/d4, dL/d5, dL/d6]

func NewChunkOp ¶ added in v0.5.0

func NewChunkOp(input *tensor.RawTensor, n, dim int, outputs []*tensor.RawTensor) *ChunkOp

NewChunkOp creates a new chunk operation.

func (*ChunkOp) Backward ¶ added in v0.5.0

func (op *ChunkOp) Backward(_ *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor

Backward computes gradients for the input tensor.

Since Chunk splits a tensor, the backward pass concatenates the gradients of all output chunks back together.

Algorithm:

Collect gradients for all output chunks
Concatenate them along the same dimension
Return the concatenated gradient

Note: The caller must provide gradients for all outputs.

func (*ChunkOp) BackwardMulti ¶ added in v0.5.0

func (op *ChunkOp) BackwardMulti(gradOutputs []*tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

BackwardMulti computes gradients for the input tensor given all output gradients.

This is the proper backward pass for chunk that takes all output gradients.

func (*ChunkOp) Inputs ¶ added in v0.5.0

func (op *ChunkOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor.

func (*ChunkOp) Output ¶ added in v0.5.0

func (op *ChunkOp) Output() *tensor.RawTensor

Output returns the first output tensor. Note: ChunkOp has multiple outputs, but the Operation interface expects a single output. We return the first chunk here, but the tape needs special handling for multi-output ops.

func (*ChunkOp) Outputs ¶ added in v0.5.0

func (op *ChunkOp) Outputs() []*tensor.RawTensor

Outputs returns all output tensors (implements MultiOutputOperation).

type Conv2DOp ¶

type Conv2DOp struct {
	// contains filtered or unexported fields
}

Conv2DOp records a 2D convolution operation for autodiff.

Forward: output = Conv2D(input, kernel, stride, padding)

Backward (gradients):

d_input: "transposed convolution" or "deconvolution" of d_output with kernel
d_kernel: convolution of input with d_output

References:

"A guide to convolution arithmetic for deep learning" (Dumoulin & Visin, 2016)
CS231n: Convolutional Neural Networks for Visual Recognition

func NewConv2DOp ¶

func NewConv2DOp(input, kernel, output *tensor.RawTensor, stride, padding int) *Conv2DOp

NewConv2DOp creates a new Conv2D operation.

func (*Conv2DOp) Backward ¶

func (op *Conv2DOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for Conv2D.

This is pure orchestration - delegates computation to backend.

Given:

outputGrad: ∂L/∂output [N, C_out, H_out, W_out]

Compute:

inputGrad: ∂L/∂input [N, C_in, H, W]
kernelGrad: ∂L/∂kernel [C_out, C_in, K_h, K_w]

References:

Burn framework: crates/burn-autodiff/src/ops/module.rs (conv2d backward)

func (*Conv2DOp) Inputs ¶

func (op *Conv2DOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*Conv2DOp) Output ¶

func (op *Conv2DOp) Output() *tensor.RawTensor

Output returns the output tensor.

type CosOp ¶ added in v0.3.0

type CosOp struct {
	// contains filtered or unexported fields
}

CosOp represents the cosine operation: y = cos(x).

Backward pass:

d(cos(x))/dx = -sin(x)
grad_input = grad_output * (-sin(input))

func NewCosOp ¶ added in v0.3.0

func NewCosOp(input, output *tensor.RawTensor) *CosOp

NewCosOp creates a new CosOp.

func (*CosOp) Backward ¶ added in v0.3.0

func (op *CosOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for cos.

Since d(cos(x))/dx = -sin(x): grad_input = grad_output * (-sin(input)).

func (*CosOp) Inputs ¶ added in v0.3.0

func (op *CosOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*CosOp) Output ¶ added in v0.3.0

func (op *CosOp) Output() *tensor.RawTensor

Output returns the output tensor cos(x).

type CrossEntropyOp ¶

type CrossEntropyOp struct {
	// contains filtered or unexported fields
}

CrossEntropyOp represents the cross-entropy loss operation.

Forward:

Loss = mean(-log_softmax(logits)[targets])

Where log_softmax uses the log-sum-exp trick for numerical stability:

log_softmax(z) = z - (max(z) + log(Σ exp(z - max(z))))

Backward:

∂L/∂logits = (softmax(logits) - y_one_hot) / batch_size

This elegant gradient formula is the key reason why softmax + cross-entropy are often fused together in modern frameworks (PyTorch, TensorFlow, Burn).

Assumptions:

Logits shape: [batch_size, num_classes] (2D)
Targets shape: [batch_size] (1D, class indices)
Output: scalar loss (mean over batch)

func NewCrossEntropyOp ¶

func NewCrossEntropyOp(logits, targets, output *tensor.RawTensor) *CrossEntropyOp

NewCrossEntropyOp creates a new cross-entropy operation.

func (*CrossEntropyOp) Backward ¶

func (op *CrossEntropyOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor

Backward computes the gradient with respect to logits.

Gradient formula:

∂L/∂logits[b,i] = (softmax(logits[b])[i] - y_one_hot[b,i]) / batch_size

Where y_one_hot[b,i] = 1 if i == targets[b], else 0.

Note: The gradient is averaged over the batch size because the forward pass computes mean loss.

func (*CrossEntropyOp) Inputs ¶

func (op *CrossEntropyOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*CrossEntropyOp) Output ¶

func (op *CrossEntropyOp) Output() *tensor.RawTensor

Output returns the output tensor.

type DivOp ¶

type DivOp struct {
	// contains filtered or unexported fields
}

DivOp represents an element-wise division operation: output = a / b.

Backward pass:

d(a/b)/da = 1/b, so grad_a = outputGrad / b
d(a/b)/db = -a/b², so grad_b = -outputGrad * a / b²

func NewDivOp ¶

func NewDivOp(a, b, output *tensor.RawTensor) *DivOp

NewDivOp creates a new DivOp.

func (*DivOp) Backward ¶

func (op *DivOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for division.

func (*DivOp) Inputs ¶

func (op *DivOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*DivOp) Output ¶

func (op *DivOp) Output() *tensor.RawTensor

Output returns the output tensor a / b.

type EmbeddingOp ¶ added in v0.3.0

type EmbeddingOp struct {
	// contains filtered or unexported fields
}

EmbeddingOp represents an embedding lookup operation.

Forward: output[i] = weight[indices[i]]

Backward:

For each index i, accumulate grad_output[i] to grad_weight[indices[i]]
This is a scatter-add operation where gradients for the same index are summed.

Example:

indices = [0, 1, 0]  // index 0 appears twice
grad_output = [[1,2], [3,4], [5,6]]
grad_weight[0] = [1,2] + [5,6] = [6,8]  // Accumulated!
grad_weight[1] = [3,4]

func NewEmbeddingOp ¶ added in v0.3.0

func NewEmbeddingOp(weight, indices, output *tensor.RawTensor) *EmbeddingOp

NewEmbeddingOp creates a new embedding operation.

func (*EmbeddingOp) Backward ¶ added in v0.3.0

func (op *EmbeddingOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for the embedding weights.

Gradient computation:

For each position i in output, grad_output[i] flows back to weight[indices[i]]
Multiple indices pointing to the same embedding accumulate gradients

Algorithm:

Create grad_weight tensor (same shape as weight) initialized to zeros
For each index i: - Read index value: idx = indices[i] - Add grad_output[i] to grad_weight[idx]
Return grad_weight

func (*EmbeddingOp) Inputs ¶ added in v0.3.0

func (op *EmbeddingOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors (weight and indices). Note: Only weight needs gradient; indices are integer indices.

func (*EmbeddingOp) Output ¶ added in v0.3.0

func (op *EmbeddingOp) Output() *tensor.RawTensor

Output returns the output tensor.

type ExpOp ¶ added in v0.3.0

type ExpOp struct {
	// contains filtered or unexported fields
}

ExpOp represents the exponential operation: y = exp(x).

Backward pass:

d(exp(x))/dx = exp(x) = y
grad_input = grad_output * output

func NewExpOp ¶ added in v0.3.0

func NewExpOp(input, output *tensor.RawTensor) *ExpOp

NewExpOp creates a new ExpOp.

func (*ExpOp) Backward ¶ added in v0.3.0

func (op *ExpOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for exp.

Since d(exp(x))/dx = exp(x), and we already have exp(x) as output: grad_input = grad_output * output.

func (*ExpOp) Inputs ¶ added in v0.3.0

func (op *ExpOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*ExpOp) Output ¶ added in v0.3.0

func (op *ExpOp) Output() *tensor.RawTensor

Output returns the output tensor exp(x).

type GatherOp ¶ added in v0.5.0

type GatherOp struct {
	// contains filtered or unexported fields
}

GatherOp represents a gather operation that selects elements along a dimension.

Forward: output = Gather(input, dim, index)

Backward:

Scatter-add gradOutput to gradInput at positions specified by index.
gradInput is initialized to zeros and gradients are accumulated at indexed positions.

Example:

input: [10, 20, 30, 40]
index: [2, 0, 3] along dim=0
output: [30, 10, 40]
gradOutput: [dL/d30, dL/d10, dL/d40]
gradInput: [dL/d10, 0, dL/d30, dL/d40]  (scattered back to original positions)

func NewGatherOp ¶ added in v0.5.0

func NewGatherOp(input *tensor.RawTensor, dim int, index, output *tensor.RawTensor) *GatherOp

NewGatherOp creates a new gather operation.

func (*GatherOp) Backward ¶ added in v0.5.0

func (op *GatherOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for the input tensor.

Gradient computation:

Create gradInput (same shape as input) initialized to zeros
For each position in output, scatter-add gradOutput to gradInput[index]
Multiple indices pointing to the same position accumulate gradients

Algorithm:

Create zero-initialized gradInput with input shape
For each element in gradOutput: - Read corresponding index value - Add gradOutput element to gradInput at indexed position
Return gradInput

func (*GatherOp) Inputs ¶ added in v0.5.0

func (op *GatherOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor. Note: index tensor doesn't need gradient.

func (*GatherOp) Output ¶ added in v0.5.0

func (op *GatherOp) Output() *tensor.RawTensor

Output returns the output tensor.

type LogOp ¶

type LogOp struct {
	// contains filtered or unexported fields
}

LogOp represents element-wise natural logarithm operation.

Forward:

output = log(input)

Backward:

∂L/∂input = ∂L/∂output * (1 / input)

The gradient is the reciprocal of the input, scaled by the output gradient.

func NewLogOp ¶

func NewLogOp(input, output *tensor.RawTensor) *LogOp

NewLogOp creates a new log operation.

func (*LogOp) Backward ¶

func (op *LogOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor

Backward computes the gradient with respect to input.

Gradient formula:

∂L/∂input[i] = ∂L/∂output[i] * (1 / input[i])

Note: This assumes input > 0 (log is only defined for positive values). In practice, a small epsilon (e.g., 1e-8) is often added for numerical stability.

func (*LogOp) Inputs ¶

func (op *LogOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*LogOp) Output ¶

func (op *LogOp) Output() *tensor.RawTensor

Output returns the output tensor.

type LogSoftmaxOp ¶

type LogSoftmaxOp struct {
	// contains filtered or unexported fields
}

LogSoftmaxOp represents the log-softmax operation.

Forward:

log_softmax(x)_i = x_i - max(x) - log(Σ_j exp(x_j - max(x)))

This is more numerically stable than computing softmax then log.

Backward:

∂L/∂x_j = ∂L/∂log_softmax_j - softmax_j * Σ_i ∂L/∂log_softmax_i

Note: We need to cache both log_softmax (output) and softmax for backward.

func NewLogSoftmaxOp ¶

func NewLogSoftmaxOp(input, output *tensor.RawTensor, softmaxData []float32) *LogSoftmaxOp

NewLogSoftmaxOp creates a new log-softmax operation.

Parameters:

input: Input logits
output: Log-softmax output
softmaxData: Pre-computed softmax (needed for backward)

func (*LogSoftmaxOp) Backward ¶

func (op *LogSoftmaxOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor

Backward computes gradient for log-softmax.

Formula:

∂L/∂x[b,j] = ∂L/∂log_softmax[b,j] - softmax[b,j] * Σ_i ∂L/∂log_softmax[b,i]

func (*LogSoftmaxOp) Inputs ¶

func (op *LogSoftmaxOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*LogSoftmaxOp) Output ¶

func (op *LogSoftmaxOp) Output() *tensor.RawTensor

Output returns the output tensor.

type LogWithEpsilonOp ¶

type LogWithEpsilonOp struct {
	// contains filtered or unexported fields
}

LogWithEpsilonOp represents log with numerical stability epsilon.

Forward:

output = log(input + epsilon)

This is numerically more stable when input might be very close to zero.

func NewLogWithEpsilonOp ¶

func NewLogWithEpsilonOp(input, output *tensor.RawTensor, epsilon float64) *LogWithEpsilonOp

NewLogWithEpsilonOp creates a log operation with epsilon for stability.

func (*LogWithEpsilonOp) Backward ¶

func (op *LogWithEpsilonOp) Backward(outputGrad *tensor.RawTensor, _ tensor.Backend) []*tensor.RawTensor

Backward computes gradient: ∂L/∂input = ∂L/∂output / (input + epsilon).

func (*LogWithEpsilonOp) Inputs ¶

func (op *LogWithEpsilonOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*LogWithEpsilonOp) Output ¶

func (op *LogWithEpsilonOp) Output() *tensor.RawTensor

Output returns the output tensor.

type MatMulOp ¶

type MatMulOp struct {
	// contains filtered or unexported fields
}

MatMulOp represents a matrix multiplication operation: output = a @ b.

Backward pass:

d(A@B)/dA = outputGrad @ B^T
d(A@B)/dB = A^T @ outputGrad

Where @ denotes matrix multiplication and ^T denotes transpose.

func NewMatMulOp ¶

func NewMatMulOp(a, b, output *tensor.RawTensor) *MatMulOp

NewMatMulOp creates a new MatMulOp.

func (*MatMulOp) Backward ¶

func (op *MatMulOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for matrix multiplication.

func (*MatMulOp) Inputs ¶

func (op *MatMulOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*MatMulOp) Output ¶

func (op *MatMulOp) Output() *tensor.RawTensor

Output returns the output tensor a @ b.

type MaxPool2DOp ¶

type MaxPool2DOp struct {
	// contains filtered or unexported fields
}

MaxPool2DOp records a max pooling operation for autodiff.

Forward:

output[n,c,h,w] = max(input[n,c,h*stride+kh,w*stride+kw] for kh,kw in kernel)

Backward:

Input gradient: Gradients flow only to positions that had the max value
For each output position, only one input position receives gradient
All other positions in pooling window receive zero gradient

Example (2x2 pool, stride=2):

Input:  [[1, 2],  Output: [4]  Input Grad: [[0, 0],
         [3, 4]]                             [0, grad]]

Unlike Conv2D which has learnable parameters, MaxPool2D only has input gradients.

func NewMaxPool2DOp ¶

func NewMaxPool2DOp(input, output *tensor.RawTensor, kernelSize, stride int) *MaxPool2DOp

NewMaxPool2DOp creates a new MaxPool2D operation.

CRITICAL: Must compute and store max indices during forward pass! Without max indices, backward pass cannot route gradients correctly.

func (*MaxPool2DOp) Backward ¶

func (op *MaxPool2DOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for MaxPool2D.

Gradient routing:

Initialize input gradient to zeros
For each output gradient value
Route it to the input position that had the max value (stored in maxIndices)
All other positions in pooling window remain zero

This implements the subgradient of the max function:

∂max(x_i)/∂x_j = 1 if j = argmax(x_i), else 0

This is pure orchestration - delegates computation to backend.

References:

Burn framework: crates/burn-autodiff/src/ops/module.rs (max_pool2d_backward)

func (*MaxPool2DOp) Inputs ¶

func (op *MaxPool2DOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*MaxPool2DOp) Output ¶

func (op *MaxPool2DOp) Output() *tensor.RawTensor

Output returns the output tensor.

type MeanDimOp ¶ added in v0.3.0

type MeanDimOp struct {
	// contains filtered or unexported fields
}

MeanDimOp represents a reduction mean operation along a dimension: output = mean(x, dim).

Forward:

y = mean(x, dim, keepDim) = sum(x, dim, keepDim) / size[dim]

Backward:

grad_x = broadcast(grad_y, x.shape) / size[dim]

If keepDim=false, we need to unsqueeze grad_y first to match broadcasting requirements.

func NewMeanDimOp ¶ added in v0.3.0

func NewMeanDimOp(x, output *tensor.RawTensor, dim int, keepDim bool) *MeanDimOp

NewMeanDimOp creates a new MeanDimOp.

func (*MeanDimOp) Backward ¶ added in v0.3.0

func (op *MeanDimOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for mean reduction.

The gradient flows by broadcasting grad_output to match input shape, then dividing by the size of the reduced dimension.

func (*MeanDimOp) Inputs ¶ added in v0.3.0

func (op *MeanDimOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [x].

func (*MeanDimOp) Output ¶ added in v0.3.0

func (op *MeanDimOp) Output() *tensor.RawTensor

Output returns the output tensor mean(x, dim).

type MulOp ¶

type MulOp struct {
	// contains filtered or unexported fields
}

MulOp represents an element-wise multiplication operation: output = a * b.

Backward pass:

d(a*b)/da = b, so grad_a = outputGrad * b
d(a*b)/db = a, so grad_b = outputGrad * a

func NewMulOp ¶

func NewMulOp(a, b, output *tensor.RawTensor) *MulOp

NewMulOp creates a new MulOp.

func (*MulOp) Backward ¶

func (op *MulOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for multiplication.

func (*MulOp) Inputs ¶

func (op *MulOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*MulOp) Output ¶

func (op *MulOp) Output() *tensor.RawTensor

Output returns the output tensor a * b.

type MultiOutputOperation ¶ added in v0.5.0

type MultiOutputOperation interface {
	Operation

	// Outputs returns all output tensors produced by this operation.
	Outputs() []*tensor.RawTensor

	// BackwardMulti computes gradients for inputs given gradients for ALL outputs.
	// This is used instead of Backward for multi-output operations.
	//
	// Example for ChunkOp (splits [a,b,c,d] into [a,b] and [c,d]):
	//   outputGrads: [grad_chunk1, grad_chunk2]
	//   returns: [grad_input] where grad_input = Cat(outputGrads)
	BackwardMulti(outputGrads []*tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor
}

MultiOutputOperation represents an operation that produces multiple outputs. Examples: Chunk (splits tensor into multiple parts), Split.

The tape handles these specially by collecting gradients for ALL outputs before calling BackwardMulti.

type Operation ¶

type Operation interface {
	// Backward computes gradients for inputs given the output gradient.
	// Returns a slice of gradients corresponding to each input tensor.
	//
	// Example for AddOp:
	//   inputs: [a, b]
	//   outputGrad: dL/d(a+b)
	//   returns: [dL/d(a+b), dL/d(a+b)] (gradient flows equally to both inputs)
	Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

	// Inputs returns the input tensors for this operation.
	Inputs() []*tensor.RawTensor

	// Output returns the output tensor produced by this operation.
	Output() *tensor.RawTensor
}

Operation represents a differentiable operation in the computation graph. Each operation records its inputs and output during the forward pass, and computes input gradients during the backward pass.

type ReLUOp ¶

type ReLUOp struct {
	// contains filtered or unexported fields
}

ReLUOp represents a ReLU (Rectified Linear Unit) activation: output = max(0, x).

Backward pass:

d(ReLU(x))/dx = 1 if x > 0, else 0

The gradient is computed by creating a mask where input > 0, then multiplying the output gradient by this mask.

func NewReLUOp ¶

func NewReLUOp(input, output *tensor.RawTensor) *ReLUOp

NewReLUOp creates a new ReLUOp.

func (*ReLUOp) Backward ¶

func (op *ReLUOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for ReLU.

func (*ReLUOp) Inputs ¶

func (op *ReLUOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*ReLUOp) Output ¶

func (op *ReLUOp) Output() *tensor.RawTensor

Output returns the output tensor max(0, x).

type ReshapeOp ¶

type ReshapeOp struct {
	// contains filtered or unexported fields
}

ReshapeOp records a reshape operation for autodiff.

Forward: output = Reshape(input, newShape)

Backward:

d_input: Reshape(d_output, input.shape())

Reshape backward is simple: reshape the output gradient back to the original input shape.

func NewReshapeOp ¶

func NewReshapeOp(input, output *tensor.RawTensor) *ReshapeOp

NewReshapeOp creates a new Reshape operation.

func (*ReshapeOp) Backward ¶

func (op *ReshapeOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for Reshape.

The gradient of reshape is simple: reshape the output gradient back to the input shape. No actual computation needed.

func (*ReshapeOp) Inputs ¶

func (op *ReshapeOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*ReshapeOp) Output ¶

func (op *ReshapeOp) Output() *tensor.RawTensor

Output returns the output tensor.

type RsqrtOp ¶ added in v0.3.0

type RsqrtOp struct {
	// contains filtered or unexported fields
}

RsqrtOp represents the reciprocal square root operation: y = 1/sqrt(x).

Backward pass:

d(1/sqrt(x))/dx = -0.5 * x^(-3/2) = -0.5 * (1/sqrt(x))^3 = -0.5 * y^3
grad_input = grad_output * (-0.5) * output^3

func NewRsqrtOp ¶ added in v0.3.0

func NewRsqrtOp(input, output *tensor.RawTensor) *RsqrtOp

NewRsqrtOp creates a new RsqrtOp.

func (*RsqrtOp) Backward ¶ added in v0.3.0

func (op *RsqrtOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for rsqrt.

Since d(1/sqrt(x))/dx = -0.5 * y^3, where y = 1/sqrt(x): grad_input = grad_output * (-0.5) * output^3.

func (*RsqrtOp) Inputs ¶ added in v0.3.0

func (op *RsqrtOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*RsqrtOp) Output ¶ added in v0.3.0

func (op *RsqrtOp) Output() *tensor.RawTensor

Output returns the output tensor 1/sqrt(x).

type SiLUOp ¶ added in v0.3.0

type SiLUOp struct {
	// contains filtered or unexported fields
}

SiLUOp represents the SiLU (Swish) activation operation: y = x * sigmoid(x).

Also known as Swish activation, widely used in modern transformers (LLaMA, Mistral, GPT-Neo).

func NewSiLUOp ¶ added in v0.3.0

func NewSiLUOp(input, output *tensor.RawTensor) *SiLUOp

NewSiLUOp creates a new SiLU operation.

func (*SiLUOp) Backward ¶ added in v0.3.0

func (op *SiLUOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes the gradient for SiLU.

For y = x * sigmoid(x):

dy/dx = sigmoid(x) + x * sigmoid(x) * (1 - sigmoid(x))
      = sigmoid(x) * (1 + x * (1 - sigmoid(x)))

We compute the gradient directly for numerical accuracy.

func (*SiLUOp) Inputs ¶ added in v0.3.0

func (op *SiLUOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*SiLUOp) Output ¶ added in v0.3.0

func (op *SiLUOp) Output() *tensor.RawTensor

Output returns the output tensor.

type SigmoidOp ¶

type SigmoidOp struct {
	// contains filtered or unexported fields
}

SigmoidOp represents the sigmoid activation operation: σ(x) = 1 / (1 + exp(-x)).

func NewSigmoidOp ¶

func NewSigmoidOp(input, output *tensor.RawTensor) *SigmoidOp

NewSigmoidOp creates a new sigmoid operation.

func (*SigmoidOp) Backward ¶

func (op *SigmoidOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes the gradient for sigmoid.

For σ(x) = 1 / (1 + exp(-x)): dσ/dx = σ(x) * (1 - σ(x))

Since we have the output σ(x) already computed, we can use it: grad_input = grad_output * output * (1 - output).

func (*SigmoidOp) Inputs ¶

func (op *SigmoidOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*SigmoidOp) Output ¶

func (op *SigmoidOp) Output() *tensor.RawTensor

Output returns the output tensor.

type SinOp ¶ added in v0.3.0

type SinOp struct {
	// contains filtered or unexported fields
}

SinOp represents the sine operation: y = sin(x).

Backward pass:

d(sin(x))/dx = cos(x)
grad_input = grad_output * cos(input)

func NewSinOp ¶ added in v0.3.0

func NewSinOp(input, output *tensor.RawTensor) *SinOp

NewSinOp creates a new SinOp.

func (*SinOp) Backward ¶ added in v0.3.0

func (op *SinOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for sin.

Since d(sin(x))/dx = cos(x): grad_input = grad_output * cos(input).

func (*SinOp) Inputs ¶ added in v0.3.0

func (op *SinOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*SinOp) Output ¶ added in v0.3.0

func (op *SinOp) Output() *tensor.RawTensor

Output returns the output tensor sin(x).

type SoftmaxOp ¶

type SoftmaxOp struct {
	// contains filtered or unexported fields
}

SoftmaxOp represents the softmax operation along a specified dimension.

Forward (for each slice along dim):

softmax(x)_i = exp(x_i - max(x)) / Σ_j exp(x_j - max(x))

The max-shifting ensures numerical stability (prevents overflow).

Backward:

The Jacobian of softmax is:
∂softmax_i/∂x_j = softmax_i * (δ_ij - softmax_j)

Chain rule gives:
∂L/∂x_j = Σ_i (∂L/∂softmax_i) * softmax_i * (δ_ij - softmax_j)
        = softmax_j * (∂L/∂softmax_j - Σ_i (∂L/∂softmax_i * softmax_i))

Simplified formula:
∂L/∂x = y * (upstream_grad - sum(y * upstream_grad, dim=axis, keepdim=True))

Supports:

N-dimensional tensors (2D, 3D, 4D, etc.)
Softmax applied along any dimension (positive or negative indexing)

func NewSoftmaxOp ¶

func NewSoftmaxOp(input, output *tensor.RawTensor, dim int) *SoftmaxOp

NewSoftmaxOp creates a new softmax operation.

func (*SoftmaxOp) Backward ¶

func (op *SoftmaxOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes the gradient with respect to input.

Uses the simplified formula:

∂L/∂x = y * (upstream_grad - sum(y * upstream_grad, dim=axis, keepdim=True))

Where:

y is the softmax output (op.output)
upstream_grad is the gradient from the next layer
sum is performed along the same dimension as softmax (op.dim)

This formula works for N-dimensional tensors (2D, 3D, 4D, etc.).

func (*SoftmaxOp) Inputs ¶

func (op *SoftmaxOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*SoftmaxOp) Output ¶

func (op *SoftmaxOp) Output() *tensor.RawTensor

Output returns the output tensor.

type SqrtOp ¶ added in v0.3.0

type SqrtOp struct {
	// contains filtered or unexported fields
}

SqrtOp represents the square root operation: y = sqrt(x).

Backward pass:

d(sqrt(x))/dx = 1 / (2 * sqrt(x)) = 0.5 / y
grad_input = grad_output * 0.5 / output

func NewSqrtOp ¶ added in v0.3.0

func NewSqrtOp(input, output *tensor.RawTensor) *SqrtOp

NewSqrtOp creates a new SqrtOp.

func (*SqrtOp) Backward ¶ added in v0.3.0

func (op *SqrtOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for sqrt.

Since d(sqrt(x))/dx = 0.5 / sqrt(x), and we have sqrt(x) as output: grad_input = grad_output * 0.5 / output.

func (*SqrtOp) Inputs ¶ added in v0.3.0

func (op *SqrtOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensor [x].

func (*SqrtOp) Output ¶ added in v0.3.0

func (op *SqrtOp) Output() *tensor.RawTensor

Output returns the output tensor sqrt(x).

type SubOp ¶

type SubOp struct {
	// contains filtered or unexported fields
}

SubOp represents an element-wise subtraction operation: output = a - b.

Backward pass:

d(a-b)/da = 1, so grad_a = outputGrad
d(a-b)/db = -1, so grad_b = -outputGrad

func NewSubOp ¶

func NewSubOp(a, b, output *tensor.RawTensor) *SubOp

NewSubOp creates a new SubOp.

func (*SubOp) Backward ¶

func (op *SubOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for subtraction.

func (*SubOp) Inputs ¶

func (op *SubOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [a, b].

func (*SubOp) Output ¶

func (op *SubOp) Output() *tensor.RawTensor

Output returns the output tensor a - b.

type SumDimOp ¶ added in v0.3.0

type SumDimOp struct {
	// contains filtered or unexported fields
}

SumDimOp represents a reduction sum operation along a dimension: output = sum(x, dim).

Forward:

y = sum(x, dim, keepDim)

Backward:

grad_x = broadcast(grad_y, x.shape)

If keepDim=false, we need to unsqueeze grad_y first to match broadcasting requirements.

func NewSumDimOp ¶ added in v0.3.0

func NewSumDimOp(x, output *tensor.RawTensor, dim int, keepDim bool) *SumDimOp

NewSumDimOp creates a new SumDimOp.

func (*SumDimOp) Backward ¶ added in v0.3.0

func (op *SumDimOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradients for sum reduction.

The gradient flows by broadcasting grad_output to match input shape. Since sum just accumulates values, each input element contributes 1.0 to the output, so the gradient is simply broadcast back.

func (*SumDimOp) Inputs ¶ added in v0.3.0

func (op *SumDimOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors [x].

func (*SumDimOp) Output ¶ added in v0.3.0

func (op *SumDimOp) Output() *tensor.RawTensor

Output returns the output tensor sum(x, dim).

type TanhOp ¶

type TanhOp struct {
	// contains filtered or unexported fields
}

TanhOp represents the hyperbolic tangent activation: tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)).

func NewTanhOp ¶

func NewTanhOp(input, output *tensor.RawTensor) *TanhOp

NewTanhOp creates a new tanh operation.

func (*TanhOp) Backward ¶

func (op *TanhOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes the gradient for tanh.

For tanh(x): d(tanh(x))/dx = 1 - tanh²(x)

Since we have the output tanh(x) already computed: grad_input = grad_output * (1 - output²).

func (*TanhOp) Inputs ¶

func (op *TanhOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*TanhOp) Output ¶

func (op *TanhOp) Output() *tensor.RawTensor

Output returns the output tensor.

type TransposeOp ¶

type TransposeOp struct {
	// contains filtered or unexported fields
}

TransposeOp represents a transpose operation.

Forward:

output = transpose(input, axes)

Backward:

∂L/∂input = transpose(∂L/∂output, inverse_axes)

The gradient of transpose is transpose with inverse axes.

func NewTransposeOp ¶

func NewTransposeOp(input, output *tensor.RawTensor, axes []int) *TransposeOp

NewTransposeOp creates a new TransposeOp.

func (*TransposeOp) Backward ¶

func (op *TransposeOp) Backward(outputGrad *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes input gradient for transpose.

The gradient of transpose is transpose with inverted axes. For example, if forward uses axes [1, 0] (swap), then backward also uses [1, 0].

func (*TransposeOp) Inputs ¶

func (op *TransposeOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors.

func (*TransposeOp) Output ¶

func (op *TransposeOp) Output() *tensor.RawTensor

Output returns the output tensor.

type WhereOp ¶ added in v0.5.0

type WhereOp struct {
	// contains filtered or unexported fields
}

WhereOp represents a conditional selection: output = where(cond, x, y).

Forward: output[i] = x[i] if cond[i] else y[i]

Backward:

grad_x = where(cond, grad_out, 0)
grad_y = where(cond, 0, grad_out)

The condition tensor has no gradient (it's boolean).

func NewWhereOp ¶ added in v0.5.0

func NewWhereOp(condition, x, y, output *tensor.RawTensor) *WhereOp

NewWhereOp creates a new where operation.

func (*WhereOp) Backward ¶ added in v0.5.0

func (op *WhereOp) Backward(gradOutput *tensor.RawTensor, backend tensor.Backend) []*tensor.RawTensor

Backward computes gradients for x and y.

grad_x = where(cond, grad_out, 0)  -- gradient flows only where cond is true
grad_y = where(cond, 0, grad_out)  -- gradient flows only where cond is false

func (*WhereOp) Inputs ¶ added in v0.5.0

func (op *WhereOp) Inputs() []*tensor.RawTensor

Inputs returns the input tensors (x and y). Note: condition is not included as it has no gradient (boolean).

func (*WhereOp) Output ¶ added in v0.5.0

func (op *WhereOp) Output() *tensor.RawTensor

Output returns the output tensor.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func CrossEntropyForward ¶

func Exp ¶

func Log ¶

func Softmax ¶

Types ¶

type AddOp ¶

func NewAddOp ¶

func (*AddOp) Backward ¶

func (*AddOp) Inputs ¶

func (*AddOp) Output ¶

type BatchMatMulOp ¶ added in v0.4.0

func NewBatchMatMulOp ¶ added in v0.4.0

func (*BatchMatMulOp) Backward ¶ added in v0.4.0

func (*BatchMatMulOp) Inputs ¶ added in v0.4.0

func (*BatchMatMulOp) Output ¶ added in v0.4.0

type CatOp ¶ added in v0.5.0

func NewCatOp ¶ added in v0.5.0

func (*CatOp) Backward ¶ added in v0.5.0

func (*CatOp) Inputs ¶ added in v0.5.0

func (*CatOp) Output ¶ added in v0.5.0

type ChunkOp ¶ added in v0.5.0

func NewChunkOp ¶ added in v0.5.0

func (*ChunkOp) Backward ¶ added in v0.5.0

func (*ChunkOp) BackwardMulti ¶ added in v0.5.0

func (*ChunkOp) Inputs ¶ added in v0.5.0

func (*ChunkOp) Output ¶ added in v0.5.0

func (*ChunkOp) Outputs ¶ added in v0.5.0

type Conv2DOp ¶

func NewConv2DOp ¶

func (*Conv2DOp) Backward ¶

func (*Conv2DOp) Inputs ¶

func (*Conv2DOp) Output ¶

type CosOp ¶ added in v0.3.0

func NewCosOp ¶ added in v0.3.0

func (*CosOp) Backward ¶ added in v0.3.0

func (*CosOp) Inputs ¶ added in v0.3.0

func (*CosOp) Output ¶ added in v0.3.0

type CrossEntropyOp ¶

func NewCrossEntropyOp ¶

func (*CrossEntropyOp) Backward ¶

func (*CrossEntropyOp) Inputs ¶

func (*CrossEntropyOp) Output ¶

type DivOp ¶

func NewDivOp ¶

func (*DivOp) Backward ¶

func (*DivOp) Inputs ¶

func (*DivOp) Output ¶

type EmbeddingOp ¶ added in v0.3.0

func NewEmbeddingOp ¶ added in v0.3.0

func (*EmbeddingOp) Backward ¶ added in v0.3.0

func (*EmbeddingOp) Inputs ¶ added in v0.3.0

func (*EmbeddingOp) Output ¶ added in v0.3.0

type ExpOp ¶ added in v0.3.0

func NewExpOp ¶ added in v0.3.0

func (*ExpOp) Backward ¶ added in v0.3.0

func (*ExpOp) Inputs ¶ added in v0.3.0

func (*ExpOp) Output ¶ added in v0.3.0

type GatherOp ¶ added in v0.5.0

func NewGatherOp ¶ added in v0.5.0

func (*GatherOp) Backward ¶ added in v0.5.0

func (*GatherOp) Inputs ¶ added in v0.5.0

func (*GatherOp) Output ¶ added in v0.5.0

type LogOp ¶

func NewLogOp ¶

func (*LogOp) Backward ¶

func (*LogOp) Inputs ¶

func (*LogOp) Output ¶

type LogSoftmaxOp ¶

func NewLogSoftmaxOp ¶

func (*LogSoftmaxOp) Backward ¶

func (*LogSoftmaxOp) Inputs ¶

func (*LogSoftmaxOp) Output ¶

type LogWithEpsilonOp ¶

func NewLogWithEpsilonOp ¶