Documentation
¶
Overview ¶
Package webgpu implements the WebGPU backend for GPU-accelerated tensor operations. Uses go-webgpu (github.com/go-webgpu/webgpu) for zero-CGO WebGPU bindings.
Package webgpu implements the WebGPU backend for GPU-accelerated tensor operations.
Package webgpu implements the WebGPU backend for GPU-accelerated tensor operations.
Package webgpu implements the WebGPU backend for GPU-accelerated tensor operations.
Package webgpu provides embedded WGSL compute shaders for tensor operations.
Index ¶
- func IsAvailable() (available bool)
- func ListAdapters() (adapters []*wgpu.AdapterInfoGo, err error)
- type Backend
- func (b *Backend) AdapterInfo() *wgpu.AdapterInfoGo
- func (b *Backend) Add(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) AddBackwardGPU(_, _, grad *GPUTensor) (*GPUTensor, *GPUTensor)
- func (b *Backend) AddGPU(a, c *GPUTensor) *GPUTensor
- func (b *Backend) AddScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *Backend) And(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Argmax(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *Backend) BatchMatMul(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Cast(x *tensor.RawTensor, dtype tensor.DataType) *tensor.RawTensor
- func (b *Backend) Cat(tensors []*tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *Backend) Chunk(x *tensor.RawTensor, n, dim int) []*tensor.RawTensor
- func (b *Backend) Conv2D(input, kernel *tensor.RawTensor, stride, padding int) *tensor.RawTensor
- func (b *Backend) Conv2DInputBackward(input, kernel, grad *tensor.RawTensor, stride, padding int) *tensor.RawTensor
- func (b *Backend) Conv2DKernelBackward(input, kernel, grad *tensor.RawTensor, stride, padding int) *tensor.RawTensor
- func (b *Backend) Cos(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Device() tensor.Device
- func (b *Backend) Div(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) DivBackwardGPU(a, c, grad *GPUTensor) (*GPUTensor, *GPUTensor)
- func (b *Backend) DivGPU(a, c *GPUTensor) *GPUTensor
- func (b *Backend) DivScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *Backend) Embedding(weight, indices *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Equal(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Exp(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Expand(x *tensor.RawTensor, newShape tensor.Shape) *tensor.RawTensor
- func (b *Backend) FlashAttentionGPU(q, k, v *tensor.RawTensor, scale float32, causal bool, blockSize int) (*tensor.RawTensor, error)
- func (b *Backend) FlushCommands()
- func (b *Backend) FromRawTensor(t *tensor.RawTensor) *GPUTensor
- func (b *Backend) Gather(input *tensor.RawTensor, dim int, indices *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Greater(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) GreaterEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Log(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Lower(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) LowerEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) MatMul(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) MatMulBackwardGPU(a, c, grad *GPUTensor) (*GPUTensor, *GPUTensor)
- func (b *Backend) MatMulGPU(a, c *GPUTensor) *GPUTensor
- func (b *Backend) MaxPool2D(input *tensor.RawTensor, kernelSize, stride int) *tensor.RawTensor
- func (b *Backend) MaxPool2DBackward(input, grad *tensor.RawTensor, maxIndices []int, kernelSize, stride int) *tensor.RawTensor
- func (b *Backend) MeanDim(x *tensor.RawTensor, dim int, keepDim bool) *tensor.RawTensor
- func (b *Backend) MemoryStats() MemoryStats
- func (b *Backend) Mul(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) MulBackwardGPU(a, c, grad *GPUTensor) (*GPUTensor, *GPUTensor)
- func (b *Backend) MulGPU(a, c *GPUTensor) *GPUTensor
- func (b *Backend) MulScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *Backend) Name() string
- func (b *Backend) NewBatch() *CommandBatch
- func (b *Backend) Not(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) NotEqual(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) OnesGPU(shape tensor.Shape, dtype tensor.DataType) *GPUTensor
- func (b *Backend) Or(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) RandGPU(shape tensor.Shape, dtype tensor.DataType) *GPUTensor
- func (b *Backend) ReLU(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) ReLUBackwardGPU(input, grad *GPUTensor) *GPUTensor
- func (b *Backend) ReLUGPU(t *GPUTensor) *GPUTensor
- func (b *Backend) ReadGPUBuffer(bufferPtr unsafe.Pointer, size uint64) ([]byte, error)
- func (b *Backend) Release()
- func (b *Backend) ReleaseGPUBuffer(bufferPtr unsafe.Pointer)
- func (b *Backend) Reshape(t *tensor.RawTensor, newShape tensor.Shape) *tensor.RawTensor
- func (b *Backend) Rsqrt(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) SetLazyMode(enabled bool)
- func (b *Backend) SetMaxBatchSize(size int)
- func (b *Backend) Sigmoid(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) SigmoidBackwardGPU(output, grad *GPUTensor) *GPUTensor
- func (b *Backend) SigmoidGPU(t *GPUTensor) *GPUTensor
- func (b *Backend) Sin(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Softmax(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *Backend) SoftmaxBackwardGPU(output, grad *GPUTensor, dim int) *GPUTensor
- func (b *Backend) SoftmaxGPU(t *GPUTensor, dim int) *GPUTensor
- func (b *Backend) Sqrt(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) Squeeze(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *Backend) Sub(a, other *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) SubBackwardGPU(_, _, grad *GPUTensor) (*GPUTensor, *GPUTensor)
- func (b *Backend) SubGPU(a, c *GPUTensor) *GPUTensor
- func (b *Backend) SubScalar(x *tensor.RawTensor, scalar any) *tensor.RawTensor
- func (b *Backend) Sum(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) SumDim(x *tensor.RawTensor, dim int, keepDim bool) *tensor.RawTensor
- func (b *Backend) SumDimGPU(t *GPUTensor, dim int, keepDim bool) *GPUTensor
- func (b *Backend) Tanh(x *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) TanhBackwardGPU(output, grad *GPUTensor) *GPUTensor
- func (b *Backend) TanhGPU(t *GPUTensor) *GPUTensor
- func (b *Backend) Transpose(t *tensor.RawTensor, axes ...int) *tensor.RawTensor
- func (b *Backend) TransposeGPU(t *GPUTensor, axes ...int) *GPUTensor
- func (b *Backend) Unsqueeze(x *tensor.RawTensor, dim int) *tensor.RawTensor
- func (b *Backend) UploadTensor(raw *tensor.RawTensor) *GPUTensor
- func (b *Backend) Where(condition, x, y *tensor.RawTensor) *tensor.RawTensor
- func (b *Backend) ZerosGPU(shape tensor.Shape, dtype tensor.DataType) *GPUTensor
- type BufferPool
- type BufferSize
- type CommandBatch
- type GPUTape
- func (tape *GPUTape) Backward(loss *GPUTensor) map[*GPUTensor]*GPUTensor
- func (tape *GPUTape) Clear()
- func (tape *GPUTape) Disable()
- func (tape *GPUTape) Enable()
- func (tape *GPUTape) IsEnabled() bool
- func (tape *GPUTape) NumOps() int
- func (tape *GPUTape) Record(name string, inputs []*GPUTensor, output *GPUTensor, ...)
- type GPUTensor
- func (t *GPUTensor) Backward()
- func (t *GPUTensor) Buffer() *wgpu.Buffer
- func (t *GPUTensor) ByteSize() uint64
- func (t *GPUTensor) DType() tensor.DataType
- func (t *GPUTensor) Eval() *GPUTensor
- func (t *GPUTensor) Grad() *GPUTensor
- func (t *GPUTensor) Item() float32
- func (t *GPUTensor) NumElements() int
- func (t *GPUTensor) Release()
- func (t *GPUTensor) RequiresGrad() bool
- func (t *GPUTensor) SetRequiresGrad(requires bool) *GPUTensor
- func (t *GPUTensor) Shape() tensor.Shape
- func (t *GPUTensor) ToCPU() *tensor.RawTensor
- func (t *GPUTensor) ZeroGrad()
- type MemoryStats
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func IsAvailable ¶
func IsAvailable() (available bool)
IsAvailable checks if WebGPU is available on this system.
func ListAdapters ¶
func ListAdapters() (adapters []*wgpu.AdapterInfoGo, err error)
ListAdapters returns information about all available GPU adapters.
Types ¶
type Backend ¶
type Backend struct {
// Lazy mode: when true, operations return lazy tensors that keep data on GPU
// until Data() is explicitly called. This is the key optimization for
// Phase 3 Integration - eliminates readBuffer() bottleneck.
// Default: true for optimal performance.
LazyMode bool
// contains filtered or unexported fields
}
Backend implements tensor operations on GPU using WebGPU.
func New ¶
New creates a new WebGPU backend. Returns an error if WebGPU is not available or initialization fails.
func (*Backend) AdapterInfo ¶
func (b *Backend) AdapterInfo() *wgpu.AdapterInfoGo
AdapterInfo returns information about the GPU adapter.
func (*Backend) Add ¶
Add performs element-wise addition on GPU. Supports float32 and int32 dtypes. In LazyMode (default), returns a lazy tensor that keeps data on GPU.
func (*Backend) AddBackwardGPU ¶ added in v0.6.0
AddBackwardGPU computes gradients for element-wise addition. d(a+b)/da = 1, d(a+b)/db = 1.
func (*Backend) AddGPU ¶ added in v0.6.0
AddGPU performs element-wise addition on GPU tensors. Data stays on GPU - no CPU transfer occurs.
func (*Backend) And ¶ added in v0.3.0
And performs element-wise logical AND on GPU. Supports mixed dtypes by casting to float32 (for boolean tensors from different sources).
func (*Backend) Argmax ¶ added in v0.3.0
Argmax returns indices of maximum values along dimension on GPU.
func (*Backend) BatchMatMul ¶ added in v0.4.0
BatchMatMul performs batched matrix multiplication on GPU. Supports 3D tensors [batch, M, K] @ [batch, K, N] -> [batch, M, N] and 4D tensors [batch, heads, M, K] @ [batch, heads, K, N].
func (*Backend) Cast ¶ added in v0.3.0
Cast converts tensor to different data type. Supports float32 and int32 as target types.
func (*Backend) Chunk ¶ added in v0.3.0
Chunk splits tensor into n equal parts along the specified dimension.
func (*Backend) Conv2D ¶
Conv2D performs 2D convolution on GPU. Input shape: [batch, in_channels, height, width]. Kernel shape: [out_channels, in_channels, kH, kW].
func (*Backend) Conv2DInputBackward ¶ added in v0.7.1
func (b *Backend) Conv2DInputBackward(input, kernel, grad *tensor.RawTensor, stride, padding int) *tensor.RawTensor
Conv2DInputBackward computes gradient with respect to input for Conv2D. Not yet implemented for WebGPU backend.
func (*Backend) Conv2DKernelBackward ¶ added in v0.7.1
func (b *Backend) Conv2DKernelBackward(input, kernel, grad *tensor.RawTensor, stride, padding int) *tensor.RawTensor
Conv2DKernelBackward computes gradient with respect to kernel for Conv2D. Not yet implemented for WebGPU backend.
func (*Backend) Div ¶
Div performs element-wise division on GPU. Supports float32 and int32 dtypes. In LazyMode (default), returns a lazy tensor that keeps data on GPU.
func (*Backend) DivBackwardGPU ¶ added in v0.6.0
DivBackwardGPU computes gradients for element-wise division. d(a/b)/da = 1/b, d(a/b)/db = -a/b^2.
func (*Backend) DivGPU ¶ added in v0.6.0
DivGPU performs element-wise division on GPU tensors. Data stays on GPU - no CPU transfer occurs.
func (*Backend) Embedding ¶ added in v0.5.1
Embedding performs embedding lookup on GPU. weight: [num_embeddings, embedding_dim], indices: int32 tensor. Returns: [...indices_shape, embedding_dim].
func (*Backend) Equal ¶ added in v0.3.0
Equal performs element-wise equality comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) Expand ¶ added in v0.3.0
Expand broadcasts tensor to new shape. GPU-accelerated for up to 6D tensors.
func (*Backend) FlashAttentionGPU ¶ added in v0.7.0
func (b *Backend) FlashAttentionGPU( q, k, v *tensor.RawTensor, scale float32, causal bool, blockSize int, ) (*tensor.RawTensor, error)
FlashAttentionGPU executes Flash Attention 2 on GPU using WebGPU.
This implementation uses tiled computation with online softmax to achieve O(N) memory complexity instead of O(N²) for standard attention.
Parameters:
- q: Query tensor [batch, seqLen, numHeads, headDim]
- k: Key tensor [batch, kvLen, numHeads, headDim]
- v: Value tensor [batch, kvLen, numHeads, headDim]
- scale: Attention scale factor (typically 1/sqrt(headDim))
- causal: Whether to apply causal masking
- blockSize: Tile size for blocked computation (64 or 128)
Returns:
- *tensor.RawTensor: Output tensor [batch, seqLen, numHeads, headDim]
func (*Backend) FlushCommands ¶ added in v0.6.0
func (b *Backend) FlushCommands()
FlushCommands submits all pending command buffers to the GPU queue. Call this when you need to ensure all queued operations are executed. Note: This is called automatically before reading data from GPU buffers.
func (*Backend) FromRawTensor ¶ added in v0.6.0
FromRawTensor uploads a CPU tensor to GPU memory. This creates a new GPUTensor with data copied from the RawTensor.
func (*Backend) Gather ¶ added in v0.3.0
func (b *Backend) Gather(input *tensor.RawTensor, dim int, indices *tensor.RawTensor) *tensor.RawTensor
Gather selects elements along dim using index tensor on GPU.
func (*Backend) Greater ¶ added in v0.3.0
Greater performs element-wise greater-than comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) GreaterEqual ¶ added in v0.3.0
GreaterEqual performs element-wise greater-or-equal comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) Lower ¶ added in v0.3.0
Lower performs element-wise less-than comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) LowerEqual ¶ added in v0.3.0
LowerEqual performs element-wise less-or-equal comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) MatMulBackwardGPU ¶ added in v0.6.0
MatMulBackwardGPU computes gradients for matrix multiplication. d(A@B)/dA = grad@B^T, d(A@B)/dB = A^T@grad.
func (*Backend) MatMulGPU ¶ added in v0.6.0
MatMulGPU performs matrix multiplication on GPU tensors. Data stays on GPU - no CPU transfer occurs.
func (*Backend) MaxPool2D ¶
MaxPool2D performs 2D max pooling on GPU. Input shape: [batch, channels, height, width].
func (*Backend) MaxPool2DBackward ¶ added in v0.7.1
func (b *Backend) MaxPool2DBackward(input, grad *tensor.RawTensor, maxIndices []int, kernelSize, stride int) *tensor.RawTensor
MaxPool2DBackward computes gradient with respect to input for MaxPool2D. Not yet implemented for WebGPU backend.
func (*Backend) MemoryStats ¶
func (b *Backend) MemoryStats() MemoryStats
MemoryStats returns current GPU memory usage statistics.
func (*Backend) Mul ¶
Mul performs element-wise multiplication on GPU. Supports float32 and int32 dtypes. In LazyMode (default), returns a lazy tensor that keeps data on GPU.
func (*Backend) MulBackwardGPU ¶ added in v0.6.0
MulBackwardGPU computes gradients for element-wise multiplication. d(a*b)/da = b, d(a*b)/db = a.
func (*Backend) MulGPU ¶ added in v0.6.0
MulGPU performs element-wise multiplication on GPU tensors. Data stays on GPU - no CPU transfer occurs.
func (*Backend) MulScalar ¶ added in v0.3.0
MulScalar multiplies tensor elements by a scalar on GPU.
func (*Backend) NewBatch ¶ added in v0.6.0
func (b *Backend) NewBatch() *CommandBatch
NewBatch creates a new command batch for accumulating operations. The batch will use a single CommandEncoder for all operations.
func (*Backend) NotEqual ¶ added in v0.3.0
NotEqual performs element-wise inequality comparison on GPU. Always returns float32 tensor (0.0 for false, 1.0 for true).
func (*Backend) OnesGPU ¶ added in v0.6.0
OnesGPU creates a GPU tensor filled with ones. Data is initialized to ones on CPU then uploaded to GPU.
func (*Backend) Or ¶ added in v0.3.0
Or performs element-wise logical OR on GPU. Supports mixed dtypes by casting to float32 (for boolean tensors from different sources).
func (*Backend) RandGPU ¶ added in v0.6.0
RandGPU creates a random GPU tensor with uniform distribution [0, 1). Data is generated on CPU using math/rand then uploaded to GPU.
func (*Backend) ReLUBackwardGPU ¶ added in v0.6.0
ReLUBackwardGPU computes gradients for ReLU activation. d(ReLU(x))/dx = 1 if x > 0, else 0. grad_input = grad * (input > 0).
func (*Backend) ReLUGPU ¶ added in v0.6.0
ReLUGPU applies ReLU activation on GPU: max(0, x). Data stays on GPU - no CPU transfer occurs.
func (*Backend) ReadGPUBuffer ¶ added in v0.6.0
ReadGPUBuffer implements tensor.LazyBackend interface. Reads data from a GPU buffer to CPU memory. bufferPtr must be *wgpu.Buffer.
func (*Backend) Release ¶
func (b *Backend) Release()
Release releases all WebGPU resources. Must be called when the backend is no longer needed.
func (*Backend) ReleaseGPUBuffer ¶ added in v0.6.0
ReleaseGPUBuffer implements tensor.LazyBackend interface. Releases a GPU buffer when no longer needed. bufferPtr must be *wgpu.Buffer.
func (*Backend) Reshape ¶
Reshape returns a tensor with new shape. This is typically a metadata-only operation (zero-copy).
func (*Backend) SetLazyMode ¶ added in v0.6.0
SetLazyMode enables or disables lazy evaluation mode. When enabled (default), operations return lazy tensors that keep data on GPU until Data() is explicitly called. This dramatically improves performance by eliminating unnecessary GPU→CPU transfers. When disabled, operations immediately transfer results to CPU (slower).
func (*Backend) SetMaxBatchSize ¶ added in v0.6.0
SetMaxBatchSize sets the maximum number of commands to accumulate before auto-flush. Set to 0 (default) to disable auto-flush limit. Typical values: 32-128 for balanced latency/throughput.
func (*Backend) SigmoidBackwardGPU ¶ added in v0.6.0
SigmoidBackwardGPU computes gradients for sigmoid activation. d(sigmoid(x))/dx = sigmoid(x) * (1 - sigmoid(x)).
func (*Backend) SigmoidGPU ¶ added in v0.6.0
SigmoidGPU applies sigmoid activation on GPU: 1 / (1 + exp(-x)). Data stays on GPU - no CPU transfer occurs.
func (*Backend) Softmax ¶
Softmax applies softmax along the specified dimension. Supports N-dimensional tensors with dim=-1 (last dimension).
func (*Backend) SoftmaxBackwardGPU ¶ added in v0.6.0
SoftmaxBackwardGPU computes gradients for softmax activation. d_input[i] = s[i] * (grad[i] - sum(s * grad)) where s = softmax output.
func (*Backend) SoftmaxGPU ¶ added in v0.6.0
SoftmaxGPU applies softmax activation along the specified dimension. For now, only last dimension (dim=-1) is supported efficiently on GPU. Data stays on GPU - no CPU transfer occurs.
func (*Backend) Squeeze ¶ added in v0.3.0
Squeeze removes a dimension of size 1 at the specified position.
func (*Backend) Sub ¶
Sub performs element-wise subtraction on GPU. Supports float32 and int32 dtypes. In LazyMode (default), returns a lazy tensor that keeps data on GPU.
func (*Backend) SubBackwardGPU ¶ added in v0.6.0
SubBackwardGPU computes gradients for element-wise subtraction. d(a-b)/da = 1, d(a-b)/db = -1.
func (*Backend) SubGPU ¶ added in v0.6.0
SubGPU performs element-wise subtraction on GPU tensors. Data stays on GPU - no CPU transfer occurs.
func (*Backend) SubScalar ¶ added in v0.3.0
SubScalar subtracts a scalar from tensor elements on GPU.
func (*Backend) SumDim ¶ added in v0.3.0
SumDim sums along a dimension. Implemented on CPU as reduction operations are complex on GPU.
func (*Backend) SumDimGPU ¶ added in v0.6.0
SumDimGPU computes sum along the last dimension. Input: [batch, dim], Output: [batch].
func (*Backend) TanhBackwardGPU ¶ added in v0.6.0
TanhBackwardGPU computes gradients for tanh activation. d(tanh(x))/dx = 1 - tanh(x)^2.
func (*Backend) TanhGPU ¶ added in v0.6.0
TanhGPU applies tanh activation on GPU. Data stays on GPU - no CPU transfer occurs.
func (*Backend) Transpose ¶
Transpose transposes the tensor by permuting its dimensions. GPU-accelerated for 2D (optimized) and ND tensors (up to 6D).
func (*Backend) TransposeGPU ¶ added in v0.6.0
TransposeGPU transposes a 2D tensor on GPU. Data stays on GPU - no CPU transfer occurs.
func (*Backend) Unsqueeze ¶ added in v0.3.0
Unsqueeze adds a dimension of size 1 at the specified position.
func (*Backend) UploadTensor ¶ added in v0.6.0
UploadTensor uploads a CPU tensor to GPU memory. Returns a GPUTensor that can be used for lazy GPU operations.
type BufferPool ¶
type BufferPool struct {
// contains filtered or unexported fields
}
BufferPool manages GPU buffer reuse to reduce allocation overhead. Buffers are categorized by size and usage flags.
func NewBufferPool ¶
func NewBufferPool(device *wgpu.Device) *BufferPool
NewBufferPool creates a new buffer pool for the given device.
func (*BufferPool) Acquire ¶
func (p *BufferPool) Acquire(size uint64, usage wgpu.BufferUsage) *wgpu.Buffer
Acquire gets a buffer from the pool or creates a new one. Returns a buffer that matches or exceeds the requested size and usage.
func (*BufferPool) Clear ¶
func (p *BufferPool) Clear()
Clear releases all pooled buffers. Should be called when the backend is released.
func (*BufferPool) Release ¶
func (p *BufferPool) Release(buffer *wgpu.Buffer, size uint64, usage wgpu.BufferUsage)
Release returns a buffer to the pool for reuse. If the pool is full, the buffer is immediately released.
func (*BufferPool) Stats ¶
func (p *BufferPool) Stats() (allocated, released, hits, misses uint64, pooledCount int)
Stats returns statistics about buffer pool usage.
type BufferSize ¶
type BufferSize int
BufferSize represents different buffer size categories for pooling.
const ( // SmallBuffer for tensors < 4KB. SmallBuffer BufferSize = iota // MediumBuffer for tensors 4KB-1MB. MediumBuffer // LargeBuffer for tensors > 1MB. LargeBuffer )
type CommandBatch ¶ added in v0.6.0
type CommandBatch struct {
// contains filtered or unexported fields
}
CommandBatch accumulates GPU operations for single submission. Instead of submitting each operation separately (causing GPU overhead), we collect all operations in a batch and submit them together.
func (*CommandBatch) Add ¶ added in v0.6.0
func (batch *CommandBatch) Add(name string, output *GPUTensor, execFunc func()) *CommandBatch
Add adds an operation to the batch. The operation function should encode the compute pass but NOT submit it. Returns the batch for method chaining.
func (*CommandBatch) Count ¶ added in v0.6.0
func (batch *CommandBatch) Count() int
Count returns the number of operations in the batch.
func (*CommandBatch) Submit ¶ added in v0.6.0
func (batch *CommandBatch) Submit()
Submit executes all batched operations in a single GPU submission. This dramatically reduces GPU overhead compared to submitting each operation separately.
Example performance difference:
3 separate submissions: encode → submit → wait (×3) = ~1.5ms overhead 1 batched submission: encode → encode → encode → submit → wait = ~0.5ms overhead
The batch is consumed after Submit() and cannot be reused.
type GPUTape ¶ added in v0.6.0
type GPUTape struct {
// contains filtered or unexported fields
}
GPUTape records GPU operations for backward pass. All operations and gradients stay on GPU for maximum performance.
func NewGPUTape ¶ added in v0.6.0
NewGPUTape creates a new gradient tape for GPU operations.
func (*GPUTape) Backward ¶ added in v0.6.0
Backward computes gradients for all inputs by walking the tape in reverse. All operations stay on GPU - no CPU transfers occur.
Algorithm:
- Start with loss gradient (typically ones for scalar loss)
- Walk operations in reverse order
- For each operation, compute input gradients using chain rule
- Accumulate gradients when the same tensor is used multiple times
Returns a map from GPUTensor to its accumulated gradient (also GPUTensor).
func (*GPUTape) Clear ¶ added in v0.6.0
func (tape *GPUTape) Clear()
Clear resets the tape, removing all recorded operations. Recording state is preserved.
func (*GPUTape) Disable ¶ added in v0.6.0
func (tape *GPUTape) Disable()
Disable disables operation recording.
func (*GPUTape) Enable ¶ added in v0.6.0
func (tape *GPUTape) Enable()
Enable enables operation recording.
func (*GPUTape) IsEnabled ¶ added in v0.6.0
IsEnabled returns true if the tape is currently recording operations.
type GPUTensor ¶ added in v0.6.0
type GPUTensor struct {
// contains filtered or unexported fields
}
GPUTensor holds tensor data in GPU memory without transferring to CPU. This enables efficient GPU-to-GPU operations without the overhead of readBuffer() calls.
func (*GPUTensor) Backward ¶ added in v0.6.0
func (t *GPUTensor) Backward()
Backward computes gradients for this tensor. This is a convenience method that creates a gradient of ones and calls tape.Backward().
func (*GPUTensor) Buffer ¶ added in v0.6.0
Buffer returns the underlying GPU buffer. This is exposed for internal backend operations.
func (*GPUTensor) Eval ¶ added in v0.6.0
Eval forces computation of lazy tensor using batched submission. Collects all dependencies and submits them in a single GPU command buffer. This reduces GPU overhead compared to submitting each operation separately.
func (*GPUTensor) Grad ¶ added in v0.6.0
Grad returns the accumulated gradient for this tensor. Returns nil if no gradient has been computed.
func (*GPUTensor) Item ¶ added in v0.6.0
Item returns the single scalar value from a tensor. This is useful for extracting loss values during training. Panics if tensor has more than one element.
func (*GPUTensor) NumElements ¶ added in v0.6.0
NumElements returns the total number of elements in the tensor.
func (*GPUTensor) Release ¶ added in v0.6.0
func (t *GPUTensor) Release()
Release releases the GPU buffer and frees memory. This should be called when the tensor is no longer needed.
func (*GPUTensor) RequiresGrad ¶ added in v0.6.0
RequiresGrad returns whether this tensor requires gradient computation.
func (*GPUTensor) SetRequiresGrad ¶ added in v0.6.0
SetRequiresGrad sets whether this tensor requires gradient computation. Returns the tensor for method chaining. Note: PyTorch uses requires_grad_ (underscore suffix for in-place). In Go, we use SetRequiresGrad for clarity.
type MemoryStats ¶
type MemoryStats struct {
// Total bytes allocated since backend creation
TotalAllocatedBytes uint64
// Peak memory usage in bytes
PeakMemoryBytes uint64
// Number of currently active buffers
ActiveBuffers int64
// Buffer pool statistics
PoolAllocated uint64
PoolReleased uint64
PoolHits uint64
PoolMisses uint64
PooledBuffers int
}
MemoryStats represents GPU memory usage statistics.