Documentation
¶
Overview ¶
Package httpclient provides a typed Go client for consuming the go-llama REST API.
Create a client with:
client, err := httpclient.New("http://localhost:8080/api/gollama")
if err != nil {
panic(err)
}
Then use the client to manage models and perform inference:
// List all models
models, err := client.ListModels(ctx)
// Get a specific model
model, err := client.GetModel(ctx, "llama-7b")
// Download a model from URL with progress
model, err := client.PullModel(ctx, "hf://microsoft/DialoGPT-medium",
httpclient.WithProgressCallback(func(filename string, received, total uint64) error {
if total > 0 {
pct := float64(received) * 100.0 / float64(total)
fmt.Printf("Downloading %s: %.1f%%\n", filename, pct)
}
return nil
}))
// Load a model into memory
model, err := client.LoadModel(ctx, "llama-7b",
httpclient.WithGpu(0),
httpclient.WithLayers(32))
// Unload a model from memory
err := client.UnloadModel(ctx, "llama-7b")
// Generate text completion
result, err := client.Complete(ctx, "llama-7b", "Once upon a time",
httpclient.WithMaxTokens(100),
httpclient.WithTemperature(0.7))
// Stream completion tokens
result, err := client.Complete(ctx, "llama-7b", "Once upon a time",
httpclient.WithChunkCallback(func(chunk *schema.CompletionChunk) error {
fmt.Print(chunk.Text)
return nil
}))
// Generate embeddings
result, err := client.Embed(ctx, "embedding-model", []string{"Hello", "World"})
// Tokenize text
tokens, err := client.Tokenize(ctx, "llama-7b", "Hello, world!")
// Detokenize tokens
text, err := client.Detokenize(ctx, "llama-7b", tokens.Tokens)
Index ¶
- type Client
- func (c *Client) Chat(ctx context.Context, model string, messages []schema.ChatMessage, opts ...Opt) (*schema.ChatResponse, error)
- func (c *Client) Complete(ctx context.Context, model, prompt string, opts ...Opt) (*schema.CompletionResponse, error)
- func (c *Client) DeleteModel(ctx context.Context, id string) error
- func (c *Client) Detokenize(ctx context.Context, model string, tokens []schema.Token, opts ...Opt) (*schema.DetokenizeResponse, error)
- func (c *Client) Embed(ctx context.Context, model string, input []string, opts ...Opt) (*schema.EmbedResponse, error)
- func (c *Client) GetModel(ctx context.Context, id string) (*schema.CachedModel, error)
- func (c *Client) ListModels(ctx context.Context) ([]*schema.CachedModel, error)
- func (c *Client) LoadModel(ctx context.Context, name string, opts ...Opt) (*schema.CachedModel, error)
- func (c *Client) PullModel(ctx context.Context, url string, opts ...Opt) (*schema.CachedModel, error)
- func (c *Client) Tokenize(ctx context.Context, model, text string, opts ...Opt) (*schema.TokenizeResponse, error)
- func (c *Client) UnloadModel(ctx context.Context, id string) (*schema.CachedModel, error)
- type Opt
- func WithAddSpecial(addSpecial bool) Opt
- func WithChatChunkCallback(callback func(*schema.ChatChunk) error) Opt
- func WithChunkCallback(callback func(*schema.CompletionChunk) error) Opt
- func WithGpu(gpu int32) Opt
- func WithLayers(layers int32) Opt
- func WithMaxTokens(maxTokens int32) Opt
- func WithMlock(mlock bool) Opt
- func WithMmap(mmap bool) Opt
- func WithNormalize(normalize bool) Opt
- func WithParseSpecial(parseSpecial bool) Opt
- func WithPrefixCache(prefixCache bool) Opt
- func WithProgressCallback(callback func(filename string, bytesReceived, totalBytes uint64) error) Opt
- func WithRemoveSpecial(removeSpecial bool) Opt
- func WithRepeatLastN(repeatLastN int32) Opt
- func WithRepeatPenalty(repeatPenalty float32) Opt
- func WithSeed(seed uint32) Opt
- func WithStop(stop ...string) Opt
- func WithSystem(system string) Opt
- func WithTemperature(temperature float32) Opt
- func WithTopK(topK int32) Opt
- func WithTopP(topP float32) Opt
- func WithUnparseSpecial(unparseSpecial bool) Opt
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Client ¶
Client is a llama HTTP client that wraps the base HTTP client and provides typed methods for interacting with the llama API.
func New ¶
New creates a new llama HTTP client with the given base URL and options. The url parameter should point to the llama API endpoint, e.g. "http://localhost:8080/api/gollama".
func (*Client) Chat ¶
func (c *Client) Chat(ctx context.Context, model string, messages []schema.ChatMessage, opts ...Opt) (*schema.ChatResponse, error)
Chat generates a response for the given chat messages. Use WithChunkCallback to receive streaming chunks as they are generated.
Example:
result, err := client.Chat(ctx, "llama-7b",
schema.ChatMessage{Role: "user", Content: "What is 2+2?"},
httpclient.WithMaxTokens(100),
httpclient.WithTemperature(0.7))
func (*Client) Complete ¶
func (c *Client) Complete(ctx context.Context, model, prompt string, opts ...Opt) (*schema.CompletionResponse, error)
Complete generates a text completion for the given prompt. Use WithChunkCallback to receive streaming chunks as they are generated.
Example:
result, err := client.Complete(ctx, "llama-7b", "Once upon a time",
httpclient.WithMaxTokens(100),
httpclient.WithTemperature(0.7))
func (*Client) DeleteModel ¶
DeleteModel deletes a model from disk.
func (*Client) Detokenize ¶
func (c *Client) Detokenize(ctx context.Context, model string, tokens []schema.Token, opts ...Opt) (*schema.DetokenizeResponse, error)
Detokenize converts tokens back into text using the specified model.
Example:
result, err := client.Detokenize(ctx, "llama-7b", tokens)
func (*Client) Embed ¶
func (c *Client) Embed(ctx context.Context, model string, input []string, opts ...Opt) (*schema.EmbedResponse, error)
Embed generates embeddings for the given input texts.
Example:
result, err := client.Embed(ctx, "embedding-model", []string{"Hello", "World"})
func (*Client) ListModels ¶
ListModels returns a list of all available models from the llama API.
func (*Client) LoadModel ¶
func (c *Client) LoadModel(ctx context.Context, name string, opts ...Opt) (*schema.CachedModel, error)
LoadModel loads a model into memory with the given options.
func (*Client) PullModel ¶
func (c *Client) PullModel(ctx context.Context, url string, opts ...Opt) (*schema.CachedModel, error)
PullModel downloads and caches a model from a URL. Use WithProgressCallback to receive download progress updates.
Example:
model, err := client.PullModel(ctx, "hf://microsoft/DialoGPT-medium",
httpclient.WithProgressCallback(func(filename string, received, total uint64) error {
if total > 0 {
pct := float64(received) * 100.0 / float64(total)
fmt.Printf("Downloading %s: %.1f%%\n", filename, pct)
}
return nil
}))
func (*Client) Tokenize ¶
func (c *Client) Tokenize(ctx context.Context, model, text string, opts ...Opt) (*schema.TokenizeResponse, error)
Tokenize converts text into tokens using the specified model.
Example:
result, err := client.Tokenize(ctx, "llama-7b", "Hello, world!")
func (*Client) UnloadModel ¶
UnloadModel unloads a model from memory and returns the unloaded model.
type Opt ¶
type Opt func(*opt) error
Opt is an option to set on the client request.
func WithAddSpecial ¶
WithAddSpecial enables or disables adding BOS/EOS tokens during tokenization.
func WithChatChunkCallback ¶
WithChatChunkCallback sets a callback function to receive streaming chat chunks.
func WithChunkCallback ¶
func WithChunkCallback(callback func(*schema.CompletionChunk) error) Opt
WithChunkCallback sets a callback function to receive streaming chunks. This enables streaming support for text completion.
func WithLayers ¶
WithLayers sets the number of layers to offload to GPU. Use -1 to offload all layers.
func WithMaxTokens ¶
WithMaxTokens sets the maximum number of tokens to generate.
func WithNormalize ¶
WithNormalize enables or disables L2 normalization of embeddings.
func WithParseSpecial ¶
WithParseSpecial enables or disables parsing special tokens in input text.
func WithPrefixCache ¶
WithPrefixCache enables or disables prefix caching optimization.
func WithProgressCallback ¶
func WithProgressCallback(callback func(filename string, bytesReceived, totalBytes uint64) error) Opt
WithProgressCallback sets a callback function to receive progress updates. This enables streaming support for model pull operations.
func WithRemoveSpecial ¶
WithRemoveSpecial enables or disables removing BOS/EOS tokens during detokenization.
func WithRepeatLastN ¶
WithRepeatLastN sets the repeat penalty window size.
func WithRepeatPenalty ¶
WithRepeatPenalty sets the repeat penalty (1.0 = disabled).
func WithSystem ¶
WithSystem sets the system message/prompt for chat requests.
func WithTemperature ¶
WithTemperature sets the sampling temperature. Valid range is [0, 2] inclusive.
func WithUnparseSpecial ¶
WithUnparseSpecial enables or disables rendering special tokens as text during detokenization.