Documentation
¶
Overview ¶
Package tensorlake provides a Go SDK for the Tensorlake API.
Tensorlake enables document parsing, structured data extraction, page classification, and cloud sandbox management for various document formats including PDF, DOCX, PPTX, images, and more.
The SDK covers three main areas:
- Document APIs: file management, parsing, extraction, classification, and datasets
- Sandbox APIs: create, manage, snapshot, suspend/resume cloud sandboxes
- Sandbox interaction: file operations, PTY terminal sessions, and process management
Getting Started ¶
Create a client with your API key:
c := tensorlake.NewClient(
tensorlake.WithBaseURL("https://api.your-domain.com"),
tensorlake.WithAPIKey("your-api-key"),
)
Uploading a File ¶
Upload a file to the project:
file, err := os.Open("path/to/your/file.pdf")
if err != nil {
log.Fatal(err)
}
defer file.Close()
r, err := c.UploadFile(context.Background(), &tensorlake.UploadFileRequest{
FileBytes: file,
FileName: "your-file.pdf",
Labels: map[string]string{"category": "label-1", "subcategory": "label-2"},
})
if err != nil {
log.Fatal(err)
}
Parsing a Document ¶
Parse an uploaded file and retrieve the results:
// Start parsing using the file ID from upload
parseJob, err := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
FileSource: tensorlake.FileSource{
FileId: r.FileId,
},
Labels: map[string]string{"type": "invoice"},
})
if err != nil {
log.Fatal(err)
}
// Retrieve parse results with streaming updates
result, err := c.GetParseResult(context.Background(), parseJob.ParseId,
tensorlake.WithSSE(true),
tensorlake.WithOnUpdate(func(eventName string, r *tensorlake.ParseResult) {
log.Printf("Parse status: %s", eventName)
}),
)
if err != nil {
log.Fatal(err)
}
// Access the parsed content
for _, page := range result.Pages {
log.Printf("Page %d: %s", page.PageNumber, page.Markdown)
}
Index ¶
- Constants
- type Chunk
- type ChunkingStrategy
- type ClassifyDocumentRequest
- type Client
- func (c *Client) ClassifyDocument(ctx context.Context, in *ClassifyDocumentRequest) (*ParseJob, error)
- func (c *Client) CloseProcessStdin(ctx context.Context, sandboxID string, pid int32) error
- func (c *Client) ConnectPTY(ctx context.Context, sandboxID, sessionID, token string) (*PTYConn, error)
- func (c *Client) ConnectPTYWithURL(ctx context.Context, wsURL, token string) (*PTYConn, error)
- func (c *Client) CreateDataset(ctx context.Context, in *CreateDatasetRequest) (*CreateDatasetResponse, error)
- func (c *Client) CreatePTY(ctx context.Context, sandboxID string, in *CreatePTYRequest) (*CreatePTYResponse, error)
- func (c *Client) CreateSandbox(ctx context.Context, in *CreateSandboxRequest) (*CreateSandboxResponse, error)
- func (c *Client) DeleteDataset(ctx context.Context, datasetId string) error
- func (c *Client) DeleteFile(ctx context.Context, fileId string) error
- func (c *Client) DeleteParseJob(ctx context.Context, parseId string) error
- func (c *Client) DeleteSandbox(ctx context.Context, sandboxID string) error
- func (c *Client) DeleteSandboxFile(ctx context.Context, sandboxID, path string) error
- func (c *Client) ExtractDocument(ctx context.Context, in *ExtractDocumentRequest) (*ParseJob, error)
- func (c *Client) FollowProcessOutput(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
- func (c *Client) FollowProcessStderr(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
- func (c *Client) FollowProcessStdout(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
- func (c *Client) GetDataset(ctx context.Context, in *GetDatasetRequest) (*Dataset, error)
- func (c *Client) GetFileMetadata(ctx context.Context, fileId string) (*FileInfo, error)
- func (c *Client) GetPTY(ctx context.Context, sandboxID, sessionID string) (*PTYSessionInfo, error)
- func (c *Client) GetParseResult(ctx context.Context, parseId string, opts ...GetParseResultOption) (*ParseResult, error)
- func (c *Client) GetProcess(ctx context.Context, sandboxID string, pid int32) (*ProcessInfo, error)
- func (c *Client) GetProcessOutput(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
- func (c *Client) GetProcessStderr(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
- func (c *Client) GetProcessStdout(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
- func (c *Client) GetSandbox(ctx context.Context, sandboxID string) (*SandboxInfo, error)
- func (c *Client) IterDatasetData(ctx context.Context, datasetId string, batchSize int) iter.Seq2[ParseResult, error]
- func (c *Client) IterDatasets(ctx context.Context, batchSize int) iter.Seq2[Dataset, error]
- func (c *Client) IterFiles(ctx context.Context, batchSize int) iter.Seq2[FileInfo, error]
- func (c *Client) IterParseJobs(ctx context.Context, batchSize int) iter.Seq2[ParseResult, error]
- func (c *Client) KillPTY(ctx context.Context, sandboxID, sessionID string) error
- func (c *Client) KillProcess(ctx context.Context, sandboxID string, pid int32) error
- func (c *Client) ListDatasetData(ctx context.Context, in *ListDatasetDataRequest) (*PaginationResult[ParseResult], error)
- func (c *Client) ListDatasets(ctx context.Context, in *ListDatasetsRequest) (*PaginationResult[Dataset], error)
- func (c *Client) ListFiles(ctx context.Context, in *ListFilesRequest) (*PaginationResult[FileInfo], error)
- func (c *Client) ListPTY(ctx context.Context, sandboxID string) (*PTYListResponse, error)
- func (c *Client) ListParseJobs(ctx context.Context, in *ListParseJobsRequest) (*PaginationResult[ParseResult], error)
- func (c *Client) ListProcesses(ctx context.Context, sandboxID string) (*ProcessListResponse, error)
- func (c *Client) ListSandboxDirectory(ctx context.Context, sandboxID, path string) (*SandboxDirectoryListResponse, error)
- func (c *Client) ListSandboxes(ctx context.Context, in *ListSandboxesRequest) (*ListSandboxesResponse, error)
- func (c *Client) ParseDataset(ctx context.Context, in *ParseDatasetRequest) (*ParseJob, error)
- func (c *Client) ParseDocument(ctx context.Context, in *ParseDocumentRequest) (*ParseJob, error)
- func (c *Client) ReadDocument(ctx context.Context, in *ReadDocumentRequest) (*ParseJob, error)
- func (c *Client) ReadSandboxFile(ctx context.Context, sandboxID, path string) ([]byte, error)
- func (c *Client) ResizePTY(ctx context.Context, sandboxID, sessionID string, in *ResizePTYRequest) error
- func (c *Client) ResumeSandbox(ctx context.Context, sandboxID string) error
- func (c *Client) SignalProcess(ctx context.Context, sandboxID string, pid int32, in *SignalProcessRequest) error
- func (c *Client) SnapshotSandbox(ctx context.Context, sandboxID string, in *SnapshotSandboxRequest) (*SnapshotSandboxResponse, error)
- func (c *Client) StartProcess(ctx context.Context, sandboxID string, in *StartProcessRequest) (*ProcessInfo, error)
- func (c *Client) SuspendSandbox(ctx context.Context, sandboxID string) error
- func (c *Client) UpdateDataset(ctx context.Context, in *UpdateDatasetRequest) (*Dataset, error)
- func (c *Client) UpdateSandbox(ctx context.Context, sandboxID string, in *UpdateSandboxRequest) (*SandboxInfo, error)
- func (c *Client) UploadFile(ctx context.Context, in *UploadFileRequest) (*FileUploadResponse, error)
- func (c *Client) WriteProcessStdin(ctx context.Context, sandboxID string, pid int32, data io.Reader) error
- func (c *Client) WriteSandboxFile(ctx context.Context, sandboxID, path string, content io.Reader) error
- type ContainerResourcesInfo
- type CreateDatasetRequest
- type CreateDatasetResponse
- type CreatePTYRequest
- type CreatePTYResponse
- type CreateSandboxRequest
- type CreateSandboxResponse
- type Dataset
- type DatasetParseJobAnalytics
- type DatasetStatus
- type EnrichmentOptions
- type ErrorCode
- type ErrorResponse
- type ExtractDocumentRequest
- type FileInfo
- type FileSource
- type FileUploadResponse
- type GPUResources
- type GetDatasetRequest
- type GetParseResultOption
- type GetParseResultOptions
- type JobType
- type ListDatasetDataRequest
- type ListDatasetsRequest
- type ListFilesRequest
- type ListParseJobsRequest
- type ListSandboxesRequest
- type ListSandboxesResponse
- type MergeTableActions
- type MergedTable
- type MimeType
- type ModelProvider
- type OCRPipelineProvider
- type Option
- type OutputMode
- type PTYConn
- type PTYListResponse
- type PTYMessage
- type PTYMessageType
- type PTYSessionInfo
- type Page
- type PageClass
- type PageClassConfig
- type PageDimensions
- type PageFragment
- type PageFragmentContent
- type PageFragmentFigure
- type PageFragmentHeader
- type PageFragmentSignature
- type PageFragmentTable
- type PageFragmentTableCell
- type PageFragmentText
- type PageFragmentType
- type PaginationDirection
- type PaginationResult
- type ParseConfiguration
- type ParseDatasetRequest
- type ParseDocumentRequest
- type ParseEventName
- type ParseJob
- type ParseResult
- type ParseResultOptions
- type ParseResultUpdateFunc
- type ParseStatus
- type ParsingOptions
- type PartitionStrategy
- type ProcessInfo
- type ProcessListResponse
- type ProcessOutputEvent
- type ProcessOutputResponse
- type ProcessStatus
- type ReadDocumentRequest
- type ResizePTYRequest
- type SandboxDirectoryEntry
- type SandboxDirectoryListResponse
- type SandboxInfo
- type SandboxNetworkAccessControl
- type SandboxPendingReason
- type SandboxProxyError
- type SandboxResourceOverrides
- type SandboxStatus
- type SignalProcessRequest
- type SnapshotContentMode
- type SnapshotSandboxRequest
- type SnapshotSandboxResponse
- type StartProcessRequest
- type StdinMode
- type StructuredData
- type StructuredExtractionOptions
- type TableOutputMode
- type TableParsingFormat
- type UnionValues
- type UpdateDatasetRequest
- type UpdateSandboxRequest
- type UploadFileRequest
- type Usage
Constants ¶
const ( // EndpointEU is the European endpoint. EndpointEU string = "https://api.eu.tensorlake.ai/documents/v2" // EndpointUS is the United States endpoint. EndpointUS string = "https://api.tensorlake.ai/documents/v2" )
const ( // DefaultSandboxProxyBaseURL is the default base URL for the sandbox file proxy. // The sandbox ID is prepended as a subdomain. DefaultSandboxProxyBaseURL = "https://sandbox.tensorlake.ai" )
const (
// SandboxAPIBaseURL is the default base URL for sandbox management operations.
SandboxAPIBaseURL = "https://api.tensorlake.ai/sandboxes"
)
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ChunkingStrategy ¶
type ChunkingStrategy string
ChunkingStrategy determines how the document is chunked into smaller pieces.
Every text block, image, table, etc. is considered a fragment.
const ( // ChunkingStrategyNone: No chunking is applied. ChunkingStrategyNone ChunkingStrategy = "none" // ChunkingStrategyPage: The document is chunked by page. ChunkingStrategyPage ChunkingStrategy = "page" // ChunkingStrategySection: The document is chunked into sections. // Title and section headers are used as chunking markers. ChunkingStrategySection ChunkingStrategy = "section" // ChunkingStrategyFragment: Each page element is converted into markdown form. ChunkingStrategyFragment ChunkingStrategy = "fragment" )
type ClassifyDocumentRequest ¶
type ClassifyDocumentRequest struct {
FileSource
PageClassifications []PageClassConfig `json:"page_classifications"`
PageRange string `json:"page_range,omitempty"`
MimeType string `json:"mime_type,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
// FileName is the name of the file. This is populated when file_id is used.
//
// Optional.
FileName string `json:"file_name,omitempty"`
}
ClassifyDocumentRequest holds options for classifying a document.
type Client ¶
type Client struct {
// contains filtered or unexported fields
}
Client is a Tensorlake API client.
func (*Client) ClassifyDocument ¶
func (c *Client) ClassifyDocument(ctx context.Context, in *ClassifyDocumentRequest) (*ParseJob, error)
ClassifyDocument submits a document for page classification.
See also: Classify Document API Reference
func (*Client) CloseProcessStdin ¶ added in v0.3.0
CloseProcessStdin closes a process's stdin stream (sends EOF).
The process must have been started with StdinMode "pipe".
See also: Close Process Stdin API Reference
func (*Client) ConnectPTY ¶ added in v0.3.0
func (c *Client) ConnectPTY(ctx context.Context, sandboxID, sessionID, token string) (*PTYConn, error)
ConnectPTY opens a WebSocket connection to a PTY session.
The token is obtained from Client.CreatePTY. After connecting, the caller must call PTYConn.Ready to signal readiness before reading output.
See also: PTY WebSocket API Reference
func (*Client) ConnectPTYWithURL ¶ added in v0.3.0
ConnectPTYWithURL opens a WebSocket connection using an explicit base URL. This is primarily for testing with non-standard URLs.
func (*Client) CreateDataset ¶
func (c *Client) CreateDataset(ctx context.Context, in *CreateDatasetRequest) (*CreateDatasetResponse, error)
CreateDataset creates a new dataset.
See also: Create Dataset API Reference
func (*Client) CreatePTY ¶ added in v0.3.0
func (c *Client) CreatePTY(ctx context.Context, sandboxID string, in *CreatePTYRequest) (*CreatePTYResponse, error)
CreatePTY creates a new PTY session in a sandbox.
Returns a session ID and token for WebSocket authentication. The maximum number of concurrent PTY sessions per sandbox is 64.
See also: Create PTY Session API Reference
func (*Client) CreateSandbox ¶ added in v0.3.0
func (c *Client) CreateSandbox(ctx context.Context, in *CreateSandboxRequest) (*CreateSandboxResponse, error)
CreateSandbox creates a new sandbox.
To restore from a snapshot, set SnapshotId in the request.
See also: Create Sandbox API Reference
func (*Client) DeleteDataset ¶
DeleteDataset deletes a dataset from Tensorlake.
See also: Delete Dataset API Reference
func (*Client) DeleteFile ¶
DeleteFile deletes a file from Tensorlake Cloud.
See also: Delete File API Reference
func (*Client) DeleteParseJob ¶
DeleteParseJob deletes a previously submitted parse job. This will remove the parse job and its associated settings from the system. Deleting a parse job does not delete the original file used for parsing, nor does it affect any other parse jobs that may have been created from the same file
See also: Delete Parse Job API Reference
func (*Client) DeleteSandbox ¶ added in v0.3.0
DeleteSandbox terminates a sandbox.
This operation is idempotent — terminating an already-terminated sandbox returns success.
See also: Delete Sandbox API Reference
func (*Client) DeleteSandboxFile ¶ added in v0.3.0
DeleteSandboxFile deletes a file from a sandbox.
See also: Delete Sandbox File API Reference
func (*Client) ExtractDocument ¶
func (c *Client) ExtractDocument(ctx context.Context, in *ExtractDocumentRequest) (*ParseJob, error)
ExtractDocument submits a document for structured data extraction.
See also: Extract Document API Reference
func (*Client) FollowProcessOutput ¶ added in v0.3.0
func (c *Client) FollowProcessOutput(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
FollowProcessOutput opens an SSE stream that replays captured output (stdout + stderr merged) then streams live output lines until the process exits.
The returned iterator yields ProcessOutputEvent for each line. The Stream field is set to "stdout" or "stderr" to identify the source. Iteration ends when the server sends an "eof" event.
See also: Follow Process Output API Reference
func (*Client) FollowProcessStderr ¶ added in v0.3.0
func (c *Client) FollowProcessStderr(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
FollowProcessStderr opens an SSE stream that replays captured stderr then streams live stderr lines until the process exits.
The returned iterator yields ProcessOutputEvent for each line. The Stream field is not set (stderr-only endpoint). Iteration ends when the server sends an "eof" event.
See also: Follow Process Stderr API Reference
func (*Client) FollowProcessStdout ¶ added in v0.3.0
func (c *Client) FollowProcessStdout(ctx context.Context, sandboxID string, pid int32) iter.Seq2[ProcessOutputEvent, error]
FollowProcessStdout opens an SSE stream that replays captured stdout then streams live stdout lines until the process exits.
The returned iterator yields ProcessOutputEvent for each line. The Stream field is not set (stdout-only endpoint). Iteration ends when the server sends an "eof" event.
See also: Follow Process Stdout API Reference
func (*Client) GetDataset ¶
GetDataset retrieves details for a specific dataset.
See also: Get Dataset API Reference
func (*Client) GetFileMetadata ¶
GetFileMetadata retrieves metadata for a specific file.
See also: Get File Metadata API Reference
func (*Client) GetPTY ¶ added in v0.3.0
GetPTY retrieves details for a specific PTY session.
See also: Get PTY Session API Reference
func (*Client) GetParseResult ¶
func (c *Client) GetParseResult(ctx context.Context, parseId string, opts ...GetParseResultOption) (*ParseResult, error)
GetParseResult retrieves the result of a parse job. The response will include: 1) parsed content (markdown or pages); 2) structured extraction results (if schemas are provided during the parse request); 3) page classification results (if page classifications are provided during the parse request).
When the job finishes successfully, the response will contain pages (chunks of the page) chunks (text chunks extracted from the document), structured data (every schema_name provided in the parse request as a key).
See also: Get Parse Result API Reference
func (*Client) GetProcess ¶ added in v0.3.0
GetProcess retrieves details for a specific process in a sandbox.
See also: Get Process API Reference
func (*Client) GetProcessOutput ¶ added in v0.3.0
func (c *Client) GetProcessOutput(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
GetProcessOutput retrieves all captured output (stdout + stderr merged) from a process.
See also: Get Process Output API Reference
func (*Client) GetProcessStderr ¶ added in v0.3.0
func (c *Client) GetProcessStderr(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
GetProcessStderr retrieves all captured stderr lines from a process.
See also: Get Process Stderr API Reference
func (*Client) GetProcessStdout ¶ added in v0.3.0
func (c *Client) GetProcessStdout(ctx context.Context, sandboxID string, pid int32) (*ProcessOutputResponse, error)
GetProcessStdout retrieves all captured stdout lines from a process.
See also: Get Process Stdout API Reference
func (*Client) GetSandbox ¶ added in v0.3.0
GetSandbox retrieves details for a specific sandbox.
See also: Get Sandbox API Reference
func (*Client) IterDatasetData ¶
func (c *Client) IterDatasetData(ctx context.Context, datasetId string, batchSize int) iter.Seq2[ParseResult, error]
IterDatasetData iterates over all dataset data in the organization.
func (*Client) IterDatasets ¶
IterDatasets iterates over all datasets in the organization.
func (*Client) IterParseJobs ¶
IterParseJobs iterates over all parse jobs in the project.
func (*Client) KillPTY ¶ added in v0.3.0
KillPTY terminates a PTY session.
The daemon sends SIGHUP initially, then escalates to SIGKILL if the session persists after a grace period.
See also: Kill PTY Session API Reference
func (*Client) KillProcess ¶ added in v0.3.0
KillProcess kills a process in a sandbox.
See also: Kill Process API Reference
func (*Client) ListDatasetData ¶
func (c *Client) ListDatasetData(ctx context.Context, in *ListDatasetDataRequest) (*PaginationResult[ParseResult], error)
ListDatasetData lists all the parse jobs associated with a specific dataset. This endpoint allows you to retrieve the status and metadata of each parse job that has been submitted under the specified dataset.
See also: List Dataset Data API Reference
func (*Client) ListDatasets ¶
func (c *Client) ListDatasets(ctx context.Context, in *ListDatasetsRequest) (*PaginationResult[Dataset], error)
ListDatasets lists all datasets in the organization.
See also: List Datasets API Reference
func (*Client) ListFiles ¶
func (c *Client) ListFiles(ctx context.Context, in *ListFilesRequest) (*PaginationResult[FileInfo], error)
ListFiles lists files in the Tensorlake project.
This operation allows to list every file that has been uploaded to the Project specified by the API key used in the request. The response will include metadata about each file, such as the file ID, name, size, and type. We use cursor-based pagination to return the files in pages. A page has the following fields:
- Items: An array of file metadata, each containing the fields described below.
- HasMore: A boolean indicating whether there are more files available beyond the current page.
- NextCursor: A base64-encoded cursor for the next page of results. If HasMore is false, this field will be null.
- PrevCursor: A base64-encoded cursor for the previous page of results. If this is the first page, this field will be null.
See also: List Files API Reference
func (*Client) ListPTY ¶ added in v0.3.0
ListPTY lists all PTY sessions in a sandbox.
The PTY token is not included in list responses.
See also: List PTY Sessions API Reference
func (*Client) ListParseJobs ¶
func (c *Client) ListParseJobs(ctx context.Context, in *ListParseJobsRequest) (*PaginationResult[ParseResult], error)
ListParseJobs lists parse jobs in the Tensorlake project.
See also: List Parse Jobs API Reference
func (*Client) ListProcesses ¶ added in v0.3.0
ListProcesses lists all tracked processes in a sandbox.
See also: List Processes API Reference
func (*Client) ListSandboxDirectory ¶ added in v0.3.0
func (c *Client) ListSandboxDirectory(ctx context.Context, sandboxID, path string) (*SandboxDirectoryListResponse, error)
ListSandboxDirectory lists the contents of a directory in a sandbox.
Entries are sorted with directories first, then alphabetically.
See also: List Sandbox Directory API Reference
func (*Client) ListSandboxes ¶ added in v0.3.0
func (c *Client) ListSandboxes(ctx context.Context, in *ListSandboxesRequest) (*ListSandboxesResponse, error)
ListSandboxes lists sandboxes in the project.
See also: List Sandboxes API Reference
func (*Client) ParseDataset ¶
ParseDataset parses a document using a dataset's configuration.
See also: Parse Dataset API Reference
func (*Client) ParseDocument ¶
ParseDocument submits a document for comprehensive parsing (read, extract, and classify).
See also: Parse Document API Reference
func (*Client) ReadDocument ¶
ReadDocument submits an uploaded file, an internet-reachable URL, or any kind of raw text for document parsing. If you have configured a webhook, we will notify you when the job is complete, be it a success or a failure. The API will convert the document into markdown, and provide document layout information. Once submitted, the API will return a parse response with a parse_id field. You can query the status and results of the parse operation with the Get Parse Result endpoint.
See also: Read Document API Reference
func (*Client) ReadSandboxFile ¶ added in v0.3.0
ReadSandboxFile reads a file from a sandbox.
The response is the raw file content as bytes.
See also: Read Sandbox File API Reference
func (*Client) ResizePTY ¶ added in v0.3.0
func (c *Client) ResizePTY(ctx context.Context, sandboxID, sessionID string, in *ResizePTYRequest) error
ResizePTY resizes a PTY session's terminal dimensions.
Rows are clamped to 1..500, cols to 1..1000 server-side.
See also: Resize PTY Session API Reference
func (*Client) ResumeSandbox ¶ added in v0.3.0
ResumeSandbox resumes a suspended sandbox.
Returns nil on success (both 200 already-running and 202 resume-initiated).
See also: Resume Sandbox API Reference
func (*Client) SignalProcess ¶ added in v0.3.0
func (c *Client) SignalProcess(ctx context.Context, sandboxID string, pid int32, in *SignalProcessRequest) error
SignalProcess sends a POSIX signal to a process in a sandbox.
See also: Signal Process API Reference
func (*Client) SnapshotSandbox ¶ added in v0.3.0
func (c *Client) SnapshotSandbox(ctx context.Context, sandboxID string, in *SnapshotSandboxRequest) (*SnapshotSandboxResponse, error)
SnapshotSandbox creates a snapshot of a sandbox.
See also: Snapshot Sandbox API Reference
func (*Client) StartProcess ¶ added in v0.3.0
func (c *Client) StartProcess(ctx context.Context, sandboxID string, in *StartProcessRequest) (*ProcessInfo, error)
StartProcess starts a new process in a sandbox.
See also: Start Process API Reference
func (*Client) SuspendSandbox ¶ added in v0.3.0
SuspendSandbox suspends a named sandbox.
Only named sandboxes can be suspended. Ephemeral sandboxes return an error. Returns nil on success (both 200 already-suspended and 202 suspend-initiated).
See also: Suspend Sandbox API Reference
func (*Client) UpdateDataset ¶
UpdateDataset updates a dataset's settings.
See also: Update Dataset API Reference
func (*Client) UpdateSandbox ¶ added in v0.3.0
func (c *Client) UpdateSandbox(ctx context.Context, sandboxID string, in *UpdateSandboxRequest) (*SandboxInfo, error)
UpdateSandbox updates a sandbox's settings.
See also: Update Sandbox API Reference
func (*Client) UploadFile ¶
func (c *Client) UploadFile(ctx context.Context, in *UploadFileRequest) (*FileUploadResponse, error)
UploadFile uploads a file to Tensorlake Cloud.
The file will be associated with the project specified by the API key used in the request.
The file can be of any of the following types: - PDF - Word (DOCX) - Spreadsheets (XLS, XLSX, XSLM, CSV) - Presentations (PPTX, Apple Keynote) - Images (PNG, JPG, JPEG) - Raw text (plain text, HTML)
The file type is automatically detected based on Content-Type header. In case the Content-Type header is not provided, the file extension will be used to infer the type. If the file type cannot be determined, it will default to application/octet-stream.
We only keep one copy of the file, so uploading the same file multiple times will return the same file_id.
Labels ¶
Labels can be added to the file to help categorize the parse jobs associated with it. Labels are key-value pairs that can be used to filter and organize files. These should be provided in the a labels text field in the multipart form data. Labels are optional, but they can be very useful for organizing and managing parse jobs.
Limits ¶
There is an upload limit of 1 GB per file.
See also: Upload File API Reference
func (*Client) WriteProcessStdin ¶ added in v0.3.0
func (c *Client) WriteProcessStdin(ctx context.Context, sandboxID string, pid int32, data io.Reader) error
WriteProcessStdin writes raw bytes to a process's stdin.
The process must have been started with StdinMode "pipe".
See also: Write Process Stdin API Reference
func (*Client) WriteSandboxFile ¶ added in v0.3.0
func (c *Client) WriteSandboxFile(ctx context.Context, sandboxID, path string, content io.Reader) error
WriteSandboxFile writes a file to a sandbox.
Parent directories are created automatically if they do not exist. The content is written as raw bytes.
See also: Write Sandbox File API Reference
type ContainerResourcesInfo ¶ added in v0.3.0
type ContainerResourcesInfo struct {
CPUs float64 `json:"cpus"`
MemoryMB int64 `json:"memory_mb"`
EphemeralDiskMB int64 `json:"ephemeral_disk_mb"`
}
ContainerResourcesInfo describes the resource allocation of a sandbox.
type CreateDatasetRequest ¶
type CreateDatasetRequest struct {
// The name of the dataset.
//
// The name can only contain alphanumeric characters, hyphens, and
// underscores.
//
// The name must be unique within the organization and project context.
//
// Example:
// "invoices dataset"
Name string `json:"name"`
// A description of the dataset.
//
// This field is optional and can be used to provide additional context
// about the dataset.
//
// Example:
// "This dataset contains all invoices from 2023."
Description string `json:"description,omitempty"`
// The properties of this object define the configuration for the document
// parsing process.
//
// Tensorlake provides sane defaults that work well for most
// documents, so this object is not required. However, every document
// is different, and you may want to customize the parsing process to
// better suit your needs.
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
// The properties of this object define the configuration for structured
// data extraction.
//
// If this object is present, the API will perform structured data
// extraction on the document.
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
// The properties of this object define the configuration for page
// classify.
//
// If this object is present, the API will perform page classify on
// the document.
PageClassifications []PageClassConfig `json:"page_classifications,omitempty"`
// The properties of this object help to extend the output of the document
// parsing process with additional information.
//
// This includes summarization of tables and figures, which can help to
// provide a more comprehensive understanding of the document.
//
// This object is not required, and the API will use default settings if it
// is not present.
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
}
CreateDatasetRequest holds options for creating a dataset.
type CreateDatasetResponse ¶
type CreateDatasetResponse struct {
// Name is the name of the dataset.
Name string `json:"name"`
// DatasetId is the ID of the created dataset.
DatasetId string `json:"dataset_id"`
// CreatedAt is the creation date and time of the dataset.
CreatedAt string `json:"created_at"`
}
CreateDatasetResponse represents the response from creating a dataset.
type CreatePTYRequest ¶ added in v0.3.0
type CreatePTYRequest struct {
// Command is the executable to run (e.g. "/bin/bash").
//
// Required.
Command string `json:"command"`
// Args are command-line arguments (e.g. ["-l"]).
Args []string `json:"args,omitempty"`
// Env sets environment variables for the session.
Env map[string]string `json:"env,omitempty"`
// WorkingDir is the initial working directory.
WorkingDir string `json:"working_dir,omitempty"`
// Rows is the terminal height. Default: 24. Clamped to 1..500.
Rows int32 `json:"rows,omitempty"`
// Cols is the terminal width. Default: 80. Clamped to 1..1000.
Cols int32 `json:"cols,omitempty"`
}
CreatePTYRequest holds options for creating a PTY session.
type CreatePTYResponse ¶ added in v0.3.0
type CreatePTYResponse struct {
// SessionId is the unique PTY session identifier.
SessionId string `json:"session_id"`
// Token is used for WebSocket connection authentication.
Token string `json:"token"`
}
CreatePTYResponse represents the response from creating a PTY session.
type CreateSandboxRequest ¶ added in v0.3.0
type CreateSandboxRequest struct {
Name string `json:"name,omitempty"`
Image string `json:"image,omitempty"`
SnapshotId string `json:"snapshot_id,omitempty"`
Entrypoint []string `json:"entrypoint,omitempty"`
TimeoutSecs *int64 `json:"timeout_secs,omitempty"`
SecretNames []string `json:"secret_names,omitempty"`
TemplateId string `json:"template_id,omitempty"`
AllowUnauthenticatedAccess *bool `json:"allow_unauthenticated_access,omitempty"`
ExposedPorts []int32 `json:"exposed_ports,omitempty"`
Resources *SandboxResourceOverrides `json:"resources,omitempty"`
Network *SandboxNetworkAccessControl `json:"network,omitempty"`
}
CreateSandboxRequest holds options for creating a sandbox.
type CreateSandboxResponse ¶ added in v0.3.0
type CreateSandboxResponse struct {
SandboxId string `json:"sandbox_id"`
Status SandboxStatus `json:"status"`
PendingReason SandboxPendingReason `json:"pending_reason,omitempty"`
}
CreateSandboxResponse represents the response from creating a sandbox.
type Dataset ¶
type Dataset struct {
Name string `json:"name"`
DatasetId string `json:"dataset_id"`
Description string `json:"description,omitempty"`
Status DatasetStatus `json:"status"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
Analytics *DatasetParseJobAnalytics `json:"analytics,omitempty"`
}
Dataset represents a dataset.
type DatasetParseJobAnalytics ¶ added in v0.2.0
type DatasetParseJobAnalytics struct {
TotalProcessingParseJobs int `json:"total_processing_parse_jobs"`
TotalPendingParseJobs int `json:"total_pending_parse_jobs"`
TotalErrorParseJobs int `json:"total_error_parse_jobs"`
TotalSuccessfulParseJobs int `json:"total_successful_parse_jobs"`
TotalJobs int `json:"total_jobs"`
}
DatasetParseJobAnalytics contains analytics about parse jobs in a dataset.
type DatasetStatus ¶
type DatasetStatus string
const ( DatasetStatusIdle DatasetStatus = "idle" DatasetStatusProcessing DatasetStatus = "processing" )
type EnrichmentOptions ¶
type EnrichmentOptions struct {
// FigureSummarization enables summary generation for parsed figures.
// The default is false.
FigureSummarization bool `json:"figure_summarization,omitempty"`
// FigureSummarizationPrompt is the prompt to guide the figure summarization.
// If not provided, a default prompt will be used. It is not required to provide a prompt.
// The prompt only has effect if [FigureSummarization] is set to `true`.
FigureSummarizationPrompt string `json:"figure_summarization_prompt,omitempty"`
// TableSummarization enables summary generation for parsed tables.
// The default is false.
TableSummarization bool `json:"table_summarization,omitempty"`
// TableSummarizationPrompt is the prompt to guide the table summarization.
// If not provided, a default prompt will be used. It is not required to provide a prompt.
// The prompt only has effect if [TableSummarization] is set to `true`.
TableSummarizationPrompt string `json:"table_summarization_prompt,omitempty"`
// IncludeFullPageImage includes the full page image in addition to the cropped table and figure images.
// This provides Language Models context about the table and figure they are summarizing in addition to the cropped images, and could improve the summarization quality.
// The default is false.
IncludeFullPageImage bool `json:"include_full_page_image,omitempty"`
// TableCellGrounding enables grounding of table cells with bounding boxes.
// The default is false.
TableCellGrounding bool `json:"table_cell_grounding,omitempty"`
// ChartExtraction enables extraction of data from charts.
// The default is false.
ChartExtraction bool `json:"chart_extraction,omitempty"`
// KeyValueExtraction enables extraction of key-value pairs.
// The default is false.
KeyValueExtraction bool `json:"key_value_extraction,omitempty"`
}
EnrichmentOptions holds configuration for document enrichment.
type ErrorCode ¶
type ErrorCode string
ErrorCode represents error codes for Document AI API.
These codes are used to identify specific error conditions in the API. They can be used for programmatic handling of errors.
const ( ErrorCodeQuotaExceeded ErrorCode = "QUOTA_EXCEEDED" ErrorCodeInvalidJSONSchema ErrorCode = "INVALID_JSON_SCHEMA" ErrorCodeInvalidConfiguration ErrorCode = "INVALID_CONFIGURATION" ErrorCodeInvalidPageClass ErrorCode = "INVALID_PAGE_CLASSIFICATION" ErrorCodeEntityNotFound ErrorCode = "ENTITY_NOT_FOUND" ErrorCodeEntityAlreadyExists ErrorCode = "ENTITY_ALREADY_EXISTS" ErrorCodeInvalidFile ErrorCode = "INVALID_FILE" ErrorCodeInvalidPageRange ErrorCode = "INVALID_PAGE_RANGE" ErrorCodeInvalidMimeType ErrorCode = "INVALID_MIME_TYPE" ErrorCodeInvalidDatasetName ErrorCode = "INVALID_DATASET_NAME" ErrorCodeInternalError ErrorCode = "INTERNAL_ERROR" ErrorCodeInvalidMultipart ErrorCode = "INVALID_MULTIPART" ErrorCodeMultipartStreamEnd ErrorCode = "MULTIPART_STREAM_END" ErrorCodeInvalidQueryParams ErrorCode = "INVALID_QUERY_PARAMS" ErrorCodeInvalidJobState ErrorCode = "INVALID_JOB_STATE" ErrorCodeClientDisconnect ErrorCode = "CLIENT_DISCONNECT" ErrorCodeInvalidID ErrorCode = "INVALID_ID" )
type ErrorResponse ¶
type ErrorResponse struct {
// Message is a human-readable error message.
Message string `json:"message"`
// Code is the error code for programmatic handling.
Code ErrorCode `json:"code"`
// Timestamp is the Unix epoch timestamp in milliseconds when the error occurred.
Timestamp int64 `json:"timestamp,omitempty"`
// TraceId is the trace ID of the error.
TraceId string `json:"trace_id,omitempty"`
// Details is the details of the error.
Details any `json:"details,omitempty"`
}
ErrorResponse represents an error returned by the Tensorlake API.
func (*ErrorResponse) Error ¶
func (e *ErrorResponse) Error() string
type ExtractDocumentRequest ¶
type ExtractDocumentRequest struct {
FileSource
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options"`
PageRange string `json:"page_range,omitempty"`
MimeType string `json:"mime_type,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
// FileName is the name of the file. This is populated when file_id is used.
//
// Optional.
FileName string `json:"file_name,omitempty"`
}
ExtractDocumentRequest holds options for extracting structured data from a document.
type FileInfo ¶
type FileInfo struct {
FileId string `json:"file_id"`
FileName string `json:"file_name,omitempty"`
MimeType MimeType `json:"mime_type"`
FileSize int64 `json:"file_size"`
ChecksumSHA256 string `json:"checksum_sha256"`
CreatedAt string `json:"created_at"`
Labels map[string]string `json:"labels,omitempty"`
}
FileInfo represents metadata about a file.
type FileSource ¶
type FileSource struct {
// ID of the file previously uploaded to Tensorlake.
// Has tensorlake- (V1) or file_ (V2) prefix.
// Example: "file_abc123xyz"
FileId string `json:"file_id,omitempty"`
// External URL of the file to parse. Must be publicly accessible.
// Examples: "https://pub-226479de18b2493f96b64c6674705dd8.r2.dev/real-estate-purchase-all-signed.pdf"
FileURL string `json:"file_url,omitempty"`
// The raw text content to parse.
// Examples: "This is the document content..."
RawText string `json:"raw_text,omitempty"`
}
FileSource represents the source of a document (FileId, FileURL, or RawText).
func (*FileSource) SourceProvided ¶
func (fs *FileSource) SourceProvided() bool
SourceProvided checks exactly one source is provided.
type FileUploadResponse ¶
type FileUploadResponse struct {
// FileId is the ID of the created file.
// Use this ID to reference the file in parse, datasets, and other operations.
FileId string `json:"file_id"`
// CreatedAt is the creation date and time of the file.
// This is in RFC 3339 format.
CreatedAt time.Time `json:"created_at"`
}
FileUploadResponse represents the response from uploading a file.
type GPUResources ¶ added in v0.3.0
GPUResources specifies GPU allocation.
type GetDatasetRequest ¶ added in v0.2.0
type GetDatasetRequest struct {
// DatasetId is the unique identifier for the dataset.
//
// Required.
DatasetId string
// IncludeAnalytics includes parse job analytics in the response when set to true.
//
// Optional.
IncludeAnalytics bool
}
GetDatasetRequest holds options for retrieving a dataset.
type GetParseResultOption ¶
type GetParseResultOption func(*GetParseResultOptions)
GetParseResultOption is a function that configures the GetParseResultOptions.
func WithOnUpdate ¶
func WithOnUpdate(onUpdate ParseResultUpdateFunc) GetParseResultOption
WithOnUpdate sets the callback function that receives intermediate parse result updates during SSE streaming. It will be called for each SSE event received.
func WithOptions ¶ added in v0.1.1
func WithOptions(enable bool) GetParseResultOption
func WithSSE ¶
func WithSSE(enable bool) GetParseResultOption
WithSSE enables Server-Sent Events (SSE) for streaming updates.
type GetParseResultOptions ¶
type GetParseResultOptions struct {
// contains filtered or unexported fields
}
type ListDatasetDataRequest ¶
type ListDatasetDataRequest struct {
DatasetId string `json:"-"`
Cursor string `json:"cursor,omitempty"`
Direction PaginationDirection `json:"direction,omitempty"`
Limit int `json:"limit,omitempty"`
Status ParseStatus `json:"status,omitempty"`
ParseId string `json:"parse_id,omitempty"`
FileName string `json:"file_name,omitempty"`
CreatedAfter string `json:"created_after,omitempty"` // RFC3339
CreatedBefore string `json:"created_before,omitempty"` // RFC3339
FinishedAfter string `json:"finished_after,omitempty"` // RFC3339
FinishedBefore string `json:"finished_before,omitempty"` // RFC3339
}
ListDatasetDataRequest holds options for listing dataset parse jobs.
type ListDatasetsRequest ¶
type ListDatasetsRequest struct {
Cursor string
Direction PaginationDirection
Limit int
Status DatasetStatus
Name string
}
ListDatasetsRequest holds options for listing datasets.
type ListFilesRequest ¶
type ListFilesRequest struct {
// Cursor is the cursor to use for pagination.
// This is a base64-encoded string representing a timestamp.
// It is used to paginate through the results.
//
// Optional.
Cursor string `json:"cursor,omitempty"`
// Direction of pagination.
//
// This can be either next or prev.
// next means to get the next page of results,
// while prev means to get the previous page of results.
//
// Optional.
Direction PaginationDirection `json:"direction,omitempty"`
// Limit is the limits for the number of results to return.
//
// This is a positive integer that specifies the maximum number of results
// to return. If not provided, a default value will be used.
//
// Required range: x >= 0.
Limit int `json:"limit,omitempty"`
// FileName is the name to filter results by.
// This is a case-sensitive substring that will be matched against the file names.
// If provided, only files with names containing this substring will be returned.
FileName string `json:"file_name,omitempty"`
// CreatedAfter is the date and time to filter results by.
// The date should be in RFC 3339 format.
CreatedAfter string `json:"created_after,omitempty"`
// CreatedBefore is the date and time to filter results by.
// The date should be in RFC 3339 format.
CreatedBefore string `json:"created_before,omitempty"`
}
ListFilesRequest holds options for listing files.
type ListParseJobsRequest ¶
type ListParseJobsRequest struct {
Cursor string `json:"cursor,omitempty"`
Direction PaginationDirection `json:"direction,omitempty"`
DatasetName string `json:"dataset_name,omitempty"`
Limit int `json:"limit,omitempty"`
FileName string `json:"filename,omitempty"`
Status ParseStatus `json:"status,omitempty"`
Id string `json:"id,omitempty"`
CreatedAfter string `json:"created_after,omitempty"`
CreatedBefore string `json:"created_before,omitempty"`
FinishedAfter string `json:"finished_after,omitempty"`
FinishedBefore string `json:"finished_before,omitempty"`
}
type ListSandboxesRequest ¶ added in v0.3.0
type ListSandboxesRequest struct {
Limit int `json:"limit,omitempty"`
Cursor string `json:"cursor,omitempty"`
Direction string `json:"direction,omitempty"`
Status string `json:"status,omitempty"`
}
ListSandboxesRequest holds options for listing sandboxes.
type ListSandboxesResponse ¶ added in v0.3.0
type ListSandboxesResponse struct {
Sandboxes []SandboxInfo `json:"sandboxes"`
PrevCursor string `json:"prev_cursor,omitempty"`
NextCursor string `json:"next_cursor,omitempty"`
}
ListSandboxesResponse represents the response from listing sandboxes.
type MergeTableActions ¶ added in v0.2.0
type MergeTableActions struct {
// Pages is the list of page numbers that were merged.
Pages []int `json:"pages,omitempty"`
// TargetColumns is the target column count for the merged table.
TargetColumns *int `json:"target_columns,omitempty"`
}
MergeTableActions describes the merge operations performed on the table.
type MergedTable ¶ added in v0.2.0
type MergedTable struct {
// MergedTableId is the unique identifier for the merged table.
MergedTableId string `json:"merged_table_id"`
// MergedTableHTML is the HTML representation of the merged table.
MergedTableHTML string `json:"merged_table_html"`
// StartPage is the first page of the merged table.
StartPage int `json:"start_page"`
// EndPage is the last page of the merged table.
EndPage int `json:"end_page"`
// PagesMerged is the number of pages that were merged.
PagesMerged int `json:"pages_merged"`
// Summary is an optional summary of the merged table.
Summary string `json:"summary,omitempty"`
// MergeActions describes the merge operations performed.
MergeActions *MergeTableActions `json:"merge_actions,omitempty"`
}
MergedTable represents a table that was merged across multiple pages.
type MimeType ¶
type MimeType string
MimeType represents supported MIME types for document parsing.
const ( // MimeTypeTXT represents plain text files. MimeTypeTXT MimeType = "text/plain" // MimeTypeCSV represents a comma-separated values files. MimeTypeCSV MimeType = "text/csv" // MimeTypeHTML represents HTML files. MimeTypeHTML MimeType = "text/html" // MimeTypeJPEG represents JPEG image files. MimeTypeJPEG MimeType = "image/jpeg" // MimeTypePNG represents PNG image files. MimeTypePNG MimeType = "image/png" // MimeTypePDF represents Portable Document Format files. MimeTypePDF MimeType = "application/pdf" // MimeTypeDOCX represents Microsoft Word documents. MimeTypeDOCX MimeType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" // MimeTypePPTX represents Microsoft PowerPoint presentations. MimeTypePPTX MimeType = "application/vnd.openxmlformats-officedocument.presentationml.presentation" // MimeTypeKEYNOTE represents Apple Keynote presentations. MimeTypeKEYNOTE MimeType = "application/vnd.apple.keynote" // MimeTypeXLS represents Microsoft Excel spreadsheets (legacy format). MimeTypeXLS MimeType = "application/vnd.ms-excel" // MimeTypeXLSX represents Microsoft Excel spreadsheets. MimeTypeXLSX MimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" // MimeTypeXLSM represents Microsoft Excel spreadsheets (macros enabled). MimeTypeXLSM MimeType = "application/vnd.ms-excel.sheet.macroenabled.12" // MimeTypeDOC represents legacy Microsoft Word documents. MimeTypeDOC MimeType = "application/msword" // MimeTypePPT represents legacy Microsoft PowerPoint presentations. MimeTypePPT MimeType = "application/vnd.ms-powerpoint" // MimeTypeTIFF represents TIFF image files. MimeTypeTIFF MimeType = "image/tiff" // MimeTypeMD represents Markdown files. MimeTypeMD MimeType = "text/markdown" // MimeTypeXMD represents Markdown files (alternative MIME type). MimeTypeXMD MimeType = "text/x-markdown" // MimeTypeXML represents XML files. MimeTypeXML MimeType = "text/xml" // MimeTypeRTF represents Rich Text Format files. MimeTypeRTF MimeType = "text/rtf" // MimeTypeRTFApp represents Rich Text Format files (application MIME type). MimeTypeRTFApp MimeType = "application/rtf" // MimeTypeOCTET represents arbitrary binary data. MimeTypeOCTET MimeType = "application/octet-stream" // MimeTypePKCS7 represents S/MIME encrypted messages. MimeTypePKCS7 MimeType = "application/pkcs7-mime" // MimeTypeXPKCS7 represents S/MIME encrypted messages (alternative MIME type). MimeTypeXPKCS7 MimeType = "application/x-pkcs7-mime" // MimeTypePKCS7Sig represents S/MIME digital signatures. MimeTypePKCS7Sig MimeType = "application/pkcs7-signature" )
type ModelProvider ¶
type ModelProvider string
ModelProvider represents the LLM provider to use for structured data extraction.
const ( // ModelProviderTensorlake represents private models, running on Tensorlake infrastructure. ModelProviderTensorlake ModelProvider = "tensorlake" // ModelProviderGemini3 represents Google Gemini 3 models. ModelProviderGemini3 ModelProvider = "gemini3" // ModelProviderSonnet represents Anthropic Sonnet models. ModelProviderSonnet ModelProvider = "sonnet" // ModelProviderGPT4oMini represents OpenAI GPT-4o-mini model. ModelProviderGPT4oMini ModelProvider = "gpt4o_mini" )
type OCRPipelineProvider ¶
type OCRPipelineProvider string
OCRPipelineProvider represents the different models for OCR (Optical Character Recognition).
const ( // OCRPipelineProviderDefault is the default OCR model (same as model01). OCRPipelineProviderDefault OCRPipelineProvider = "" // OCRPipelineProviderTensorlake01 is fast but could have lower accuracy on complex tables. // It's good for legal documents with footnotes. OCRPipelineProviderTensorlake01 OCRPipelineProvider = "model01" // OCRPipelineProviderTensorlake02 is slower but could have higher accuracy on complex tables. // It's good for financial documents with merged cells. OCRPipelineProviderTensorlake02 OCRPipelineProvider = "model02" // OCRPipelineProviderTensorlake03 is a compact model delivered to on-premise users. // It takes about 2 minutes to startup on Tensorlake's Cloud because it's meant // for testing for users who are eventually going to deploy this model on // dedicated hardware in their own datacenter. OCRPipelineProviderTensorlake03 OCRPipelineProvider = "model03" // OCRPipelineProviderGemini3 calls Google Gemini 3 API for OCR processing. OCRPipelineProviderGemini3 OCRPipelineProvider = "gemini3" // OCRPipelineProviderTensorlake06 is a newer model variant. OCRPipelineProviderTensorlake06 OCRPipelineProvider = "model06" )
type Option ¶
type Option func(*Client)
Option defines a configuration option for the Client.
func WithAPIKey ¶
WithAPIKey sets the API key to use for the client.
func WithBaseURL ¶
WithBaseURL sets the base URL to use for the client.
func WithHTTPClient ¶
WithHTTPClient sets the HTTP client to use for the client.
func WithSandboxAPIBaseURL ¶ added in v0.3.0
WithSandboxAPIBaseURL sets the base URL for sandbox management API calls. Default: https://api.tensorlake.ai/sandboxes
Example: WithSandboxAPIBaseURL("https://api-tensorlake.orange.sixt.com/sandboxes")
func WithSandboxProxyBaseURL ¶ added in v0.3.0
WithSandboxProxyBaseURL sets the base URL template for sandbox file proxy calls. The sandbox ID is prepended as a subdomain. The value should include the scheme and domain but NOT the sandbox ID subdomain.
Default: https://sandbox.tensorlake.ai
For a sandbox with ID "abc123", the file API URL becomes:
https://abc123.sandbox.tensorlake.ai/api/v1/files
Example: WithSandboxProxyBaseURL("https://sandbox-tensorlake.orange.sixt.com") would produce: https://abc123.sandbox-tensorlake.orange.sixt.com/api/v1/files
type OutputMode ¶ added in v0.3.0
type OutputMode string
OutputMode determines how stdout/stderr is handled for a process.
const ( OutputModeCapture OutputMode = "capture" OutputModeDiscard OutputMode = "discard" )
type PTYConn ¶ added in v0.3.0
type PTYConn struct {
// contains filtered or unexported fields
}
PTYConn represents an active WebSocket connection to a PTY session.
PTYConn wraps a WebSocket connection and provides typed methods for the PTY binary protocol. After creating a PTYConn with Client.ConnectPTY, the caller must call PTYConn.Close when done.
The PTY WebSocket protocol uses binary frames with a single-byte opcode prefix:
- 0x00 Data: terminal I/O (both directions)
- 0x01 Resize: client → server terminal resize (uint16 BE cols + uint16 BE rows)
- 0x02 Ready: client → server readiness signal (must be sent first)
- 0x03 Exit: server → client process exit (int32 BE exit code)
func (*PTYConn) Read ¶ added in v0.3.0
func (pc *PTYConn) Read(ctx context.Context) (*PTYMessage, error)
Read reads the next message from the PTY session.
Returns PTYMessageData for terminal output and PTYMessageExit when the process exits. After receiving an exit message, the WebSocket will be closed by the server.
func (*PTYConn) Ready ¶ added in v0.3.0
Ready sends the READY signal to the server, indicating the client is ready to receive terminal output. This must be called immediately after connecting, before reading any data.
If Ready is not sent, the server buffers output up to 1 MB then disconnects.
type PTYListResponse ¶ added in v0.3.0
type PTYListResponse struct {
Sessions []PTYSessionInfo `json:"sessions"`
}
PTYListResponse represents the response from listing PTY sessions.
type PTYMessage ¶ added in v0.3.0
type PTYMessage struct {
// Type is either PTYMessageData or PTYMessageExit.
Type PTYMessageType
// Data contains the terminal output bytes. Only set when Type is PTYMessageData.
Data []byte
// ExitCode contains the process exit code. Only set when Type is PTYMessageExit.
ExitCode int32
}
PTYMessage represents a message received from the PTY WebSocket.
type PTYMessageType ¶ added in v0.3.0
type PTYMessageType int
PTYMessageType distinguishes between data and exit messages.
const ( // PTYMessageData indicates terminal output data. PTYMessageData PTYMessageType = iota // PTYMessageExit indicates the process has exited. PTYMessageExit )
type PTYSessionInfo ¶ added in v0.3.0
type PTYSessionInfo struct {
SessionId string `json:"session_id"`
PID int32 `json:"pid"`
Command string `json:"command"`
Args []string `json:"args"`
Rows int32 `json:"rows"`
Cols int32 `json:"cols"`
CreatedAt int64 `json:"created_at"`
EndedAt *int64 `json:"ended_at,omitempty"`
ExitCode *int32 `json:"exit_code,omitempty"`
IsAlive bool `json:"is_alive"`
}
PTYSessionInfo represents metadata about a PTY session.
type Page ¶
type Page struct {
// Dimensions is a 2-element vector representing the width and height of
// the page in points.
Dimensions []int `json:"dimensions,omitempty"`
// PageDimensions is a 2-element vector representing the width and height of
// the page in points.
PageDimensions PageDimensions `json:"page_dimensions,omitempty"`
// Vector of text fragments extracted from the page.
// Each fragment represents a distinct section of text, such as titles,
// paragraphs, tables, figures, etc.
PageFragments []PageFragment `json:"page_fragments,omitempty"`
// 1-indexed page number in the document.
PageNumber int `json:"page_number"`
// If the page was classified into a specific class, this field contains
// the reason for the classification.
ClassificationReason string `json:"classification_reason,omitempty"`
}
Page represents a page in the parsed document.
type PageClass ¶
type PageClass struct {
// PageClass is the name of the page class given in the parse request.
// This value should match one of the class names provided in the
// page_classification_options field of the parse request.
//
// Required.
PageClass string `json:"page_class"`
// PageNumbers is a list of page numbers (1-indexed) where
// the page class was detected. Required.
PageNumbers []int `json:"page_numbers"`
// ClassificationReasons is a map of classification reasons per page number
// The key is the page number, and the value is the reason for the classification.
ClassificationReasons map[int]string `json:"classification_reasons,omitempty"`
}
PageClass extracted from the document.
type PageClassConfig ¶
type PageClassConfig struct {
// Name is the name of the page class.
Name string `json:"name"`
// Description is the description of the page class to guide the model
// to classify the pages. Describe what the model should look for in
// the page to classify it.
Description string `json:"description,omitempty"`
}
type PageDimensions ¶
type PageDimensions struct {
// Width is the width of the page in points.
Width int `json:"width"`
// Height is the height of the page in points.
Height int `json:"height"`
}
PageDimensions represents the dimensions of a page.
type PageFragment ¶
type PageFragment struct {
FragmentType PageFragmentType `json:"fragment_type"`
Content PageFragmentContent `json:"content"`
ReadingOrder int64 `json:"reading_order,omitempty"`
BoundingBox map[string]float64 `json:"bbox,omitempty"`
}
PageFragment represents a fragment of a page in the parsed document.
type PageFragmentContent ¶
type PageFragmentContent struct {
// One of these will be set depending on the JSON input:
Text *PageFragmentText `json:"text,omitempty"`
Header *PageFragmentHeader `json:"header,omitempty"`
Table *PageFragmentTable `json:"table,omitempty"`
Figure *PageFragmentFigure `json:"figure,omitempty"`
Signature *PageFragmentSignature `json:"signature,omitempty"`
}
type PageFragmentFigure ¶
type PageFragmentHeader ¶
type PageFragmentSignature ¶
type PageFragmentSignature struct {
Content string `json:"content"`
}
type PageFragmentTable ¶
type PageFragmentTable struct {
Content string `json:"content"`
Cells []PageFragmentTableCell `json:"cells"`
HTML string `json:"html,omitempty"`
Markdown string `json:"markdown,omitempty"`
Summary string `json:"summary,omitempty"`
}
type PageFragmentTableCell ¶
type PageFragmentText ¶
type PageFragmentText struct {
Content string `json:"content"`
}
type PageFragmentType ¶
type PageFragmentType string
PageFragmentType represents the type of a page fragment.
const ( PageFragmentTypeSectionHeader PageFragmentType = "section_header" PageFragmentTypeTitle PageFragmentType = "title" PageFragmentTypeText PageFragmentType = "text" PageFragmentTypeTable PageFragmentType = "table" PageFragmentTypeFigure PageFragmentType = "figure" PageFragmentTypeFormula PageFragmentType = "formula" PageFragmentTypeForm PageFragmentType = "form" PageFragmentTypeKeyValueRegion PageFragmentType = "key_value_region" PageFragmentTypeDocumentIndex PageFragmentType = "document_index" PageFragmentTypeListItem PageFragmentType = "list_item" PageFragmentTypeTableCaption PageFragmentType = "table_caption" PageFragmentTypeFigureCaption PageFragmentType = "figure_caption" PageFragmentTypeFormulaCaption PageFragmentType = "formula_caption" PageFragmentTypePageHeader PageFragmentType = "page_header" PageFragmentTypePageNumber PageFragmentType = "page_number" PageFragmentTypeSignature PageFragmentType = "signature" PageFragmentTypeStrikethrough PageFragmentType = "strikethrough" PageFragmentTypeBarcode PageFragmentType = "barcode" PageFragmentTypeChart PageFragmentType = "chart" PageFragmentTypeTrackedChanges PageFragmentType = "tracked_changes" PageFragmentTypeComments PageFragmentType = "comments" )
type PaginationDirection ¶
type PaginationDirection string
const ( PaginationDirectionNext PaginationDirection = "next" PaginationDirectionPrev PaginationDirection = "prev" )
type PaginationResult ¶
type PaginationResult[T any] struct { Items []T `json:"items"` HasMore bool `json:"has_more"` NextCursor string `json:"next_cursor,omitempty"` PrevCursor string `json:"prev_cursor,omitempty"` }
PaginationResult represents the result of a pagination operation.
type ParseConfiguration ¶ added in v0.2.0
type ParseConfiguration struct {
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
PageClassifications []PageClassConfig `json:"page_classifications,omitempty"`
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
}
ParseConfiguration contains the full configuration used for a parse job.
type ParseDatasetRequest ¶
type ParseDatasetRequest struct {
DatasetId string `json:"-"`
FileSource
PageRange string `json:"page_range,omitempty"`
FileName string `json:"file_name,omitempty"`
MimeType MimeType `json:"mime_type,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
}
ParseDatasetRequest holds options for parsing a document with a dataset.
type ParseDocumentRequest ¶
type ParseDocumentRequest struct {
FileSource
// ParsingOptions contains the properties of this object define
// the configuration for the document parsing process.
//
// Tensorlake provides sane defaults that work well for most
// documents, so this object is not required. However, every document
// is different, and you may want to customize the parsing process to
// better suit your needs.
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
// The properties of this object help to extend the output of the document
// parsing process with additional information.
//
// This includes summarization of tables and figures, which can help to
// provide a more comprehensive understanding of the document.
//
// This object is not required, and the API will use default settings if it
// is not present.
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
// StructuredExtractionOptions is the options for structured data extraction.
//
// The properties of this object define the configuration for structured
// data extraction.
//
// If this object is present, the API will perform structured data
// extraction on the document.
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
// PageClassificationOptions is the options for page classification.
//
// The properties of this object define the configuration for page
// classify.
//
// If this object is present, the API will perform page classify on
// the document.
PageClassificationOptions []PageClassConfig `json:"page_classifications,omitempty"`
// PageRange is a comma-separated list of page numbers or
// ranges to parse (e.g., '1,2,3-5'). Default: all pages.
// Examples: "1-5,8,10"
PageRange string `json:"page_range,omitempty"`
// Additional metadata to identify the read request. The labels are
// returned in the read response.
Labels map[string]string `json:"labels,omitempty"`
// FileName is the name of the file. This is populated when file_id is used.
//
// Optional.
FileName string `json:"file_name,omitempty"`
// MimeType is the MIME type of the file. This is used to determine how to process the file.
MimeType MimeType `json:"mime_type,omitempty"`
}
type ParseEventName ¶
type ParseEventName string
ParseEventName is the name of the SSE event.
const ( SSEEventParseQueued ParseEventName = "parse_queued" SSEEventParseUpdate ParseEventName = "parse_update" SSEEventParseDone ParseEventName = "parse_done" SSEEventParseFailed ParseEventName = "parse_failed" )
The possible SSE events. See also: https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/documentai/_parse.py#L499
type ParseJob ¶
type ParseJob struct {
// ParseId is the unique identifier for the parse job.
// This is the ID that can be used to track the status of the parse job.
// Used in the GET /documents/v2/parse/{parse_id} endpoint to retrieve
// the status and results of the parse job.
ParseId string `json:"parse_id"`
// CreatedAt is the creation date and time of the parse job.
CreatedAt string `json:"created_at"`
}
ParseJob represents a parse job.
type ParseResult ¶
type ParseResult struct {
// The unique identifier for the parse job. This is the same value
// returned from ReadDocument or ParseDocument.
// Example: "parse_abcd1234"
ParseId string `json:"parse_id"`
// The number of pages that were parsed successfully.
// This is the total number of pages that were successfully parsed
// in the document. Required range: x >= 0. Example: 5
ParsedPagesCount int `json:"parsed_pages_count"`
// The current status of the parse job. This indicates whether the
// job is pending, in progress, completed, or failed.
// This can be used to track the progress of the parse operation.
Status ParseStatus `json:"status"`
// The date and time when the parse job was created.
// The date is in RFC 3339 format. This can be used to track when
// the parse job was initiated. Example: "2023-10-01T12:00:00Z"
CreatedAt string `json:"created_at"`
// Error occurred during any part of the parse execution.
// This is only populated if the parse operation failed.
Error string `json:"error,omitempty"`
// The date and time when the parse job was finished.
// The date is in RFC 3339 format.
// This can be undefined if the parse job is still in progress or pending.
FinishedAt string `json:"finished_at,omitempty"`
// Labels associated with the parse job.
//
// These are the key-value, or json, pairs submitted with the parse
// request.
//
// This can be used to categorize or tag the parse job for easier
// identification and filtering.
//
// It can be undefined if no labels were provided in the request.
Labels map[string]string `json:"labels,omitempty"`
// TotalPages is the total number of pages in the document that was parsed.
TotalPages int `json:"total_pages,omitempty"`
// MessageUpdate is the message update for the parse job.
MessageUpdate string `json:"message_update,omitempty"`
// PdfBase64 is the base64-encoded PDF content of the parsed document.
PdfBase64 string `json:"pdf_base64,omitempty"`
// TasksCompletedCount is the number of tasks completed for the parse job.
TasksCompletedCount *int `json:"tasks_completed_count,omitempty"`
// TasksTotalCount is the total number of tasks for the parse job.
TasksTotalCount *int `json:"tasks_total_count,omitempty"`
// If the parse job was scheduled from a dataset, this field contains
// the dataset id. This is the identifier used in URLs and API endpoints
// to refer to the dataset.
DatasetId string `json:"dataset_id,omitempty"`
// Chunks of the document.
//
// This is a vector of Chunk objects, each containing a chunk of the
// document.
//
// The number of chunks depend on the chunking strategy used during
// parsing.
Chunks []Chunk `json:"chunks,omitempty"`
// List of pages parsed from the document.
//
// Each page has a list of fragments, which are detected objects such as
// tables, text, figures, section headers, etc.
//
// We also return the detected text, structure of the table(if its a
// table), and the bounding box of the object.
Pages []Page `json:"pages"`
// Page classes extracted from the document.
//
// This is a map where the keys are page class names provided in the parse
// request under the page_classification_options field,
// and the values are vectors of page numbers (1-indexed) where each page
// class appears.
//
// This is used to categorize pages in the document based on the
// classify options provided.
PageClasses []PageClass `json:"page_classes,omitempty"`
// Structured data extracted from the document.
//
// The structured data is a map where the keys are the schema names
// provided in the parse request, and the values are
// StructuredData objects containing the structured data extracted from
// the document.
//
// The number of structured data objects depends on the partition strategy
// None - one structured data object for the entire document.
// Page - one structured data object for each page.
StructuredData []StructuredData `json:"structured_data,omitempty"`
// MergedTables contains tables that were merged across multiple pages.
MergedTables []MergedTable `json:"merged_tables,omitempty"`
// Options contains the options used for the parse job.
Options *ParseResultOptions `json:"options,omitempty"`
// Resource usage associated with the parse job.
//
// This includes details such as number of pages parsed, tokens used for
// OCR and extraction, etc.
//
// Usage is only populated for successful jobs.
//
// Billing is based on the resource usage.
Usage Usage `json:"usage"`
}
ParseResult represents the result of a parse job.
type ParseResultOptions ¶ added in v0.1.1
type ParseResultOptions struct {
FileSource
FileName string `json:"file_name"`
FileLabels map[string]string `json:"file_labels"`
MimeType MimeType `json:"mime_type"`
TraceId string `json:"trace_id"`
PageRange string `json:"page_range"`
JobType JobType `json:"job_type"`
Configuration *ParseConfiguration `json:"configuration"`
Usage *Usage `json:"usage,omitempty"`
MessageUpdate string `json:"message_update,omitempty"`
}
ParseResultOptions contains the options used for the parse job. It includes the configuration options used for the parse job, including the file ID, file URL, raw text, mime type, and structured extraction options, etc.
type ParseResultUpdateFunc ¶
type ParseResultUpdateFunc func(name ParseEventName, result *ParseResult)
ParseResultUpdateFunc is a callback function that receives intermediate parse result updates during SSE streaming. It will be called for each SSE event received.
type ParseStatus ¶
type ParseStatus string
ParseStatus indicates the status of the parse job.
const ( // ParseStatusFailure means the job has failed. ParseStatusFailure ParseStatus = "failure" // ParseStatusPending means the job is waiting to be processed. ParseStatusPending ParseStatus = "pending" // ParseStatusProcessing means the job is currently being processed. ParseStatusProcessing ParseStatus = "processing" // ParseStatusSuccessful means the job has been successfully completed and the results are available. ParseStatusSuccessful ParseStatus = "successful" // ParseStatusDetectingLayout means the job is detecting the layout of the document. ParseStatusDetectingLayout ParseStatus = "detecting_layout" // ParseStatusLayoutDetected means the layout of the document has been detected. ParseStatusLayoutDetected ParseStatus = "detected_layout" // ParseStatusExtractingData means the job is extracting the data from the document. ParseStatusExtractingData ParseStatus = "extracting_data" // ParseStatusExtractedData means the data has been extracted from the document. ParseStatusExtractedData ParseStatus = "extracted_data" // ParseStatusFormattingOutput means the output is being formatted. ParseStatusFormattingOutput ParseStatus = "formatting_output" // ParseStatusFormattedOutput means the output has been formatted. ParseStatusFormattedOutput ParseStatus = "formatted_output" )
type ParsingOptions ¶
type ParsingOptions struct {
// Chunking strategy determines how the document is chunked into smaller pieces.
// Different strategies can be used to optimize the parsing process.
// Choose the one that best fits your use case. The default is `None`,
// which means no chunking is applied.
ChunkingStrategy ChunkingStrategy `json:"chunking_strategy,omitempty"`
// CrossPageHeaderDetection enables header-hierarchy detection across pages.
// When set to `true`, the parser will consider headers from different pages
// when determining the hierarchy of headers within a single page.
CrossPageHeaderDetection bool `json:"cross_page_header_detection,omitempty"`
// DisableLayoutDetection disables bounding box detection for the document.
// Leads to faster document parsing.
DisableLayoutDetection bool `json:"disable_layout_detection,omitempty"`
// OCRModel indicates the model to use for OCR (Optical Character Recognition).
//
// - model01: It's fast but could have lower accuracy on complex tables.
// It's good for legal documents with footnotes.
// - model02: It's slower but could have higher accuracy on complex tables.
// It's good for financial documents with merged cells.
// - model03: A compact model that we deliver to on-premise users.
// It takes about 2 minutes to startup on Tensorlake's Cloud
// because it's meant for testing for users who are eventually
// going to deploy this model on dedicated hardware in their
// own datacenter.
OCRModel OCRPipelineProvider `json:"ocr_model,omitempty"`
// RemoveStrikethroughLines enables the detection, and removal, of
// strikethrough text in the document. This flag incurs additional billing costs.
RemoveStrikethroughLines bool `json:"remove_strikethrough_lines,omitempty"`
// SignatureDetection enables the detection of signatures in the document.
// This flag incurs additional billing costs.
// The default is false.
SignatureDetection bool `json:"signature_detection,omitempty"`
// SkewDetection enables detect and correct skewed or rotated pages in the
// document. Setting this to true will increase the processing time of the
// document. The default is false.
SkewDetection bool `json:"skew_detection,omitempty"`
// TableOutputMode is the format for the tables extracted from the document.
// The default is HTML.
TableOutputMode TableOutputMode `json:"table_output_mode,omitempty"`
// TableParsingFormat determines which model the system uses to identify
// and extract tables from the document. The default is tsr.
TableParsingFormat TableParsingFormat `json:"table_parsing_format,omitempty"`
// IgnoreSections contain a set of page fragment types to ignore during parsing.
//
// This can be used to skip certain types of content that are not relevant
// for the parsing process, such as headers, footers, or other
// non-essential elements.
//
// The default is an empty set.
IgnoreSections []PageFragmentType `json:"ignore_sections,omitempty"`
// IncludeImages embeded images from document in the markdown.
// The default is false.
IncludeImages bool `json:"include_images,omitempty"`
// BarcodeDetection enable barcode detection in the document.
// Setting this to true will increase the processing time of the document.
// The default is false.
BarcodeDetection bool `json:"barcode_detection,omitempty"`
// MergeTables enables merging of tables that span across multiple pages.
// The default is false.
MergeTables bool `json:"merge_tables,omitempty"`
}
ParsingOptions holds configuration for document parsing.
type PartitionStrategy ¶
type PartitionStrategy string
PartitionStrategy determines how documents are partitioned before structured data extraction.
The API will return one structured data object per partition.
const ( // PartitionStrategyNone: No partitioning is applied. // The entire document is treated as a single unit for extraction. PartitionStrategyNone PartitionStrategy = "none" // PartitionStrategyPage: The document is partitioned by individual pages. // Each page is treated as a separate unit for extraction. PartitionStrategyPage PartitionStrategy = "page" // PartitionStrategySection: The document is partitioned into sections based on // detected section headers. Each section is treated as a separate unit for extraction. PartitionStrategySection PartitionStrategy = "section" // PartitionStrategyFragment: The document is partitioned by individual page elements. // Each fragment is treated as a separate unit for extraction. PartitionStrategyFragment PartitionStrategy = "fragment" // PartitionStrategyPatterns: The document is partitioned based on user-defined // start and end patterns. PartitionStrategyPatterns PartitionStrategy = "patterns" )
type ProcessInfo ¶ added in v0.3.0
type ProcessInfo struct {
PID int32 `json:"pid"`
Status ProcessStatus `json:"status"`
ExitCode *int32 `json:"exit_code,omitempty"`
Signal *int32 `json:"signal,omitempty"`
StdinWritable bool `json:"stdin_writable"`
Command string `json:"command"`
Args []string `json:"args"`
StartedAt int64 `json:"started_at"`
EndedAt *int64 `json:"ended_at,omitempty"`
}
ProcessInfo represents metadata about a sandbox process.
type ProcessListResponse ¶ added in v0.3.0
type ProcessListResponse struct {
Processes []ProcessInfo `json:"processes"`
}
ProcessListResponse represents the response from listing processes.
type ProcessOutputEvent ¶ added in v0.3.0
type ProcessOutputEvent struct {
Line string `json:"line"`
Timestamp int64 `json:"timestamp"`
Stream string `json:"stream,omitempty"` // "stdout" or "stderr", only present in follow-output
}
ProcessOutputEvent represents a single line of output from a follow stream.
type ProcessOutputResponse ¶ added in v0.3.0
type ProcessOutputResponse struct {
PID int32 `json:"pid"`
Lines []string `json:"lines"`
LineCount int32 `json:"line_count"`
}
ProcessOutputResponse represents captured output lines from a process.
type ProcessStatus ¶ added in v0.3.0
type ProcessStatus string
ProcessStatus represents the current state of a sandbox process.
const ( ProcessStatusRunning ProcessStatus = "running" ProcessStatusExited ProcessStatus = "exited" ProcessStatusSignaled ProcessStatus = "signaled" )
type ReadDocumentRequest ¶
type ReadDocumentRequest struct {
FileSource
// ParsingOptions contains the properties of this object define
// the configuration for the document parsing process.
//
// Tensorlake provides sane defaults that work well for most
// documents, so this object is not required. However, every document
// is different, and you may want to customize the parsing process to
// better suit your needs.
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
// The properties of this object help to extend the output of the document
// parsing process with additional information.
//
// This includes summarization of tables and figures, which can help to
// provide a more comprehensive understanding of the document.
//
// This object is not required, and the API will use default settings if it
// is not present.
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
// Additional metadata to identify the read request. The labels are
// returned in the read response.
Labels map[string]string `json:"labels,omitempty"`
// FileName is the name of the file. Only populated when using file_id.
// Examples: "document.pdf"
FileName string `json:"file_name,omitempty"`
// PageRange is a comma-separated list of page numbers or
// ranges to parse (e.g., '1,2,3-5'). Default: all pages.
// Examples: "1-5,8,10"
PageRange string `json:"page_range,omitempty"`
// MimeType is the MIME type of the file. This is used to determine how to process the file.
MimeType MimeType `json:"mime_type,omitempty"`
}
ReadDocumentRequest holds the input parameters for reading/parsing a document.
type ResizePTYRequest ¶ added in v0.3.0
ResizePTYRequest holds the terminal dimensions for a resize operation.
type SandboxDirectoryEntry ¶ added in v0.3.0
type SandboxDirectoryEntry struct {
// Name is the name of the file or directory.
Name string `json:"name"`
// IsDir indicates whether this entry is a directory.
IsDir bool `json:"is_dir"`
// Size is the file size in bytes. Nil for directories.
Size *int64 `json:"size,omitempty"`
// ModifiedAt is the last modification time in milliseconds since epoch. Nil if unavailable.
ModifiedAt *int64 `json:"modified_at,omitempty"`
}
SandboxDirectoryEntry represents a file or directory entry in a sandbox.
type SandboxDirectoryListResponse ¶ added in v0.3.0
type SandboxDirectoryListResponse struct {
// Path is the directory path that was listed.
Path string `json:"path"`
// Entries contains the directory entries, sorted with directories first, then alphabetically.
Entries []SandboxDirectoryEntry `json:"entries"`
}
SandboxDirectoryListResponse represents the response from listing a sandbox directory.
type SandboxInfo ¶ added in v0.3.0
type SandboxInfo struct {
Id string `json:"id"`
Namespace string `json:"namespace"`
Image string `json:"image,omitempty"`
Status SandboxStatus `json:"status"`
PendingReason string `json:"pending_reason,omitempty"`
Outcome string `json:"outcome,omitempty"`
CreatedAt int64 `json:"created_at"`
ContainerId string `json:"container_id,omitempty"`
ExecutorId string `json:"executor_id,omitempty"`
Resources ContainerResourcesInfo `json:"resources"`
TimeoutSecs int64 `json:"timeout_secs"`
SandboxURL string `json:"sandbox_url,omitempty"`
PoolId string `json:"pool_id,omitempty"`
NetworkPolicy *SandboxNetworkAccessControl `json:"network_policy,omitempty"`
AllowUnauthenticatedAccess bool `json:"allow_unauthenticated_access"`
ExposedPorts []int32 `json:"exposed_ports,omitempty"`
TemplateId string `json:"template_id,omitempty"`
Name string `json:"name,omitempty"`
}
SandboxInfo represents detailed information about a sandbox.
type SandboxNetworkAccessControl ¶ added in v0.3.0
type SandboxNetworkAccessControl struct {
AllowInternetAccess bool `json:"allow_internet_access"`
AllowOut []string `json:"allow_out,omitempty"`
DenyOut []string `json:"deny_out,omitempty"`
}
SandboxNetworkAccessControl configures network access for a sandbox.
type SandboxPendingReason ¶ added in v0.3.0
type SandboxPendingReason string
SandboxPendingReason describes why a sandbox is in pending state.
const ( SandboxPendingReasonScheduling SandboxPendingReason = "scheduling" SandboxPendingReasonWaitingForContainer SandboxPendingReason = "waiting_for_container" SandboxPendingReasonNoExecutorsAvailable SandboxPendingReason = "no_executors_available" SandboxPendingReasonNoResourcesAvailable SandboxPendingReason = "no_resources_available" SandboxPendingReasonPoolAtCapacity SandboxPendingReason = "pool_at_capacity" )
type SandboxProxyError ¶ added in v0.3.0
type SandboxProxyError struct {
// Err is a human-readable error message.
Err string `json:"error"`
// Code is an optional machine-readable error code.
Code string `json:"code,omitempty"`
}
SandboxProxyError represents an error returned by the sandbox proxy API.
func (*SandboxProxyError) Error ¶ added in v0.3.0
func (e *SandboxProxyError) Error() string
type SandboxResourceOverrides ¶ added in v0.3.0
type SandboxResourceOverrides struct {
CPUs float64 `json:"cpus,omitempty"`
MemoryMB int64 `json:"memory_mb,omitempty"`
EphemeralDiskMB int64 `json:"ephemeral_disk_mb,omitempty"` // Deprecated: server ignores this field.
GPUs []GPUResources `json:"gpus,omitempty"`
}
SandboxResourceOverrides configures resource allocation for a sandbox.
type SandboxStatus ¶ added in v0.3.0
type SandboxStatus string
SandboxStatus represents the current state of a sandbox.
const ( SandboxStatusPending SandboxStatus = "pending" SandboxStatusRunning SandboxStatus = "running" SandboxStatusSnapshotting SandboxStatus = "snapshotting" SandboxStatusSuspending SandboxStatus = "suspending" SandboxStatusSuspended SandboxStatus = "suspended" SandboxStatusTerminated SandboxStatus = "terminated" )
type SignalProcessRequest ¶ added in v0.3.0
type SignalProcessRequest struct {
Signal int32 `json:"signal"`
}
SignalProcessRequest holds the signal to send to a process.
type SnapshotContentMode ¶ added in v0.3.0
type SnapshotContentMode string
SnapshotContentMode determines what content is captured in a snapshot.
const ( SnapshotContentModeFull SnapshotContentMode = "full" SnapshotContentModeFilesystemOnly SnapshotContentMode = "filesystem_only" )
type SnapshotSandboxRequest ¶ added in v0.3.0
type SnapshotSandboxRequest struct {
SnapshotContentMode SnapshotContentMode `json:"snapshot_content_mode,omitempty"`
}
SnapshotSandboxRequest holds options for snapshotting a sandbox.
type SnapshotSandboxResponse ¶ added in v0.3.0
type SnapshotSandboxResponse struct {
SnapshotId string `json:"snapshot_id"`
Status string `json:"status"`
}
SnapshotSandboxResponse represents the response from snapshotting a sandbox.
type StartProcessRequest ¶ added in v0.3.0
type StartProcessRequest struct {
// Command is the executable to run.
//
// Required.
Command string `json:"command"`
// Args are command-line arguments.
Args []string `json:"args,omitempty"`
// Env sets environment variables for the process.
Env map[string]string `json:"env,omitempty"`
// WorkingDir is the working directory for the process.
WorkingDir string `json:"working_dir,omitempty"`
// StdinMode determines how stdin is handled. Default: "closed".
StdinMode StdinMode `json:"stdin_mode,omitempty"`
// StdoutMode determines how stdout is handled. Default: "capture".
StdoutMode OutputMode `json:"stdout_mode,omitempty"`
// StderrMode determines how stderr is handled. Default: "capture".
StderrMode OutputMode `json:"stderr_mode,omitempty"`
}
StartProcessRequest holds options for starting a process in a sandbox.
type StdinMode ¶ added in v0.3.0
type StdinMode string
StdinMode determines how stdin is handled for a process.
type StructuredData ¶
type StructuredData struct {
// Data is a JSON object containing the structured data extracted from the document.
// The schema is specified in the StructuredExtractionOptions.JSONSchema field.
Data json.RawMessage `json:"data"`
// PageNumber contains either an integer or an array of integers regarding page numbers.
// Example: [1, 2, 3] or 1
PageNumbers UnionValues[int] `json:"page_numbers"`
// SchemaName is the name of the schema used to extract the structured data.
// It is specified in the StructuredExtractionOptions.SchemaName field.
SchemaName string `json:"schema_name,omitempty"`
}
StructuredData extracted from the document. The structured data is a map where the keys are the schema names provided in the parse request, and the values are StructuredData objects containing the structured data extracted from the document.
type StructuredExtractionOptions ¶
type StructuredExtractionOptions struct {
// The name of the schema. This is used to tag the structured data output
// with a name in the response.
SchemaName string `json:"schema_name"`
// The JSON schema to guide structured data extraction from the file.
//
// This schema should be a valid JSON schema that defines the structure of
// the data to be extracted.
//
// The API supports a subset of the JSON schema specification.
//
// This value must be provided if structured_extraction is present in the
// request.
JSONSchema *jsonschema.Schema `json:"json_schema"` // Can be any JSON schema structure
// Strategy to partition the document before structured data extraction.
// The API will return one structured data object per partition. This is
// useful when you want to extract certain fields from every page.
PartitionStrategy PartitionStrategy `json:"partition_strategy,omitempty"`
// The model provider to use for structured data extraction.
//
// The default is tensorlake, which uses our private model, and runs on
// our servers.
ModelProvider ModelProvider `json:"model_provider,omitempty"`
// Filter the pages of the document to be used for structured data
// extraction by providing a list of page classes.
PageClasses []string `json:"page_classes,omitempty"`
// The prompt to use for structured data extraction.
//
// If not provided, the default prompt will be used.
Prompt string `json:"prompt,omitempty"`
// Flag to enable visual citations in the structured data output.
// It returns the bounding boxes of the coordinates of the document
// where the structured data was extracted from.
ProvideCitations bool `json:"provide_citations,omitempty"`
// Boolean flag to skip converting the document blob to OCR text before
// structured data extraction.
//
// If set to true, the API will skip the OCR step and directly extract
// structured data from the document.
SkipOCR bool `json:"skip_ocr,omitempty"`
}
StructuredExtractionOptions holds configuration for structured data extraction.
type TableOutputMode ¶
type TableOutputMode string
TableOutputMode is the format for tables extracted from the document.
const ( // TableOutputModeHTML outputs tables as HTML strings. TableOutputModeHTML TableOutputMode = "html" // TableOutputModeMarkdown outputs tables as Markdown strings. TableOutputModeMarkdown TableOutputMode = "markdown" )
type TableParsingFormat ¶
type TableParsingFormat string
TableParsingFormat determines which model the system uses to identify and extract tables from the document.
const ( // TableParsingFormatTSR identifies the structure of the table first, // then the cells of the tables. Better suited for clean, grid-like tables. TableParsingFormatTSR TableParsingFormat = "tsr" // TableParsingFormatVLM uses a vision language model to identify // and extract the cells of the tables. Better suited for tables // with merged cells or irregular structures. TableParsingFormatVLM TableParsingFormat = "vlm" )
type UnionValues ¶
type UnionValues[T any] []T
UnionValues is a union of values of type T. It can be a single value or an array of values.
func (UnionValues[T]) MarshalJSON ¶
func (v UnionValues[T]) MarshalJSON() ([]byte, error)
MarshalJSON marshals a UnionValues into a JSON array.
func (*UnionValues[T]) UnmarshalJSON ¶
func (v *UnionValues[T]) UnmarshalJSON(b []byte) error
UnmarshalJSON unmarshals a JSON array or a single value into a UnionValues.
type UpdateDatasetRequest ¶
type UpdateDatasetRequest struct {
DatasetId string `json:"-"`
Description string `json:"description,omitempty"`
ParsingOptions *ParsingOptions `json:"parsing_options,omitempty"`
StructuredExtractionOptions []StructuredExtractionOptions `json:"structured_extraction_options,omitempty"`
PageClassifications []PageClassConfig `json:"page_classifications,omitempty"`
EnrichmentOptions *EnrichmentOptions `json:"enrichment_options,omitempty"`
}
UpdateDatasetRequest holds options for updating a dataset.
type UpdateSandboxRequest ¶ added in v0.3.0
type UpdateSandboxRequest struct {
AllowUnauthenticatedAccess *bool `json:"allow_unauthenticated_access,omitempty"`
ExposedPorts []int32 `json:"exposed_ports,omitempty"`
}
UpdateSandboxRequest holds options for updating a sandbox.
type UploadFileRequest ¶
type UploadFileRequest struct {
// FileBytes is the reader for the file to upload.
//
// Required.
FileBytes io.Reader `json:"file_bytes"`
// FileName is the name of the file to upload.
//
// Optional.
FileName string `json:"file_name"`
// Labels are the labels to add to the file.
//
// Optional.
Labels map[string]string `json:"labels,omitempty"`
}
UploadFileRequest holds options for uploading a file.
type Usage ¶ added in v0.1.1
type Usage struct {
PagesParsed int `json:"pages_parsed"`
SignatureDetectedPages int `json:"signature_detected_pages"`
StrikethroughDetectedPages int `json:"strikethrough_detected_pages"`
OCRInputTokensUsed int `json:"ocr_input_tokens_used"`
OCROutputTokensUsed int `json:"ocr_output_tokens_used"`
ExtractionInputTokensUsed int `json:"extraction_input_tokens_used"`
ExtractionOutputTokensUsed int `json:"extraction_output_tokens_used"`
SummarizationInputTokensUsed int `json:"summarization_input_tokens_used"`
SummarizationOutputTokensUsed int `json:"summarization_output_tokens_used"`
}
Usage contains resource usage associated with the parse job. This includes details such as number of pages parsed, tokens used for OCR and extraction, etc. Usage is only populated for successful jobs. Billing is based on the resource usage.
Source Files
¶
- client.go
- dataset_create.go
- dataset_delete.go
- dataset_get.go
- dataset_list.go
- dataset_parse.go
- dataset_update.go
- doc.go
- enum.go
- errors.go
- file_delete.go
- file_list.go
- file_metadata.go
- file_upload.go
- opt.go
- parse_classify.go
- parse_delete.go
- parse_extract.go
- parse_get.go
- parse_list.go
- parse_parse.go
- parse_read.go
- sandbox.go
- sandbox_api.go
- sandbox_create.go
- sandbox_delete.go
- sandbox_file_delete.go
- sandbox_file_list.go
- sandbox_file_read.go
- sandbox_file_write.go
- sandbox_get.go
- sandbox_list.go
- sandbox_process.go
- sandbox_process_close_stdin.go
- sandbox_process_follow_output.go
- sandbox_process_follow_stderr.go
- sandbox_process_follow_stdout.go
- sandbox_process_get.go
- sandbox_process_kill.go
- sandbox_process_list.go
- sandbox_process_output.go
- sandbox_process_signal.go
- sandbox_process_start.go
- sandbox_process_stderr.go
- sandbox_process_stdin.go
- sandbox_process_stdout.go
- sandbox_pty_create.go
- sandbox_pty_get.go
- sandbox_pty_kill.go
- sandbox_pty_list.go
- sandbox_pty_resize.go
- sandbox_pty_websocket.go
- sandbox_resume.go
- sandbox_snapshot.go
- sandbox_suspend.go
- sandbox_update.go
- types.go
Directories
¶
| Path | Synopsis |
|---|---|
|
examples
|
|
|
sandbox-benchmark
command
Command sandbox-benchmark stress-tests sandbox creation latency by running a series of concurrency levels, launching N sandboxes at each level and measuring create API response time and time-to-running.
|
Command sandbox-benchmark stress-tests sandbox creation latency by running a series of concurrency levels, launching N sandboxes at each level and measuring create API response time and time-to-running. |
|
sandbox-terminal
command
Command sandbox-terminal creates a Tensorlake sandbox and connects an interactive terminal session to it via PTY over WebSocket.
|
Command sandbox-terminal creates a Tensorlake sandbox and connects an interactive terminal session to it via PTY over WebSocket. |
|
internal
|
|