Documentation
¶
Index ¶
- Constants
- Variables
- func ConvertDevicesToGPUInfo(devices []*hvapi.DeviceInfo) []api.GPUInfo
- func ConvertMetricsToGPUMetrics(metrics map[string]*hvapi.GPUUsageMetrics) map[string]*api.GPUMetrics
- func CreateMockGPUs(count int) []api.GPUInfo
- func DetectVendorFromLibPath(libPath string) string
- func DownloadOrFindAccelerator() (string, error)
- type Agent
- type AlreadyRegisteredError
- type LocalStatus
- type NotRegisteredError
Constants ¶
const ( // EnvConnectionInfoPath is the environment variable name for connection info directory path // Workers should write their connections to: {TF_CLIENT_INFO_PATH}/{workerID}.txt EnvConnectionInfoPath = "TF_CLIENT_INFO_PATH" // Hard limiter environment variables for Fractional GPU support // These only take effect when isolation_mode is "hard" // HardSMLimiterEnv sets compute limit in percent (1-100) HardSMLimiterEnv = "TF_CUDA_SM_PERCENT_LIMIT" // HardMemLimiterEnv sets memory limit in megabytes HardMemLimiterEnv = "TF_GPU_MEMORY_LIMIT" // URL auth environment variables // EnvURLAuth enables URL-based authentication on workers EnvURLAuth = "TF_ENABLE_URL_AUTH" // EnvAuthorizedKeyPath points to a file containing authorized share codes (one per line) EnvAuthorizedKeyPath = "TF_AUTHORIZED_KEY_PATH" )
Variables ¶
var ErrAlreadyRegistered = &AlreadyRegisteredError{}
ErrAlreadyRegistered indicates the agent is already registered
var ErrNotRegistered = &NotRegisteredError{}
ErrNotRegistered indicates the agent is not registered
Functions ¶
func ConvertDevicesToGPUInfo ¶
func ConvertDevicesToGPUInfo(devices []*hvapi.DeviceInfo) []api.GPUInfo
ConvertDevicesToGPUInfo converts hypervisor DeviceInfo to API GPUInfo
func ConvertMetricsToGPUMetrics ¶
func ConvertMetricsToGPUMetrics(metrics map[string]*hvapi.GPUUsageMetrics) map[string]*api.GPUMetrics
ConvertMetricsToGPUMetrics converts hypervisor metrics to API metrics
func CreateMockGPUs ¶
CreateMockGPUs creates mock GPU info for testing without real hardware
func DetectVendorFromLibPath ¶
DetectVendorFromLibPath extracts vendor name from library path
func DownloadOrFindAccelerator ¶
DownloadOrFindAccelerator detects vendor, finds or downloads the accelerator library Priority: 1) Config 2) System detection 3) Download from CDN if not found
Types ¶
type Agent ¶
type Agent struct {
// contains filtered or unexported fields
}
Agent manages the GPU agent lifecycle
func NewAgentWithHypervisor ¶
func NewAgentWithHypervisor(client *api.Client, configMgr *config.Manager, hvMgr hypervisor.HypervisorManager, workerBinaryPath string) *Agent
NewAgentWithHypervisor creates a new agent with hypervisor manager
func (*Agent) GetHypervisorManager ¶
func (a *Agent) GetHypervisorManager() hypervisor.HypervisorManager
GetHypervisorManager returns the hypervisor manager if available
func (*Agent) GetReconciler ¶
func (a *Agent) GetReconciler() *hypervisor.Reconciler
GetReconciler returns the reconciler if available
type AlreadyRegisteredError ¶ added in v1.1.21
type AlreadyRegisteredError struct{}
AlreadyRegisteredError is returned when the agent is already registered
func (*AlreadyRegisteredError) Error ¶ added in v1.1.21
func (e *AlreadyRegisteredError) Error() string
type LocalStatus ¶
LocalStatus represents the local agent status
func GetLocalStatus ¶
func GetLocalStatus(paths *platform.Paths) LocalStatus
GetLocalStatus checks local agent status by reading PID file and checking process
type NotRegisteredError ¶
type NotRegisteredError struct{}
NotRegisteredError is returned when the agent is not registered
func (*NotRegisteredError) Error ¶
func (e *NotRegisteredError) Error() string