Documentation
¶
Index ¶
- Variables
- type CapacityError
- type CedanaCluster
- type CedanaState
- type CheckpointReason
- type CheckpointState
- type CheckpointType
- type ClientInfo
- type Commands
- type GPU
- type GpuInfo
- type Instance
- type Job
- type JobFile
- type JobState
- type MetaState
- type PricingModel
- type ProcessInfo
- type Provider
- type ProviderEvent
- type SerializedInstance
- type ServerCommand
- type UserCommands
- type UserInstanceSpecs
Constants ¶
This section is empty.
Variables ¶
var InstanceStates = map[string]int{
"pending": 0,
"running": 1,
"shutting-down": 2,
"terminated": 3,
"stopping": 4,
}
var ProviderNames = []string{
"aws",
"gcp",
"azure",
"paperspace",
}
Functions ¶
This section is empty.
Types ¶
type CapacityError ¶
func (CapacityError) Error ¶
func (e CapacityError) Error() string
type CedanaCluster ¶
type CedanaState ¶
type CedanaState struct {
ClientInfo ClientInfo `json:"client_info" mapstructure:"client_info"`
ProcessInfo ProcessInfo `json:"process_info" mapstructure:"process_info"`
CheckpointType CheckpointType `json:"checkpoint_type" mapstructure:"checkpoint_type"`
// either local or remote checkpoint path (url vs filesystem path)
CheckpointPath string `json:"checkpoint_path" mapstructure:"checkpoint_path"`
// process state at time of checkpoint
CheckpointState CheckpointState `json:"checkpoint_state" mapstructure:"checkpoint_state"`
}
CedanaState encapsulates a CRIU checkpoint and includes filesystem state for a full restore. Typically serialized and shot around over the wire.
type CheckpointReason ¶
type CheckpointReason string
const ( CheckpointReasonInstanceTermination CheckpointReason = "instance_termination" CheckpointReasonJobTermination CheckpointReason = "job_termination" CheckpointReasonHeartbeat CheckpointReason = "heartbeat" )
type CheckpointState ¶
type CheckpointState string
const ( CheckpointSuccess CheckpointState = "CHECKPOINTED" CheckpointFailed CheckpointState = "CHECKPOINT_FAILED" RestoreSuccess CheckpointState = "RESTORED" RestoreFailed CheckpointState = "RESTORE_FAILED" )
type CheckpointType ¶
type CheckpointType string
const ( CheckpointTypeNone CheckpointType = "none" CheckpointTypeCRIU CheckpointType = "criu" CheckpointTypePytorch CheckpointType = "pytorch" )
type ClientInfo ¶
type ClientInfo struct {
Id string `json:"id" mapstructure:"id"`
Hostname string `json:"hostname" mapstructure:"hostname"`
Platform string `json:"platform" mapstructure:"platform"`
OS string `json:"os" mapstructure:"os"`
Uptime uint64 `json:"uptime" mapstructure:"uptime"`
RemainingMemory uint64 `json:"remaining_memory" mapstructure:"remaining_memory"`
}
type Instance ¶
type Instance struct {
gorm.Model
CedanaID string `json:"-"` // ignore json unmarshal. Cedana ID used for NATS messages
AllocatedID string `json:"allocated_id"` // id allocated by the provider, not to be used as a key
Provider string `json:"provider"`
InstanceType string `json:"InstanceType"`
AcceleratorName string `json:"AcceleratorName"`
AcceleratorCount int `json:"AcceleratorCount"`
VCPUs float64 `json:"vCPUs"`
MemoryGiB float64 `json:"MemoryGiB"`
GPUs string `json:"GPU"`
Region string `json:"Region"`
AvailabilityZone string `json:"AvailabilityZone"`
Price float64 `json:"Price"`
IPAddress string `json:"ip_addr"`
State string `json:"state"`
Tag string `json:"-"` // tag instance as orch or client
}
func (*Instance) DeserializeSelf ¶
func (*Instance) SerializeSelf ¶
type Job ¶
type Job struct {
gorm.Model
JobID string `json:"job_id"` // ignore json unmarshal
JobFilePath string `json:"job_file_path"` // absolute path of job file
Instances string `json:"instances"` // serialized instances.TODO: need to figure out associations!!
State JobState `json:"state"`
Checkpointed bool `json:"checkpointed"`
LastCheckpointedAt time.Time `json:"last_checkpointed_at"` // latest checkpoint
Bucket string `json:"bucket"`
}
foreign keys are weird in GORM, just attach InstanceIDs for now
func (*Job) AppendInstance ¶
these should ideally be called from the db - keeps things consistent
func (*Job) GetInstanceIds ¶
func (j *Job) GetInstanceIds() ([]SerializedInstance, error)
type JobFile ¶
type JobFile struct {
JobFilePath string `mapstructure:"job_file_path"`
WorkDir string `mapstructure:"work_dir"` // TODO NR - data storage abstractions
Containerized bool `mapstructure:"containerized"`
UserInstanceSpecs UserInstanceSpecs `mapstructure:"instance_specs"`
SetupCommands Commands `mapstructure:"setup"`
Task Commands `mapstructure:"task"`
RestoredTask Commands `mapstructure:"restored_task"`
}
Job type to be used to run on an instance, user-defined should be yaml spec
func InitJobFile ¶
type MetaState ¶
type MetaState struct {
Event ProviderEvent `json:"provider_event" mapstructure:"provider_event"`
CheckpointReason CheckpointReason `json:"checkpoint_reason" mapstructure:"checkpoint_reason"`
}
type PricingModel ¶
type PricingModel interface {
GetPrices() []Instance
}
PricingModel populates Instance.Price
type ProcessInfo ¶
type ProcessInfo struct {
PID int32 `json:"pid" mapstructure:"pid"`
AttachedToHardwareAccel bool `json:"attached_to_hardware_accel" mapstructure:"attached_to_hardware_accel"`
OpenFds []process.OpenFilesStat `json:"open_fds" mapstructure:"open_fds"` // list of open FDs
OpenWriteOnlyFilePaths []string `json:"open_write_only" mapstructure:"open_write_only"`
OpenConnections []net.ConnectionStat `json:"open_connections" mapstructure:"open_connections"` // open network connections
MemoryPercent float32 `json:"memory_percent" mapstructure:"memory_percent"` // % of total RAM used
IsRunning bool `json:"is_running" mapstructure:"is_running"`
Status string `json:"status" mapstructure:"status"`
}
type Provider ¶
type Provider interface {
// CreateInstance takes a list of "optimal" instances as input and creates them.
// We take multiple to circumvent any capacity issues.
Name() string
CreateInstance(Candidate *Instance) (*Instance, error)
DestroyInstance(i Instance) error
// Anywhere describeInstance is called, the entry in the db should be updated with the latest information
DescribeInstance(Instances []*Instance, filter string) error
// should encapsulate all events or state changes on the instance. Function that is used for state polling
// regularly, so keep efficiency in mind when designing for a provider.
GetInstanceStatus(i Instance) (*ProviderEvent, error)
}
Types for commodity providers (e.g. AWS, GCP, etc) generic interface for a commodity provider, that actors we broker between (AWS, GCP, etc) will each implement.
type ProviderEvent ¶
type ProviderEvent struct {
InstanceID string `json:"instance_id"`
FaultCode string `json:"fault_code"`
// the below fields are deriviatives of the above, we keep the fault code for any downstream processing
MarkedForTermination bool `json:"marked_for_termination"`
TerminationTime int64 `json:"termination_time"`
}
type SerializedInstance ¶
type SerializedInstance struct {
InstanceID string `json:"instance_id"`
}
only serialize instanceID, can reverse lookup for instance using id
type ServerCommand ¶
type ServerCommand struct {
Command string `json:"command" mapstructure:"command"`
Heartbeat bool `json:"heartbeat" mapstructure:"heartbeat"`
CedanaState CedanaState `json:"cedana_state" mapstructure:"cedana_state"`
}
type UserCommands ¶
type UserCommands struct {
SetupCommands Commands `mapstructure:"setup"`
PostSetupCommands Commands `mapstructure:"post_setup"`
PreCheckpoint Commands `mapstructure:"pre_checkpoint"`
PostCheckpoint Commands `mapstructure:"post_checkpoint"`
PreRestore Commands `mapstructure:"pre_restore"`
PostRestore Commands `mapstructure:"post_restore"`
}
due to key-value nature of yaml, need a nested commands struct