Documentation
¶
Index ¶
- func AddAllocationAcceleratorData(ctx context.Context, accData model.AcceleratorData) error
- func InsertNTSCAllocationWorkspaceRecord(ctx context.Context, allocationID model.AllocationID, workspaceID int, ...) error
- func InsertTrialAllocationWorkspaceRecord(ctx context.Context, experimentID int, allocationID model.AllocationID) error
- type AllocationExited
- type AllocationService
- type AllocationSignal
- type AllocationState
- type AllocationUnfulfilledError
- type AlreadyCancelledError
- type BehaviorDisabledError
- type BehaviorUnsupportedError
- type NoAllocationError
- type RendezvousInfoOrError
- type RendezvousWatcher
- type StaleContainerError
- type StaleResourcesError
- type StaleResourcesReceivedError
- type TimeoutExceededError
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AddAllocationAcceleratorData ¶
func AddAllocationAcceleratorData(ctx context.Context, accData model.AcceleratorData, ) error
AddAllocationAcceleratorData stores acceleration data for an allocation.
func InsertNTSCAllocationWorkspaceRecord ¶
func InsertNTSCAllocationWorkspaceRecord( ctx context.Context, allocationID model.AllocationID, workspaceID int, workspaceName string, ) error
InsertNTSCAllocationWorkspaceRecord inserts a record linking an NTSC tasks' allocation to it's respective workspace.
func InsertTrialAllocationWorkspaceRecord ¶
func InsertTrialAllocationWorkspaceRecord( ctx context.Context, experimentID int, allocationID model.AllocationID, ) error
InsertTrialAllocationWorkspaceRecord inserts a record linking an trial's allocation to a trial to it's respective workspace & experiment.
Types ¶
type AllocationExited ¶
type AllocationExited struct {
// userRequestedStop is when a container unexpectedly exits with 0.
UserRequestedStop bool
Err error
FinalState AllocationState
}
AllocationExited summarizes the exit status of an allocation.
func (*AllocationExited) String ¶
func (a *AllocationExited) String() string
type AllocationService ¶
type AllocationService interface {
GetAllAllocationIDs() []model.AllocationID
StartAllocation(
logCtx logger.Context,
req sproto.AllocateRequest,
db db.DB,
rm rm.ResourceManager,
specifier tasks.TaskSpecifier,
onExit func(*AllocationExited),
) error
AwaitTermination(id model.AllocationID)
Signal(
id model.AllocationID,
sig AllocationSignal,
reason string,
) error
State(id model.AllocationID) (AllocationState, error)
SetReady(ctx context.Context, id model.AllocationID) error
SetWaiting(ctx context.Context, id model.AllocationID) error
SetProxyAddress(
ctx context.Context,
id model.AllocationID,
addr string,
) error
GetAllocation(
ctx context.Context,
allocallocationID string,
) (*model.Allocation, error)
SetAcceleratorData(
ctx context.Context,
accData model.AcceleratorData,
) error
WatchRendezvous(
ctx context.Context,
id model.AllocationID,
rID sproto.ResourcesID,
) (*trialv1.RendezvousInfo, error)
SetResourcesAsDaemon(
ctx context.Context,
id model.AllocationID,
rID sproto.ResourcesID,
) error
AllGather(
ctx context.Context,
allocationID model.AllocationID,
id uuid.UUID,
numPeers int,
data any,
) ([]any, error)
WatchPreemption(ctx context.Context, id model.AllocationID) (bool, error)
AckPreemption(ctx context.Context, id model.AllocationID) error
SendLog(
ctx context.Context,
id model.AllocationID,
log *sproto.ContainerLog,
)
WaitForRestore(ctx context.Context, id model.AllocationID) error
Detach(id model.AllocationID) error
}
AllocationService allows callers to launch, direct and query allocations.
var DefaultService AllocationService = newAllocationService()
DefaultService is the singleton default allocationService.
type AllocationSignal ¶
type AllocationSignal string
AllocationSignal is an interface for signals that can be sent to an allocation.
const ( // KillAllocation is the signal to kill an allocation; analogous to SIGKILL. KillAllocation AllocationSignal = "kill" // TerminateAllocation is the signal to kill an allocation; analogous to SIGTERM. TerminateAllocation AllocationSignal = "terminate" )
type AllocationState ¶
type AllocationState struct {
State model.AllocationState
Resources map[sproto.ResourcesID]sproto.ResourcesSummary
Ready bool
Addresses map[sproto.ResourcesID][]cproto.Address
Containers map[sproto.ResourcesID][]cproto.Container
}
AllocationState requests allocation state. A copy is filled and returned.
func (AllocationState) SingleContainer ¶
func (a AllocationState) SingleContainer() *cproto.Container
SingleContainer returns a single random container from the allocation state.
func (AllocationState) SingleContainerAddresses ¶
func (a AllocationState) SingleContainerAddresses() []cproto.Address
SingleContainerAddresses returns a single random container's addresses from the allocation state.
type AllocationUnfulfilledError ¶
type AllocationUnfulfilledError struct {
Action string
}
AllocationUnfulfilledError is returned an operation is tried without an active allocation.
func (AllocationUnfulfilledError) Error ¶
func (e AllocationUnfulfilledError) Error() string
type AlreadyCancelledError ¶
type AlreadyCancelledError struct{}
AlreadyCancelledError is returned to the allocation when it tries to take an action but has an unread cancellation in its inbox.
func (AlreadyCancelledError) Error ¶
func (e AlreadyCancelledError) Error() string
type BehaviorDisabledError ¶
type BehaviorDisabledError struct {
Behavior string
}
BehaviorDisabledError is returned an operation is tried without the behavior being enabled.
func (BehaviorDisabledError) Error ¶
func (e BehaviorDisabledError) Error() string
type BehaviorUnsupportedError ¶
type BehaviorUnsupportedError struct {
Behavior string
}
BehaviorUnsupportedError is returned an operation is tried without the behavior being supported.
func (BehaviorUnsupportedError) Error ¶
func (e BehaviorUnsupportedError) Error() string
type NoAllocationError ¶
type NoAllocationError struct {
Action string
}
NoAllocationError is returned an operation is tried without a requested allocation.
func (NoAllocationError) Error ¶
func (e NoAllocationError) Error() string
type RendezvousInfoOrError ¶
type RendezvousInfoOrError struct {
Info *trialv1.RendezvousInfo
Err error
}
RendezvousInfoOrError contains either rendezvous info or an error from failing to materialize it.
type RendezvousWatcher ¶
type RendezvousWatcher struct {
C <-chan RendezvousInfoOrError
}
RendezvousWatcher contains a channel which can be polled for rendezvous info.
type StaleContainerError ¶
StaleContainerError is returned when an operation was attempted by a stale container.
func (StaleContainerError) Error ¶
func (e StaleContainerError) Error() string
type StaleResourcesError ¶
type StaleResourcesError struct {
ID sproto.ResourcesID
}
StaleResourcesError is returned when an operation was attempted by a stale resources.
func (StaleResourcesError) Error ¶
func (e StaleResourcesError) Error() string
type StaleResourcesReceivedError ¶
type StaleResourcesReceivedError struct{}
StaleResourcesReceivedError is returned the scheduler gives an allocation resources between when it requests them and it deciding, for some reason or another, they are not needed.
func (StaleResourcesReceivedError) Error ¶
func (e StaleResourcesReceivedError) Error() string
type TimeoutExceededError ¶
type TimeoutExceededError struct {
Message string
}
TimeoutExceededError is return, with a bit of detail, when a timeout is exceeded.
func (TimeoutExceededError) Error ¶
func (e TimeoutExceededError) Error() string