Documentation
¶
Index ¶
- Constants
- func CreateHostConfig(config *HostConfig) error
- func CreatePipelineConfig(cfg *PipelineConfig) error
- func CreateProject(project *Project) error
- func CreateSnapshot(snapshot *Snapshot) error
- func CreateTask(task *Task) error
- func DeleteHostConfig(id string) error
- func DeletePipelineConfig(id string) error
- func DeleteProject(id string) error
- func DeleteSnapshot(snapshot *Snapshot) error
- func DeleteTask(id string) error
- func GetAllRegisteredJoints() map[string]interface{}
- func GetHostStatus(status int) (error, map[string]interface{})
- func GetPermissionsByRole(role string) (*hashset.Set, error)
- func GetTaskStatus(host string) (error, map[string]interface{})
- func GetTaskStatusText(status int) string
- func RegisterPipeJoint(joint Joint)
- func RegisterPipeJointWithName(jointName string, joint Joint)
- func UpdateHostConfig(config *HostConfig) error
- func UpdatePipelineConfig(id string, cfg *PipelineConfig) error
- func UpdateProject(project *Project) error
- func UpdateTask(task *Task) error
- type Aggregation
- type Bucket
- type Context
- type Host
- type HostConfig
- type IndexDocument
- type Joint
- type JointConfig
- type KV
- type LinkGroup
- type PageLink
- type ParaKey
- type Parameters
- func (para *Parameters) Get(key ParaKey) interface{}
- func (para *Parameters) GetBool(key ParaKey, defaultV bool) bool
- func (para *Parameters) GetInt(key ParaKey, defaultV int) (int, bool)
- func (para *Parameters) GetInt64(key ParaKey, defaultV int64) (int64, bool)
- func (para *Parameters) GetInt64OrDefault(key ParaKey, defaultV int64) int64
- func (para *Parameters) GetIntOrDefault(key ParaKey, defaultV int) int
- func (para *Parameters) GetMap(key ParaKey) (map[string]interface{}, bool)
- func (para *Parameters) GetOrDefault(key ParaKey, val interface{}) interface{}
- func (para *Parameters) GetString(key ParaKey) (string, bool)
- func (para *Parameters) GetStringArray(key ParaKey) ([]string, bool)
- func (para *Parameters) GetStringOrDefault(key ParaKey, val string) string
- func (para *Parameters) GetTime(key ParaKey) (time.Time, bool)
- func (para *Parameters) Has(key ParaKey) bool
- func (para *Parameters) MustGet(key ParaKey) interface{}
- func (para *Parameters) MustGetBytes(key ParaKey) []byte
- func (para *Parameters) MustGetInt(key ParaKey) int
- func (para *Parameters) MustGetInt64(key ParaKey) int64
- func (para *Parameters) MustGetMap(key ParaKey) map[string]interface{}
- func (para *Parameters) MustGetString(key ParaKey) string
- func (para *Parameters) MustGetTime(key ParaKey) time.Time
- func (para *Parameters) Set(key ParaKey, value interface{})
- type Pipeline
- func (pipe *Pipeline) Context(s *Context) *Pipeline
- func (pipe *Pipeline) End(s Joint) *Pipeline
- func (pipe *Pipeline) GetContext() *Context
- func (pipe *Pipeline) GetCurrentJoint() string
- func (pipe *Pipeline) GetID() string
- func (pipe *Pipeline) Join(s Joint) *Pipeline
- func (pipe *Pipeline) Run() *Context
- func (pipe *Pipeline) Start(s Joint) *Pipeline
- type PipelineConfig
- type Project
- type Snapshot
- type Task
- func GetFailedTasks(offset time.Time) (int, []Task, error)
- func GetPendingNewFetchTasks(offset time.Time) (int, []Task, error)
- func GetPendingUpdateFetchTasks(offset time.Time) (int, []Task, error)
- func GetTask(id string) (Task, error)
- func GetTaskByField(k, v string) ([]Task, error)
- func GetTaskList(from, size int, host string, status int) (int, []Task, error)
- func NewTask(url, ref string, depth int, breadth int) *Task
Constants ¶
View Source
const ( ROLE_GUEST string = "guest" ROLE_ADMIN string = "admin" )
View Source
const ( //GUEST PERMISSION_SNAPSHOT_VIEW string = "view_snapshot" //ADMIN PERMISSION_ADMIN_MINIMAL string = "admin_minimal" )
View Source
const PipelineConfigBucket = "PipelineConfig"
View Source
const Task404 int = 4
View Source
const TaskCreated int = 0
View Source
const TaskDuplicated int = 7
View Source
const TaskFailed int = 2
View Source
const TaskInterrupted int = 8
View Source
const TaskPendingFetch int = 9
View Source
const TaskRedirected int = 5
View Source
const TaskSuccess int = 3
View Source
const TaskTimeout int = 6
Variables ¶
This section is empty.
Functions ¶
func CreateHostConfig ¶ added in v0.10.0
func CreateHostConfig(config *HostConfig) error
func CreatePipelineConfig ¶ added in v0.10.0
func CreatePipelineConfig(cfg *PipelineConfig) error
func CreateProject ¶ added in v0.10.0
func CreateSnapshot ¶
func CreateTask ¶
func DeleteHostConfig ¶ added in v0.10.0
func DeletePipelineConfig ¶ added in v0.10.0
func DeleteProject ¶ added in v0.10.0
func DeleteSnapshot ¶ added in v0.9.0
func DeleteTask ¶
func GetAllRegisteredJoints ¶ added in v0.10.0
func GetAllRegisteredJoints() map[string]interface{}
func GetHostStatus ¶ added in v0.10.0
func GetPermissionsByRole ¶ added in v0.10.0
func GetTaskStatus ¶ added in v0.10.0
func GetTaskStatusText ¶ added in v0.9.0
func RegisterPipeJoint ¶ added in v0.10.0
func RegisterPipeJoint(joint Joint)
func RegisterPipeJointWithName ¶ added in v0.10.0
func UpdateHostConfig ¶ added in v0.10.0
func UpdateHostConfig(config *HostConfig) error
func UpdatePipelineConfig ¶ added in v0.10.0
func UpdatePipelineConfig(id string, cfg *PipelineConfig) error
func UpdateProject ¶ added in v0.10.0
func UpdateTask ¶
Types ¶
type Aggregation ¶ added in v0.9.0
type Aggregation struct {
Buckets []Bucket `json:"buckets,omitempty"`
}
type Context ¶ added in v0.10.0
type Context struct {
Parameters
SequenceID int64 `json:"sequence"`
Simulate bool `json:"simulate"`
IgnoreBroken bool `json:"ignore_broken"`
Payload interface{} `json:"-"`
// contains filtered or unexported fields
}
func (*Context) End ¶ added in v0.10.0
func (context *Context) End(msg interface{})
End break all pipelines, but the end phrase not included
func (*Context) Exit ¶ added in v0.10.0
func (context *Context) Exit(msg interface{})
Exit tells pipeline to exit
type Host ¶ added in v0.10.0
type Host struct {
Host string `json:"host,omitempty" gorm:"not null;unique;primary_key" index:"id"`
Favicon string `json:"favicon,omitempty"`
Enabled bool `json:"enabled"`
HostConfigs *[]HostConfig `json:"host_configs,omitempty"`
Created *time.Time `json:"created,omitempty"`
Updated *time.Time `json:"updated,omitempty"`
}
Host is host struct
func CreateHost ¶ added in v0.10.0
CreateHost create a domain host
type HostConfig ¶ added in v0.10.0
type HostConfig struct {
ID string `json:"id,omitempty" gorm:"not null;unique;primary_key" index:"id"`
Host string `gorm:"index" json:"host"`
UrlPattern string `gorm:"index" json:"url_pattern"`
Runner string `gorm:"index" json:"runner"`
SortOrder int `gorm:"index" json:"sort_order"`
PipelineID string `gorm:"index" json:"pipeline_id"`
Cookies string `json:"cookies,omitempty"`
Created time.Time `gorm:"index" json:"created,omitempty"`
Updated time.Time `gorm:"index" json:"updated,omitempty"`
}
func GetHostConfig ¶ added in v0.10.0
func GetHostConfig(runner, host string) []HostConfig
func GetHostConfigByHostAndUrl ¶ added in v0.10.0
func GetHostConfigByHostAndUrl(runner, host, url string) (*HostConfig, error)
func GetHostConfigByID ¶ added in v0.10.0
func GetHostConfigByID(id string) (HostConfig, error)
func GetHostConfigList ¶ added in v0.10.0
func GetHostConfigList(from, size int, host string) (int, []HostConfig, error)
type IndexDocument ¶
type IndexDocument struct {
Index string `json:"_index,omitempty"`
ID string `json:"_id,omitempty"`
Source map[string]interface{} `json:"_source,omitempty"`
Highlight map[string][]interface{} `json:"highlight,omitempty"`
}
IndexDocument used to construct indexing document
type Joint ¶ added in v0.10.0
func GetJointInstance ¶ added in v0.10.0
func GetJointInstance(cfg *JointConfig) Joint
type JointConfig ¶ added in v0.10.0
type JointConfig struct {
JointName string `json:"joint" config:"joint"` //the joint name
Parameters map[string]interface{} `json:"parameters,omitempty" config:"parameters"` //kv parameters for this joint
Enabled bool `json:"enabled" config:"enabled"`
}
JointConfig configs for each joint
type ParaKey ¶ added in v0.10.0
type ParaKey string
Common pipeline context keys
const ( CONTEXT_TASK_ID ParaKey = "GOPA_TASK_ID" CONTEXT_TASK_URL ParaKey = "GOPA_TASK_URL" CONTEXT_TASK_Reference ParaKey = "GOPA_TASK_Reference" CONTEXT_TASK_Depth ParaKey = "GOPA_TASK_Depth" CONTEXT_TASK_Breadth ParaKey = "GOPA_TASK_Breadth" CONTEXT_TASK_Host ParaKey = "GOPA_TASK_Host" CONTEXT_TASK_Schema ParaKey = "GOPA_TASK_Schema" CONTEXT_TASK_OriginalUrl ParaKey = "GOPA_TASK_OriginalUrl" CONTEXT_TASK_Status ParaKey = "GOPA_TASK_Status" CONTEXT_TASK_Message ParaKey = "GOPA_TASK_Message" CONTEXT_TASK_Created ParaKey = "GOPA_TASK_Created" CONTEXT_TASK_Updated ParaKey = "GOPA_TASK_Updated" CONTEXT_TASK_LastFetch ParaKey = "GOPA_TASK_LastFetch" CONTEXT_TASK_LastCheck ParaKey = "GOPA_TASK_LastCheck" CONTEXT_TASK_NextCheck ParaKey = "GOPA_TASK_NextCheck" CONTEXT_TASK_SnapshotID ParaKey = "GOPA_TASK_SnapshotID" CONTEXT_TASK_SnapshotSimHash ParaKey = "GOPA_TASK_SnapshotSimHash" CONTEXT_TASK_SnapshotHash ParaKey = "GOPA_TASK_SnapshotHash" CONTEXT_TASK_SnapshotCreated ParaKey = "GOPA_TASK_SnapshotCreated" CONTEXT_TASK_SnapshotVersion ParaKey = "GOPA_TASK_SnapshotVersion" CONTEXT_TASK_LastScreenshotID ParaKey = "GOPA_TASK_LastScreenshotID" CONTEXT_TASK_PipelineConfigID ParaKey = "GOPA_TASK_PipelineConfigID" CONTEXT_TASK_Cookies ParaKey = "GOPA_TASK_Cookies" CONTEXT_SNAPSHOT_ContentType ParaKey = "GOPA_SNAPSHOT_ContentType" )
type Parameters ¶ added in v0.10.0
type Parameters struct {
Data map[string]interface{} `json:"data,omitempty"`
// contains filtered or unexported fields
}
func (*Parameters) Get ¶ added in v0.10.0
func (para *Parameters) Get(key ParaKey) interface{}
func (*Parameters) GetBool ¶ added in v0.10.0
func (para *Parameters) GetBool(key ParaKey, defaultV bool) bool
func (*Parameters) GetInt ¶ added in v0.10.0
func (para *Parameters) GetInt(key ParaKey, defaultV int) (int, bool)
func (*Parameters) GetInt64 ¶ added in v0.10.0
func (para *Parameters) GetInt64(key ParaKey, defaultV int64) (int64, bool)
func (*Parameters) GetInt64OrDefault ¶ added in v0.10.0
func (para *Parameters) GetInt64OrDefault(key ParaKey, defaultV int64) int64
func (*Parameters) GetIntOrDefault ¶ added in v0.10.0
func (para *Parameters) GetIntOrDefault(key ParaKey, defaultV int) int
func (*Parameters) GetMap ¶ added in v0.10.0
func (para *Parameters) GetMap(key ParaKey) (map[string]interface{}, bool)
func (*Parameters) GetOrDefault ¶ added in v0.10.0
func (para *Parameters) GetOrDefault(key ParaKey, val interface{}) interface{}
func (*Parameters) GetString ¶ added in v0.10.0
func (para *Parameters) GetString(key ParaKey) (string, bool)
func (*Parameters) GetStringArray ¶ added in v0.10.0
func (para *Parameters) GetStringArray(key ParaKey) ([]string, bool)
GetStringArray will return a array which type of the items are string
func (*Parameters) GetStringOrDefault ¶ added in v0.10.0
func (para *Parameters) GetStringOrDefault(key ParaKey, val string) string
func (*Parameters) GetTime ¶ added in v0.10.0
func (para *Parameters) GetTime(key ParaKey) (time.Time, bool)
func (*Parameters) Has ¶ added in v0.10.0
func (para *Parameters) Has(key ParaKey) bool
func (*Parameters) MustGet ¶ added in v0.10.0
func (para *Parameters) MustGet(key ParaKey) interface{}
func (*Parameters) MustGetBytes ¶ added in v0.10.0
func (para *Parameters) MustGetBytes(key ParaKey) []byte
func (*Parameters) MustGetInt ¶ added in v0.10.0
func (para *Parameters) MustGetInt(key ParaKey) int
MustGetInt return 0 if not key was found
func (*Parameters) MustGetInt64 ¶ added in v0.10.0
func (para *Parameters) MustGetInt64(key ParaKey) int64
func (*Parameters) MustGetMap ¶ added in v0.10.0
func (para *Parameters) MustGetMap(key ParaKey) map[string]interface{}
func (*Parameters) MustGetString ¶ added in v0.10.0
func (para *Parameters) MustGetString(key ParaKey) string
func (*Parameters) MustGetTime ¶ added in v0.10.0
func (para *Parameters) MustGetTime(key ParaKey) time.Time
func (*Parameters) Set ¶ added in v0.10.0
func (para *Parameters) Set(key ParaKey, value interface{})
type Pipeline ¶ added in v0.10.0
type Pipeline struct {
// contains filtered or unexported fields
}
func NewPipeline ¶ added in v0.10.0
func NewPipelineFromConfig ¶ added in v0.10.0
func NewPipelineFromConfig(name string, config *PipelineConfig, context *Context) *Pipeline
func (*Pipeline) GetContext ¶ added in v0.10.0
func (*Pipeline) GetCurrentJoint ¶ added in v0.10.0
type PipelineConfig ¶ added in v0.10.0
type PipelineConfig struct {
ID string `json:"id,omitempty" index:"id"`
Name string `json:"name,omitempty" config:"name"`
StartJoint *JointConfig `gorm:"-" json:"start,omitempty" config:"start"`
ProcessJoints []*JointConfig `gorm:"-" json:"process,omitempty" config:"process"`
EndJoint *JointConfig `gorm:"-" json:"end,omitempty" config:"end"`
Created *time.Time `json:"created,omitempty"`
Updated *time.Time `json:"updated,omitempty"`
Tags []string `gorm:"-" json:"tags,omitempty" config:"tags"`
}
PipelineConfig config for each pipeline, a pipeline may have more than one joints
func GetPipelineConfig ¶ added in v0.10.0
func GetPipelineConfig(id string) (*PipelineConfig, error)
func GetPipelineList ¶ added in v0.10.0
func GetPipelineList(from, size int) (int, []PipelineConfig, error)
type Project ¶ added in v0.10.0
type Project struct {
ID string `storm:"id,unique" json:"id,omitempty" gorm:"not null;unique;primary_key" index:"id"`
Name string `json:"name,omitempty"`
Description string `json:"description,omitempty"`
Enabled bool `json:"enabled"`
Created time.Time `json:"created,omitempty"`
Updated time.Time `json:"updated,omitempty"`
}
Project is a definition, include a collection of Host
func GetProject ¶ added in v0.10.0
type Snapshot ¶
type Snapshot struct {
ID string `json:"id,omitempty" gorm:"not null;unique;primary_key" index:"id"`
Version int `json:"version,omitempty"`
Url string `json:"url,omitempty"`
TaskID string `json:"task_id,omitempty"`
Path string `json:"path,omitempty" gorm:"-"` //path of this file
File string `json:"file,omitempty" gorm:"-"` //filename of this page
Ext string `json:"ext,omitempty" gorm:"-"` //extension of filename
StatusCode int `json:"-" gorm:"-"`
Payload []byte `json:"-" gorm:"-"`
Size uint64 `json:"size,omitempty"`
ScreenshotID string `json:"screenshot_id,omitempty"`
Headers map[string][]string `json:"-" gorm:"-"`
Metadata *map[string]interface{} `json:"-" gorm:"-"`
Parameters []KV `json:"-" gorm:"-"`
Language string `json:"lang,omitempty" gorm:"-"`
Title string `json:"title,omitempty"`
Summary string `json:"summary,omitempty" gorm:"-"`
Text string `json:"text,omitempty" gorm:"-"`
ContentType string `json:"content_type,omitempty"`
Tags []string `json:"tags,omitempty" gorm:"-"`
Links LinkGroup `json:"links,omitempty" gorm:"-"`
Images struct {
Internal []PageLink `json:"internal,omitempty"`
External []PageLink `json:"external,omitempty"`
} `json:"images,omitempty" gorm:"-"`
H1 []string `json:"h1,omitempty" gorm:"-"`
H2 []string `json:"h2,omitempty" gorm:"-"`
H3 []string `json:"h3,omitempty" gorm:"-"`
H4 []string `json:"h4,omitempty" gorm:"-"`
H5 []string `json:"h5,omitempty" gorm:"-"`
Bold []string `json:"bold,omitempty" gorm:"-"`
Italic []string `json:"italic,omitempty" gorm:"-"`
Classifications []string `json:"classifications,omitempty" gorm:"-"`
EnrichedFeatures *map[string]interface{} `json:"enriched_features,omitempty" gorm:"-"`
Hash string `json:"hash,omitempty"`
SimHash string `json:"sim_hash,omitempty"`
Created time.Time `json:"created,omitempty"`
}
func GetSnapshot ¶ added in v0.9.0
func GetSnapshotByField ¶ added in v0.9.0
type Task ¶
type Task struct {
ID string `gorm:"not null;unique;primary_key" json:"id" index:"id"`
// the url may not cleaned, may miss the host part, need reference to provide the complete url information
Url string `storm:"index" json:"url,omitempty" gorm:"type:varchar(500)"`
Reference string `json:"reference_url,omitempty"`
Depth int `storm:"index" json:"depth"`
Breadth int `storm:"index" json:"breadth"`
Host string `gorm:"index" json:"host"`
Schema string `json:"schema,omitempty"`
OriginalUrl string `json:"original_url,omitempty"`
Status int `gorm:"index" json:"status"`
Message string `json:"message,omitempty"`
Created time.Time `gorm:"index" json:"created,omitempty"`
Updated time.Time `gorm:"index" json:"updated,omitempty"`
LastFetch time.Time `gorm:"index" json:"last_fetch,omitempty"`
LastCheck time.Time `gorm:"index" json:"last_check,omitempty"`
NextCheck time.Time `gorm:"index" json:"next_check,omitempty"`
SnapshotVersion int `json:"snapshot_version,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotHash string `json:"snapshot_hash,omitempty"`
SnapshotSimHash string `json:"snapshot_simhash,omitempty"`
SnapshotCreated time.Time `json:"snapshot_created,omitempty"`
LastScreenshotID string `json:"last_screenshot_id,omitempty"`
PipelineConfigID string `json:"pipline_config_id,omitempty"`
HostConfig *HostConfig `json:"host_config,omitempty"`
// transient properties
Snapshots []Snapshot `json:"-"`
SnapshotCount int `json:"-"`
}
func GetTaskByField ¶
Click to show internal directories.
Click to hide internal directories.