Documentation
¶
Index ¶
- Constants
- func CreateHostConfig(config *HostConfig) error
- func CreateProject(project *Project) error
- func CreateSnapshot(snapshot *Snapshot) error
- func CreateTask(task *Task) error
- func DeleteHostConfig(id string) error
- func DeleteProject(id string) error
- func DeleteSnapshot(snapshot *Snapshot) error
- func DeleteTask(id string) error
- func GetHostStatus(status int) (error, map[string]interface{})
- func GetTaskStatus(host string) (error, map[string]interface{})
- func GetTaskStatusText(status int) string
- func UpdateHostConfig(config *HostConfig) error
- func UpdateProject(project *Project) error
- func UpdateTask(task *Task) error
- type Domain
- type FetchTask
- type Host
- type HostConfig
- type Index
- type KV
- type LinkGroup
- type PageLink
- type Project
- type Snapshot
- type Task
- func GetFailedTasks(offset int64) (int, []Task, error)
- func GetPendingNewFetchTasks(offset int64, size int) (int, []Task, error)
- func GetPendingUpdateFetchTasks(offset int64) (int, []Task, error)
- func GetTask(id string) (Task, error)
- func GetTaskByField(k, v string) ([]Task, error)
- func GetTaskList(from, size int, host string, status int) (int, []Task, error)
- func NewTask(url, ref string, depth int, breadth int) *Task
- type Url
Constants ¶
View Source
const ( CONTEXT_SNAPSHOT pipeline.ParaKey = "SNAPSHOT" CONTEXT_PAGE_LINKS pipeline.ParaKey = "PAGE_LINKS" )
Common pipeline context keys
View Source
const ( CONTEXT_TASK_ID pipeline.ParaKey = "GOPA_TASK_ID" CONTEXT_TASK_URL pipeline.ParaKey = "GOPA_TASK_URL" CONTEXT_TASK_Reference pipeline.ParaKey = "GOPA_TASK_Reference" CONTEXT_TASK_Depth pipeline.ParaKey = "GOPA_TASK_Depth" CONTEXT_TASK_Breadth pipeline.ParaKey = "GOPA_TASK_Breadth" CONTEXT_TASK_Host pipeline.ParaKey = "GOPA_TASK_Host" CONTEXT_TASK_Schema pipeline.ParaKey = "GOPA_TASK_Schema" CONTEXT_TASK_OriginalUrl pipeline.ParaKey = "GOPA_TASK_OriginalUrl" CONTEXT_TASK_Status pipeline.ParaKey = "GOPA_TASK_Status" CONTEXT_TASK_Message pipeline.ParaKey = "GOPA_TASK_Message" CONTEXT_TASK_Created pipeline.ParaKey = "GOPA_TASK_Created" CONTEXT_TASK_Updated pipeline.ParaKey = "GOPA_TASK_Updated" CONTEXT_TASK_LastFetch pipeline.ParaKey = "GOPA_TASK_LastFetch" CONTEXT_TASK_LastCheck pipeline.ParaKey = "GOPA_TASK_LastCheck" CONTEXT_TASK_NextCheck pipeline.ParaKey = "GOPA_TASK_NextCheck" CONTEXT_TASK_SnapshotID pipeline.ParaKey = "GOPA_TASK_SnapshotID" CONTEXT_TASK_SnapshotSimHash pipeline.ParaKey = "GOPA_TASK_SnapshotSimHash" CONTEXT_TASK_SnapshotHash pipeline.ParaKey = "GOPA_TASK_SnapshotHash" CONTEXT_TASK_SnapshotCreated pipeline.ParaKey = "GOPA_TASK_SnapshotCreated" CONTEXT_TASK_SnapshotVersion pipeline.ParaKey = "GOPA_TASK_SnapshotVersion" CONTEXT_TASK_LastScreenshotID pipeline.ParaKey = "GOPA_TASK_LastScreenshotID" CONTEXT_TASK_PipelineConfigID pipeline.ParaKey = "GOPA_TASK_PipelineConfigID" CONTEXT_TASK_Cookies pipeline.ParaKey = "GOPA_TASK_Cookies" CONTEXT_SNAPSHOT_ContentType pipeline.ParaKey = "GOPA_SNAPSHOT_ContentType" )
View Source
const PreFetchCheck = 4
View Source
const PreFetchCheckError = 6
View Source
const PreFetchChecking = 5
View Source
const PreFetchPendingCheck = 3
View Source
const StageAfterFetch = 2
View Source
const StageFetch = 1
View Source
const StagePreFetch = 0
View Source
const Task404 int = 4
View Source
const TaskCreated int = 0
View Source
const TaskDuplicated int = 7
View Source
const TaskFailed int = 2
View Source
const TaskInterrupted int = 8
View Source
const TaskPendingFetch int = 9
View Source
const TaskRedirected int = 5
View Source
const TaskSuccess int = 3
View Source
const TaskTimeout int = 6
Variables ¶
This section is empty.
Functions ¶
func CreateHostConfig ¶
func CreateHostConfig(config *HostConfig) error
func CreateProject ¶
func CreateSnapshot ¶
func CreateTask ¶
func DeleteHostConfig ¶
func DeleteProject ¶
func DeleteSnapshot ¶
func DeleteTask ¶
func GetHostStatus ¶
func GetTaskStatus ¶
func GetTaskStatusText ¶
func UpdateHostConfig ¶
func UpdateHostConfig(config *HostConfig) error
func UpdateProject ¶
func UpdateTask ¶
Types ¶
type FetchTask ¶
func (*FetchTask) UpdateStatus ¶
type Host ¶
type Host struct {
Host string `json:"host,omitempty" elastic_meta:"_id" elastic_mapping:"host: { type: keyword, ignore_above: 256 }"`
Favicon string `json:"favicon,omitempty"`
Enabled bool `json:"enabled"`
HostConfigs *[]HostConfig `json:"host_configs,omitempty"`
Created time.Time `json:"created,omitempty"`
Updated time.Time `json:"updated,omitempty"`
}
Host is host struct
type HostConfig ¶
type HostConfig struct {
ID string `json:"id,omitempty" elastic_meta:"_id"`
Host string `json:"host"`
UrlPattern string `json:"url_pattern"`
Runner string `json:"runner"`
SortOrder int `json:"sort_order"`
PipelineID string `json:"pipeline_id"`
Cookies string `json:"cookies,omitempty"`
Created time.Time `json:"created,omitempty"`
Updated time.Time `json:"updated,omitempty"`
}
func GetHostConfig ¶
func GetHostConfig(runner, host string) []HostConfig
func GetHostConfigByHostAndUrl ¶
func GetHostConfigByHostAndUrl(runner, host, url string) (*HostConfig, error)
func GetHostConfigByID ¶
func GetHostConfigByID(id string) (HostConfig, error)
func GetHostConfigList ¶
func GetHostConfigList(from, size int, host string) (int, []HostConfig, error)
type Project ¶
type Project struct {
ID string `json:"id,omitempty" elastic_meta:"_id"`
Name string `json:"name,omitempty"`
Description string `json:"description,omitempty"`
Enabled bool `json:"enabled"`
Created time.Time `json:"created,omitempty"`
Updated time.Time `json:"updated,omitempty"`
Banner string `json:"banner,omitempty"`
Favicon string `json:"favicon,omitempty"`
DomainRules config.Rules `json:"domain_rules,omitempty"`
UrlRules config.Rules `json:"url_rules,omitempty"`
}
Project is a definition, include a collection of Host
func GetProject ¶
type Snapshot ¶
type Snapshot struct {
ID string `json:"id,omitempty" elastic_meta:"_id"`
Version int `json:"version,omitempty"`
Url string `json:"url,omitempty"`
TaskID string `json:"task_id,omitempty"`
Path string `json:"path,omitempty"` //path of this file
File string `json:"file,omitempty"` //filename of this page
Ext string `json:"ext,omitempty"` //extension of filename
StatusCode int `json:"-"`
Payload []byte `json:"-"`
Size uint64 `json:"size,omitempty"`
ScreenshotID string `json:"screenshot_id,omitempty"`
Headers map[string][]string `json:"-"`
Metadata *map[string]interface{} `json:"-"`
Parameters []KV `json:"-"`
Language string `json:"lang,omitempty"`
Title string `json:"title,omitempty" elastic_mapping:"title: { type: text, fields: { keyword: { type: keyword } } }"`
Summary string `json:"summary,omitempty"`
Text string `json:"text,omitempty" elastic_mapping:"text: { type: text }"`
ContentType string `json:"content_type,omitempty"`
Tags []string `json:"tags,omitempty"`
Links LinkGroup `json:"links,omitempty" elastic_mapping:"links:{type:object}"`
Images struct {
Internal []PageLink `json:"internal,omitempty" elastic_mapping:"internal:{type:object}"`
External []PageLink `json:"external,omitempty" elastic_mapping:"external:{type:object}"`
} `json:"images,omitempty" elastic_mapping:"images:{type:object}"`
H1 []string `json:"h1,omitempty" elastic_mapping:"h1: { type: text }"`
H2 []string `json:"h2,omitempty" elastic_mapping:"h2: { type: text }"`
H3 []string `json:"h3,omitempty" elastic_mapping:"h3: { type: text }"`
H4 []string `json:"h4,omitempty" elastic_mapping:"h4: { type: text }"`
H5 []string `json:"h5,omitempty" elastic_mapping:"h5: { type: text }"`
Bold []string `json:"bold,omitempty" elastic_mapping:"bold: { type: text }"`
Italic []string `json:"italic,omitempty"`
Classifications []string `json:"classifications,omitempty"`
EnrichedFeatures *map[string]interface{} `json:"enriched_features,omitempty"`
Hash string `json:"hash,omitempty"`
SimHash string `json:"sim_hash,omitempty"`
Created time.Time `json:"created,omitempty"`
}
func GetSnapshot ¶
func GetSnapshotByField ¶
type Task ¶
type Task struct {
ID string `json:"id" elastic_meta:"_id"`
// the url may not cleaned, may miss the host part, need reference to provide the complete url information
Url string `json:"url,omitempty"`
Reference string `json:"reference_url,omitempty"`
Depth int `json:"depth"`
Breadth int `json:"breadth"`
Host string `json:"host"`
Schema string `json:"schema,omitempty"`
OriginalUrl string `json:"original_url,omitempty"`
Status int `json:"status"`
Message string `json:"message,omitempty"`
Created time.Time `json:"created,omitempty" elastic_mapping:"created: { type: date }"`
Updated time.Time `json:"updated,omitempty" elastic_mapping:"updated: { type: date }"`
LastFetch time.Time `json:"last_fetch,omitempty" elastic_mapping:"last_fetch: { type: date }"`
LastCheck time.Time `json:"last_check,omitempty" elastic_mapping:"last_check: { type: date }"`
NextCheck time.Time `json:"next_check,omitempty" elastic_mapping:"next_check: { type: date }"`
SnapshotVersion int `json:"snapshot_version,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotHash string `json:"snapshot_hash,omitempty"`
SnapshotSimHash string `json:"snapshot_simhash,omitempty"`
SnapshotCreated time.Time `json:"snapshot_created,omitempty" elastic_mapping:"snapshot_created: { type: date }"`
LastScreenshotID string `json:"last_screenshot_id,omitempty"`
PipelineConfigID string `json:"pipline_config_id,omitempty"`
HostConfig *HostConfig `json:"host_config,omitempty"`
// transient properties
Snapshots []Snapshot `json:"-"`
SnapshotCount int `json:"-"`
}
func GetPendingNewFetchTasks ¶
func GetTaskByField ¶
Click to show internal directories.
Click to hide internal directories.