Documentation
¶
Overview ¶
Package protofiles is a generated protocol buffer package.
It is generated from these files:
protofiles/ideacrawler.proto
It has these top-level messages:
Status KVP DomainOpt Subscription PageRequest PageHTML
Index ¶
- Variables
- func RegisterIdeaCrawlerServer(s *grpc.Server, srv IdeaCrawlerServer)
- type DomainOpt
- func (*DomainOpt) Descriptor() ([]byte, []int)
- func (m *DomainOpt) GetCallbackUrlRegexp() string
- func (m *DomainOpt) GetCallbackXpathMatch() []*KVP
- func (m *DomainOpt) GetCallbackXpathRegexp() []*KVP
- func (m *DomainOpt) GetCancelOnDisconnect() bool
- func (m *DomainOpt) GetCheckContent() bool
- func (m *DomainOpt) GetCheckLoginAfterEachPage() bool
- func (m *DomainOpt) GetChrome() bool
- func (m *DomainOpt) GetChromeBinary() string
- func (m *DomainOpt) GetDepth() int32
- func (m *DomainOpt) GetDomLoadTime() int32
- func (m *DomainOpt) GetDomainDropPriority() bool
- func (m *DomainOpt) GetDropDomains() []string
- func (m *DomainOpt) GetFirstrun() *google_protobuf.Timestamp
- func (m *DomainOpt) GetFollowOtherDomains() bool
- func (m *DomainOpt) GetFollowUrlRegexp() string
- func (m *DomainOpt) GetFrequency() *google_protobuf1.Duration
- func (m *DomainOpt) GetImpolite() bool
- func (m *DomainOpt) GetKeepDomains() []string
- func (m *DomainOpt) GetLogin() bool
- func (m *DomainOpt) GetLoginJS() string
- func (m *DomainOpt) GetLoginParseFields() bool
- func (m *DomainOpt) GetLoginParseXpath() []*KVP
- func (m *DomainOpt) GetLoginPayload() []*KVP
- func (m *DomainOpt) GetLoginSuccessCheck() *KVP
- func (m *DomainOpt) GetLoginUrl() string
- func (m *DomainOpt) GetLoginUsingSelenium() bool
- func (m *DomainOpt) GetMaxConcurrentRequests() int32
- func (m *DomainOpt) GetMaxDelay() int32
- func (m *DomainOpt) GetMaxIdleTime() int32
- func (m *DomainOpt) GetMinDelay() int32
- func (m *DomainOpt) GetNetworkIface() string
- func (m *DomainOpt) GetNoFollow() bool
- func (m *DomainOpt) GetPrefetch() bool
- func (m *DomainOpt) GetRepeat() bool
- func (m *DomainOpt) GetSeedUrl() string
- func (m *DomainOpt) GetUnsafeNormalizeURL() bool
- func (m *DomainOpt) GetUseragent() string
- func (*DomainOpt) ProtoMessage()
- func (m *DomainOpt) Reset()
- func (m *DomainOpt) String() string
- type IdeaCrawlerClient
- type IdeaCrawlerServer
- type IdeaCrawler_AddDomainAndListenClient
- type IdeaCrawler_AddDomainAndListenServer
- type IdeaCrawler_AddPagesClient
- type IdeaCrawler_AddPagesServer
- type KVP
- type PageHTML
- func (*PageHTML) Descriptor() ([]byte, []int)
- func (m *PageHTML) GetContent() []byte
- func (m *PageHTML) GetError() string
- func (m *PageHTML) GetHttpstatuscode() int32
- func (m *PageHTML) GetMetaStr() string
- func (m *PageHTML) GetSub() *Subscription
- func (m *PageHTML) GetSuccess() bool
- func (m *PageHTML) GetUrl() string
- func (*PageHTML) ProtoMessage()
- func (m *PageHTML) Reset()
- func (m *PageHTML) String() string
- type PageReqType
- type PageRequest
- func (*PageRequest) Descriptor() ([]byte, []int)
- func (m *PageRequest) GetJs() string
- func (m *PageRequest) GetMetaStr() string
- func (m *PageRequest) GetNoCallback() bool
- func (m *PageRequest) GetReqtype() PageReqType
- func (m *PageRequest) GetSub() *Subscription
- func (m *PageRequest) GetUrl() string
- func (*PageRequest) ProtoMessage()
- func (m *PageRequest) Reset()
- func (m *PageRequest) String() string
- type Status
- type SubType
- type Subscription
- func (*Subscription) Descriptor() ([]byte, []int)
- func (m *Subscription) GetDatetime() *google_protobuf.Timestamp
- func (m *Subscription) GetDomainname() string
- func (m *Subscription) GetSeqnum() int32
- func (m *Subscription) GetSubcode() string
- func (m *Subscription) GetSubtype() SubType
- func (*Subscription) ProtoMessage()
- func (m *Subscription) Reset()
- func (m *Subscription) String() string
Constants ¶
This section is empty.
Variables ¶
View Source
var PageReqType_name = map[int32]string{
0: "GET",
1: "HEAD",
2: "BUILTINJS",
3: "JSCRIPT",
}
View Source
var PageReqType_value = map[string]int32{
"GET": 0,
"HEAD": 1,
"BUILTINJS": 2,
"JSCRIPT": 3,
}
View Source
var SubType_name = map[int32]string{
0: "SEQNUM",
1: "DATETIME",
}
View Source
var SubType_value = map[string]int32{
"SEQNUM": 0,
"DATETIME": 1,
}
Functions ¶
func RegisterIdeaCrawlerServer ¶
func RegisterIdeaCrawlerServer(s *grpc.Server, srv IdeaCrawlerServer)
Types ¶
type DomainOpt ¶
type DomainOpt struct {
SeedUrl string `protobuf:"bytes,1,opt,name=seedUrl" json:"seedUrl,omitempty"`
// crawl delay in seconds
MinDelay int32 `protobuf:"varint,2,opt,name=minDelay" json:"minDelay,omitempty"`
MaxDelay int32 `protobuf:"varint,3,opt,name=maxDelay" json:"maxDelay,omitempty"`
// don't follow any pages, just send back responses for the received URLs.
NoFollow bool `protobuf:"varint,4,opt,name=noFollow" json:"noFollow,omitempty"`
// only pages matching reqUrlRegexp will be shipped back to the client.
// only matching pages will be saved to s3 as well.
CallbackUrlRegexp string `protobuf:"bytes,5,opt,name=callbackUrlRegexp" json:"callbackUrlRegexp,omitempty"`
// only pages matching followUrlRegexp will be followed and sublinks added to fetcher.
FollowUrlRegexp string `protobuf:"bytes,6,opt,name=followUrlRegexp" json:"followUrlRegexp,omitempty"`
MaxConcurrentRequests int32 `protobuf:"varint,7,opt,name=maxConcurrentRequests" json:"maxConcurrentRequests,omitempty"`
// TODO
Useragent string `protobuf:"bytes,8,opt,name=useragent" json:"useragent,omitempty"`
Impolite bool `protobuf:"varint,9,opt,name=impolite" json:"impolite,omitempty"`
// TODO
Depth int32 `protobuf:"varint,10,opt,name=depth" json:"depth,omitempty"`
// TODO: maybe just remove all scheduling features, immediate jobs only
Repeat bool `protobuf:"varint,11,opt,name=repeat" json:"repeat,omitempty"`
// needs min limit of 5mins, ideally 1hour
Frequency *google_protobuf1.Duration `protobuf:"bytes,12,opt,name=frequency" json:"frequency,omitempty"`
// time of first run, if this is saturday 10pm, frequency is 2 weeks. ideally atleast 10 mins away.
// it will continue to run at that time every 2 weeks
Firstrun *google_protobuf.Timestamp `protobuf:"bytes,13,opt,name=firstrun" json:"firstrun,omitempty"`
// Callback check order -
// (1) - callbackUrlRegexp
// (2) - callbackXpathMatch
// (3) - callbackXpathRegexp
// Any one has to match.
// provide multiple xpaths as keys and expected values as value. Pages are
// sent back to client only if all values are found in page.
CallbackXpathMatch []*KVP `protobuf:"bytes,14,rep,name=callbackXpathMatch" json:"callbackXpathMatch,omitempty"`
// TODO keepKeywords and followOtherDomains still need to be implemented
// keep page only if these keywords are present
// repeated string keepKeywords = 14;
// drop pages if these keywords are present
CallbackXpathRegexp []*KVP `protobuf:"bytes,15,rep,name=callbackXpathRegexp" json:"callbackXpathRegexp,omitempty"`
// in seconds, it is the time to wait for a new
// page, before stopping the job; affects workerIdleTTL of fetchbot.
// min value is 600, it is also default.
MaxIdleTime int32 `protobuf:"varint,16,opt,name=maxIdleTime" json:"maxIdleTime,omitempty"`
FollowOtherDomains bool `protobuf:"varint,17,opt,name=followOtherDomains" json:"followOtherDomains,omitempty"`
KeepDomains []string `protobuf:"bytes,18,rep,name=keepDomains" json:"keepDomains,omitempty"`
DropDomains []string `protobuf:"bytes,19,rep,name=dropDomains" json:"dropDomains,omitempty"`
DomainDropPriority bool `protobuf:"varint,20,opt,name=domainDropPriority" json:"domainDropPriority,omitempty"`
// safe url normalizations happen by default. below is only for a few unsafe ones.
// for list of safe normalizations: https://github.com/PuerkitoBio/purell/blob/master/purell.go#L59
// remove index.php, etc, fragments #section, +FlagsUsuallySafeGreedy from above link
UnsafeNormalizeURL bool `protobuf:"varint,21,opt,name=unsafeNormalizeURL" json:"unsafeNormalizeURL,omitempty"`
Login bool `protobuf:"varint,22,opt,name=login" json:"login,omitempty"`
// currently not possible, assumes false
LoginUsingSelenium bool `protobuf:"varint,23,opt,name=loginUsingSelenium" json:"loginUsingSelenium,omitempty"`
LoginUrl string `protobuf:"bytes,24,opt,name=loginUrl" json:"loginUrl,omitempty"`
// for username, password fields, other form data to send on post request
LoginPayload []*KVP `protobuf:"bytes,25,rep,name=loginPayload" json:"loginPayload,omitempty"`
// if there are hidden fields in the page that need to be scraped before login
LoginParseFields bool `protobuf:"varint,26,opt,name=loginParseFields" json:"loginParseFields,omitempty"`
// key is key of hidden fields to parse from form, value is the xpath of field to scrape.
LoginParseXpath []*KVP `protobuf:"bytes,27,rep,name=loginParseXpath" json:"loginParseXpath,omitempty"`
// to check if login succeeded, provide xpath as key, and expected value as value.
// for example, after login, xpath of top right corner, and username as value.
// if the xpath is not there of if there is no value match, then we probably didn't login.
LoginSuccessCheck *KVP `protobuf:"bytes,28,opt,name=loginSuccessCheck" json:"loginSuccessCheck,omitempty"`
// checks login state after downloading each page, using check defined in 'loginSuccessCheck'
CheckLoginAfterEachPage bool `protobuf:"varint,29,opt,name=checkLoginAfterEachPage" json:"checkLoginAfterEachPage,omitempty"`
// javascript for login in chrome browser.
LoginJS string `protobuf:"bytes,30,opt,name=loginJS" json:"loginJS,omitempty"`
// whether to use chrome, location of chrome binary
Chrome bool `protobuf:"varint,31,opt,name=chrome" json:"chrome,omitempty"`
ChromeBinary string `protobuf:"bytes,32,opt,name=chromeBinary" json:"chromeBinary,omitempty"`
DomLoadTime int32 `protobuf:"varint,33,opt,name=domLoadTime" json:"domLoadTime,omitempty"`
// check if this network interface is still active before every request.
NetworkIface string `protobuf:"bytes,34,opt,name=networkIface" json:"networkIface,omitempty"`
// TODO
CancelOnDisconnect bool `protobuf:"varint,35,opt,name=cancelOnDisconnect" json:"cancelOnDisconnect,omitempty"`
// if true, sends a HEAD request first ensure content is text/html before sending GET request.
CheckContent bool `protobuf:"varint,36,opt,name=checkContent" json:"checkContent,omitempty"`
// if prefetch flag is true, downloads resources like img, css, png, svg, js associated with the actual page to mimic browser behaviour.
Prefetch bool `protobuf:"varint,37,opt,name=prefetch" json:"prefetch,omitempty"`
}
func (*DomainOpt) Descriptor ¶
func (*DomainOpt) GetCallbackUrlRegexp ¶
func (*DomainOpt) GetCallbackXpathMatch ¶
func (*DomainOpt) GetCallbackXpathRegexp ¶
func (*DomainOpt) GetCancelOnDisconnect ¶
func (*DomainOpt) GetCheckContent ¶
func (*DomainOpt) GetCheckLoginAfterEachPage ¶
func (*DomainOpt) GetChromeBinary ¶
func (*DomainOpt) GetDomLoadTime ¶
func (*DomainOpt) GetDomainDropPriority ¶
func (*DomainOpt) GetDropDomains ¶
func (*DomainOpt) GetFirstrun ¶
func (m *DomainOpt) GetFirstrun() *google_protobuf.Timestamp
func (*DomainOpt) GetFollowOtherDomains ¶
func (*DomainOpt) GetFollowUrlRegexp ¶
func (*DomainOpt) GetFrequency ¶
func (m *DomainOpt) GetFrequency() *google_protobuf1.Duration
func (*DomainOpt) GetImpolite ¶
func (*DomainOpt) GetKeepDomains ¶
func (*DomainOpt) GetLoginJS ¶
func (*DomainOpt) GetLoginParseFields ¶
func (*DomainOpt) GetLoginParseXpath ¶
func (*DomainOpt) GetLoginPayload ¶
func (*DomainOpt) GetLoginSuccessCheck ¶
func (*DomainOpt) GetLoginUrl ¶
func (*DomainOpt) GetLoginUsingSelenium ¶
func (*DomainOpt) GetMaxConcurrentRequests ¶
func (*DomainOpt) GetMaxDelay ¶
func (*DomainOpt) GetMaxIdleTime ¶
func (*DomainOpt) GetMinDelay ¶
func (*DomainOpt) GetNetworkIface ¶
func (*DomainOpt) GetNoFollow ¶
func (*DomainOpt) GetPrefetch ¶
func (*DomainOpt) GetSeedUrl ¶
func (*DomainOpt) GetUnsafeNormalizeURL ¶
func (*DomainOpt) GetUseragent ¶
func (*DomainOpt) ProtoMessage ¶
func (*DomainOpt) ProtoMessage()
type IdeaCrawlerClient ¶
type IdeaCrawlerClient interface {
// rpc AddDomain(DomainOpt) returns (Subscription) {}
// rpc AddDomains(stream DomainOpt) returns (stream Subscription) {}
AddDomainAndListen(ctx context.Context, in *DomainOpt, opts ...grpc.CallOption) (IdeaCrawler_AddDomainAndListenClient, error)
AddPages(ctx context.Context, opts ...grpc.CallOption) (IdeaCrawler_AddPagesClient, error)
CancelJob(ctx context.Context, in *Subscription, opts ...grpc.CallOption) (*Status, error)
}
func NewIdeaCrawlerClient ¶
func NewIdeaCrawlerClient(cc *grpc.ClientConn) IdeaCrawlerClient
type IdeaCrawlerServer ¶
type IdeaCrawlerServer interface {
// rpc AddDomain(DomainOpt) returns (Subscription) {}
// rpc AddDomains(stream DomainOpt) returns (stream Subscription) {}
AddDomainAndListen(*DomainOpt, IdeaCrawler_AddDomainAndListenServer) error
AddPages(IdeaCrawler_AddPagesServer) error
CancelJob(context.Context, *Subscription) (*Status, error)
}
type IdeaCrawler_AddDomainAndListenClient ¶
type IdeaCrawler_AddDomainAndListenClient interface {
Recv() (*PageHTML, error)
grpc.ClientStream
}
type IdeaCrawler_AddDomainAndListenServer ¶
type IdeaCrawler_AddDomainAndListenServer interface {
Send(*PageHTML) error
grpc.ServerStream
}
type IdeaCrawler_AddPagesClient ¶
type IdeaCrawler_AddPagesClient interface {
Send(*PageRequest) error
CloseAndRecv() (*Status, error)
grpc.ClientStream
}
type IdeaCrawler_AddPagesServer ¶
type IdeaCrawler_AddPagesServer interface {
SendAndClose(*Status) error
Recv() (*PageRequest, error)
grpc.ServerStream
}
type KVP ¶
type KVP struct {
Key string `protobuf:"bytes,1,opt,name=key" json:"key,omitempty"`
Value string `protobuf:"bytes,2,opt,name=value" json:"value,omitempty"`
}
func (*KVP) Descriptor ¶
func (*KVP) ProtoMessage ¶
func (*KVP) ProtoMessage()
type PageHTML ¶
type PageHTML struct {
Success bool `protobuf:"varint,1,opt,name=success" json:"success,omitempty"`
Error string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"`
Sub *Subscription `protobuf:"bytes,3,opt,name=sub" json:"sub,omitempty"`
Url string `protobuf:"bytes,4,opt,name=url" json:"url,omitempty"`
Httpstatuscode int32 `protobuf:"varint,5,opt,name=httpstatuscode" json:"httpstatuscode,omitempty"`
Content []byte `protobuf:"bytes,6,opt,name=content,proto3" json:"content,omitempty"`
MetaStr string `protobuf:"bytes,7,opt,name=metaStr" json:"metaStr,omitempty"`
}
func (*PageHTML) Descriptor ¶
func (*PageHTML) GetContent ¶
func (*PageHTML) GetHttpstatuscode ¶
func (*PageHTML) GetMetaStr ¶
func (*PageHTML) GetSub ¶
func (m *PageHTML) GetSub() *Subscription
func (*PageHTML) GetSuccess ¶
func (*PageHTML) ProtoMessage ¶
func (*PageHTML) ProtoMessage()
type PageReqType ¶
type PageReqType int32
const ( PageReqType_GET PageReqType = 0 // Sends a HEAD request to first identify page is text/html before downloading // If we are unsure link will send back large gzip file, etc. which we want to // avoid. PageReqType_HEAD PageReqType = 1 PageReqType_BUILTINJS PageReqType = 2 PageReqType_JSCRIPT PageReqType = 3 )
func (PageReqType) EnumDescriptor ¶
func (PageReqType) EnumDescriptor() ([]byte, []int)
func (PageReqType) String ¶
func (x PageReqType) String() string
type PageRequest ¶
type PageRequest struct {
Sub *Subscription `protobuf:"bytes,1,opt,name=sub" json:"sub,omitempty"`
Reqtype PageReqType `protobuf:"varint,2,opt,name=reqtype,enum=protofiles.PageReqType" json:"reqtype,omitempty"`
Url string `protobuf:"bytes,3,opt,name=url" json:"url,omitempty"`
Js string `protobuf:"bytes,4,opt,name=js" json:"js,omitempty"`
NoCallback bool `protobuf:"varint,5,opt,name=noCallback" json:"noCallback,omitempty"`
MetaStr string `protobuf:"bytes,6,opt,name=metaStr" json:"metaStr,omitempty"`
}
func (*PageRequest) Descriptor ¶
func (*PageRequest) Descriptor() ([]byte, []int)
func (*PageRequest) GetJs ¶
func (m *PageRequest) GetJs() string
func (*PageRequest) GetMetaStr ¶
func (m *PageRequest) GetMetaStr() string
func (*PageRequest) GetNoCallback ¶
func (m *PageRequest) GetNoCallback() bool
func (*PageRequest) GetReqtype ¶
func (m *PageRequest) GetReqtype() PageReqType
func (*PageRequest) GetSub ¶
func (m *PageRequest) GetSub() *Subscription
func (*PageRequest) GetUrl ¶
func (m *PageRequest) GetUrl() string
func (*PageRequest) ProtoMessage ¶
func (*PageRequest) ProtoMessage()
func (*PageRequest) Reset ¶
func (m *PageRequest) Reset()
func (*PageRequest) String ¶
func (m *PageRequest) String() string
type Status ¶
type Status struct {
Success bool `protobuf:"varint,1,opt,name=success" json:"success,omitempty"`
Error string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"`
}
func (*Status) Descriptor ¶
func (*Status) GetSuccess ¶
func (*Status) ProtoMessage ¶
func (*Status) ProtoMessage()
type Subscription ¶
type Subscription struct {
Subcode string `protobuf:"bytes,1,opt,name=subcode" json:"subcode,omitempty"`
Domainname string `protobuf:"bytes,2,opt,name=domainname" json:"domainname,omitempty"`
Subtype SubType `protobuf:"varint,3,opt,name=subtype,enum=protofiles.SubType" json:"subtype,omitempty"`
Seqnum int32 `protobuf:"varint,4,opt,name=seqnum" json:"seqnum,omitempty"`
Datetime *google_protobuf.Timestamp `protobuf:"bytes,5,opt,name=datetime" json:"datetime,omitempty"`
}
func (*Subscription) Descriptor ¶
func (*Subscription) Descriptor() ([]byte, []int)
func (*Subscription) GetDatetime ¶
func (m *Subscription) GetDatetime() *google_protobuf.Timestamp
func (*Subscription) GetDomainname ¶
func (m *Subscription) GetDomainname() string
func (*Subscription) GetSeqnum ¶
func (m *Subscription) GetSeqnum() int32
func (*Subscription) GetSubcode ¶
func (m *Subscription) GetSubcode() string
func (*Subscription) GetSubtype ¶
func (m *Subscription) GetSubtype() SubType
func (*Subscription) ProtoMessage ¶
func (*Subscription) ProtoMessage()
func (*Subscription) Reset ¶
func (m *Subscription) Reset()
func (*Subscription) String ¶
func (m *Subscription) String() string
Click to show internal directories.
Click to hide internal directories.