Documentation
¶
Overview ¶
Archive holds all common model definitions for archivers 2.0.
TODO - turn "Metadata" into github.com/datatogether/metablocks.Metablock
Index ¶
- Variables
- func CalcHash(data []byte) (string, error)
- func ContentUrlsCount(db sqlutil.Queryable) (count int, err error)
- func CountPrimers(db sqlutil.Queryable) (count int64, err error)
- func CountSources(db sqlutil.Queryable) (count int, err error)
- func FileUrl(url *Url) string
- func MetadataCountByKey(db sqlutil.Queryable, keyId string) (count int, err error)
- func NormalizeURL(u *url.URL) *url.URL
- func NormalizeURLString(url string) (string, error)
- func ValidArchivingUrl(db sqlutil.Queryable, url string) error
- func WriteSnapshot(store datastore.Datastore, u *Url) error
- type Collection
- func (c Collection) DatastoreType() string
- func (c *Collection) Delete(store datastore.Datastore) error
- func (c *Collection) DeleteItems(store datastore.Datastore, items []*CollectionItem) error
- func (c Collection) GetId() string
- func (c *Collection) ItemCount(store datastore.Datastore) (count int, err error)
- func (c Collection) Key() datastore.Key
- func (c *Collection) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *Collection) Read(store datastore.Datastore) error
- func (c *Collection) ReadItems(store datastore.Datastore, orderby string, limit, offset int) (items []*CollectionItem, err error)
- func (c *Collection) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c Collection) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *Collection) Save(store datastore.Datastore) (err error)
- func (c *Collection) SaveItems(store datastore.Datastore, items []*CollectionItem) error
- func (c *Collection) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type CollectionItem
- func (c CollectionItem) DatastoreType() string
- func (c *CollectionItem) Delete(store datastore.Datastore) error
- func (c CollectionItem) GetId() string
- func (c CollectionItem) Key() datastore.Key
- func (c *CollectionItem) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *CollectionItem) Read(store datastore.Datastore) error
- func (c *CollectionItem) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c CollectionItem) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *CollectionItem) Save(store datastore.Datastore) (err error)
- func (c *CollectionItem) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type Consensus
- type CustomCrawl
- func (CustomCrawl) DatastoreType() string
- func (c *CustomCrawl) Delete(store datastore.Datastore) error
- func (c CustomCrawl) GetId() string
- func (u CustomCrawl) Key() datastore.Key
- func (c *CustomCrawl) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (c *CustomCrawl) Read(store datastore.Datastore) error
- func (c *CustomCrawl) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (c *CustomCrawl) SQLQuery(cmd sql_datastore.Cmd) string
- func (c *CustomCrawl) Save(store datastore.Datastore) (err error)
- func (c *CustomCrawl) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type DataRepo
- func (d *DataRepo) DatastoreType() string
- func (d *DataRepo) Delete(store datastore.Datastore) error
- func (d *DataRepo) GetId() string
- func (d *DataRepo) Key() datastore.Key
- func (d *DataRepo) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (d *DataRepo) Read(store datastore.Datastore) error
- func (d DataRepo) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (d DataRepo) SQLQuery(cmd sql_datastore.Cmd) string
- func (d *DataRepo) Save(store datastore.Datastore) (err error)
- func (d *DataRepo) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type File
- type Link
- func (l *Link) DatastoreType() string
- func (l *Link) Delete(store datastore.Datastore) error
- func (l *Link) GetId() string
- func (l *Link) Insert(store datastore.Datastore) error
- func (l *Link) Key() datastore.Key
- func (l *Link) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (l *Link) Read(store datastore.Datastore) (err error)
- func (l *Link) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (l *Link) SQLQuery(cmd sql_datastore.Cmd) string
- func (l *Link) UnmarshalSQL(row sqlutil.Scannable) error
- func (l *Link) Update(store datastore.Datastore) error
- type Meta
- type Metadata
- func LatestMetadata(db sqlutil.Queryable, keyId, subject string) (m *Metadata, err error)
- func MetadataByKey(db sqlutil.Queryable, keyId string, limit, offset int) ([]*Metadata, error)
- func MetadataBySubject(db sqlutil.Queryable, subject string) ([]*Metadata, error)
- func NextMetadata(db sqlutil.Queryable, keyId, subject string) (*Metadata, error)
- func (m Metadata) DatastoreType() string
- func (m Metadata) GetId() string
- func (m *Metadata) HashMaps() (keyMap map[string]string, valueMap map[string]interface{}, err error)
- func (m *Metadata) HashableBytes() ([]byte, error)
- func (m Metadata) Key() datastore.Key
- func (m Metadata) String() string
- func (m *Metadata) UnmarshalSQL(row sqlutil.Scannable) error
- func (m *Metadata) Write(store datastore.Datastore) error
- type Primer
- func (p *Primer) CalcStats(db *sql.DB) error
- func (p Primer) DatastoreType() string
- func (p *Primer) Delete(store datastore.Datastore) error
- func (p Primer) GetId() string
- func (p Primer) Key() datastore.Key
- func (p *Primer) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (p *Primer) Read(store datastore.Datastore) error
- func (p *Primer) ReadSources(db sqlutil.Queryable) error
- func (p *Primer) ReadSubPrimers(db sqlutil.Queryable) error
- func (p *Primer) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (p *Primer) SQLQuery(cmd sql_datastore.Cmd) string
- func (p *Primer) Save(store datastore.Datastore) (err error)
- func (p *Primer) UnmarshalSQL(row sqlutil.Scannable) error
- type PrimerStats
- type Snapshot
- type Source
- func (c *Source) AsUrl(db *sql.DB) (*Url, error)
- func (s *Source) CalcStats(db *sql.DB) error
- func (s Source) DatastoreType() string
- func (s *Source) Delete(store datastore.Datastore) error
- func (s *Source) DescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func (s Source) GetId() string
- func (s Source) Key() datastore.Key
- func (s *Source) MatchesUrl(rawurl string) bool
- func (s *Source) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (s *Source) Read(store datastore.Datastore) error
- func (s *Source) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (s *Source) SQLQuery(cmd sql_datastore.Cmd) string
- func (s *Source) Save(store datastore.Datastore) (err error)
- func (s *Source) UndescribedContent(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func (c *Source) UnmarshalSQL(row sqlutil.Scannable) error
- type SourceStats
- type Uncrawlable
- func (u Uncrawlable) DatastoreType() string
- func (u *Uncrawlable) Delete(store datastore.Datastore) error
- func (u Uncrawlable) GetId() string
- func (u Uncrawlable) Key() datastore.Key
- func (u *Uncrawlable) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (u *Uncrawlable) Read(store datastore.Datastore) error
- func (u *Uncrawlable) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (u *Uncrawlable) SQLQuery(cmd sql_datastore.Cmd) string
- func (u *Uncrawlable) Save(store datastore.Datastore) (err error)
- func (u *Uncrawlable) UnmarshalSQL(row sqlutil.Scannable) (err error)
- type Url
- func ContentUrls(db sqlutil.Queryable, limit, skip int) ([]*Url, error)
- func FetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func ListUrls(store datastore.Datastore, limit, offset int) ([]*Url, error)
- func Search(db sqlutil.Queryable, q string, limit, offset int) ([]*Url, error)
- func UnfetchedUrls(db sqlutil.Queryable, limit, offset int) ([]*Url, error)
- func UnmarshalBoundedUrls(rows *sql.Rows, limit int) ([]*Url, error)
- func UnmarshalUrls(rows *sql.Rows) ([]*Url, error)
- func UrlsForHash(db sqlutil.Queryable, hash string) ([]*Url, error)
- func (u Url) DatastoreType() string
- func (u *Url) Delete(store datastore.Datastore) error
- func (u *Url) ExtractDocLinks(store datastore.Datastore, doc *goquery.Document) ([]*Link, error)
- func (u *Url) File() (*File, error)
- func (u *Url) Get(store datastore.Datastore) (body []byte, links []*Link, err error)
- func (u Url) GetId() string
- func (u *Url) HandleGetResponse(store datastore.Datastore, res *http.Response) (body []byte, links []*Link, err error)
- func (u *Url) HeadersMap() (headers map[string]string)
- func (u *Url) InboundLinks(db sqlutil.Queryable) ([]string, error)
- func (u Url) Key() datastore.Key
- func (u *Url) NewSQLModel(key datastore.Key) sql_datastore.Model
- func (u *Url) OutboundLinks(db sqlutil.Queryable) ([]string, error)
- func (u *Url) ParsedUrl() (*url.URL, error)
- func (u *Url) Read(store datastore.Datastore) error
- func (u *Url) SQLParams(cmd sql_datastore.Cmd) []interface{}
- func (u *Url) SQLQuery(cmd sql_datastore.Cmd) string
- func (u *Url) Save(store datastore.Datastore) (err error)
- func (u *Url) ShouldEnqueueGet() bool
- func (u *Url) ShouldEnqueueHead() bool
- func (u *Url) ShouldPutS3() bool
- func (u *Url) SuspectedContentUrl() bool
- func (u *Url) UnmarshalSQL(row sqlutil.Scannable) (err error)
- func (u *Url) WarcRequest() *warc.Request
Constants ¶
This section is empty.
Variables ¶
var ( // how long before a url is considered stale. default is 72 hours. StaleDuration = time.Hour * 72 // all these need to be set for file saving to work AwsRegion string AwsAccessKeyId string AwsSecretAccessKey string AwsS3BucketName string AwsS3BucketPath string )
var ( ErrNotFound = fmt.Errorf("Not Found") ErrInvalidResponse = fmt.Errorf("Datastore returned an invalid response") )
Functions ¶
func CalcHash ¶
CalcHash calculates the multihash key for a given slice of bytes TODO - find a proper home for this
func CountPrimers ¶
CountPrimers returns the total number of primers
func CountSources ¶
CountSources grabs the total number of sources
func MetadataCountByKey ¶
func NormalizeURL ¶
NormalizeURL removes inconsitencincies from a given url
func NormalizeURLString ¶
NormalizeURLString removes inconsitencincies from a given url string
Types ¶
type Collection ¶
type Collection struct {
// version 4 uuid
Id string `json:"id"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// sha256 multihash of the public key that created this collection
Creator string `json:"creator"`
// human-readable title of the collection
Title string `json:"title"`
// description of the collection
Description string `json:"description"`
// url this collection originates from
Url string `json:"url,omitempty"`
}
Collections are generic groupings of content collections can be thought of as a csv file listing content hashes as the first column, and whatever other information is necessary in subsequent columns
func CollectionsByCreator ¶
func ListCollections ¶
func ListCollections(store datastore.Datastore, limit, offset int) ([]*Collection, error)
func (Collection) DatastoreType ¶
func (c Collection) DatastoreType() string
func (*Collection) Delete ¶
func (c *Collection) Delete(store datastore.Datastore) error
Delete a collection, should only do for erronious additions
func (*Collection) DeleteItems ¶
func (c *Collection) DeleteItems(store datastore.Datastore, items []*CollectionItem) error
DeleteItems removes a given list of items from the collection
func (Collection) GetId ¶
func (c Collection) GetId() string
func (*Collection) ItemCount ¶
func (c *Collection) ItemCount(store datastore.Datastore) (count int, err error)
ItemCount gets the number of items in the collection
func (Collection) Key ¶
func (c Collection) Key() datastore.Key
func (*Collection) NewSQLModel ¶
func (c *Collection) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Collection) Read ¶
func (c *Collection) Read(store datastore.Datastore) error
Read collection from db
func (*Collection) ReadItems ¶
func (c *Collection) ReadItems(store datastore.Datastore, orderby string, limit, offset int) (items []*CollectionItem, err error)
ReadItems reads a bounded set of items from the collection the orderby param currently only supports SQL-style input of a single proprty, eg: "index" or "index DESC"
func (*Collection) SQLParams ¶
func (c *Collection) SQLParams(cmd sql_datastore.Cmd) []interface{}
func (Collection) SQLQuery ¶
func (c Collection) SQLQuery(cmd sql_datastore.Cmd) string
func (*Collection) Save ¶
func (c *Collection) Save(store datastore.Datastore) (err error)
Save a collection
func (*Collection) SaveItems ¶
func (c *Collection) SaveItems(store datastore.Datastore, items []*CollectionItem) error
SaveItems saves a slice of items to the collection. It's up to you to ensure that the "index" param doesn't get all messed up. TODO - validate / automate the Index param?
func (*Collection) UnmarshalSQL ¶
func (c *Collection) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection
type CollectionItem ¶
type CollectionItem struct {
// Collection Items are Url's at heart
Url
// this item's index in the collection
Index int `json:"index"`
// unique description of this item
Description string `json:"description"`
// contains filtered or unexported fields
}
CollectionItem is an item in a collection. They are urls with added collection-specific information. This has the effect of storing all of the "main properties" of a collection item in the common list of urls
func (CollectionItem) DatastoreType ¶
func (c CollectionItem) DatastoreType() string
DatastoreType is to satisfy sql_datastore.Model interface
func (*CollectionItem) Delete ¶
func (c *CollectionItem) Delete(store datastore.Datastore) error
Delete a collection item
func (CollectionItem) GetId ¶
func (c CollectionItem) GetId() string
GetId returns the Id of the collectionItem, which is the id of the underlying Url
func (CollectionItem) Key ¶
func (c CollectionItem) Key() datastore.Key
Key is somewhat special as CollectionItems always have a Collection as their parent. This relationship is represented in directory-form: /Collection:[collection-id]/CollectionItem:[item-id]
func (*CollectionItem) NewSQLModel ¶
func (c *CollectionItem) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*CollectionItem) Read ¶
func (c *CollectionItem) Read(store datastore.Datastore) error
Read collection from db
func (*CollectionItem) SQLParams ¶
func (c *CollectionItem) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLQuery is to satisfy the sql_datastore.Model interface, it returns this CollectionItem's parameters for a given type of SQL command
func (CollectionItem) SQLQuery ¶
func (c CollectionItem) SQLQuery(cmd sql_datastore.Cmd) string
SQLQuery is to satisfy the sql_datastore.Model interface, it returns the concrete query for a given type of SQL command
func (*CollectionItem) Save ¶
func (c *CollectionItem) Save(store datastore.Datastore) (err error)
Save a collection item to a store
func (*CollectionItem) UnmarshalSQL ¶
func (c *CollectionItem) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the collection receiver it expects the request to have used collectionCols() for selection
type Consensus ¶
Consensus is an enumeration of Meta graph values arranged by key
func SumConsensus ¶
func SumConsensus(subject string, blocks []*Metadata) (c Consensus, values map[string]interface{}, err error)
SumConsensus tallies the consensus around a given subject hash from a provided Metadata slice
type CustomCrawl ¶
type CustomCrawl struct {
// version 4 uuid
Id string `json:"id"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// Json Web token that created this request
Jwt string `json:"jwt"`
// MorphRunId
MorphRunId string `json:"morphRunId"`
// timestamp this run was completed
DateCompleted time.Time
// repository for code that ran the crawl
GithubRepo string `json:"githubRepo"`
// OriginalUrl
OriginalUrl string `json:"originalUrl"`
// SqliteChecksum
SqliteChecksum string `json:"sqliteChecksum"`
}
CustomCrawls are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the custom crawl, setting the stage for writing custom scripts to extract the underlying content.
func ListCustomCrawls ¶
func ListCustomCrawls(store datastore.Datastore, limit, offset int) ([]*CustomCrawl, error)
func (CustomCrawl) DatastoreType ¶
func (CustomCrawl) DatastoreType() string
func (*CustomCrawl) Delete ¶
func (c *CustomCrawl) Delete(store datastore.Datastore) error
Delete a custom crawl, should only do for erronious additions
func (CustomCrawl) GetId ¶
func (c CustomCrawl) GetId() string
func (CustomCrawl) Key ¶
func (u CustomCrawl) Key() datastore.Key
func (*CustomCrawl) NewSQLModel ¶
func (c *CustomCrawl) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*CustomCrawl) Read ¶
func (c *CustomCrawl) Read(store datastore.Datastore) error
Read custom crawl from db
func (*CustomCrawl) SQLParams ¶
func (c *CustomCrawl) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLParams formats a custom crawl struct for inserting / updating into postgres
func (*CustomCrawl) SQLQuery ¶
func (c *CustomCrawl) SQLQuery(cmd sql_datastore.Cmd) string
func (*CustomCrawl) Save ¶
func (c *CustomCrawl) Save(store datastore.Datastore) (err error)
Save a custom crawl
func (*CustomCrawl) UnmarshalSQL ¶
func (c *CustomCrawl) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the custom crawl receiver it expects the request to have used custom crawlCols() for selection
type DataRepo ¶
type DataRepo struct {
// version 4 uuid
Id string
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// Title of this data repository
Title string `json:"title"`
// Human-readable description
Description string `json:"description"`
// Main url link to the DataRepository
Url string `json:"url"`
}
DataRepo is a place that holds data in a structured format
func (*DataRepo) DatastoreType ¶
func (*DataRepo) NewSQLModel ¶
func (d *DataRepo) NewSQLModel(key datastore.Key) sql_datastore.Model
func (DataRepo) SQLParams ¶
func (d DataRepo) SQLParams(cmd sql_datastore.Cmd) []interface{}
type File ¶
File is a buffered byte slice often made from a GET response body. It provides easy hash-calculation & storage to S3 TODO - depricate, use s3-datastore, or, uh... the distributed web
func NewFileFromRes ¶
NewFileFromRes generates a new file by consuming & closing a given response body
type Link ¶
type Link struct {
// Calculated Hash for fixed ID purposes
Hash string
// created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// origin url of the linking document
Src *Url `json:"src"`
// absolute url of the <a> href property
Dst *Url `json:"dst"`
}
A link represents an <a> tag in an html document src who's href attribute points to the url that resolves to dst. both src & dst must be stored as urls
func ReadDstContentLinks ¶
ReadDstContentLinks returns a list of links that specify a gien url as src that are content urls
func ReadDstLinks ¶
ReadDstLinks returns all links that specify a given url as src
func ReadSrcLinks ¶
ReadSrcLinks returns all links that specify a given url as dst
func (*Link) DatastoreType ¶
func (*Link) NewSQLModel ¶
func (l *Link) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Link) SQLParams ¶
func (l *Link) SQLParams(cmd sql_datastore.Cmd) []interface{}
type Meta ¶
type Meta struct {
Url string `json:"url"`
Date *time.Time `json:"date,omitempty"`
HeadersTook int `json:"headersTook,omitempty"`
Id string `json:"id"`
Status int `json:"status"`
ContentSniff string `json:"contentSniff,omitempty"`
RawHeaders []string `json:"rawHeaders""`
Headers map[string]string `json:"headers"`
DownloadTook int `json:"downloadTook,omitempty"`
Sha256 string `json:"sha256"`
Multihash string `json:"multihash"`
Consensus *Consensus `json:"consensus"`
InboundLinks []string `json:"inboundLinks,omitempty"`
OutboundLinks []string `json:"outboundLinks,omitempty"`
}
Meta is a struct for sharing our knowledge of a url with other services
type Metadata ¶
type Metadata struct {
// Hash is the sha256 multihash of all other fields in metadata
// as expressed by Metadata.HashableBytes()
Hash string `json:"hash"`
// Creation timestamp
Timestamp time.Time `json:"timestamp"`
// Sha256 multihash of the public key that signed this metadata
KeyId string `json:"keyId"`
// Sha256 multihash of the content this metadata is describing
Subject string `json:"subject"`
// Hash value of the metadata that came before this, if any
Prev string `json:"prev"`
// Acutal metadata, a valid json Object
Meta map[string]interface{} `json:"meta"`
}
A snapshot is a record of a GET request to a url There can be many metadata of a given url
func LatestMetadata ¶
LatestMetadata gives the most recent metadata timestamp for a given keyId & subject combination if one exists
func MetadataByKey ¶
func MetadataBySubject ¶
MetadatasBySubject returns all metadata for a given subject hash
func NextMetadata ¶
NextMetadata returns the next metadata block for a given subject. If no metablock exists a new one is created
func (Metadata) DatastoreType ¶
func (*Metadata) HashMaps ¶
func (m *Metadata) HashMaps() (keyMap map[string]string, valueMap map[string]interface{}, err error)
TODO - this is ripped from metablocks
func (*Metadata) HashableBytes ¶
HashableBytes returns the exact structure to be used for hash
func (*Metadata) UnmarshalSQL ¶
UnmarshalSQL reads an SQL result into the snapshot receiver
type Primer ¶
type Primer struct {
// version 4 uuid
Id string `json:"id"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// shortest possible expression of this primer's name, usually an acronym
// called shortTitle b/c acronyms collide often & users should feel free to
// expand on acronyms
ShortTitle string `json:"shortTitle"`
// human-readable title of this primer.
Title string `json:"title"`
// long-form description of this primer.
// TODO - Maybe we should store this in markdown format?
Description string `json:"description"`
// parent primer (if any)
Parent *Primer `json:"parent"`
// child-primers list
SubPrimers []*Primer `json:"subPrimers,omitempty"`
// metadata to associate with this primer
Meta map[string]interface{} `json:"meta"`
// statistics about this primer
Stats *PrimerStats `json:"stats"`
// collection of child sources
Sources []*Source `json:"sources,omitempty"`
}
Primer is tracking information about an abstract group of content. For example a government agency is a primer
func BasePrimers ¶
BasePrimers lists primers that have no parent
func ListPrimers ¶
ListPrimers
func UnmarshalBoundedPrimers ¶
UnmarshalBoundedPrimers turns sql.Rows into primers, expecting len(rows) <= limit
func (Primer) DatastoreType ¶
func (*Primer) NewSQLModel ¶
func (p *Primer) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Primer) ReadSources ¶
ReadSources reads child sources of this primer
func (*Primer) ReadSubPrimers ¶
ReadSubPrimers reads child primers of this primer
func (*Primer) SQLParams ¶
func (p *Primer) SQLParams(cmd sql_datastore.Cmd) []interface{}
type PrimerStats ¶
type PrimerStats struct {
UrlCount int `json:"urlCount"`
ArchivedUrlCount int `json:"archivedUrlCount"`
ContentUrlCount int `json:"contentUrlCount"`
ContentMetadataCount int `json:"contentMetadataCount"`
SourcesUrlCount int `json:"sourcesUrlCount"`
SourcesArchivedUrlCount int `json:"sourcesArchivedUrlCount"`
}
TODO - finish
type Snapshot ¶
type Snapshot struct {
// The url that was requested
Url string `json:"url"`
// Time this request was issued
Created time.Time `json:"date"`
// Returned Status
Status int `json:"status,omitempty"`
// Time to complete response in milliseconds
Duration int64 `json:"downloadTook,omitempty"`
// Record of all returned headers in [key,value,key,value...]
Headers []string `json:"headers,omitempty"`
// Multihash of response body (if any)
Hash string `json:"hash,omitempty"`
}
A snapshot is a record of a GET request to a url There can be many snapshots of a given url
func SnapshotsForUrl ¶
SnapshotsForUrl returns all snapshots for a given url string
type Source ¶
type Source struct {
// version 4 uuid
Id string `json:"id"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// human-readable title for this source
Title string `json:"title"`
// description of the source, ideally one paragraph
Description string `json:"description"`
// absolute url to serve as the root of the
Url string `json:"url"`
// primer this source is connected to
Primer *Primer `json:"primer"`
// weather or not this url should be crawled be a web crawler
Crawl bool `json:"crawl"`
// amount of time before a link within this tree is considered in need
// of re-checking for changes. currently not in use, but planned.
StaleDuration time.Duration `json:"staleDuration"`
// yeah this'll probably get depricated. Part of a half-baked alerts feature idea.
LastAlertSent *time.Time `json:"lastAlertSent"`
// Metadata associated with this source that should be added to all
// child urls, currently not in use, but planned
Meta map[string]interface{} `json:"meta"`
// Stats about this source
Stats *SourceStats `json:"stats"`
}
Source is a concreate handle for archiving. Crawlers use source's url as a base of a link tree. Sources are connected to a parent Primer to provide context & organization.
func CrawlingSources ¶
CrawlingSources lists sources with crawling = true, paginated
func ListSources ¶
ListSources lists all sources from most to least recent, paginated
func UnmarshalBoundedSources ¶
UnmarshalBoundedSources turns a standard sql.Rows of Source results into a *Source slice
func (*Source) AsUrl ¶
AsUrl retrieves the url that corresponds for the crawlUrl. If one doesn't exist & the url is saved, a new url is created
func (Source) DatastoreType ¶
func (*Source) DescribedContent ¶
TODO - this currently doesn't check the status of metadata, gonna need to do that DescribedContent returns a list of content-urls from this subprimer that need work.
func (*Source) MatchesUrl ¶
MatchesUrl checks to see if the url pattern of Source is contained within the passed-in url string TODO - make this more sophisticated, checking against the beginning of the url to avoid things like accidental matches, or urls in query params matching within rawurl
func (*Source) NewSQLModel ¶
func (s *Source) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Source) SQLParams ¶
func (s *Source) SQLParams(cmd sql_datastore.Cmd) []interface{}
func (*Source) UndescribedContent ¶
TODO - this currently doesn't check the status of metadata, gonna need to do that UndescribedContent returns a list of content-urls from this subprimer that need work.
type SourceStats ¶
type Uncrawlable ¶
type Uncrawlable struct {
// version 4 uuid
Id string `json:"id"`
// url from urls table, must be unique
Url string `json:"url"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated"`
// sha256 multihash of the public key that created this uncrawlable
Creator string `json:"creator"`
// name of person making submission
Name string `json:"name"`
// email address of person making submission
Email string `json:"email"`
// name of data rescue event where uncrawlable was added
EventName string `json:"eventName"`
// agency name
Agency string `json:"agency"`
// EDGI agency Id
AgencyId string `json:"agencyId"`
// EDGI subagency Id
SubagencyId string `json:"subagencyId"`
// EDGI organization Id
OrgId string `json:"orgId"`
// EDGI Suborganization Id
SuborgId string `json:"orgId"`
// EDGI subprimer Id
SubprimerId string `json:"subprimerId"`
// flag for ftp content
Ftp bool `json:"ftp"`
// flag for 'database'
// TODO - refine this?
Database bool `json:"database"`
// flag for visualization / interactive content
// obfuscating data
Interactive bool `json:"interactive"`
// flag for a page that links to many files
ManyFiles bool `json:"manyFiles"`
// uncrawlable comments
Comments string `json:"comments"`
}
Uncrawlables are urls that contain content that cannot be extracted with traditional web crawling / scraping methods. This model classifies the nature of the uncrawlable, setting the stage for writing custom scripts to extract the underlying content.
func ListUncrawlables ¶
func ListUncrawlables(store datastore.Datastore, limit, offset int) ([]*Uncrawlable, error)
func (Uncrawlable) DatastoreType ¶
func (u Uncrawlable) DatastoreType() string
func (*Uncrawlable) Delete ¶
func (u *Uncrawlable) Delete(store datastore.Datastore) error
Delete a uncrawlable, should only do for erronious additions
func (Uncrawlable) GetId ¶
func (u Uncrawlable) GetId() string
func (Uncrawlable) Key ¶
func (u Uncrawlable) Key() datastore.Key
func (*Uncrawlable) NewSQLModel ¶
func (u *Uncrawlable) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Uncrawlable) Read ¶
func (u *Uncrawlable) Read(store datastore.Datastore) error
Read uncrawlable from db
func (*Uncrawlable) SQLParams ¶
func (u *Uncrawlable) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLParams formats a uncrawlable struct for inserting / updating into postgres
func (*Uncrawlable) SQLQuery ¶
func (u *Uncrawlable) SQLQuery(cmd sql_datastore.Cmd) string
func (*Uncrawlable) Save ¶
func (u *Uncrawlable) Save(store datastore.Datastore) (err error)
Save a uncrawlable
func (*Uncrawlable) UnmarshalSQL ¶
func (u *Uncrawlable) UnmarshalSQL(row sqlutil.Scannable) (err error)
UnmarshalSQL reads an sql response into the uncrawlable receiver it expects the request to have used uncrawlableCols() for selection
type Url ¶
type Url struct {
// version 4 uuid
// urls can/should/must also be be uniquely identified by Url
Id string `json:"id,omitempty"`
// A Url is uniquely identified by URI string without
// any normalization. Url strings must always be absolute.
Url string `json:"url"`
// Created timestamp rounded to seconds in UTC
Created time.Time `json:"created,omitempty"`
// Updated timestamp rounded to seconds in UTC
Updated time.Time `json:"updated,omitempty"`
// Timestamp for most recent GET request
LastGet *time.Time `json:"lastGet,omitempty"`
// Timestamp for most revent HEAD request
LastHead *time.Time `json:"lastHead,omitempty"`
// Returned HTTP status code
Status int `json:"status,omitempty"`
// Returned HTTP 'Content-Type' header
ContentType string `json:"contentType,omitempty"`
// Result of mime sniffing to GET response body, as detailed at https://mimesniff.spec.whatwg.org
ContentSniff string `json:"contentSniff,omitempty"`
// ContentLength in bytes, will be the header value if only a HEAD request has been issued
// After a valid GET response, it will be set to the length of the returned response
ContentLength int64 `json:"contentLength,omitempty"`
// best guess at a filename based on url string analysis
// if you just want to know what type of file this is, this is the field to use.
FileName string `json:"fileName,omitempty"`
// HTML Title tag attribute
Title string `json:"title,omitempty"`
// Time remote server took to transfer content in miliseconds.
// TODO - currently not implemented
DownloadTook int `json:"downloadTook,omitempty"`
// Time taken to in miliseconds. currently not implemented
HeadersTook int `json:"headersTook,omitempty"`
// key-value slice of returned headers from most recent HEAD or GET request
// stored in the form [key,value,key,value...]
Headers []string `json:"headers,omitempty"`
// any associative metadata
Meta map[string]interface{} `json:"meta,omitempty"`
// Hash is a multihash sha-256 of res.Body
Hash string `json:"hash,omitempty"`
// Url to saved content
ContentUrl string `json:"contentUrl,omitempty"`
// Uncrawlable information
Uncrawlable *Uncrawlable `json:"uncrawlable,omitempty"`
}
URL represents... a url. TODO - consider renaming to Resource
func UnmarshalUrls ¶
UnmarshalUrls takes an sql cursor & returns a slice of url pointers expects columns to math urlCols()
func (Url) DatastoreType ¶
func (*Url) ExtractDocLinks ¶
ExtractDocLinks extracts & stores a page's linked documents by selecting all a[href] links from a given qoquery document, using the receiver *Url as the base
func (*Url) File ¶
File leverages a url's hash to generate a file that can have it's bytes read back
func (*Url) HandleGetResponse ¶
func (u *Url) HandleGetResponse(store datastore.Datastore, res *http.Response) (body []byte, links []*Link, err error)
HandleGetResponse performs all necessary actions in response to a GET request, regardless of weather it came from a crawl or archive request
func (*Url) HeadersMap ¶
HeadersMap formats u.Headers (a string slice) as a map[header]value
func (*Url) InboundLinks ¶
InboundLinks returns a slice of url strings that link to this url
func (*Url) NewSQLModel ¶
func (u *Url) NewSQLModel(key datastore.Key) sql_datastore.Model
func (*Url) OutboundLinks ¶
Outbound returns a slice of url strings that this url links to
func (*Url) SQLParams ¶
func (u *Url) SQLParams(cmd sql_datastore.Cmd) []interface{}
SQLArgs formats a url struct for inserting / updating into postgres
func (*Url) ShouldEnqueueGet ¶
ShouldEnqueueGet returns weather the url can be added to the que for a GET request. keep in mind only urls who's domain are are marked crawl : true in the domains list will be candidates for GET requests. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration
func (*Url) ShouldEnqueueHead ¶
ShouldEnqueueHead returns weather the url can be added to the que for a HEAD request. It should return true if: * the url is of http / https scheme * has never been GET'd or hasn't been GET'd for a period longer than the stale duration
func (*Url) ShouldPutS3 ¶
ShouldPutS3 is a chance to override weather the content should be stored
func (*Url) SuspectedContentUrl ¶
SuspectedContentUrl examines the url string, returns true if there's a reasonable chance the url leads to content
func (*Url) UnmarshalSQL ¶
UnmarshalSQL reads an sql response into the url receiver it expects the request to have used urlCols() for selection