Documentation
¶
Index ¶
- type APICollector
- type Article
- type ArticleFilter
- type AtomAuthor
- type AtomCategory
- type AtomContent
- type AtomEntry
- type AtomFeed
- type AtomLink
- type BatchCollector
- type Channel
- type CollectConfig
- type CollectResult
- type CollectorManager
- type CollectorManagerImpl
- func (cm *CollectorManagerImpl) CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult
- func (cm *CollectorManagerImpl) CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)
- func (cm *CollectorManagerImpl) GetCollector(sourceType string) (DataCollector, bool)
- func (cm *CollectorManagerImpl) RegisterCollector(sourceType string, collector DataCollector)
- type DataCollector
- type DevToArticle
- type DevToOrg
- type DevToUser
- type GitHubIssue
- type GitHubRepo
- type HTMLCollector
- type HTMLSelector
- type Item
- type Link
- type RSSCollector
- type RSSFeed
- type RetryConfig
- type SortBy
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type APICollector ¶
type APICollector struct {
// contains filtered or unexported fields
}
APICollector API数据采集器
func (*APICollector) Collect ¶
func (a *APICollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集API数据
func (*APICollector) GetSourceType ¶
func (a *APICollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*APICollector) Validate ¶
func (a *APICollector) Validate(config CollectConfig) error
Validate 验证配置
type Article ¶
type Article struct {
ID string `json:"id"`
Title string `json:"title"`
Content string `json:"content"`
Summary string `json:"summary"`
Author string `json:"author"`
URL string `json:"url"`
PublishedAt time.Time `json:"published_at"`
Tags []string `json:"tags"`
Source string `json:"source"`
SourceType string `json:"source_type"`
Language string `json:"language"`
Metadata map[string]string `json:"metadata"`
}
Article 表示采集到的文章数据
func AggregateResults ¶
func AggregateResults(results []CollectResult) ([]Article, []error)
AggregateResults 聚合多个采集结果
func DeduplicateArticles ¶
DeduplicateArticles 去重文章
type ArticleFilter ¶
type ArticleFilter struct {
Keywords []string // 关键词过滤
Authors []string // 作者过滤
Tags []string // 标签过滤
Languages []string // 语言过滤
DateFrom time.Time // 起始日期
DateTo time.Time // 结束日期
MinLength int // 最小内容长度
MaxLength int // 最大内容长度
}
FilterArticles 根据条件过滤文章
func (*ArticleFilter) FilterArticles ¶
func (af *ArticleFilter) FilterArticles(articles []Article) []Article
FilterArticles 过滤文章
type AtomAuthor ¶
type AtomCategory ¶
type AtomCategory struct {
Term string `xml:"term,attr"`
}
type AtomContent ¶
type AtomEntry ¶
type AtomEntry struct {
Title string `xml:"title"`
Content AtomContent `xml:"content"`
Summary string `xml:"summary"`
Link []AtomLink `xml:"link"`
ID string `xml:"id"`
Published string `xml:"published"`
Updated string `xml:"updated"`
Author AtomAuthor `xml:"author"`
Category []AtomCategory `xml:"category"`
}
type AtomFeed ¶
type AtomFeed struct {
XMLName xml.Name `xml:"feed"`
Title string `xml:"title"`
Link []AtomLink `xml:"link"`
Entries []AtomEntry `xml:"entry"`
}
Atom feed 结构定义
type BatchCollector ¶
type BatchCollector struct {
// contains filtered or unexported fields
}
BatchCollector 批量采集器
func NewBatchCollector ¶
func NewBatchCollector(manager CollectorManager, maxConcurrent int, timeout time.Duration) *BatchCollector
NewBatchCollector 创建批量采集器
func (*BatchCollector) CollectBatch ¶
func (bc *BatchCollector) CollectBatch(ctx context.Context, configs []CollectConfig) []CollectResult
CollectBatch 批量采集,支持并发控制
type CollectConfig ¶
type CollectConfig struct {
URL string `json:"url"`
Headers map[string]string `json:"headers,omitempty"`
Timeout time.Duration `json:"timeout,omitempty"`
MaxArticles int `json:"max_articles,omitempty"`
Language string `json:"language,omitempty"`
Tags []string `json:"tags,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
CollectConfig 表示采集配置
type CollectResult ¶
type CollectResult struct {
Articles []Article `json:"articles"`
Source string `json:"source"`
Error error `json:"error,omitempty"`
}
CollectResult 表示采集结果
type CollectorManager ¶
type CollectorManager interface {
// RegisterCollector 注册采集器
RegisterCollector(sourceType string, collector DataCollector)
// CollectAll 并发采集多个数据源
CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult
// CollectWithRetry 带重试机制的采集
CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)
// GetCollector 根据类型获取采集器
GetCollector(sourceType string) (DataCollector, bool)
}
CollectorManager 管理所有采集器并提供并发采集能力
type CollectorManagerImpl ¶
type CollectorManagerImpl struct {
// contains filtered or unexported fields
}
CollectorManagerImpl 采集器管理器实现
func (*CollectorManagerImpl) CollectAll ¶
func (cm *CollectorManagerImpl) CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult
CollectAll 并发采集多个数据源
func (*CollectorManagerImpl) CollectWithRetry ¶
func (cm *CollectorManagerImpl) CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)
CollectWithRetry 带重试机制的采集
func (*CollectorManagerImpl) GetCollector ¶
func (cm *CollectorManagerImpl) GetCollector(sourceType string) (DataCollector, bool)
GetCollector 根据类型获取采集器
func (*CollectorManagerImpl) RegisterCollector ¶
func (cm *CollectorManagerImpl) RegisterCollector(sourceType string, collector DataCollector)
RegisterCollector 注册采集器
type DataCollector ¶
type DataCollector interface {
// Collect 采集数据,返回文章列表
Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
// GetSourceType 返回采集器类型
GetSourceType() string
// Validate 验证配置是否有效
Validate(config CollectConfig) error
}
DataCollector 定义数据采集器的统一接口
type DevToArticle ¶
type DevToArticle struct {
ID int `json:"id"`
Title string `json:"title"`
Description string `json:"description"`
BodyMarkdown string `json:"body_markdown"`
URL string `json:"url"`
PublishedAt string `json:"published_at"`
CreatedAt string `json:"created_at"`
TagList []string `json:"tag_list"`
User DevToUser `json:"user"`
Organization *DevToOrg `json:"organization"`
ReadingTimeMinutes int `json:"reading_time_minutes"`
}
Dev.to API 响应结构
type GitHubIssue ¶
type GitHubIssue struct {
ID int `json:"id"`
Number int `json:"number"`
Title string `json:"title"`
Body string `json:"body"`
State string `json:"state"`
HTMLURL string `json:"html_url"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
User struct {
Login string `json:"login"`
} `json:"user"`
Labels []struct {
Name string `json:"name"`
} `json:"labels"`
}
type GitHubRepo ¶
type GitHubRepo struct {
ID int `json:"id"`
Name string `json:"name"`
FullName string `json:"full_name"`
Description string `json:"description"`
HTMLURL string `json:"html_url"`
Language string `json:"language"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
PushedAt string `json:"pushed_at"`
StargazersCount int `json:"stargazers_count"`
ForksCount int `json:"forks_count"`
WatchersCount int `json:"watchers_count"`
OpenIssuesCount int `json:"open_issues_count"`
Fork bool `json:"fork"`
Owner struct {
Login string `json:"login"`
} `json:"owner"`
Topics []string `json:"topics"`
}
GitHub API 响应结构
type HTMLCollector ¶
type HTMLCollector struct {
// contains filtered or unexported fields
}
HTMLCollector HTML网页采集器
func (*HTMLCollector) Collect ¶
func (h *HTMLCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集HTML页面数据
func (*HTMLCollector) GetSourceType ¶
func (h *HTMLCollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*HTMLCollector) Validate ¶
func (h *HTMLCollector) Validate(config CollectConfig) error
Validate 验证配置
type HTMLSelector ¶
type HTMLSelector struct {
Title string `json:"title"` // 标题选择器
Content string `json:"content"` // 内容选择器
Summary string `json:"summary"` // 摘要选择器
Author string `json:"author"` // 作者选择器
PublishedAt string `json:"published_at"` // 发布时间选择器
Tags string `json:"tags"` // 标签选择器
Links string `json:"links"` // 链接选择器
}
HTMLSelector 定义HTML选择器配置
type RSSCollector ¶
type RSSCollector struct {
// contains filtered or unexported fields
}
RSSCollector RSS数据采集器
func (*RSSCollector) Collect ¶
func (r *RSSCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集RSS数据
func (*RSSCollector) GetSourceType ¶
func (r *RSSCollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*RSSCollector) Validate ¶
func (r *RSSCollector) Validate(config CollectConfig) error
Validate 验证配置
type RetryConfig ¶
type RetryConfig struct {
MaxRetries int `json:"max_retries"`
RetryDelay time.Duration `json:"retry_delay"`
}
RetryConfig 重试配置