Documentation
¶
Overview ¶
Package processor provides unified data processing capabilities
Index ¶
- func ValidatePaginationConfig(config PaginationConfig) error
- func ValidateSortConfig(config SortConfig) error
- type ArticleSorter
- func (as *ArticleSorter) SetSortConfig(config SortConfig)
- func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)
- func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)
- func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article
- type Config
- type Converter
- func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)
- func (c *Converter) ConvertToArticle(collectorArticle collector.Article) (*models.Article, error)
- func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)
- func (c *Converter) GenerateHash(title, url string) string
- func (c *Converter) GetConfig() ConverterConfig
- func (c *Converter) UpdateConfig(config ConverterConfig)
- type ConverterConfig
- type DocumentFrequency
- type KeywordMatcher
- type PaginationConfig
- type PaginationResult
- type ProcessOptions
- type Processor
- func (p *Processor) CalculateFrontendRelevance(article models.Article, query string) float64
- func (p *Processor) GetStats() ProcessorStats
- func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)
- func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)
- type ProcessorStats
- type RelevanceScorer
- func (rs *RelevanceScorer) AddToCorpus(article *models.Article)
- func (rs *RelevanceScorer) ClearCache()
- func (rs *RelevanceScorer) GetKeywords() []string
- func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency
- func (rs *RelevanceScorer) GetWeightConfig() WeightConfig
- func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64
- func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)
- func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)
- func (rs *RelevanceScorer) UpdateKeywords(keywords []string)
- type RepositoryPaginationResult
- type RepositorySorter
- func (rs *RepositorySorter) SetSortConfig(config SortConfig)
- func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)
- func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)
- func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository
- type SentenceScore
- type SimpleStemmer
- type SortBy
- type SortConfig
- type SortOrder
- type Summarizer
- type TFIDFScore
- type TermFrequency
- type UserPreferences
- type WeightConfig
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ValidatePaginationConfig ¶
func ValidatePaginationConfig(config PaginationConfig) error
ValidatePaginationConfig validates pagination configuration
func ValidateSortConfig ¶
func ValidateSortConfig(config SortConfig) error
ValidateSortConfig validates a sort configuration
Types ¶
type ArticleSorter ¶
type ArticleSorter struct {
// contains filtered or unexported fields
}
ArticleSorter handles multi-dimensional sorting and pagination of articles
func NewArticleSorter ¶
func NewArticleSorter(relevanceScorer *RelevanceScorer) *ArticleSorter
NewArticleSorter creates a new ArticleSorter instance
func (*ArticleSorter) SetSortConfig ¶
func (as *ArticleSorter) SetSortConfig(config SortConfig)
SetSortConfig updates the sorting configuration
func (*ArticleSorter) SetUserPreferences ¶
func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)
SetUserPreferences sets user-specific preferences for personalized ranking
func (*ArticleSorter) SortAndPaginate ¶
func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)
SortAndPaginate sorts articles and applies pagination
func (*ArticleSorter) SortArticles ¶
func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article
SortArticles sorts articles according to the configured criteria
type Config ¶
type Config struct { EnableSummarization bool `json:"enableSummarization"` EnableSorting bool `json:"enableSorting"` MaxSummaryLength int `json:"maxSummaryLength"` ProcessingTimeout time.Duration `json:"processingTimeout"` MaxConcurrency int `json:"maxConcurrency"` }
Config holds processor configuration
func DefaultConfig ¶
func DefaultConfig() *Config
DefaultConfig returns default processor configuration
type Converter ¶
type Converter struct {
// contains filtered or unexported fields
}
Converter handles conversion from raw collector data to unified Article/Repository models It provides thread-safe operations for concurrent processing and comprehensive data normalization
func NewConverter ¶
func NewConverter(config ConverterConfig) *Converter
NewConverter creates a new converter with the given configuration
func NewDefaultConverter ¶
func NewDefaultConverter() *Converter
NewDefaultConverter creates a new converter with default configuration
func (*Converter) BatchConvertArticles ¶
func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)
BatchConvertArticles converts multiple collector articles concurrently
func (*Converter) ConvertToArticle ¶
ConvertToArticle converts a collector.Article to models.Article with full data normalization
func (*Converter) ConvertToRepository ¶
func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)
ConvertToRepository converts API data to models.Repository for repository-type content
func (*Converter) GenerateHash ¶
GenerateHash creates a hash for deduplication purposes
func (*Converter) GetConfig ¶
func (c *Converter) GetConfig() ConverterConfig
GetConfig returns a copy of the current configuration
func (*Converter) UpdateConfig ¶
func (c *Converter) UpdateConfig(config ConverterConfig)
UpdateConfig updates the converter configuration in a thread-safe manner
type ConverterConfig ¶
type ConverterConfig struct { // Maximum length for summary text (default: 1000) MaxSummaryLength int // Maximum length for title text (default: 500) MaxTitleLength int // Maximum length for content text (default: 50000) MaxContentLength int // Default quality score for articles without explicit quality indicators DefaultQuality float64 // Default relevance score for articles without relevance calculation DefaultRelevance float64 // Enable aggressive HTML cleaning (removes more tags and attributes) AggressiveHTMLCleaning bool // Normalize URLs to canonical form (removes tracking parameters, etc.) NormalizeURLs bool // Time zone for date normalization (default: UTC) TimeZone *time.Location }
ConverterConfig contains configuration options for data conversion
func DefaultConverterConfig ¶
func DefaultConverterConfig() ConverterConfig
DefaultConverterConfig returns a configuration with sensible defaults
type DocumentFrequency ¶
DocumentFrequency represents how many documents contain a specific term
type KeywordMatcher ¶
type KeywordMatcher struct { Keywords []string `json:"keywords"` WeightConfig WeightConfig `json:"weightConfig"` // contains filtered or unexported fields }
KeywordMatcher handles keyword matching with weighted scoring
func (*KeywordMatcher) ScoreKeywordMatch ¶
func (km *KeywordMatcher) ScoreKeywordMatch(article *models.Article) float64
ScoreKeywordMatch calculates keyword matching score with weights
type PaginationConfig ¶
type PaginationConfig struct { Page int `json:"page"` // Current page (1-based) PageSize int `json:"pageSize"` // Items per page }
PaginationConfig defines pagination settings
func GetDefaultPaginationConfig ¶
func GetDefaultPaginationConfig() PaginationConfig
GetDefaultPaginationConfig returns default pagination configuration
type PaginationResult ¶
type PaginationResult struct { Items []*models.Article `json:"items"` CurrentPage int `json:"currentPage"` PageSize int `json:"pageSize"` TotalItems int `json:"totalItems"` TotalPages int `json:"totalPages"` HasNext bool `json:"hasNext"` HasPrev bool `json:"hasPrev"` }
PaginationResult contains paginated results with metadata
type ProcessOptions ¶
type ProcessOptions struct { Query string `json:"query"` SortBy string `json:"sortBy"` SortOrder string `json:"sortOrder"` Limit int `json:"limit"` }
ProcessOptions defines options for processing
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
Processor provides unified data processing functionality
func NewProcessor ¶
NewProcessor creates a new processor instance
func (*Processor) CalculateFrontendRelevance ¶
CalculateFrontendRelevance calculates how relevant an article is to frontend development
func (*Processor) GetStats ¶
func (p *Processor) GetStats() ProcessorStats
GetStats returns processor statistics
func (*Processor) ProcessArticles ¶
func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)
ProcessArticles processes a slice of articles with various enhancements
func (*Processor) ProcessRepositories ¶
func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)
ProcessRepositories processes a slice of repositories
type ProcessorStats ¶
type ProcessorStats struct { ProcessedArticles int `json:"processedArticles"` ProcessedRepositories int `json:"processedRepositories"` AverageProcessingTime time.Duration `json:"averageProcessingTime"` CacheHitRate float64 `json:"cacheHitRate"` }
ProcessorStats holds processor performance statistics
type RelevanceScorer ¶
type RelevanceScorer struct {
// contains filtered or unexported fields
}
RelevanceScorer handles content relevance scoring using TF-IDF and keyword matching
func NewRelevanceScorer ¶
func NewRelevanceScorer(keywords []string) *RelevanceScorer
NewRelevanceScorer creates a new instance of RelevanceScorer
func (*RelevanceScorer) AddToCorpus ¶
func (rs *RelevanceScorer) AddToCorpus(article *models.Article)
AddToCorpus adds an article to the corpus
func (*RelevanceScorer) ClearCache ¶
func (rs *RelevanceScorer) ClearCache()
ClearCache clears all internal caches
func (*RelevanceScorer) GetKeywords ¶
func (rs *RelevanceScorer) GetKeywords() []string
GetKeywords returns the current keywords
func (*RelevanceScorer) GetTopTerms ¶
func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency
GetTopTerms returns the top N terms for an article based on TF-IDF
func (*RelevanceScorer) GetWeightConfig ¶
func (rs *RelevanceScorer) GetWeightConfig() WeightConfig
GetWeightConfig returns the current weight configuration
func (*RelevanceScorer) ScoreRelevance ¶
func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64
ScoreRelevance calculates the relevance score for an article
func (*RelevanceScorer) SetCorpus ¶
func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)
SetCorpus sets the corpus for IDF calculation
func (*RelevanceScorer) SetWeightConfig ¶
func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)
SetWeightConfig updates the weight configuration
func (*RelevanceScorer) UpdateKeywords ¶
func (rs *RelevanceScorer) UpdateKeywords(keywords []string)
UpdateKeywords updates the keywords used for scoring
type RepositoryPaginationResult ¶
type RepositoryPaginationResult struct { Items []*models.Repository `json:"items"` CurrentPage int `json:"currentPage"` PageSize int `json:"pageSize"` TotalItems int `json:"totalItems"` TotalPages int `json:"totalPages"` HasNext bool `json:"hasNext"` HasPrev bool `json:"hasPrev"` }
RepositoryPaginationResult contains paginated repository results
type RepositorySorter ¶
type RepositorySorter struct {
// contains filtered or unexported fields
}
RepositorySorter handles sorting of repositories
func NewRepositorySorter ¶
func NewRepositorySorter() *RepositorySorter
NewRepositorySorter creates a new RepositorySorter instance
func (*RepositorySorter) SetSortConfig ¶
func (rs *RepositorySorter) SetSortConfig(config SortConfig)
SetSortConfig updates the sorting configuration for repositories
func (*RepositorySorter) SetUserPreferences ¶
func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)
SetUserPreferences sets user preferences for repository sorting
func (*RepositorySorter) SortAndPaginate ¶
func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)
SortAndPaginate sorts repositories and applies pagination
func (*RepositorySorter) SortRepositories ¶
func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository
SortRepositories sorts repositories according to the configured criteria
type SentenceScore ¶
SentenceScore represents a sentence with its calculated importance score
type SimpleStemmer ¶
type SimpleStemmer struct {
// contains filtered or unexported fields
}
SimpleStemmer provides basic word stemming functionality
func NewSimpleStemmer ¶
func NewSimpleStemmer() *SimpleStemmer
NewSimpleStemmer creates a new stemmer instance
func (*SimpleStemmer) Stem ¶
func (ss *SimpleStemmer) Stem(word string) string
Stem applies basic stemming to a word
type SortBy ¶
type SortBy string
SortBy defines the primary sorting criteria
const ( SortByRelevance SortBy = "relevance" // Sort by relevance score SortByTime SortBy = "time" // Sort by publication time SortByPopularity SortBy = "popularity" // Sort by quality/popularity metrics SortByTrend SortBy = "trend" // Sort by trending score SortByComposite SortBy = "composite" // Weighted combination of multiple factors )
type SortConfig ¶
type SortConfig struct { Primary SortBy `json:"primary"` // Primary sorting criterion Secondary SortBy `json:"secondary"` // Secondary sorting criterion (for tie-breaking) Order SortOrder `json:"order"` // Sort order // Weights for composite sorting RelevanceWeight float64 `json:"relevanceWeight"` // Weight for relevance (default: 0.4) TimeWeight float64 `json:"timeWeight"` // Weight for recency (default: 0.3) PopularityWeight float64 `json:"popularityWeight"` // Weight for popularity (default: 0.2) TrendWeight float64 `json:"trendWeight"` // Weight for trending (default: 0.1) }
SortConfig defines the sorting configuration
func GetDefaultSortConfig ¶
func GetDefaultSortConfig() SortConfig
GetDefaultSortConfig returns default sorting configuration
type Summarizer ¶
type Summarizer struct { // Configuration for summary generation MinSummaryLength int // Minimum summary length in characters MaxSummaryLength int // Maximum summary length in characters SentenceCount int // Target number of sentences in summary PositionWeight float64 // Weight for sentence position (earlier sentences score higher) LengthWeight float64 // Weight for sentence length KeywordWeight float64 // Weight for keyword density }
Summarizer handles intelligent content summarization and processing
func NewSummarizer ¶
func NewSummarizer() *Summarizer
NewSummarizer creates a new Summarizer with default configuration
func (*Summarizer) AssessQuality ¶
func (s *Summarizer) AssessQuality(article *models.Article) float64
AssessQuality evaluates content quality based on multiple factors
func (*Summarizer) CleanContent ¶
func (s *Summarizer) CleanContent(article *models.Article) error
CleanContent performs content cleaning and preprocessing
func (*Summarizer) GenerateSummary ¶
func (s *Summarizer) GenerateSummary(text string) (string, error)
GenerateSummary creates an intelligent summary of the given text Uses sentence extraction based on position, length, and keyword density
func (*Summarizer) ProcessArticle ¶
func (s *Summarizer) ProcessArticle(article *models.Article) error
ProcessArticle performs complete content processing on an article This includes cleaning, summarization, and quality assessment
type TFIDFScore ¶
type TFIDFScore struct { Term string `json:"term"` TF float64 `json:"tf"` // Term Frequency IDF float64 `json:"idf"` // Inverse Document Frequency TFIDF float64 `json:"tfidf"` // TF-IDF Score Weight float64 `json:"weight"` // Additional weight based on position/importance }
TFIDFScore represents the TF-IDF score for a term in a document
type TermFrequency ¶
type TermFrequency struct { Term string `json:"term"` Count int `json:"count"` Frequency float64 `json:"frequency"` }
TermFrequency represents the frequency of a term in a document
type UserPreferences ¶
type UserPreferences struct { FavoriteTopics []string `json:"favoriteTopics"` // Preferred topic keywords PreferredSources []string `json:"preferredSources"` // Preferred news sources ReadingHistory []string `json:"readingHistory"` // Article IDs user has read TopicWeights map[string]float64 `json:"topicWeights"` // Custom weights for topics RecencyPreference float64 `json:"recencyPreference"` // How much user prefers recent articles (0-1) LanguagePrefs []string `json:"languagePrefs"` // Preferred programming languages for repos }
UserPreferences defines user-specific ranking preferences
type WeightConfig ¶
type WeightConfig struct { TitleWeight float64 `json:"titleWeight"` // Weight for title matches (default: 3.0) SummaryWeight float64 `json:"summaryWeight"` // Weight for summary matches (default: 2.0) ContentWeight float64 `json:"contentWeight"` // Weight for content matches (default: 1.0) TagWeight float64 `json:"tagWeight"` // Weight for tag matches (default: 4.0) ExactMatch float64 `json:"exactMatch"` // Bonus for exact keyword match (default: 1.5) PartialMatch float64 `json:"partialMatch"` // Score for partial match (default: 0.8) }
WeightConfig defines scoring weights for different text sections