processor

package
v0.0.0-...-812ebae Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 7, 2025 License: MIT Imports: 16 Imported by: 0

Documentation

Overview

Package processor provides unified data processing capabilities

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ValidatePaginationConfig

func ValidatePaginationConfig(config PaginationConfig) error

ValidatePaginationConfig validates pagination configuration

func ValidateSortConfig

func ValidateSortConfig(config SortConfig) error

ValidateSortConfig validates a sort configuration

Types

type ArticleSorter

type ArticleSorter struct {
	// contains filtered or unexported fields
}

ArticleSorter handles multi-dimensional sorting and pagination of articles

func NewArticleSorter

func NewArticleSorter(relevanceScorer *RelevanceScorer) *ArticleSorter

NewArticleSorter creates a new ArticleSorter instance

func (*ArticleSorter) SetSortConfig

func (as *ArticleSorter) SetSortConfig(config SortConfig)

SetSortConfig updates the sorting configuration

func (*ArticleSorter) SetUserPreferences

func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)

SetUserPreferences sets user-specific preferences for personalized ranking

func (*ArticleSorter) SortAndPaginate

func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)

SortAndPaginate sorts articles and applies pagination

func (*ArticleSorter) SortArticles

func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article

SortArticles sorts articles according to the configured criteria

type Config

type Config struct {
	EnableSummarization bool          `json:"enableSummarization"`
	EnableSorting       bool          `json:"enableSorting"`
	MaxSummaryLength    int           `json:"maxSummaryLength"`
	ProcessingTimeout   time.Duration `json:"processingTimeout"`
	MaxConcurrency      int           `json:"maxConcurrency"`
}

Config holds processor configuration

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns default processor configuration

type Converter

type Converter struct {
	// contains filtered or unexported fields
}

Converter handles conversion from raw collector data to unified Article/Repository models It provides thread-safe operations for concurrent processing and comprehensive data normalization

func NewConverter

func NewConverter(config ConverterConfig) *Converter

NewConverter creates a new converter with the given configuration

func NewDefaultConverter

func NewDefaultConverter() *Converter

NewDefaultConverter creates a new converter with default configuration

func (*Converter) BatchConvertArticles

func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)

BatchConvertArticles converts multiple collector articles concurrently

func (*Converter) ConvertToArticle

func (c *Converter) ConvertToArticle(collectorArticle collector.Article) (*models.Article, error)

ConvertToArticle converts a collector.Article to models.Article with full data normalization

func (*Converter) ConvertToRepository

func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)

ConvertToRepository converts API data to models.Repository for repository-type content

func (*Converter) GenerateHash

func (c *Converter) GenerateHash(title, url string) string

GenerateHash creates a hash for deduplication purposes

func (*Converter) GetConfig

func (c *Converter) GetConfig() ConverterConfig

GetConfig returns a copy of the current configuration

func (*Converter) UpdateConfig

func (c *Converter) UpdateConfig(config ConverterConfig)

UpdateConfig updates the converter configuration in a thread-safe manner

type ConverterConfig

type ConverterConfig struct {
	// Maximum length for summary text (default: 1000)
	MaxSummaryLength int

	// Maximum length for title text (default: 500)
	MaxTitleLength int

	// Maximum length for content text (default: 50000)
	MaxContentLength int

	// Default quality score for articles without explicit quality indicators
	DefaultQuality float64

	// Default relevance score for articles without relevance calculation
	DefaultRelevance float64

	// Enable aggressive HTML cleaning (removes more tags and attributes)
	AggressiveHTMLCleaning bool

	// Normalize URLs to canonical form (removes tracking parameters, etc.)
	NormalizeURLs bool

	// Time zone for date normalization (default: UTC)
	TimeZone *time.Location
}

ConverterConfig contains configuration options for data conversion

func DefaultConverterConfig

func DefaultConverterConfig() ConverterConfig

DefaultConverterConfig returns a configuration with sensible defaults

type DocumentFrequency

type DocumentFrequency struct {
	Term  string `json:"term"`
	Count int    `json:"count"`
}

DocumentFrequency represents how many documents contain a specific term

type KeywordMatcher

type KeywordMatcher struct {
	Keywords     []string     `json:"keywords"`
	WeightConfig WeightConfig `json:"weightConfig"`
	// contains filtered or unexported fields
}

KeywordMatcher handles keyword matching with weighted scoring

func (*KeywordMatcher) ScoreKeywordMatch

func (km *KeywordMatcher) ScoreKeywordMatch(article *models.Article) float64

ScoreKeywordMatch calculates keyword matching score with weights

type PaginationConfig

type PaginationConfig struct {
	Page     int `json:"page"`     // Current page (1-based)
	PageSize int `json:"pageSize"` // Items per page
}

PaginationConfig defines pagination settings

func GetDefaultPaginationConfig

func GetDefaultPaginationConfig() PaginationConfig

GetDefaultPaginationConfig returns default pagination configuration

type PaginationResult

type PaginationResult struct {
	Items       []*models.Article `json:"items"`
	CurrentPage int               `json:"currentPage"`
	PageSize    int               `json:"pageSize"`
	TotalItems  int               `json:"totalItems"`
	TotalPages  int               `json:"totalPages"`
	HasNext     bool              `json:"hasNext"`
	HasPrev     bool              `json:"hasPrev"`
}

PaginationResult contains paginated results with metadata

type ProcessOptions

type ProcessOptions struct {
	Query     string `json:"query"`
	SortBy    string `json:"sortBy"`
	SortOrder string `json:"sortOrder"`
	Limit     int    `json:"limit"`
}

ProcessOptions defines options for processing

type Processor

type Processor struct {
	// contains filtered or unexported fields
}

Processor provides unified data processing functionality

func NewProcessor

func NewProcessor(config *Config) *Processor

NewProcessor creates a new processor instance

func (*Processor) CalculateFrontendRelevance

func (p *Processor) CalculateFrontendRelevance(article models.Article, query string) float64

CalculateFrontendRelevance calculates how relevant an article is to frontend development

func (*Processor) GetStats

func (p *Processor) GetStats() ProcessorStats

GetStats returns processor statistics

func (*Processor) ProcessArticles

func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)

ProcessArticles processes a slice of articles with various enhancements

func (*Processor) ProcessRepositories

func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)

ProcessRepositories processes a slice of repositories

type ProcessorStats

type ProcessorStats struct {
	ProcessedArticles     int           `json:"processedArticles"`
	ProcessedRepositories int           `json:"processedRepositories"`
	AverageProcessingTime time.Duration `json:"averageProcessingTime"`
	CacheHitRate          float64       `json:"cacheHitRate"`
}

ProcessorStats holds processor performance statistics

type RelevanceScorer

type RelevanceScorer struct {
	// contains filtered or unexported fields
}

RelevanceScorer handles content relevance scoring using TF-IDF and keyword matching

func NewRelevanceScorer

func NewRelevanceScorer(keywords []string) *RelevanceScorer

NewRelevanceScorer creates a new instance of RelevanceScorer

func (*RelevanceScorer) AddToCorpus

func (rs *RelevanceScorer) AddToCorpus(article *models.Article)

AddToCorpus adds an article to the corpus

func (*RelevanceScorer) ClearCache

func (rs *RelevanceScorer) ClearCache()

ClearCache clears all internal caches

func (*RelevanceScorer) GetKeywords

func (rs *RelevanceScorer) GetKeywords() []string

GetKeywords returns the current keywords

func (*RelevanceScorer) GetTopTerms

func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency

GetTopTerms returns the top N terms for an article based on TF-IDF

func (*RelevanceScorer) GetWeightConfig

func (rs *RelevanceScorer) GetWeightConfig() WeightConfig

GetWeightConfig returns the current weight configuration

func (*RelevanceScorer) ScoreRelevance

func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64

ScoreRelevance calculates the relevance score for an article

func (*RelevanceScorer) SetCorpus

func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)

SetCorpus sets the corpus for IDF calculation

func (*RelevanceScorer) SetWeightConfig

func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)

SetWeightConfig updates the weight configuration

func (*RelevanceScorer) UpdateKeywords

func (rs *RelevanceScorer) UpdateKeywords(keywords []string)

UpdateKeywords updates the keywords used for scoring

type RepositoryPaginationResult

type RepositoryPaginationResult struct {
	Items       []*models.Repository `json:"items"`
	CurrentPage int                  `json:"currentPage"`
	PageSize    int                  `json:"pageSize"`
	TotalItems  int                  `json:"totalItems"`
	TotalPages  int                  `json:"totalPages"`
	HasNext     bool                 `json:"hasNext"`
	HasPrev     bool                 `json:"hasPrev"`
}

RepositoryPaginationResult contains paginated repository results

type RepositorySorter

type RepositorySorter struct {
	// contains filtered or unexported fields
}

RepositorySorter handles sorting of repositories

func NewRepositorySorter

func NewRepositorySorter() *RepositorySorter

NewRepositorySorter creates a new RepositorySorter instance

func (*RepositorySorter) SetSortConfig

func (rs *RepositorySorter) SetSortConfig(config SortConfig)

SetSortConfig updates the sorting configuration for repositories

func (*RepositorySorter) SetUserPreferences

func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)

SetUserPreferences sets user preferences for repository sorting

func (*RepositorySorter) SortAndPaginate

func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)

SortAndPaginate sorts repositories and applies pagination

func (*RepositorySorter) SortRepositories

func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository

SortRepositories sorts repositories according to the configured criteria

type SentenceScore

type SentenceScore struct {
	Text     string
	Score    float64
	Position int
	Length   int
}

SentenceScore represents a sentence with its calculated importance score

type SimpleStemmer

type SimpleStemmer struct {
	// contains filtered or unexported fields
}

SimpleStemmer provides basic word stemming functionality

func NewSimpleStemmer

func NewSimpleStemmer() *SimpleStemmer

NewSimpleStemmer creates a new stemmer instance

func (*SimpleStemmer) Stem

func (ss *SimpleStemmer) Stem(word string) string

Stem applies basic stemming to a word

type SortBy

type SortBy string

SortBy defines the primary sorting criteria

const (
	SortByRelevance  SortBy = "relevance"  // Sort by relevance score
	SortByTime       SortBy = "time"       // Sort by publication time
	SortByPopularity SortBy = "popularity" // Sort by quality/popularity metrics
	SortByTrend      SortBy = "trend"      // Sort by trending score
	SortByComposite  SortBy = "composite"  // Weighted combination of multiple factors
)

type SortConfig

type SortConfig struct {
	Primary   SortBy    `json:"primary"`   // Primary sorting criterion
	Secondary SortBy    `json:"secondary"` // Secondary sorting criterion (for tie-breaking)
	Order     SortOrder `json:"order"`     // Sort order

	// Weights for composite sorting
	RelevanceWeight  float64 `json:"relevanceWeight"`  // Weight for relevance (default: 0.4)
	TimeWeight       float64 `json:"timeWeight"`       // Weight for recency (default: 0.3)
	PopularityWeight float64 `json:"popularityWeight"` // Weight for popularity (default: 0.2)
	TrendWeight      float64 `json:"trendWeight"`      // Weight for trending (default: 0.1)
}

SortConfig defines the sorting configuration

func GetDefaultSortConfig

func GetDefaultSortConfig() SortConfig

GetDefaultSortConfig returns default sorting configuration

type SortOrder

type SortOrder string

SortOrder defines ascending or descending order

const (
	SortAsc  SortOrder = "asc"  // Ascending order
	SortDesc SortOrder = "desc" // Descending order
)

type Summarizer

type Summarizer struct {
	// Configuration for summary generation
	MinSummaryLength int     // Minimum summary length in characters
	MaxSummaryLength int     // Maximum summary length in characters
	SentenceCount    int     // Target number of sentences in summary
	PositionWeight   float64 // Weight for sentence position (earlier sentences score higher)
	LengthWeight     float64 // Weight for sentence length
	KeywordWeight    float64 // Weight for keyword density
}

Summarizer handles intelligent content summarization and processing

func NewSummarizer

func NewSummarizer() *Summarizer

NewSummarizer creates a new Summarizer with default configuration

func (*Summarizer) AssessQuality

func (s *Summarizer) AssessQuality(article *models.Article) float64

AssessQuality evaluates content quality based on multiple factors

func (*Summarizer) CleanContent

func (s *Summarizer) CleanContent(article *models.Article) error

CleanContent performs content cleaning and preprocessing

func (*Summarizer) GenerateSummary

func (s *Summarizer) GenerateSummary(text string) (string, error)

GenerateSummary creates an intelligent summary of the given text Uses sentence extraction based on position, length, and keyword density

func (*Summarizer) ProcessArticle

func (s *Summarizer) ProcessArticle(article *models.Article) error

ProcessArticle performs complete content processing on an article This includes cleaning, summarization, and quality assessment

type TFIDFScore

type TFIDFScore struct {
	Term   string  `json:"term"`
	TF     float64 `json:"tf"`     // Term Frequency
	IDF    float64 `json:"idf"`    // Inverse Document Frequency
	TFIDF  float64 `json:"tfidf"`  // TF-IDF Score
	Weight float64 `json:"weight"` // Additional weight based on position/importance
}

TFIDFScore represents the TF-IDF score for a term in a document

type TermFrequency

type TermFrequency struct {
	Term      string  `json:"term"`
	Count     int     `json:"count"`
	Frequency float64 `json:"frequency"`
}

TermFrequency represents the frequency of a term in a document

type UserPreferences

type UserPreferences struct {
	FavoriteTopics    []string           `json:"favoriteTopics"`    // Preferred topic keywords
	PreferredSources  []string           `json:"preferredSources"`  // Preferred news sources
	ReadingHistory    []string           `json:"readingHistory"`    // Article IDs user has read
	TopicWeights      map[string]float64 `json:"topicWeights"`      // Custom weights for topics
	RecencyPreference float64            `json:"recencyPreference"` // How much user prefers recent articles (0-1)
	LanguagePrefs     []string           `json:"languagePrefs"`     // Preferred programming languages for repos
}

UserPreferences defines user-specific ranking preferences

type WeightConfig

type WeightConfig struct {
	TitleWeight   float64 `json:"titleWeight"`   // Weight for title matches (default: 3.0)
	SummaryWeight float64 `json:"summaryWeight"` // Weight for summary matches (default: 2.0)
	ContentWeight float64 `json:"contentWeight"` // Weight for content matches (default: 1.0)
	TagWeight     float64 `json:"tagWeight"`     // Weight for tag matches (default: 4.0)
	ExactMatch    float64 `json:"exactMatch"`    // Bonus for exact keyword match (default: 1.5)
	PartialMatch  float64 `json:"partialMatch"`  // Score for partial match (default: 0.8)
}

WeightConfig defines scoring weights for different text sections

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL