processor

package

v0.0.0-...-812ebae Latest Latest Go to latest Published: Sep 7, 2025 License: MIT Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/ZephyrDeng/dev-context

Links

Open Source Insights

Documentation ¶

Overview ¶

Package processor provides unified data processing capabilities

Index ¶

func ValidatePaginationConfig(config PaginationConfig) error
func ValidateSortConfig(config SortConfig) error
type ArticleSorter
- func NewArticleSorter(relevanceScorer *RelevanceScorer) *ArticleSorter
- func (as *ArticleSorter) SetSortConfig(config SortConfig)
- func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)
- func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)
- func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article
type Config
- func DefaultConfig() *Config
type Converter
- func NewConverter(config ConverterConfig) *Converter
- func NewDefaultConverter() *Converter
- func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)
- func (c *Converter) ConvertToArticle(collectorArticle collector.Article) (*models.Article, error)
- func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)
- func (c *Converter) GenerateHash(title, url string) string
- func (c *Converter) GetConfig() ConverterConfig
- func (c *Converter) UpdateConfig(config ConverterConfig)
type ConverterConfig
- func DefaultConverterConfig() ConverterConfig
type DocumentFrequency
type KeywordMatcher
- func (km *KeywordMatcher) ScoreKeywordMatch(article *models.Article) float64
type PaginationConfig
- func GetDefaultPaginationConfig() PaginationConfig
type PaginationResult
type ProcessOptions
type Processor
- func NewProcessor(config *Config) *Processor
- func (p *Processor) CalculateFrontendRelevance(article models.Article, query string) float64
- func (p *Processor) GetStats() ProcessorStats
- func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)
- func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)
type ProcessorStats
type RelevanceScorer
- func NewRelevanceScorer(keywords []string) *RelevanceScorer
- func (rs *RelevanceScorer) AddToCorpus(article *models.Article)
- func (rs *RelevanceScorer) ClearCache()
- func (rs *RelevanceScorer) GetKeywords() []string
- func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency
- func (rs *RelevanceScorer) GetWeightConfig() WeightConfig
- func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64
- func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)
- func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)
- func (rs *RelevanceScorer) UpdateKeywords(keywords []string)
type RepositoryPaginationResult
type RepositorySorter
- func NewRepositorySorter() *RepositorySorter
- func (rs *RepositorySorter) SetSortConfig(config SortConfig)
- func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)
- func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)
- func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository
type SentenceScore
type SimpleStemmer
- func NewSimpleStemmer() *SimpleStemmer
- func (ss *SimpleStemmer) Stem(word string) string
type SortBy
type SortConfig
- func GetDefaultSortConfig() SortConfig
type SortOrder
type Summarizer
- func NewSummarizer() *Summarizer
- func (s *Summarizer) AssessQuality(article *models.Article) float64
- func (s *Summarizer) CleanContent(article *models.Article) error
- func (s *Summarizer) GenerateSummary(text string) (string, error)
- func (s *Summarizer) ProcessArticle(article *models.Article) error
type TFIDFScore
type TermFrequency
type UserPreferences
type WeightConfig

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ValidatePaginationConfig ¶

func ValidatePaginationConfig(config PaginationConfig) error

ValidatePaginationConfig validates pagination configuration

func ValidateSortConfig ¶

func ValidateSortConfig(config SortConfig) error

ValidateSortConfig validates a sort configuration

Types ¶

type ArticleSorter ¶

type ArticleSorter struct {
	// contains filtered or unexported fields
}

ArticleSorter handles multi-dimensional sorting and pagination of articles

func NewArticleSorter ¶

func NewArticleSorter(relevanceScorer *RelevanceScorer) *ArticleSorter

NewArticleSorter creates a new ArticleSorter instance

func (*ArticleSorter) SetSortConfig ¶

func (as *ArticleSorter) SetSortConfig(config SortConfig)

SetSortConfig updates the sorting configuration

func (*ArticleSorter) SetUserPreferences ¶

func (as *ArticleSorter) SetUserPreferences(prefs *UserPreferences)

SetUserPreferences sets user-specific preferences for personalized ranking

func (*ArticleSorter) SortAndPaginate ¶

func (as *ArticleSorter) SortAndPaginate(articles []*models.Article, paginationConfig PaginationConfig) (*PaginationResult, error)

SortAndPaginate sorts articles and applies pagination

func (*ArticleSorter) SortArticles ¶

func (as *ArticleSorter) SortArticles(articles []*models.Article) []*models.Article

SortArticles sorts articles according to the configured criteria

type Config ¶

type Config struct {
	EnableSummarization bool          `json:"enableSummarization"`
	EnableSorting       bool          `json:"enableSorting"`
	MaxSummaryLength    int           `json:"maxSummaryLength"`
	ProcessingTimeout   time.Duration `json:"processingTimeout"`
	MaxConcurrency      int           `json:"maxConcurrency"`
}

Config holds processor configuration

func DefaultConfig ¶

func DefaultConfig() *Config

DefaultConfig returns default processor configuration

type Converter ¶

type Converter struct {
	// contains filtered or unexported fields
}

Converter handles conversion from raw collector data to unified Article/Repository models It provides thread-safe operations for concurrent processing and comprehensive data normalization

func NewConverter ¶

func NewConverter(config ConverterConfig) *Converter

NewConverter creates a new converter with the given configuration

func NewDefaultConverter ¶

func NewDefaultConverter() *Converter

NewDefaultConverter creates a new converter with default configuration

func (*Converter) BatchConvertArticles ¶

func (c *Converter) BatchConvertArticles(collectorArticles []collector.Article) ([]*models.Article, []error)

BatchConvertArticles converts multiple collector articles concurrently

func (*Converter) ConvertToArticle ¶

func (c *Converter) ConvertToArticle(collectorArticle collector.Article) (*models.Article, error)

ConvertToArticle converts a collector.Article to models.Article with full data normalization

func (*Converter) ConvertToRepository ¶

func (c *Converter) ConvertToRepository(name, fullName, url string, metadata map[string]string) (*models.Repository, error)

ConvertToRepository converts API data to models.Repository for repository-type content

func (*Converter) GenerateHash ¶

func (c *Converter) GenerateHash(title, url string) string

GenerateHash creates a hash for deduplication purposes

func (*Converter) GetConfig ¶

func (c *Converter) GetConfig() ConverterConfig

GetConfig returns a copy of the current configuration

func (*Converter) UpdateConfig ¶

func (c *Converter) UpdateConfig(config ConverterConfig)

UpdateConfig updates the converter configuration in a thread-safe manner

type ConverterConfig ¶

type ConverterConfig struct {
	// Maximum length for summary text (default: 1000)
	MaxSummaryLength int

	// Maximum length for title text (default: 500)
	MaxTitleLength int

	// Maximum length for content text (default: 50000)
	MaxContentLength int

	// Default quality score for articles without explicit quality indicators
	DefaultQuality float64

	// Default relevance score for articles without relevance calculation
	DefaultRelevance float64

	// Enable aggressive HTML cleaning (removes more tags and attributes)
	AggressiveHTMLCleaning bool

	// Normalize URLs to canonical form (removes tracking parameters, etc.)
	NormalizeURLs bool

	// Time zone for date normalization (default: UTC)
	TimeZone *time.Location
}

ConverterConfig contains configuration options for data conversion

func DefaultConverterConfig ¶

func DefaultConverterConfig() ConverterConfig

DefaultConverterConfig returns a configuration with sensible defaults

type DocumentFrequency ¶

type DocumentFrequency struct {
	Term  string `json:"term"`
	Count int    `json:"count"`
}

DocumentFrequency represents how many documents contain a specific term

type KeywordMatcher ¶

type KeywordMatcher struct {
	Keywords     []string     `json:"keywords"`
	WeightConfig WeightConfig `json:"weightConfig"`
	// contains filtered or unexported fields
}

KeywordMatcher handles keyword matching with weighted scoring

func (*KeywordMatcher) ScoreKeywordMatch ¶

func (km *KeywordMatcher) ScoreKeywordMatch(article *models.Article) float64

ScoreKeywordMatch calculates keyword matching score with weights

type PaginationConfig ¶

type PaginationConfig struct {
	Page     int `json:"page"`     // Current page (1-based)
	PageSize int `json:"pageSize"` // Items per page
}

PaginationConfig defines pagination settings

func GetDefaultPaginationConfig ¶

func GetDefaultPaginationConfig() PaginationConfig

GetDefaultPaginationConfig returns default pagination configuration

type PaginationResult ¶

type PaginationResult struct {
	Items       []*models.Article `json:"items"`
	CurrentPage int               `json:"currentPage"`
	PageSize    int               `json:"pageSize"`
	TotalItems  int               `json:"totalItems"`
	TotalPages  int               `json:"totalPages"`
	HasNext     bool              `json:"hasNext"`
	HasPrev     bool              `json:"hasPrev"`
}

PaginationResult contains paginated results with metadata

type ProcessOptions ¶

type ProcessOptions struct {
	Query     string `json:"query"`
	SortBy    string `json:"sortBy"`
	SortOrder string `json:"sortOrder"`
	Limit     int    `json:"limit"`
}

ProcessOptions defines options for processing

type Processor ¶

type Processor struct {
	// contains filtered or unexported fields
}

Processor provides unified data processing functionality

func NewProcessor ¶

func NewProcessor(config *Config) *Processor

NewProcessor creates a new processor instance

func (*Processor) CalculateFrontendRelevance ¶

func (p *Processor) CalculateFrontendRelevance(article models.Article, query string) float64

CalculateFrontendRelevance calculates how relevant an article is to frontend development

func (*Processor) GetStats ¶

func (p *Processor) GetStats() ProcessorStats

GetStats returns processor statistics

func (*Processor) ProcessArticles ¶

func (p *Processor) ProcessArticles(ctx context.Context, articles []models.Article, options ProcessOptions) ([]models.Article, error)

ProcessArticles processes a slice of articles with various enhancements

func (*Processor) ProcessRepositories ¶

func (p *Processor) ProcessRepositories(ctx context.Context, repos []models.Repository, options ProcessOptions) ([]models.Repository, error)

ProcessRepositories processes a slice of repositories

type ProcessorStats ¶

type ProcessorStats struct {
	ProcessedArticles     int           `json:"processedArticles"`
	ProcessedRepositories int           `json:"processedRepositories"`
	AverageProcessingTime time.Duration `json:"averageProcessingTime"`
	CacheHitRate          float64       `json:"cacheHitRate"`
}

ProcessorStats holds processor performance statistics

type RelevanceScorer ¶

type RelevanceScorer struct {
	// contains filtered or unexported fields
}

RelevanceScorer handles content relevance scoring using TF-IDF and keyword matching

func NewRelevanceScorer ¶

func NewRelevanceScorer(keywords []string) *RelevanceScorer

NewRelevanceScorer creates a new instance of RelevanceScorer

func (*RelevanceScorer) AddToCorpus ¶

func (rs *RelevanceScorer) AddToCorpus(article *models.Article)

AddToCorpus adds an article to the corpus

func (*RelevanceScorer) ClearCache ¶

func (rs *RelevanceScorer) ClearCache()

ClearCache clears all internal caches

func (*RelevanceScorer) GetKeywords ¶

func (rs *RelevanceScorer) GetKeywords() []string

GetKeywords returns the current keywords

func (*RelevanceScorer) GetTopTerms ¶

func (rs *RelevanceScorer) GetTopTerms(article *models.Article, n int) []TermFrequency

GetTopTerms returns the top N terms for an article based on TF-IDF

func (*RelevanceScorer) GetWeightConfig ¶

func (rs *RelevanceScorer) GetWeightConfig() WeightConfig

GetWeightConfig returns the current weight configuration

func (*RelevanceScorer) ScoreRelevance ¶

func (rs *RelevanceScorer) ScoreRelevance(article *models.Article) float64

ScoreRelevance calculates the relevance score for an article

func (*RelevanceScorer) SetCorpus ¶

func (rs *RelevanceScorer) SetCorpus(articles []*models.Article)

SetCorpus sets the corpus for IDF calculation

func (*RelevanceScorer) SetWeightConfig ¶

func (rs *RelevanceScorer) SetWeightConfig(config WeightConfig)

SetWeightConfig updates the weight configuration

func (*RelevanceScorer) UpdateKeywords ¶

func (rs *RelevanceScorer) UpdateKeywords(keywords []string)

UpdateKeywords updates the keywords used for scoring

type RepositoryPaginationResult ¶

type RepositoryPaginationResult struct {
	Items       []*models.Repository `json:"items"`
	CurrentPage int                  `json:"currentPage"`
	PageSize    int                  `json:"pageSize"`
	TotalItems  int                  `json:"totalItems"`
	TotalPages  int                  `json:"totalPages"`
	HasNext     bool                 `json:"hasNext"`
	HasPrev     bool                 `json:"hasPrev"`
}

RepositoryPaginationResult contains paginated repository results

type RepositorySorter ¶

type RepositorySorter struct {
	// contains filtered or unexported fields
}

RepositorySorter handles sorting of repositories

func NewRepositorySorter ¶

func NewRepositorySorter() *RepositorySorter

NewRepositorySorter creates a new RepositorySorter instance

func (*RepositorySorter) SetSortConfig ¶

func (rs *RepositorySorter) SetSortConfig(config SortConfig)

SetSortConfig updates the sorting configuration for repositories

func (*RepositorySorter) SetUserPreferences ¶

func (rs *RepositorySorter) SetUserPreferences(prefs *UserPreferences)

SetUserPreferences sets user preferences for repository sorting

func (*RepositorySorter) SortAndPaginate ¶

func (rs *RepositorySorter) SortAndPaginate(repos []*models.Repository, paginationConfig PaginationConfig) (*RepositoryPaginationResult, error)

SortAndPaginate sorts repositories and applies pagination

func (*RepositorySorter) SortRepositories ¶

func (rs *RepositorySorter) SortRepositories(repos []*models.Repository) []*models.Repository

SortRepositories sorts repositories according to the configured criteria

type SentenceScore ¶

type SentenceScore struct {
	Text     string
	Score    float64
	Position int
	Length   int
}

SentenceScore represents a sentence with its calculated importance score

type SimpleStemmer ¶

type SimpleStemmer struct {
	// contains filtered or unexported fields
}

SimpleStemmer provides basic word stemming functionality

func NewSimpleStemmer ¶

func NewSimpleStemmer() *SimpleStemmer

NewSimpleStemmer creates a new stemmer instance

func (*SimpleStemmer) Stem ¶

func (ss *SimpleStemmer) Stem(word string) string

Stem applies basic stemming to a word

type SortBy ¶

type SortBy string

SortBy defines the primary sorting criteria

const (
	SortByRelevance  SortBy = "relevance"  // Sort by relevance score
	SortByTime       SortBy = "time"       // Sort by publication time
	SortByPopularity SortBy = "popularity" // Sort by quality/popularity metrics
	SortByTrend      SortBy = "trend"      // Sort by trending score
	SortByComposite  SortBy = "composite"  // Weighted combination of multiple factors
)

type SortConfig ¶

type SortConfig struct {
	Primary   SortBy    `json:"primary"`   // Primary sorting criterion
	Secondary SortBy    `json:"secondary"` // Secondary sorting criterion (for tie-breaking)
	Order     SortOrder `json:"order"`     // Sort order

	// Weights for composite sorting
	RelevanceWeight  float64 `json:"relevanceWeight"`  // Weight for relevance (default: 0.4)
	TimeWeight       float64 `json:"timeWeight"`       // Weight for recency (default: 0.3)
	PopularityWeight float64 `json:"popularityWeight"` // Weight for popularity (default: 0.2)
	TrendWeight      float64 `json:"trendWeight"`      // Weight for trending (default: 0.1)
}

SortConfig defines the sorting configuration

func GetDefaultSortConfig ¶

func GetDefaultSortConfig() SortConfig

GetDefaultSortConfig returns default sorting configuration

type SortOrder ¶

type SortOrder string

SortOrder defines ascending or descending order

const (
	SortAsc  SortOrder = "asc"  // Ascending order
	SortDesc SortOrder = "desc" // Descending order
)

type Summarizer ¶

type Summarizer struct {
	// Configuration for summary generation
	MinSummaryLength int     // Minimum summary length in characters
	MaxSummaryLength int     // Maximum summary length in characters
	SentenceCount    int     // Target number of sentences in summary
	PositionWeight   float64 // Weight for sentence position (earlier sentences score higher)
	LengthWeight     float64 // Weight for sentence length
	KeywordWeight    float64 // Weight for keyword density
}

Summarizer handles intelligent content summarization and processing

func NewSummarizer ¶

func NewSummarizer() *Summarizer

NewSummarizer creates a new Summarizer with default configuration

func (*Summarizer) AssessQuality ¶

func (s *Summarizer) AssessQuality(article *models.Article) float64

AssessQuality evaluates content quality based on multiple factors

func (*Summarizer) CleanContent ¶

func (s *Summarizer) CleanContent(article *models.Article) error

CleanContent performs content cleaning and preprocessing

func (*Summarizer) GenerateSummary ¶

func (s *Summarizer) GenerateSummary(text string) (string, error)

GenerateSummary creates an intelligent summary of the given text Uses sentence extraction based on position, length, and keyword density

func (*Summarizer) ProcessArticle ¶

func (s *Summarizer) ProcessArticle(article *models.Article) error

ProcessArticle performs complete content processing on an article This includes cleaning, summarization, and quality assessment

type TFIDFScore ¶

type TFIDFScore struct {
	Term   string  `json:"term"`
	TF     float64 `json:"tf"`     // Term Frequency
	IDF    float64 `json:"idf"`    // Inverse Document Frequency
	TFIDF  float64 `json:"tfidf"`  // TF-IDF Score
	Weight float64 `json:"weight"` // Additional weight based on position/importance
}

TFIDFScore represents the TF-IDF score for a term in a document

type TermFrequency ¶

type TermFrequency struct {
	Term      string  `json:"term"`
	Count     int     `json:"count"`
	Frequency float64 `json:"frequency"`
}

TermFrequency represents the frequency of a term in a document

type UserPreferences ¶

type UserPreferences struct {
	FavoriteTopics    []string           `json:"favoriteTopics"`    // Preferred topic keywords
	PreferredSources  []string           `json:"preferredSources"`  // Preferred news sources
	ReadingHistory    []string           `json:"readingHistory"`    // Article IDs user has read
	TopicWeights      map[string]float64 `json:"topicWeights"`      // Custom weights for topics
	RecencyPreference float64            `json:"recencyPreference"` // How much user prefers recent articles (0-1)
	LanguagePrefs     []string           `json:"languagePrefs"`     // Preferred programming languages for repos
}

UserPreferences defines user-specific ranking preferences

type WeightConfig ¶

type WeightConfig struct {
	TitleWeight   float64 `json:"titleWeight"`   // Weight for title matches (default: 3.0)
	SummaryWeight float64 `json:"summaryWeight"` // Weight for summary matches (default: 2.0)
	ContentWeight float64 `json:"contentWeight"` // Weight for content matches (default: 1.0)
	TagWeight     float64 `json:"tagWeight"`     // Weight for tag matches (default: 4.0)
	ExactMatch    float64 `json:"exactMatch"`    // Bonus for exact keyword match (default: 1.5)
	PartialMatch  float64 `json:"partialMatch"`  // Score for partial match (default: 0.8)
}

WeightConfig defines scoring weights for different text sections

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL