Documentation
¶
Overview ¶
Package lm provides a library for storing large n-gram language models in memory. Mostly inspired by https://code.google.com/archive/p/berkeleylm/
Index ¶
- Constants
- Variables
- func NewTokenizer(alphabet alphabet.Alphabet) analysis.Tokenizer
- func StoreBinaryLMFromGoogleFormat(directory store.Directory, config *Config) error
- type Config
- type ContextOffset
- type CountTrie
- type Indexer
- type Key
- type LanguageModel
- type NGramBuilder
- type NGramModel
- type NGramNode
- type NGramReader
- type NGramVector
- type NGramVectorBuilder
- type NGramVectorFactory
- type NGramWriter
- type NGrams
- type ScorerNext
- type Sentence
- type SentenceRetriever
- type Token
- type TrieIterator
- type WordCount
- type WordID
Constants ¶
const ( // UnknownWordID is an index of an unregistered word UnknownWordID = uint32(0xffffffff) // UnknownWordSymbol is a symbol that is returned on an unregistred word UnknownWordSymbol = "<UNK>" )
const (
// InvalidContextOffset is context id that represents invalid context offset
InvalidContextOffset = maxContextOffset - 1
)
const (
// UnknownWordScore is the score for unknown phrases
UnknownWordScore = -100.0
)
Variables ¶
var ( // ErrContextOverflow tells that it was an attempt ErrContextOverflow = errors.New("out of maxContextOffset") )
var ErrInvalidIndex = errors.New("index is not exists")
ErrInvalidIndex tells that there is not data for the provided index index
var ErrNGramOrderIsOutOfRange = errors.New("nGrams order is out of range")
ErrNGramOrderIsOutOfRange informs that the given NGrams is out of range for the given
Functions ¶
func NewTokenizer ¶
NewTokenizer creates a new instance of Tokenizer
Types ¶
type Config ¶
type Config struct {
Name string `json:"name"`
NGramOrder uint8 `json:"nGramOrder"`
SourcePath string `json:"source"`
OutputPath string `json:"output"`
Alphabet []string `json:"alphabet"`
Separators []string `json:"separators"`
StartSymbol string `json:"startSymbol"`
EndSymbol string `json:"endSymbol"`
// contains filtered or unexported fields
}
Config represents a configuration of a language model
func ReadConfig ¶
ReadConfig reads a language model config from the given reader
func (*Config) GetBinaryPath ¶
GetBinaryPath returns a stored path for the binary lm
func (*Config) GetDictionaryPath ¶
GetDictionaryPath returns a stored path for the dictionary
func (*Config) GetOutputPath ¶
GetOutputPath returns a output path of the builded index
func (*Config) GetSeparatorsAlphabet ¶
GetSeparatorsAlphabet returns a separators alphabet corresponding to the declaration
func (*Config) GetSourcePath ¶
GetSourcePath returns a source path of the index description
func (*Config) GetWordsAlphabet ¶
GetWordsAlphabet returns a word alphabet corresponding to the declaration
type ContextOffset ¶
type ContextOffset = uint32
ContextOffset represents the id of parent nGram path
type CountTrie ¶
type CountTrie interface {
// Put increments WordCount for last element of given sequence.
Put(sentence Sentence, count WordCount)
// Walk iterates through trie and calls walker function on each element.
Walk(walker TrieIterator) error
}
CountTrie represents a data structure for counting ngrams.
type Indexer ¶
type Indexer interface {
// Returns the index for the word, otherwise returns UnknownWordID
Get(token Token) (WordID, error)
// Find a token by the given index
Find(id WordID) (Token, error)
}
Indexer enumerates words in the vocabulary of a language model. Stores a two-way mapping between uint32 and strings.
func BuildIndexer ¶
func BuildIndexer(dict dictionary.Dictionary) (Indexer, error)
BuildIndexer builds a indexer from the given dictionary
func NewIndexer ¶
func NewIndexer(dict dictionary.Dictionary, table mph.MPH) Indexer
NewIndexer creates a new instance of indexer
type Key ¶
type Key uint64
Key represents a NGramNode key as a composition of a NGram context and wordID
func MakeKey ¶
func MakeKey(word WordID, context ContextOffset) Key
MakeKey creates uint64 key for the given pair (word, context)
func (Key) GetContext ¶
func (k Key) GetContext() ContextOffset
GetContext returns the context for the given key
type LanguageModel ¶
type LanguageModel interface {
// ScoreSentence scores and returns a lm weight for the given sentence
ScoreSentence(sentence Sentence) (float64, error)
// ScoreWordIDs scores and returns a lm weight for the given sequence of nGrams
ScoreWordIDs(sequence []WordID) float64
// GetWordID returns id for the given token
GetWordID(token Token) (WordID, error)
// Next returns the list of candidates for the given sequence
Next(sequence []WordID) (ScorerNext, error)
}
LanguageModel is an interface for an n-gram language model
func NewLanguageModel ¶
func NewLanguageModel( model NGramModel, indexer Indexer, config *Config, ) (LanguageModel, error)
NewLanguageModel creates a new instance of a LanguageModel
func RetrieveLMFromBinary ¶
func RetrieveLMFromBinary(directory store.Directory, config *Config) (LanguageModel, error)
RetrieveLMFromBinary retrieves a language model from the binary format
type NGramBuilder ¶
type NGramBuilder struct {
// contains filtered or unexported fields
}
NGramBuilder is an entity that responsible for creating CountTrie
func NewNGramBuilder ¶
func NewNGramBuilder( startSymbol, endSymbol string, ) *NGramBuilder
NewNGramBuilder returns new instance of NGramBuilder
func (*NGramBuilder) Build ¶
func (nb *NGramBuilder) Build(retriever SentenceRetriever, nGramOrder uint8) CountTrie
Build builds CountTrie with nGrams
type NGramModel ¶
type NGramModel interface {
store.Marshaler
store.Unmarshaler
// Score returns a lm value of the given sequence of WordID
Score(nGrams []WordID) float64
// Next returns a list of WordID which follow after the given sequence of nGrams
Next(nGrams []WordID) (ScorerNext, error)
}
NGramModel is an entity that responses for scoring the given nGrams
func NewNGramModel ¶
func NewNGramModel(indices []NGramVector) NGramModel
NewNGramModel creates a NGramModel from the given indices.
type NGramReader ¶
type NGramReader interface {
// Read builds NGramModel from the given list of readers
Read() (NGramModel, error)
}
NGramReader is responsible for creating NGramModel from the files
func NewGoogleNGramReader ¶
func NewGoogleNGramReader(nGramOrder uint8, indexer Indexer, directory store.Directory) NGramReader
NewGoogleNGramReader creates new instance of NGramReader
type NGramVector ¶
type NGramVector interface {
store.Marshaler
store.Unmarshaler
// GetCount returns WordCount and Node ContextOffset for the given pair (word, context)
GetCount(word WordID, context ContextOffset) (WordCount, ContextOffset)
// GetContextOffset returns the given node context offset
GetContextOffset(word WordID, context ContextOffset) ContextOffset
// CorpusCount returns size of all counts in the collection
CorpusCount() WordCount
// SubVector returns NGramVector for the given context
SubVector(context ContextOffset) NGramVector
}
NGramVector represents one level of nGram trie
func CreatePackedArray ¶
func CreatePackedArray(ch <-chan NGramNode) NGramVector
CreatePackedArray creates a NGramVector form the channel of NGramNodes.
func NewNGramVector ¶
func NewNGramVector() NGramVector
NewNGramVector creates a new instance of NGramVector.
type NGramVectorBuilder ¶
type NGramVectorBuilder interface {
// Put adds the given sequence of nGrams and count to model
Put(nGrams []WordID, count WordCount) error
// Build creates new instance of NGramVector
Build() NGramVector
}
NGramVectorBuilder is an entity that responses for building NGramVector
func NewNGramVectorBuilder ¶
func NewNGramVectorBuilder(parents []NGramVector, factory NGramVectorFactory) NGramVectorBuilder
NewNGramVectorBuilder creates new instance of NGramVectorBuilder
type NGramVectorFactory ¶
type NGramVectorFactory func(ch <-chan NGramNode) NGramVector
NGramVectorFactory represents a factory method for creating a NGramVector instance.
type NGramWriter ¶
type NGramWriter interface {
// Write persists the given trie to a storage
Write(trie CountTrie) error
}
NGramWriter is the interface that persists the NGram Count Trie to a storage
func NewGoogleNGramWriter ¶
func NewGoogleNGramWriter(nGramOrder uint8, directory store.Directory) NGramWriter
NewGoogleNGramWriter creates new instance of NGramWriter that persists the given NGram Count Trie with Google NGram Format negotiations
type NGrams ¶
type NGrams = [][]WordID
NGrams is the result of splitting the given sequence of words into nGrams
type ScorerNext ¶
type ScorerNext interface {
// ScoreNext calculates the score for the given nGram built on the parent context
ScoreNext(nGram WordID) float64
}
ScorerNext represents the entity that responses for scoring the word using the parent context
type SentenceRetriever ¶
type SentenceRetriever interface {
// Retrieves and returns the next sentence from the source
Retrieve() Sentence
}
SentenceRetriever is an entity that is responsible for retrieving sentences from the given source
func NewSentenceRetriever ¶
func NewSentenceRetriever(tokenizer analysis.Tokenizer, reader io.Reader, alphabet alphabet.Alphabet) SentenceRetriever
NewSentenceRetriever creates new instance of sentence retriever
type TrieIterator ¶
TrieIterator is a callback that is called for each path of the given trie
type WordID ¶
type WordID = uint32
WordID is an index of the corresponding word
func MapIntoListOfWordIDs ¶
func MapIntoListOfWordIDs(lm LanguageModel, sentence Sentence) ([]WordID, error)
MapIntoListOfWordIDs maps the given sentence into a list of WordIDs