lm

package

v1.0.0 Latest Latest Go to latest Published: Jul 24, 2024 License: MIT Imports: 24 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/finalrep/suggest

Links

Open Source Insights

Documentation ¶

Rendered for

Overview ¶

Package lm provides a library for storing large n-gram language models in memory. Mostly inspired by https://code.google.com/archive/p/berkeleylm/

Index ¶

Constants
Variables
func NewTokenizer(alphabet alphabet.Alphabet) analysis.Tokenizer
func StoreBinaryLMFromGoogleFormat(directory store.Directory, config *Config) error
type Config
- func ReadConfig(configPath string) (*Config, error)
- func (c *Config) GetBinaryPath() string
- func (c *Config) GetDictionaryPath() string
- func (c *Config) GetOutputPath() string
- func (c *Config) GetSeparatorsAlphabet() alphabet.Alphabet
- func (c *Config) GetSourcePath() string
- func (c *Config) GetWordsAlphabet() alphabet.Alphabet
type ContextOffset
type CountTrie
- func NewCountTrie() CountTrie
type Indexer
- func BuildIndexer(dict dictionary.Dictionary) (Indexer, error)
- func NewIndexer(dict dictionary.Dictionary, table mph.MPH) Indexer
type Key
- func MakeKey(word WordID, context ContextOffset) Key
- func (k Key) GetContext() ContextOffset
- func (k Key) GetWordID() WordID
type LanguageModel
- func NewLanguageModel(model NGramModel, indexer Indexer, config *Config) (LanguageModel, error)
- func RetrieveLMFromBinary(directory store.Directory, config *Config) (LanguageModel, error)
type NGramBuilder
- func NewNGramBuilder(startSymbol, endSymbol string) *NGramBuilder
- func (nb *NGramBuilder) Build(retriever SentenceRetriever, nGramOrder uint8) CountTrie
type NGramModel
- func NewNGramModel(indices []NGramVector) NGramModel
type NGramNode
- func (n *NGramNode) Less(other rbtree.Item) bool
type NGramReader
- func NewGoogleNGramReader(nGramOrder uint8, indexer Indexer, directory store.Directory) NGramReader
type NGramVector
- func CreatePackedArray(ch <-chan NGramNode) NGramVector
- func NewNGramVector() NGramVector
type NGramVectorBuilder
- func NewNGramVectorBuilder(parents []NGramVector, factory NGramVectorFactory) NGramVectorBuilder
type NGramVectorFactory
type NGramWriter
- func NewGoogleNGramWriter(nGramOrder uint8, directory store.Directory) NGramWriter
type NGrams
type ScorerNext
type Sentence
type SentenceRetriever
- func NewSentenceRetriever(tokenizer analysis.Tokenizer, reader io.Reader, alphabet alphabet.Alphabet) SentenceRetriever
type Token
type TrieIterator
type WordCount
type WordID
- func MapIntoListOfWordIDs(lm LanguageModel, sentence Sentence) ([]WordID, error)

Constants ¶

View Source

const (
	// UnknownWordID is an index of an unregistered word
	UnknownWordID = uint32(0xffffffff)
	// UnknownWordSymbol is a symbol that is returned on an unregistred word
	UnknownWordSymbol = "<UNK>"
)

View Source

const (
	// InvalidContextOffset is context id that represents invalid context offset
	InvalidContextOffset = maxContextOffset - 1
)

View Source

const (
	// UnknownWordScore is the score for unknown phrases
	UnknownWordScore = -100.0
)

Variables ¶

View Source

var (
	// ErrContextOverflow tells that it was an attempt
	ErrContextOverflow = errors.New("out of maxContextOffset")
)

View Source

var ErrInvalidIndex = errors.New("index is not exists")

ErrInvalidIndex tells that there is not data for the provided index index

View Source

var ErrNGramOrderIsOutOfRange = errors.New("nGrams order is out of range")

ErrNGramOrderIsOutOfRange informs that the given NGrams is out of range for the given

Functions ¶

func NewTokenizer ¶

func NewTokenizer(alphabet alphabet.Alphabet) analysis.Tokenizer

NewTokenizer creates a new instance of Tokenizer

func StoreBinaryLMFromGoogleFormat ¶

func StoreBinaryLMFromGoogleFormat(directory store.Directory, config *Config) error

StoreBinaryLMFromGoogleFormat creates a ngram language model from the google ngram format

Types ¶

type Config ¶

type Config struct {
	Name        string   `json:"name"`
	NGramOrder  uint8    `json:"nGramOrder"`
	SourcePath  string   `json:"source"`
	OutputPath  string   `json:"output"`
	Alphabet    []string `json:"alphabet"`
	Separators  []string `json:"separators"`
	StartSymbol string   `json:"startSymbol"`
	EndSymbol   string   `json:"endSymbol"`
	// contains filtered or unexported fields
}

Config represents a configuration of a language model

func ReadConfig ¶

func ReadConfig(configPath string) (*Config, error)

ReadConfig reads a language model config from the given reader

func (*Config) GetBinaryPath ¶

func (c *Config) GetBinaryPath() string

GetBinaryPath returns a stored path for the binary lm

func (*Config) GetDictionaryPath ¶

func (c *Config) GetDictionaryPath() string

GetDictionaryPath returns a stored path for the dictionary

func (*Config) GetOutputPath ¶

func (c *Config) GetOutputPath() string

GetOutputPath returns a output path of the builded index

func (*Config) GetSeparatorsAlphabet ¶

func (c *Config) GetSeparatorsAlphabet() alphabet.Alphabet

GetSeparatorsAlphabet returns a separators alphabet corresponding to the declaration

func (*Config) GetSourcePath ¶

func (c *Config) GetSourcePath() string

GetSourcePath returns a source path of the index description

func (*Config) GetWordsAlphabet ¶

func (c *Config) GetWordsAlphabet() alphabet.Alphabet

GetWordsAlphabet returns a word alphabet corresponding to the declaration

type ContextOffset ¶

type ContextOffset = uint32

ContextOffset represents the id of parent nGram path

type CountTrie ¶

type CountTrie interface {
	// Put increments WordCount for last element of given sequence.
	Put(sentence Sentence, count WordCount)
	// Walk iterates through trie and calls walker function on each element.
	Walk(walker TrieIterator) error
}

CountTrie represents a data structure for counting ngrams.

func NewCountTrie ¶

func NewCountTrie() CountTrie

NewCountTrie creates new a instance of CountTrie

type Indexer ¶

type Indexer interface {
	// Returns the index for the word, otherwise returns UnknownWordID
	Get(token Token) (WordID, error)
	// Find a token by the given index
	Find(id WordID) (Token, error)
}

Indexer enumerates words in the vocabulary of a language model. Stores a two-way mapping between uint32 and strings.

func BuildIndexer ¶

func BuildIndexer(dict dictionary.Dictionary) (Indexer, error)

BuildIndexer builds a indexer from the given dictionary

func NewIndexer ¶

func NewIndexer(dict dictionary.Dictionary, table mph.MPH) Indexer

NewIndexer creates a new instance of indexer

type Key ¶

type Key uint64

Key represents a NGramNode key as a composition of a NGram context and wordID

func MakeKey ¶

func MakeKey(word WordID, context ContextOffset) Key

MakeKey creates uint64 key for the given pair (word, context)

func (Key) GetContext ¶

func (k Key) GetContext() ContextOffset

GetContext returns the context for the given key

func (Key) GetWordID ¶

func (k Key) GetWordID() WordID

GetWordID returns the wordID for the given key

type LanguageModel ¶

type LanguageModel interface {
	// ScoreSentence scores and returns a lm weight for the given sentence
	ScoreSentence(sentence Sentence) (float64, error)
	// ScoreWordIDs scores and returns a lm weight for the given sequence of nGrams
	ScoreWordIDs(sequence []WordID) float64
	// GetWordID returns id for the given token
	GetWordID(token Token) (WordID, error)
	// Next returns the list of candidates for the given sequence
	Next(sequence []WordID) (ScorerNext, error)
}

LanguageModel is an interface for an n-gram language model

func NewLanguageModel ¶

func NewLanguageModel(
	model NGramModel,
	indexer Indexer,
	config *Config,
) (LanguageModel, error)

NewLanguageModel creates a new instance of a LanguageModel

func RetrieveLMFromBinary ¶

func RetrieveLMFromBinary(directory store.Directory, config *Config) (LanguageModel, error)

RetrieveLMFromBinary retrieves a language model from the binary format

type NGramBuilder ¶

type NGramBuilder struct {
	// contains filtered or unexported fields
}

NGramBuilder is an entity that responsible for creating CountTrie

func NewNGramBuilder ¶

func NewNGramBuilder(
	startSymbol, endSymbol string,
) *NGramBuilder

NewNGramBuilder returns new instance of NGramBuilder

func (*NGramBuilder) Build ¶

func (nb *NGramBuilder) Build(retriever SentenceRetriever, nGramOrder uint8) CountTrie

Build builds CountTrie with nGrams

type NGramModel ¶

type NGramModel interface {
	store.Marshaler
	store.Unmarshaler

	// Score returns a lm value of the given sequence of WordID
	Score(nGrams []WordID) float64
	// Next returns a list of WordID which follow after the given sequence of nGrams
	Next(nGrams []WordID) (ScorerNext, error)
}

NGramModel is an entity that responses for scoring the given nGrams

func NewNGramModel ¶

func NewNGramModel(indices []NGramVector) NGramModel

NewNGramModel creates a NGramModel from the given indices.

type NGramNode ¶

type NGramNode struct {
	Key   Key
	Count WordCount
}

NGramNode represents tree node for the given nGram

func (*NGramNode) Less ¶

func (n *NGramNode) Less(other rbtree.Item) bool

Less tells is current elements is bigger than the other

type NGramReader ¶

type NGramReader interface {
	// Read builds NGramModel from the given list of readers
	Read() (NGramModel, error)
}

NGramReader is responsible for creating NGramModel from the files

func NewGoogleNGramReader ¶

func NewGoogleNGramReader(nGramOrder uint8, indexer Indexer, directory store.Directory) NGramReader

NewGoogleNGramReader creates new instance of NGramReader

type NGramVector ¶

type NGramVector interface {
	store.Marshaler
	store.Unmarshaler

	// GetCount returns WordCount and Node ContextOffset for the given pair (word, context)
	GetCount(word WordID, context ContextOffset) (WordCount, ContextOffset)
	// GetContextOffset returns the given node context offset
	GetContextOffset(word WordID, context ContextOffset) ContextOffset
	// CorpusCount returns size of all counts in the collection
	CorpusCount() WordCount
	// SubVector returns NGramVector for the given context
	SubVector(context ContextOffset) NGramVector
}

NGramVector represents one level of nGram trie

func CreatePackedArray ¶

func CreatePackedArray(ch <-chan NGramNode) NGramVector

CreatePackedArray creates a NGramVector form the channel of NGramNodes.

func NewNGramVector ¶

func NewNGramVector() NGramVector

NewNGramVector creates a new instance of NGramVector.

type NGramVectorBuilder ¶

type NGramVectorBuilder interface {
	// Put adds the given sequence of nGrams and count to model
	Put(nGrams []WordID, count WordCount) error
	// Build creates new instance of NGramVector
	Build() NGramVector
}

NGramVectorBuilder is an entity that responses for building NGramVector

func NewNGramVectorBuilder ¶

func NewNGramVectorBuilder(parents []NGramVector, factory NGramVectorFactory) NGramVectorBuilder

NewNGramVectorBuilder creates new instance of NGramVectorBuilder

type NGramVectorFactory ¶

type NGramVectorFactory func(ch <-chan NGramNode) NGramVector

NGramVectorFactory represents a factory method for creating a NGramVector instance.

type NGramWriter ¶

type NGramWriter interface {
	// Write persists the given trie to a storage
	Write(trie CountTrie) error
}

NGramWriter is the interface that persists the NGram Count Trie to a storage

func NewGoogleNGramWriter ¶

func NewGoogleNGramWriter(nGramOrder uint8, directory store.Directory) NGramWriter

NewGoogleNGramWriter creates new instance of NGramWriter that persists the given NGram Count Trie with Google NGram Format negotiations

type NGrams ¶

type NGrams = [][]WordID

NGrams is the result of splitting the given sequence of words into nGrams

type ScorerNext ¶

type ScorerNext interface {
	// ScoreNext calculates the score for the given nGram built on the parent context
	ScoreNext(nGram WordID) float64
}

ScorerNext represents the entity that responses for scoring the word using the parent context

type Sentence ¶

type Sentence = []Token

Sentence is a sequence of tokens

type SentenceRetriever ¶

type SentenceRetriever interface {
	// Retrieves and returns the next sentence from the source
	Retrieve() Sentence
}

SentenceRetriever is an entity that is responsible for retrieving sentences from the given source

func NewSentenceRetriever ¶

func NewSentenceRetriever(tokenizer analysis.Tokenizer, reader io.Reader, alphabet alphabet.Alphabet) SentenceRetriever

NewSentenceRetriever creates new instance of sentence retriever

type Token ¶

type Token = analysis.Token

Token is a string with an assigned and thus identified meaning

type TrieIterator ¶

type TrieIterator = func(path Sentence, count WordCount) error

TrieIterator is a callback that is called for each path of the given trie

type WordCount ¶

type WordCount = uint32

WordCount is a count of a corresponding path

type WordID ¶

type WordID = uint32

WordID is an index of the corresponding word

func MapIntoListOfWordIDs ¶

func MapIntoListOfWordIDs(lm LanguageModel, sentence Sentence) ([]WordID, error)

MapIntoListOfWordIDs maps the given sentence into a list of WordIDs

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL