Documentation
¶
Overview ¶
Package lex provides all the lexing functions that transform text into lexical tokens, using token types defined in the pi/token package. It also has the basic file source and position / region management functionality.
Index ¶
- Variables
- func DigitVal(ch rune) int
- func IsDigit(ch rune) bool
- func IsLetter(ch rune) bool
- func IsWhiteSpace(ch rune) bool
- func PrintError(w io.Writer, err error)
- type Actions
- type Error
- type ErrorList
- func (p *ErrorList) Add(pos Pos, fname, msg string)
- func (p ErrorList) AllString() string
- func (p ErrorList) Err() error
- func (p ErrorList) Error() string
- func (p ErrorList) Len() int
- func (p ErrorList) Less(i, j int) bool
- func (p *ErrorList) RemoveMultiples()
- func (p *ErrorList) Reset()
- func (p ErrorList) Sort()
- func (p ErrorList) Swap(i, j int)
- type File
- func (fl *File) AllocLines()
- func (fl *File) IsLexPosValid(pos Pos) bool
- func (fl *File) LexAt(cp Pos) *Lex
- func (fl *File) LexAtSafe(cp Pos) Lex
- func (fl *File) LexLine(ln int) Line
- func (fl *File) LexTagSrc() string
- func (fl *File) LexTagSrcLn(ln int) string
- func (fl *File) LinesDeleted(stln, edln int)
- func (fl *File) LinesInserted(stln, nsz int)
- func (fl *File) NLines() int
- func (fl *File) NTokens(ln int) int
- func (fl *File) NextTokenPos(pos Pos) (Pos, bool)
- func (fl *File) PrevDepth(ln int) int
- func (fl *File) PrevStack(ln int) Stack
- func (fl *File) PrevTokenPos(pos Pos) (Pos, bool)
- func (fl *File) RegSrc(reg Reg) string
- func (fl *File) SetLine(ln int, lexs, comments Line, stack Stack)
- func (fl *File) SetSrc(src *[][]rune, fname string)
- func (fl *File) Token(pos Pos) token.Tokens
- func (fl *File) TokenMapReg(reg Reg) TokenMap
- func (fl *File) TokenRegSrc(reg Reg) string
- func (fl *File) TokenSrc(pos Pos) []rune
- func (fl *File) TokenSrcPos(pos Pos) Reg
- func (fl *File) TokenSrcReg(reg Reg) Reg
- func (fl *File) ValidTokenPos(pos Pos) (Pos, bool)
- type LangLexer
- type Lex
- type Lexer
- type Line
- type MatchPos
- type Matches
- type PassTwo
- func (pt *PassTwo) EosDetect(ts *TwoState)
- func (pt *PassTwo) ErrString(ts *TwoState) string
- func (pt *PassTwo) Error(ts *TwoState, msg string)
- func (pt *PassTwo) HasErrs(ts *TwoState) bool
- func (pt *PassTwo) MismatchError(ts *TwoState, tok token.Tokens)
- func (pt *PassTwo) NestDepth(ts *TwoState)
- func (pt *PassTwo) NestDepthLine(line Line, initDepth int)
- func (pt *PassTwo) PopNest(ts *TwoState, tok token.Tokens)
- func (pt *PassTwo) PushNest(ts *TwoState, tok token.Tokens)
- type Pos
- type Reg
- type Rule
- func (lr *Rule) AsLexRule() *Rule
- func (lr *Rule) BaseIface() reflect.Type
- func (lr *Rule) DoAct(ls *State, act Actions)
- func (lr *Rule) Find(find string) []*Rule
- func (lr *Rule) IsMatch(ls *State) bool
- func (lr *Rule) IsMatchPos(ls *State) bool
- func (lr *Rule) Lex(ls *State) *Rule
- func (lr *Rule) LexStart(ls *State) *Rule
- func (lr *Rule) TargetLen(ls *State) int
- func (lr *Rule) Validate(ls *State) bool
- func (lr *Rule) WriteGrammar(writer io.Writer, depth int)
- type Stack
- type State
- func (ls *State) Add(tok token.Tokens, st, ed int)
- func (ls *State) AtEol() bool
- func (ls *State) CurRune() bool
- func (ls *State) CurState() string
- func (ls *State) Error(pos int, msg string)
- func (ls *State) Init()
- func (ls *State) LineString() string
- func (ls *State) Next(inc int) bool
- func (ls *State) NextRune() bool
- func (ls *State) NextSrcLine() string
- func (ls *State) PopState() string
- func (ls *State) PushState(st string)
- func (ls *State) ReadEscape(quote rune) bool
- func (ls *State) ReadName()
- func (ls *State) ReadNumber() token.Tokens
- func (ls *State) ReadQuoted()
- func (ls *State) Rune(off int) (rune, bool)
- func (ls *State) ScanMantissa(base int)
- func (ls *State) SetLine(src []rune)
- func (ls *State) String(off, sz int) (string, bool)
- type TokenMap
- type TwoState
Constants ¶
This section is empty.
Variables ¶
var KiT_Rule = kit.Types.AddType(&Rule{}, RuleProps)
var PosErr = Pos{-1, -1}
PosErr represents an error text position (-1 for both line and char) used as a return value for cases where error positions are possible
var PosZero = Pos{}
PosZero is the uninitialized zero text position (which is still a valid position)
var RegZero = Reg{}
RegZero is the zero region
var RuleProps = ki.Props{}
Functions ¶
func IsWhiteSpace ¶
func PrintError ¶
PrintError is a utility function that prints a list of errors to w, one error per line, if the err parameter is an ErrorList. Otherwise it prints the err string.
Types ¶
type Actions ¶
type Actions int
Actions are lexing actions to perform
const ( // Next means advance input position to the next character(s) after the matched characters Next Actions = iota // Name means read in an entire name, which is letters, _ and digits after first letter // position will be advanced to just after Name // Number means read in an entire number -- the token type will automatically be // set to the actual type of number that was read in, and position advanced to just after Number // Quoted means read in an entire string enclosed in quote delimeter // that is present at current position, with proper skipping of escaped. // Position advanced to just after Quoted // QuotedRaw means read in an entire string enclosed in quote delimeter // that is present at start position, with proper skipping of escaped. // Position advanced to just after. // Raw version supports multi-line and includes CR etc at end of lines (e.g., back-tick // in various languages) QuotedRaw // EOL means read till the end of the line (e.g., for single-line comments) EOL // PushState means push the given state value onto the state stack PushState // PopState means pop given state value off the state stack PopState // SetGuestLex means install the Name (must be a prior action) as the guest // lexer -- it will take over lexing until PopGuestLex is called SetGuestLex // PopGuestLex removes the current guest lexer and returns to the original // language lexer PopGuestLex ActionsN )
The lexical acts
func (*Actions) FromString ¶
func (Actions) MarshalJSON ¶
func (*Actions) UnmarshalJSON ¶
type Error ¶
In an ErrorList, an error is represented by an *Error. The position Pos, if valid, points to the beginning of the offending token, and the error condition is described by Msg.
type ErrorList ¶
type ErrorList []*Error
ErrorList is a list of *Errors. The zero value for an ErrorList is an empty ErrorList ready to use.
func (ErrorList) Err ¶
Err returns an error equivalent to this error list. If the list is empty, Err returns nil.
func (*ErrorList) RemoveMultiples ¶
func (p *ErrorList) RemoveMultiples()
RemoveMultiples sorts an ErrorList and removes all but the first error per line.
type File ¶
type File struct {
Filename string `desc:"the current file being lex'd"`
Lines *[][]rune `desc:"contents of the file as lines of runes"`
Lexs []Line `desc:"lex'd version of the lines -- allocated to size of Lines"`
Comments []Line `` /* 148-byte string literal not displayed */
LastStacks []Stack `desc:"stack present at the end of each line -- needed for contextualizing line-at-time lexing while editing"`
}
File contains the contents of the file being parsed -- all kept in memory, and represented by Line as runes, so that positions in the file are directly convertible to indexes in Lines structure
func (*File) AllocLines ¶
func (fl *File) AllocLines()
AllocLines allocates the data per line: lex outputs and stack. We reset state so stale state is not hanging around.
func (*File) IsLexPosValid ¶
IsLexPosValid returns true if given lexical token position is valid
func (*File) LexAtSafe ¶
LexAtSafe returns the Lex item at given position, or last lex item if beyond end
func (*File) LexLine ¶
LexLine returns the lexing output for given line, combining comments and all other tokens and allocating new memory using clone
func (*File) LexTagSrcLn ¶
LexTagSrcLn returns the lex'd tagged source line for given line
func (*File) LinesDeleted ¶
LinesDeleted deletes lines -- called e.g., by giv.TextBuf to sync the markup with ongoing edits
func (*File) LinesInserted ¶
LinesInserted inserts new lines -- called e.g., by giv.TextBuf to sync the markup with ongoing edits
func (*File) NextTokenPos ¶
NextTokenPos returns the next token position, false if at end of tokens
func (*File) PrevTokenPos ¶
PrevTokenPos returns the previous token position, false if at end of tokens
func (*File) TokenMapReg ¶
TokenMapReg creates a TokenMap of tokens in region, including their Cat and SubCat levels -- err's on side of inclusiveness -- used for optimizing token matching
func (*File) TokenRegSrc ¶
TokenRegSrc returns the source code associated with the given token region
func (*File) TokenSrcPos ¶
TokenSrcPos returns source reg associated with lex token at given token position
func (*File) TokenSrcReg ¶
TokenSrcReg translates a region of tokens into a region of source
type LangLexer ¶
type LangLexer interface {
// Lexer returns the top-level lex.Rule for given language (case invariant lookup)
Lexer(lang string) *Rule
}
LangLexer looks up lexer for given language -- impl in parent pi package so we need the interface
var TheLangLexer LangLexer
TheLangLexer is the instance of LangLexer interface used to lookup lexers for languages -- is set in pi/langs.go
type Lex ¶
type Lex struct {
Tok token.Tokens `desc:"token"`
Depth int `` /* 295-byte string literal not displayed */
St int `desc:"start rune index within original source line for this token"`
Ed int `desc:"end rune index within original source line for this token (exclusive -- ends one before this)"`
Time nptime.Time `` /* 129-byte string literal not displayed */
}
Lex represents a single lexical element, with a token, and start and end rune positions within a line of a file. Critically it also contains the nesting depth computed from all the parens, brackets, braces. Todo: also support XML < > </ > tag depth.
func (*Lex) ContainsPos ¶
ContainsPos returns true if the Lex element contains given character position
func (*Lex) OverlapsReg ¶
OverlapsReg returns true if the two regions overlap
type Lexer ¶
type Lexer interface {
ki.Ki
// Validate checks for any errors in the rules and issues warnings,
// returns true if valid (no err) and false if invalid (errs)
Validate(ls *State) bool
// Lex tries to apply rule to given input state, returns true if matched, false if not
Lex(ls *State) *Rule
// AsLexRule returns object as a lex.Rule
AsLexRule() *Rule
}
Lexer is the interface type for lexers -- likely not necessary except is essential for defining the BaseIface for gui in making new nodes
type Line ¶
type Line []Lex
Line is one line of Lex'd text
func MergeLines ¶
MergeLines merges the two lines of lex regions into a combined list properly ordered by sequence of tags within the line.
func (*Line) AddLex ¶
Add adds one element to the lex line with given params, returns pointer to that new lex
func (*Line) AddSort ¶
AddSort adds a new lex element in sorted order to list, sorted by start position, and if at the same start position, then sorted by end position
type MatchPos ¶
type MatchPos int
MatchPos are special positions for a match to occur
const ( // AnyPos matches at any position AnyPos MatchPos = iota // StartOfLine matches at start of line StartOfLine // EndOfLine matches at end of line EndOfLine // MiddleOfLine matches not at the start or end MiddleOfLine MatchPosN )
Matching rules
func (*MatchPos) FromString ¶
func (MatchPos) MarshalJSON ¶
func (*MatchPos) UnmarshalJSON ¶
type Matches ¶
type Matches int
Matches are what kind of lexing matches to make
const ( // String means match a specific string as given in the rule // Note: this only looks for the string with no constraints on // what happens after this string -- use StrName to match entire names String Matches = iota // StrName means match a specific string that is a complete alpha-numeric // string (including underbar _) with some other char at the end // must use this for all keyword matches to ensure that it isn't just // the start of a longer name StrName // Match any letter, including underscore Letter // Match digit 0-9 Digit // Match any white space (space, tab) -- input is already broken into lines WhiteSpace // CurState means match current state value set by a PushState action, using String value in rule // all CurState cases must generally be first in list of rules so they can preempt // other rules when the state is active CurState // AnyRune means match any rune -- use this as the last condition where other terminators // come first! AnyRune MatchesN )
Matching rules
func (*Matches) FromString ¶
func (Matches) MarshalJSON ¶
func (*Matches) UnmarshalJSON ¶
type PassTwo ¶
type PassTwo struct {
DoEos bool `desc:"should we perform EOS detection on this type of file?"`
Eol bool `desc:"use end-of-line as a default EOS, if nesting depth is same as start of line (python) -- see also EolToks"`
Semi bool `desc:"replace all semicolons with EOS to keep it consistent (C, Go..)"`
Backslash bool `desc:"use backslash as a line continuer (python)"`
EolToks token.KeyTokenList `desc:"specific tokens to recognize at the end of a line that trigger an EOS (Go)"`
}
PassTwo performs second pass(s) through the lexicalized version of the source, computing nesting depth for every token once and for all -- this is essential for properly matching tokens and also for colorization in syntax highlighting. Optionally, a subsequent pass finds end-of-statement (EOS) tokens, which are essential for parsing to first break the source down into statement-sized chunks. A separate list of EOS token positions is maintained for very fast access.
func (*PassTwo) MismatchError ¶
MismatchError reports a mismatch for given type of parentheses / bracket
func (*PassTwo) NestDepthLine ¶
Perform nesting depth computation on only one line, starting at given initial depth -- updates the given line
type Pos ¶
Pos is a position within the source file -- it is recorded always in 0, 0 offset positions, but is converted into 1,1 offset for public consumption Ch positions are always in runes, not bytes. Also used for lex token indexes.
type Reg ¶
type Reg struct {
St Pos `desc:"starting position of region"`
Ed Pos `desc:"ending position of region"`
}
Reg is a contiguous region within the source file
type Rule ¶
type Rule struct {
ki.Node
Desc string `desc:"description / comments about this rule"`
Token token.Tokens `desc:"the token value that this rule generates -- use None for non-terminals"`
Match Matches `desc:"the lexical match that we look for to engage this rule"`
Pos MatchPos `desc:"position where match can occur"`
String string `desc:"if action is LexMatch, this is the string we match"`
Off int `desc:"offset into the input to look for a match: 0 = current char, 1 = next one, etc"`
Acts []Actions `desc:"the action(s) to perform, in order, if there is a match -- these are performed prior to iterating over child nodes"`
PushState string `desc:"the state to push if our action is PushState -- note that State matching is on String, not this value"`
TokEff token.Tokens `view:"-" json:"-" desc:"effective token based on input -- e.g., for number is the type of number"`
MatchLen int `view:"-" json:"-" desc:"length of source that matched -- if Next is called, this is what will be skipped to"`
}
lex.Rule operates on the text input to produce the lexical tokens it is assembled into a lexical grammar structure to perform lexing
Lexing is done line-by-line -- you must push and pop states to coordinate across multiple lines, e.g., for multi-line comments
In general it is best to keep lexing as simple as possible and leave the more complex things for the parsing step.
func (*Rule) Find ¶
Find looks for rules in the tree that contain given string in String or Name fields
func (*Rule) IsMatch ¶
IsMatch tests if the rule matches for current input state, returns true if so, false if not
func (*Rule) IsMatchPos ¶
IsMatchPos tests if the rule matches position
func (*Rule) Lex ¶
Lex tries to apply rule to given input state, returns lowest-level rule that matched, nil if none
func (*Rule) LexStart ¶
LexStart is called on the top-level lex node to start lexing process for one step
type State ¶
type State struct {
Filename string `desc:"the current file being lex'd"`
KeepWS bool `desc:"if true, record whitespace tokens -- else ignore"`
Src []rune `desc:"the current line of source being processed"`
Lex Line `desc:"the lex output for this line"`
Comments Line `desc:"the comments output for this line -- kept separately"`
Pos int `desc:"the current rune char position within the line"`
Ln int `desc:"the line within overall source that we're operating on (0 indexed)"`
Ch rune `desc:"the current rune read by NextRune"`
Stack Stack `desc:"state stack"`
LastName string `desc:"the last name that was read"`
GuestLex *Rule `desc:"a guest lexer that can be installed for managing a different language type, e.g., quoted text in markdown files"`
SaveStack Stack `desc:"copy of stack at point when guest lexer was installed -- restore when popped"`
Time nptime.Time `desc:"time stamp for lexing -- set at start of new lex process"`
Errs ErrorList `desc:"any error messages accumulated during lexing specifically"`
}
lex.State is the state maintained for lexing
func (*State) LineString ¶
LineString returns the current lex output as tagged source
func (*State) Next ¶
Next moves to next position using given increment in source line -- returns false if at end
func (*State) NextSrcLine ¶
NextSrcLine returns the next line of text
func (*State) ReadEscape ¶
ReadEscape parses an escape sequence where rune is the accepted escaped quote. In case of a syntax error, it stops at the offending character (without consuming it) and returns false. Otherwise it returns true.
func (*State) ReadNumber ¶
func (*State) ReadQuoted ¶
func (ls *State) ReadQuoted()
func (*State) Rune ¶
Rune gets the rune at given offset from current position, returns false if out of range
func (*State) ScanMantissa ¶
type TokenMap ¶
TokenMap is a token map, for optimizing token exclusion
type TwoState ¶
type TwoState struct {
Pos Pos `desc:"position in lex tokens we're on"`
Src *File `desc:"file that we're operating on"`
NestStack []token.Tokens `desc:"stack of nesting tokens"`
EosPos []Pos `desc:"positions *in token coordinates* of the EOS markers generated"`
Errs ErrorList `desc:"any error messages accumulated during lexing specifically"`
}
TwoState is the state maintained for the PassTwo process
func (*TwoState) Init ¶
func (ts *TwoState) Init()
Init initializes state for a new pass -- called at start of NestDepth
func (*TwoState) InsertEOS ¶
InsertEOS inserts an EOS just after the given token position (e.g., cp = last token in line)
func (*TwoState) NestStackStr ¶
NestStackStr returns the token stack as strings
func (*TwoState) ReplaceEOS ¶
ReplaceEOS replaces given token with an EOS