Documentation
¶
Index ¶
- Constants
- Variables
- func CharDescription(ch rune) string
- func Escape(input string) string
- func IsECMAIdentifierChar(r rune) bool
- func IsECMAIdentifierStartChar(r rune) bool
- func IsECMAWordChar(r rune) bool
- func IsWordChar(r rune) bool
- func Unescape(input string) (string, error)
- type AnchorLoc
- type BmPrefix
- type Category
- type CharClassAnalysisResults
- type CharSet
- func (set *CharSet) Analyze() CharClassAnalysisResults
- func (c CharSet) CharIn(ch rune) bool
- func (c CharSet) Copy() CharSet
- func (c *CharSet) Equals(c2 *CharSet) bool
- func (c CharSet) GetIfNRanges(n int) []SingleRange
- func (c *CharSet) GetIfOnlyUnicodeCategories() (cats []Category, negate bool)
- func (c *CharSet) GetSetChars(maxChars int) []rune
- func (c CharSet) HasSubtraction() bool
- func (c *CharSet) Hash() []byte
- func (c CharSet) IsAnything() bool
- func (c CharSet) IsEmpty() bool
- func (c CharSet) IsMergeable() bool
- func (c CharSet) IsNegated() bool
- func (c CharSet) IsSingleton() bool
- func (c CharSet) IsSingletonInverse() bool
- func (c *CharSet) IsUnicodeCategoryOfSmallCharCount() (isSmall bool, chars []rune, negated bool, desc string)
- func (set1 *CharSet) MayOverlap(set2 *CharSet) bool
- func (c CharSet) SingletonChar() rune
- func (c CharSet) String() string
- type Code
- type Error
- type ErrorCode
- type FindNextStartingPositionMode
- type FindOptimizations
- type FixedDistanceLiteral
- type FixedDistanceSet
- type InstOp
- type LiteralAfterLoop
- type NodeType
- type ParseOptions
- type Prefix
- type RegexNode
- func (n *RegexNode) ComputeMinLength() int
- func (n *RegexNode) Description() string
- func (n *RegexNode) FindLastExpressionInLoopForAutoAtomic() *RegexNode
- func (n *RegexNode) FindStartingLiteral() *StartingLiteral
- func (n *RegexNode) FindStartingLiteralNode(allowZeroWidth bool) *RegexNode
- func (n *RegexNode) FirstCharOfOneOrMulti() rune
- func (n *RegexNode) IsAtomicloopFamily() bool
- func (n *RegexNode) IsNotoneFamily() bool
- func (n *RegexNode) IsNotoneloopFamily() bool
- func (n *RegexNode) IsOneFamily() bool
- func (n *RegexNode) IsOneloopFamily() bool
- func (n *RegexNode) IsSetFamily() bool
- func (n *RegexNode) IsSetloopFamily() bool
- func (n *RegexNode) ReplaceChild(index int, newChild *RegexNode)
- func (n *RegexNode) TryGetJoinableLengthCheckChildRange(childIndex int, requiredLength *int, exclusiveEnd *int) bool
- func (n *RegexNode) TryGetOrdinalCaseInsensitiveString(childIndex int, exclusiveChildBound int, consumeZeroWidthNodes bool) (success bool, nodesConsumed int, caseInsensitiveString string)
- type RegexOptions
- type RegexTree
- type ReplacerData
- type RequiredLandmark
- type RequiredLandmarkAlternative
- type RequiredLandmarkChain
- type SingleRange
- type StartingLiteral
Constants ¶
const ( SpaceCategoryText = " " WordCategoryText = "W" )
const ( LowercaseSet = 0 // Set to arg. LowercaseAdd = 1 // Add arg. LowercaseBor = 2 // Bitwise or with 1. LowercaseBad = 3 // Bitwise and with 1 and add original. )
const ( // internal issue ErrInternalError ErrorCode = "regexp/syntax: internal error" // Parser errors ErrUnterminatedComment = "unterminated comment" ErrInvalidCharRange = "invalid character class range" ErrInvalidRepeatSize = "invalid repeat count" ErrInvalidUTF8 = "invalid UTF-8" ErrCaptureGroupOutOfRange = "capture group number out of range" ErrUnexpectedParen = "unexpected )" ErrMissingParen = "missing closing )" ErrMissingBrace = "missing closing }" ErrInvalidRepeatOp = "invalid nested repetition operator" ErrMissingRepeatArgument = "missing argument to repetition operator" ErrConditionalExpression = "illegal conditional (?(...)) expression" ErrTooManyAlternates = "too many | in (?()|)" ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" ErrInvalidECMAGroupName = "invalid capture group name" ErrDuplicateGroupName = "duplicate capture group name" ErrCapNumNotZero = "capture number cannot be zero" ErrUndefinedBackRef = "reference to undefined group number %v" ErrUndefinedNameRef = "reference to undefined group name %v" ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" ErrAlternationCantHaveComment = "alternation conditions cannot be comments" ErrMalformedReference = "(?(%v) ) malformed" ErrUndefinedReference = "(?(%v) ) reference to undefined group" ErrIllegalEndEscape = "illegal \\ at end of pattern" ErrMalformedSlashP = "malformed \\p{X} character escape" ErrIncompleteSlashP = "incomplete \\p{X} character escape" ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" ErrMissingControl = "missing control character" ErrUnrecognizedControl = "unrecognized control character" ErrTooFewHex = "insufficient hexadecimal digits" ErrInvalidHex = "hex values may not be larger than 0x10FFFF" ErrMalformedNameRef = "malformed \\k<...> named back reference" ErrBadClassInCharRange = "cannot include class \\%v in character range" ErrShorthandClassInCharRange = "cannot create range with shorthand escape sequence \\%v" ErrUnterminatedBracket = "unterminated [] set" ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" ErrReversedCharRange = "[%c-%c] range in reverse order" )
const ( Q byte = 5 // quantifier S byte = 4 // ordinary stopper Z byte = 3 // ScanBlank stopper X byte = 2 // whitespace E byte = 1 // should be escaped )
const MultiVsRepeaterLimit = 64
Arbitrary number of repetitions of the same character when we'd prefer to represent that as a repeater of that character rather than a string.
Variables ¶
var ( AnyClass = getCharSetFromOldString([]rune{0}, false) ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) NoneClass = getCharSetFromOldString(nil, false) ECMAWordClass = getCharSetFromOldString(ecmaWord, false) NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) WordClass = getCharSetFromCategoryString(false, false, WordCategoryText) NotWordClass = getCharSetFromCategoryString(true, false, WordCategoryText) SpaceClass = getCharSetFromCategoryString(false, false, SpaceCategoryText) NotSpaceClass = getCharSetFromCategoryString(true, false, SpaceCategoryText) DigitClass = getCharSetFromCategoryString(false, false, "Nd") NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") RE2SpaceClass = getCharSetFromOldString(re2Space, false) NotRE2SpaceClass = getCharSetFromOldString(re2Space, true) NotNewLineClass = getCharSetFromOldString([]rune{0x0a, 0x0b}, true) )
var ErrReplacementError = errors.New("replacement pattern error")
ErrReplacementError is a general error during parsing the replacement text
Functions ¶
func CharDescription ¶
CharDescription Produces a human-readable description for a single character.
func IsECMAIdentifierChar ¶
func IsECMAWordChar ¶
func IsWordChar ¶
According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
Types ¶
type AnchorLoc ¶
type AnchorLoc int16
type BmPrefix ¶
type BmPrefix struct {
// contains filtered or unexported fields
}
BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.
func (*BmPrefix) IsMatch ¶
When a regex is anchored, we can do a quick IsMatch test instead of a Scan
func (*BmPrefix) Scan ¶
Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.
The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.
type CharClassAnalysisResults ¶
type CharClassAnalysisResults struct {
// true if the set contains only ranges; false if it contains Unicode categories and/or subtraction.
OnlyRanges bool
// true if we know for sure that the set contains only ASCII values; otherwise, false.
// This can only be true if OnlyRanges is true.
ContainsOnlyAscii bool
// true if we know for sure that the set doesn't contain any ASCII values; otherwise, false.
// This can only be true if OnlyRanges is true.
ContainsNoAscii bool
// true if we know for sure that all ASCII values are in the set; otherwise, false.
// This can only be true if OnlyRanges is true.
AllAsciiContained bool
// true if we know for sure that all non-ASCII values are in the set; otherwise, false.
// This can only be true if OnlyRanges is true.
AllNonAsciiContained bool
// The inclusive lower bound.
// This is only valid if OnlyRanges is true.
LowerBoundInclusiveIfOnlyRanges rune
// The exclusive upper bound.
// This is only valid if OnlyRanges is true.
UpperBoundExclusiveIfOnlyRanges rune
}
type CharSet ¶
type CharSet struct {
// contains filtered or unexported fields
}
CharSet combines start-end rune ranges and unicode categories representing a set of characters
func NewCharSetRuntime ¶
func (*CharSet) Analyze ¶
func (set *CharSet) Analyze() CharClassAnalysisResults
<summary>Analyzes the set to determine some basic properties that can be used to optimize usage.
func (CharSet) CharIn ¶
CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.
func (CharSet) GetIfNRanges ¶
func (c CharSet) GetIfNRanges(n int) []SingleRange
func (*CharSet) GetIfOnlyUnicodeCategories ¶
func (*CharSet) GetSetChars ¶
Gets all of the characters in the specified set, storing them into the provided span.
Only considers character classes that only contain sets (no categories), just simple sets containing starting/ending pairs (subtraction from those pairs is factored in, however).The returned characters may be negated: if IsNegated(set) is false, then the returned characters are the only ones that match; if it returns true, then the returned characters are the only ones that don't match.
func (CharSet) HasSubtraction ¶
func (CharSet) IsAnything ¶
func (CharSet) IsMergeable ¶
func (CharSet) IsSingleton ¶
func (CharSet) IsSingletonInverse ¶
func (*CharSet) IsUnicodeCategoryOfSmallCharCount ¶
func (c *CharSet) IsUnicodeCategoryOfSmallCharCount() (isSmall bool, chars []rune, negated bool, desc string)
Gets whether the specified set is a named set with a reasonably small count of Unicode characters. Designed to help the regexp code generator choose a better search algo for finding chars Description is a short name that can be used as part of a var name in code gen
func (*CharSet) MayOverlap ¶
Determines whether two sets could overlap.
func (CharSet) SingletonChar ¶
SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
type Code ¶
type Code struct {
Codes []int // the code
Strings [][]rune // string table
Sets []*CharSet //character set table
TrackCount int // how many instructions use backtracking
Caps map[int]int // mapping of user group numbers -> impl group slots
Capsize int // number of impl group slots
FcPrefix *Prefix // the set of candidate first characters (may be null)
BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
RightToLeft bool // true if right to left
FindOptimizations *FindOptimizations // analyzed candidate search strategy
}
func (*Code) OpcodeDescription ¶
OpcodeDescription is a humman readable string of the specific offset
func (*Code) PrepareCharSetASCIIBitmaps ¶
func (c *Code) PrepareCharSetASCIIBitmaps()
PrepareCharSetASCIIBitmaps builds bounded ASCII lookup tables for compiled character classes before the regexp is shared across goroutines.
type Error ¶
An Error describes a failure to parse a regular expression and gives the offending expression.
type ErrorCode ¶
type ErrorCode string
An ErrorCode describes a failure to parse a regular expression.
type FindNextStartingPositionMode ¶
type FindNextStartingPositionMode int
const ( NoSearch FindNextStartingPositionMode = iota // A "beginning" anchor at the beginning of the pattern. LeadingAnchor_LeftToRight_Beginning // A "start" anchor at the beginning of the pattern. LeadingAnchor_LeftToRight_Start // An "endz" anchor at the beginning of the pattern. This is rare. LeadingAnchor_LeftToRight_EndZ // An "end" anchor at the beginning of the pattern. This is rare. LeadingAnchor_LeftToRight_End // A "beginning" anchor at the beginning of the right-to-left pattern. LeadingAnchor_RightToLeft_Beginning // A "start" anchor at the beginning of the right-to-left pattern. LeadingAnchor_RightToLeft_Start // An "endz" anchor at the beginning of the right-to-left pattern. This is rare. LeadingAnchor_RightToLeft_EndZ // An "end" anchor at the beginning of the right-to-left pattern. This is rare. LeadingAnchor_RightToLeft_End // An "end" anchor at the end of the pattern, with the pattern always matching a fixed-length expression. TrailingAnchor_FixedLength_LeftToRight_End // An "endz" anchor at the end of the pattern, with the pattern always matching a fixed-length expression. TrailingAnchor_FixedLength_LeftToRight_EndZ // A multi-character substring at the beginning of the pattern. LeadingString_LeftToRight // A multi-character substring at the beginning of the right-to-left pattern. LeadingString_RightToLeft // A multi-character ordinal case-insensitive substring at the beginning of the pattern. LeadingString_OrdinalIgnoreCase_LeftToRight // Multiple leading prefix strings LeadingStrings_LeftToRight // Multiple leading ordinal case-insensitive prefix strings LeadingStrings_OrdinalIgnoreCase_LeftToRight // A set starting the pattern. LeadingSet_LeftToRight // A set starting the right-to-left pattern. LeadingSet_RightToLeft // A single character at the start of the right-to-left pattern. LeadingChar_RightToLeft // A single character at a fixed distance from the start of the pattern. FixedDistanceChar_LeftToRight // A multi-character case-sensitive string at a fixed distance from the start of the pattern. FixedDistanceString_LeftToRight // One or more sets at a fixed distance from the start of the pattern. FixedDistanceSets_LeftToRight // A literal (single character, multi-char string, or set with small number of characters) after a non-overlapping set loop at the start of the pattern. LiteralAfterLoop_LeftToRight // A sequence of required landmarks after a leading loop. RequiredLandmarkChain_LeftToRight )
type FindOptimizations ¶
type FindOptimizations struct {
FindMode FindNextStartingPositionMode
LeadingAnchor NodeType
TrailingAnchor NodeType
MinRequiredLength int
MaxPossibleLength int
LeadingPrefix string
LeadingPrefixes []string
FixedDistanceLiteral FixedDistanceLiteral
FixedDistanceSets []FixedDistanceSet
LiteralAfterLoop *LiteralAfterLoop
LandmarkChain *RequiredLandmarkChain
// contains filtered or unexported fields
}
type FixedDistanceLiteral ¶
type FixedDistanceSet ¶
type FixedDistanceSet struct {
Set *CharSet
Chars []rune
Negated bool
Range *SingleRange
Distance int
}
type InstOp ¶
type InstOp int
const ( Onerep InstOp = 0 // lef,back char,min,max a {n} Notonerep InstOp = 1 // lef,back char,min,max .{n} Setrep InstOp = 2 // lef,back set,min,max [\d]{n} Oneloop InstOp = 3 // lef,back char,min,max a {,n} Notoneloop InstOp = 4 // lef,back char,min,max .{,n} Setloop InstOp = 5 // lef,back set,min,max [\d]{,n} Onelazy InstOp = 6 // lef,back char,min,max a {,n}? Notonelazy InstOp = 7 // lef,back char,min,max .{,n}? Setlazy InstOp = 8 // lef,back set,min,max [\d]{,n}? One InstOp = 9 // lef char a Notone InstOp = 10 // lef char [^a] Set InstOp = 11 // lef set [a-z\s] \w \s \d Multi InstOp = 12 // lef string abcd Ref InstOp = 13 // lef group \# Bol InstOp = 14 // ^ Eol InstOp = 15 // $ Boundary InstOp = 16 // \b Nonboundary InstOp = 17 // \B Beginning InstOp = 18 // \A Start InstOp = 19 // \G EndZ InstOp = 20 // \Z End InstOp = 21 // \Z Nothing InstOp = 22 // Reject! Lazybranch InstOp = 23 // back jump straight first Branchmark InstOp = 24 // back jump branch first for loop Lazybranchmark InstOp = 25 // back jump straight first for loop Nullcount InstOp = 26 // back val set counter, null mark Setcount InstOp = 27 // back val set counter, make mark Branchcount InstOp = 28 // back jump,limit branch++ if zero<=c<limit Lazybranchcount InstOp = 29 // back jump,limit same, but straight first Nullmark InstOp = 30 // back save position Setmark InstOp = 31 // back save position Capturemark InstOp = 32 // back group define group Getmark InstOp = 33 // back recall position Setjump InstOp = 34 // back save backtrack state Backjump InstOp = 35 // zap back to saved state Forejump InstOp = 36 // zap backtracking state Testref InstOp = 37 // backtrack if ref undefined Goto InstOp = 38 // jump just go Prune InstOp = 39 // prune it baby Stop InstOp = 40 // done! ECMABoundary InstOp = 41 // \b NonECMABoundary InstOp = 42 // \B // Atomic loop of the specified character. // Operand 0 is the character. Operand 1 is the max iteration count. Oneloopatomic InstOp = 43 // Atomic loop of a single character other than the one specified. // Operand 0 is the character. Operand 1 is the max iteration count. Notoneloopatomic InstOp = 44 // Atomic loop of a single character matching the specified set // Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count. Setloopatomic InstOp = 45 // Updates the bumpalong position to the current position. UpdateBumpalong InstOp = 46 Mask InstOp = 63 // Mask to get unmodified ordinary operator Rtl InstOp = 64 // bit to indicate that we're reverse scanning. Back InstOp = 128 // bit to indicate that we're backtracking. Back2 InstOp = 256 // bit to indicate that we're backtracking on a second branch. Ci InstOp = 512 // bit to indicate that we're case-insensitive. )
type LiteralAfterLoop ¶
type NodeType ¶
type NodeType int32
const ( // The following are leaves, and correspond to primitive operations NtUnknown NodeType = -1 //NtOnerep NodeType = 0 // lef,back char,min,max a {n} //NtNotonerep NodeType = 1 // lef,back char,min,max .{n} //NtSetrep NodeType = 2 // lef,back set,min,max [\d]{n} NtOneloop NodeType = 3 // lef,back char,min,max a {,n} NtNotoneloop NodeType = 4 // lef,back char,min,max .{,n} NtSetloop NodeType = 5 // lef,back set,min,max [\d]{,n} NtOnelazy NodeType = 6 // lef,back char,min,max a {,n}? NtNotonelazy NodeType = 7 // lef,back char,min,max .{,n}? NtSetlazy NodeType = 8 // lef,back set,min,max [\d]{,n}? NtOne NodeType = 9 // lef char a NtNotone NodeType = 10 // lef char [^a] NtSet NodeType = 11 // lef set [a-z\s] \w \s \d NtMulti NodeType = 12 // lef string abcd NtRef NodeType = 13 // lef group \# NtBol NodeType = 14 // ^ NtEol NodeType = 15 // $ NtBoundary NodeType = 16 // \b NtNonboundary NodeType = 17 // \B NtBeginning NodeType = 18 // \A NtStart NodeType = 19 // \G NtEndZ NodeType = 20 // \Z NtEnd NodeType = 21 // \Z NtNothing NodeType = 22 // [] NtEmpty NodeType = 23 // () NtAlternate NodeType = 24 // a|b NtConcatenate NodeType = 25 // ab NtLoop NodeType = 26 // m,x * + ? {,} NtLazyloop NodeType = 27 // m,x *? +? ?? {,}? NtCapture NodeType = 28 // n () NtGroup NodeType = 29 // (?:) NtPosLook NodeType = 30 // (?=) (?<=) NtNegLook NodeType = 31 // (?!) (?<!) NtAtomic NodeType = 32 // (?>) (?<) NtBackRefCond NodeType = 33 // (?(n) | ) NtExprCond NodeType = 34 // (?(...) | ) NtECMABoundary NodeType = 41 // \b NtNonECMABoundary NodeType = 42 // \B // Atomic loop of the specified character. // Operand 0 is the character. Operand 1 is the max iteration count. NtOneloopatomic NodeType = 43 // Atomic loop of a single character other than the one specified. // Operand 0 is the character. Operand 1 is the max iteration count. NtNotoneloopatomic NodeType = 44 // Atomic loop of a single character matching the specified set // Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count. NtSetloopatomic NodeType = 45 // Updates the bumpalong position to the current position. NtUpdateBumpalong NodeType = 46 )
type ParseOptions ¶
type ParseOptions struct {
RegexOptions RegexOptions
MaintainCaptureOrder bool
CodeGen bool
}
type RegexNode ¶
type RegexNode struct {
T NodeType
Children []*RegexNode
Str []rune
Set *CharSet
Ch rune
M int
N int
Options RegexOptions
Parent *RegexNode
}
Implementation notes:
Since the node tree is a temporary data structure only used during compilation of the regexp to integer codes, it's designed for clarity and convenience rather than space efficiency.
RegexNodes are built into a tree, linked by the n.children list. Each node also has a n.parent and n.ichild member indicating its parent and which child # it is in its parent's list.
RegexNodes come in as many types as there are constructs in a regular expression, for example, "concatenate", "alternate", "one", "rept", "group". There are also node types for basic peephole optimizations, e.g., "onerep", "notsetrep", etc.
Because perl 5 allows "lookback" groups that scan backwards, each node also gets a "direction". Normally the value of boolean n.backward = false.
On the parse stack, each tree has a "role" - basically, the nonterminal in the grammar that the parser has currently assigned to the tree. That code is stored in n.role.
Finally, some of the different kinds of nodes have data. Two integers (for the looping constructs) are stored in n.operands, an an object (either a string or a set) is stored in n.data
func (*RegexNode) ComputeMinLength ¶
Computes a min bound on the required length of any string that could possibly match. If the result is 0, there is no minimum we can enforce.
func (*RegexNode) Description ¶
func (*RegexNode) FindLastExpressionInLoopForAutoAtomic ¶
Recurs into the last expression of a loop node, looking to see if it can find a node that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. Returns The found node that should be explored further for auto-atomicity; null if it doesn't exist.
func (*RegexNode) FindStartingLiteral ¶
func (n *RegexNode) FindStartingLiteral() *StartingLiteral
func (*RegexNode) FindStartingLiteralNode ¶
Finds the guaranteed beginning literal(s) of the node, or null if none exists. allowZeroWidth = true
func (*RegexNode) FirstCharOfOneOrMulti ¶
Gets the character that begins a One or Multi.
func (*RegexNode) IsAtomicloopFamily ¶
func (*RegexNode) IsNotoneFamily ¶
func (*RegexNode) IsNotoneloopFamily ¶
func (*RegexNode) IsOneFamily ¶
func (*RegexNode) IsOneloopFamily ¶
func (*RegexNode) IsSetFamily ¶
func (*RegexNode) IsSetloopFamily ¶
func (*RegexNode) ReplaceChild ¶
func (*RegexNode) TryGetJoinableLengthCheckChildRange ¶
func (n *RegexNode) TryGetJoinableLengthCheckChildRange(childIndex int, requiredLength *int, exclusiveEnd *int) bool
Determine whether the specified child node is the beginning of a sequence that can trivially have length checks combined in order to avoid bounds checks. requiredLength is The sum of all the fixed lengths for the nodes in the sequence.</param> exclusiveEnd is The index of the node just after the last one in the sequence.</param> returns true if more than one node can have their length checks combined; otherwise, false.</returns>
There are additional node types for which we can prove a fixed length, e.g. examining all branches of an alternation and returning true if all their lengths are equal. However, the primary purpose of this method is to avoid bounds checks by consolidating length checks that guard accesses to strings/spans for which the JIT can see a fixed index within bounds, and alternations employ patterns that defeat that (e.g. reassigning the span in question). As such, the implementation remains focused on only a core subset of nodes that are a) likely to be used in concatenations and b) employ simple patterns of checks.
func (*RegexNode) TryGetOrdinalCaseInsensitiveString ¶
func (n *RegexNode) TryGetOrdinalCaseInsensitiveString(childIndex int, exclusiveChildBound int, consumeZeroWidthNodes bool) (success bool, nodesConsumed int, caseInsensitiveString string)
Determines whether the specified child index of a concatenation begins a sequence whose values should be used to perform an ordinal case-insensitive comparison.
When consumeZeroWidthNodes is false, the consumer needs the semantics of matching the produced string to fully represent the semantics of all the consumed nodes, which means nodes can be consumed iff they produce text that's represented by the resulting string. When true, the resulting string needs to fully represent all valid matches at that position, but it can have false positives, which means the resulting string doesn't need to fully represent all zero-width nodes consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of actual matching logic. consumeZeroWidthNodes = false
type RegexOptions ¶
type RegexOptions int32
const ( IgnoreCase RegexOptions = 0x0001 // "i" Multiline RegexOptions = 0x0002 // "m" ExplicitCapture RegexOptions = 0x0004 // "n" Singleline RegexOptions = 0x0010 // "s" IgnorePatternWhitespace RegexOptions = 0x0020 // "x" RightToLeft RegexOptions = 0x0040 // "r" ECMAScript RegexOptions = 0x0100 // "e" RE2 RegexOptions = 0x0200 // RE2 compat mode Unicode RegexOptions = 0x0400 // "u" )
type RegexTree ¶
type RegexTree struct {
Root *RegexNode
Caps map[int]int
Capnumlist []int
Captop int
Capnames map[string]int
Caplist []string
Options RegexOptions
FindOptimizations *FindOptimizations
}
type ReplacerData ¶
func NewReplacerData ¶
func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)
NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp
type RequiredLandmark ¶ added in v2.1.0
type RequiredLandmark struct {
Alternatives []RequiredLandmarkAlternative
}
type RequiredLandmarkAlternative ¶ added in v2.1.0
type RequiredLandmarkChain ¶ added in v2.1.0
type RequiredLandmarkChain struct {
LeadingLoopSet *CharSet
Landmarks []RequiredLandmark
}
type SingleRange ¶
type StartingLiteral ¶
type StartingLiteral struct {
Range SingleRange
String []rune
SetChars []rune
Negated bool
}