syntax

package

v2.2.2 Latest Latest Go to latest Published: Jun 15, 2026 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/dlclark/regexp2

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func CharDescription(ch rune) string
func Escape(input string) string
func IsECMAIdentifierChar(r rune) bool
func IsECMAIdentifierStartChar(r rune) bool
func IsECMAWordChar(r rune) bool
func IsWordChar(r rune) bool
func Unescape(input string) (string, error)
type AnchorLoc
- func (anchors AnchorLoc) String() string
type BmPrefix
- func (b *BmPrefix) Dump(indent string) string
- func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool
- func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int
- func (b *BmPrefix) String() string
type Category
- func (c Category) String() string
type CharClassAnalysisResults
type CharSet
- func NewCharSetRuntime(buf string) CharSet
- func (set *CharSet) Analyze() CharClassAnalysisResults
- func (c CharSet) CharIn(ch rune) bool
- func (c CharSet) Copy() CharSet
- func (c *CharSet) Equals(c2 *CharSet) bool
- func (c CharSet) GetIfNRanges(n int) []SingleRange
- func (c *CharSet) GetIfOnlyUnicodeCategories() (cats []Category, negate bool)
- func (c *CharSet) GetSetChars(maxChars int) []rune
- func (c CharSet) HasSubtraction() bool
- func (c *CharSet) Hash() []byte
- func (c CharSet) IsAnything() bool
- func (c CharSet) IsEmpty() bool
- func (c CharSet) IsMergeable() bool
- func (c CharSet) IsNegated() bool
- func (c CharSet) IsSingleton() bool
- func (c CharSet) IsSingletonInverse() bool
- func (c *CharSet) IsUnicodeCategoryOfSmallCharCount() (isSmall bool, chars []rune, negated bool, desc string)
- func (set1 *CharSet) MayOverlap(set2 *CharSet) bool
- func (c CharSet) SingletonChar() rune
- func (c CharSet) String() string
type Code
- func Write(tree *RegexTree) (*Code, error)
- func (c *Code) Dump() string
- func (c *Code) OpcodeDescription(offset int) string
- func (c *Code) PrepareCharSetASCIIBitmaps()
type Error
- func (e *Error) Error() string
type ErrorCode
- func (e ErrorCode) String() string
type FindNextStartingPositionMode
- func (m FindNextStartingPositionMode) String() string
type FindOptimizations
- func (f *FindOptimizations) Dump() string
type FixedDistanceLiteral
type FixedDistanceSet
type InstOp
type LiteralAfterLoop
type NodeType
type ParseOptions
type Prefix
type RegexNode
- func (n *RegexNode) ComputeMinLength() int
- func (n *RegexNode) Description() string
- func (n *RegexNode) FindLastExpressionInLoopForAutoAtomic() *RegexNode
- func (n *RegexNode) FindStartingLiteral() *StartingLiteral
- func (n *RegexNode) FindStartingLiteralNode(allowZeroWidth bool) *RegexNode
- func (n *RegexNode) FirstCharOfOneOrMulti() rune
- func (n *RegexNode) IsAtomicloopFamily() bool
- func (n *RegexNode) IsNotoneFamily() bool
- func (n *RegexNode) IsNotoneloopFamily() bool
- func (n *RegexNode) IsOneFamily() bool
- func (n *RegexNode) IsOneloopFamily() bool
- func (n *RegexNode) IsSetFamily() bool
- func (n *RegexNode) IsSetloopFamily() bool
- func (n *RegexNode) ReplaceChild(index int, newChild *RegexNode)
- func (n *RegexNode) TryGetJoinableLengthCheckChildRange(childIndex int, requiredLength *int, exclusiveEnd *int) bool
- func (n *RegexNode) TryGetOrdinalCaseInsensitiveString(childIndex int, exclusiveChildBound int, consumeZeroWidthNodes bool) (success bool, nodesConsumed int, caseInsensitiveString string)
type RegexOptions
type RegexTree
- func Parse(re string, op ParseOptions) (*RegexTree, error)
- func (t *RegexTree) Dump() string
type ReplacerData
- func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, ...) (*ReplacerData, error)
type RequiredLandmark
type RequiredLandmarkAlternative
type RequiredLandmarkChain
type SingleRange
type StartingLiteral

Constants ¶

View Source

const (
	SpaceCategoryText = " "
	WordCategoryText  = "W"
)

View Source

const (
	LowercaseSet = 0 // Set to arg.
	LowercaseAdd = 1 // Add arg.
	LowercaseBor = 2 // Bitwise or with 1.
	LowercaseBad = 3 // Bitwise and with 1 and add original.
)

View Source

const (
	// internal issue
	ErrInternalError ErrorCode = "regexp/syntax: internal error"
	// Parser errors
	ErrUnterminatedComment        = "unterminated comment"
	ErrInvalidCharRange           = "invalid character class range"
	ErrInvalidRepeatSize          = "invalid repeat count"
	ErrInvalidUTF8                = "invalid UTF-8"
	ErrCaptureGroupOutOfRange     = "capture group number out of range"
	ErrUnexpectedParen            = "unexpected )"
	ErrMissingParen               = "missing closing )"
	ErrMissingBrace               = "missing closing }"
	ErrInvalidRepeatOp            = "invalid nested repetition operator"
	ErrMissingRepeatArgument      = "missing argument to repetition operator"
	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
	ErrTooManyAlternates          = "too many | in (?()|)"
	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
	ErrInvalidECMAGroupName       = "invalid capture group name"
	ErrDuplicateGroupName         = "duplicate capture group name"
	ErrCapNumNotZero              = "capture number cannot be zero"
	ErrUndefinedBackRef           = "reference to undefined group number %v"
	ErrUndefinedNameRef           = "reference to undefined group name %v"
	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
	ErrMalformedReference         = "(?(%v) ) malformed"
	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
	ErrMalformedSlashP            = "malformed \\p{X} character escape"
	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
	ErrMissingControl             = "missing control character"
	ErrUnrecognizedControl        = "unrecognized control character"
	ErrTooFewHex                  = "insufficient hexadecimal digits"
	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
	ErrShorthandClassInCharRange  = "cannot create range with shorthand escape sequence \\%v"
	ErrUnterminatedBracket        = "unterminated [] set"
	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
	ErrReversedCharRange          = "[%c-%c] range in reverse order"
)

View Source

const (
	Q byte = 5 // quantifier
	S byte = 4 // ordinary stopper
	Z byte = 3 // ScanBlank stopper
	X byte = 2 // whitespace
	E byte = 1 // should be escaped
)

View Source

const MultiVsRepeaterLimit = 64

Arbitrary number of repetitions of the same character when we'd prefer to represent that as a repeater of that character rather than a string.

Variables ¶

View Source

var (
	AnyClass          = getCharSetFromOldString([]rune{0}, false)
	ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
	NoneClass         = getCharSetFromOldString(nil, false)
	ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
	NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
	ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
	ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)

	WordClass     = getCharSetFromCategoryString(false, false, WordCategoryText)
	NotWordClass  = getCharSetFromCategoryString(true, false, WordCategoryText)
	SpaceClass    = getCharSetFromCategoryString(false, false, SpaceCategoryText)
	NotSpaceClass = getCharSetFromCategoryString(true, false, SpaceCategoryText)
	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")

	RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
	NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)

	NotNewLineClass = getCharSetFromOldString([]rune{0x0a, 0x0b}, true)
)

View Source

var ErrReplacementError = errors.New("replacement pattern error")

ErrReplacementError is a general error during parsing the replacement text

Functions ¶

func CharDescription ¶

func CharDescription(ch rune) string

CharDescription Produces a human-readable description for a single character.

func Escape ¶

func Escape(input string) string

func IsECMAIdentifierChar ¶

func IsECMAIdentifierChar(r rune) bool

func IsECMAIdentifierStartChar ¶

func IsECMAIdentifierStartChar(r rune) bool

func IsECMAWordChar ¶

func IsECMAWordChar(r rune) bool

func IsWordChar ¶

func IsWordChar(r rune) bool

According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

func Unescape ¶

func Unescape(input string) (string, error)

Types ¶

type AnchorLoc ¶

type AnchorLoc int16

const (
	AnchorBeginning    AnchorLoc = 0x0001
	AnchorBol          AnchorLoc = 0x0002
	AnchorStart        AnchorLoc = 0x0004
	AnchorEol          AnchorLoc = 0x0008
	AnchorEndZ         AnchorLoc = 0x0010
	AnchorEnd          AnchorLoc = 0x0020
	AnchorBoundary     AnchorLoc = 0x0040
	AnchorECMABoundary AnchorLoc = 0x0080
)

where the regex can be pegged

func (AnchorLoc) String ¶

func (anchors AnchorLoc) String() string

anchorDescription returns a human-readable description of the anchors

type BmPrefix ¶

type BmPrefix struct {
	// contains filtered or unexported fields
}

BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.

func (*BmPrefix) Dump ¶

func (b *BmPrefix) Dump(indent string) string

Dump returns the contents of the filter as a human readable string

func (*BmPrefix) IsMatch ¶

func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool

When a regex is anchored, we can do a quick IsMatch test instead of a Scan

func (*BmPrefix) Scan ¶

func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int

Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.

The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.

func (*BmPrefix) String ¶

func (b *BmPrefix) String() string

type Category ¶

type Category struct {
	Negate bool
	Cat    string
}

func (Category) String ¶

func (c Category) String() string

type CharClassAnalysisResults ¶

type CharClassAnalysisResults struct {
	// true if the set contains only ranges; false if it contains Unicode categories and/or subtraction.
	OnlyRanges bool
	// true if we know for sure that the set contains only ASCII values; otherwise, false.
	// This can only be true if OnlyRanges is true.
	ContainsOnlyAscii bool
	// true if we know for sure that the set doesn't contain any ASCII values; otherwise, false.
	// This can only be true if OnlyRanges is true.
	ContainsNoAscii bool
	// true if we know for sure that all ASCII values are in the set; otherwise, false.
	// This can only be true if OnlyRanges is true.
	AllAsciiContained bool
	// true if we know for sure that all non-ASCII values are in the set; otherwise, false.
	// This can only be true if OnlyRanges is true.
	AllNonAsciiContained bool
	// The inclusive lower bound.
	// This is only valid if OnlyRanges is true.
	LowerBoundInclusiveIfOnlyRanges rune
	// The exclusive upper bound.
	// This is only valid if OnlyRanges is true.
	UpperBoundExclusiveIfOnlyRanges rune
}

type CharSet ¶

type CharSet struct {
	// contains filtered or unexported fields
}

CharSet combines start-end rune ranges and unicode categories representing a set of characters

func NewCharSetRuntime ¶

func NewCharSetRuntime(buf string) CharSet

func (*CharSet) Analyze ¶

func (set *CharSet) Analyze() CharClassAnalysisResults

<summary>Analyzes the set to determine some basic properties that can be used to optimize usage.

func (CharSet) CharIn ¶

func (c CharSet) CharIn(ch rune) bool

CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.

func (CharSet) Copy ¶

func (c CharSet) Copy() CharSet

Copy makes a deep copy to prevent accidental mutation of a set

func (*CharSet) Equals ¶

func (c *CharSet) Equals(c2 *CharSet) bool

func (CharSet) GetIfNRanges ¶

func (c CharSet) GetIfNRanges(n int) []SingleRange

func (*CharSet) GetIfOnlyUnicodeCategories ¶

func (c *CharSet) GetIfOnlyUnicodeCategories() (cats []Category, negate bool)

func (*CharSet) GetSetChars ¶

func (c *CharSet) GetSetChars(maxChars int) []rune

Gets all of the characters in the specified set, storing them into the provided span.

Only considers character classes that only contain sets (no categories), just simple sets containing starting/ending pairs (subtraction from those pairs is factored in, however).The returned characters may be negated: if IsNegated(set) is false, then the returned characters are the only ones that match; if it returns true, then the returned characters are the only ones that don't match.

func (CharSet) HasSubtraction ¶

func (c CharSet) HasSubtraction() bool

func (*CharSet) Hash ¶

func (c *CharSet) Hash() []byte

func (CharSet) IsAnything ¶

func (c CharSet) IsAnything() bool

func (CharSet) IsEmpty ¶

func (c CharSet) IsEmpty() bool

func (CharSet) IsMergeable ¶

func (c CharSet) IsMergeable() bool

func (CharSet) IsNegated ¶

func (c CharSet) IsNegated() bool

func (CharSet) IsSingleton ¶

func (c CharSet) IsSingleton() bool

func (CharSet) IsSingletonInverse ¶

func (c CharSet) IsSingletonInverse() bool

func (*CharSet) IsUnicodeCategoryOfSmallCharCount ¶

func (c *CharSet) IsUnicodeCategoryOfSmallCharCount() (isSmall bool, chars []rune, negated bool, desc string)

Gets whether the specified set is a named set with a reasonably small count of Unicode characters. Designed to help the regexp code generator choose a better search algo for finding chars Description is a short name that can be used as part of a var name in code gen

func (*CharSet) MayOverlap ¶

func (set1 *CharSet) MayOverlap(set2 *CharSet) bool

Determines whether two sets could overlap.

func (CharSet) SingletonChar ¶

func (c CharSet) SingletonChar() rune

SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input

func (CharSet) String ¶

func (c CharSet) String() string

gets a human-readable description for a set string

type Code ¶

type Code struct {
	Codes             []int              // the code
	Strings           [][]rune           // string table
	Sets              []*CharSet         //character set table
	TrackCount        int                // how many instructions use backtracking
	Caps              map[int]int        // mapping of user group numbers -> impl group slots
	Capsize           int                // number of impl group slots
	FcPrefix          *Prefix            // the set of candidate first characters (may be null)
	BmPrefix          *BmPrefix          // the fixed prefix string as a Boyer-Moore machine (may be null)
	Anchors           AnchorLoc          // the set of zero-length start anchors (RegexFCD.Bol, etc)
	RightToLeft       bool               // true if right to left
	FindOptimizations *FindOptimizations // analyzed candidate search strategy
}

func Write ¶

func Write(tree *RegexTree) (*Code, error)

func (*Code) Dump ¶

func (c *Code) Dump() string

func (*Code) OpcodeDescription ¶

func (c *Code) OpcodeDescription(offset int) string

OpcodeDescription is a humman readable string of the specific offset

func (*Code) PrepareCharSetASCIIBitmaps ¶

func (c *Code) PrepareCharSetASCIIBitmaps()

PrepareCharSetASCIIBitmaps builds bounded ASCII lookup tables for compiled character classes before the regexp is shared across goroutines.

type Error ¶

type Error struct {
	Code ErrorCode
	Expr string
	Args []interface{}
}

An Error describes a failure to parse a regular expression and gives the offending expression.

func (*Error) Error ¶

func (e *Error) Error() string

type ErrorCode ¶

type ErrorCode string

An ErrorCode describes a failure to parse a regular expression.

func (ErrorCode) String ¶

func (e ErrorCode) String() string

type FindNextStartingPositionMode ¶

type FindNextStartingPositionMode int

const (
	NoSearch FindNextStartingPositionMode = iota
	// A "beginning" anchor at the beginning of the pattern.
	LeadingAnchor_LeftToRight_Beginning
	// A "start" anchor at the beginning of the pattern.
	LeadingAnchor_LeftToRight_Start
	// An "endz" anchor at the beginning of the pattern.  This is rare.
	LeadingAnchor_LeftToRight_EndZ
	// An "end" anchor at the beginning of the pattern.  This is rare.
	LeadingAnchor_LeftToRight_End
	// A "beginning" anchor at the beginning of the right-to-left pattern.
	LeadingAnchor_RightToLeft_Beginning
	// A "start" anchor at the beginning of the right-to-left pattern.
	LeadingAnchor_RightToLeft_Start
	// An "endz" anchor at the beginning of the right-to-left pattern.  This is rare.
	LeadingAnchor_RightToLeft_EndZ
	// An "end" anchor at the beginning of the right-to-left pattern.  This is rare.
	LeadingAnchor_RightToLeft_End
	// An "end" anchor at the end of the pattern, with the pattern always matching a fixed-length expression.
	TrailingAnchor_FixedLength_LeftToRight_End
	// An "endz" anchor at the end of the pattern, with the pattern always matching a fixed-length expression.
	TrailingAnchor_FixedLength_LeftToRight_EndZ
	// A multi-character substring at the beginning of the pattern.
	LeadingString_LeftToRight
	// A multi-character substring at the beginning of the right-to-left pattern.
	LeadingString_RightToLeft
	// A multi-character ordinal case-insensitive substring at the beginning of the pattern.
	LeadingString_OrdinalIgnoreCase_LeftToRight
	// Multiple leading prefix strings
	LeadingStrings_LeftToRight
	// Multiple leading ordinal case-insensitive prefix strings
	LeadingStrings_OrdinalIgnoreCase_LeftToRight

	// A set starting the pattern.
	LeadingSet_LeftToRight
	// A set starting the right-to-left pattern.
	LeadingSet_RightToLeft

	// A single character at the start of the right-to-left pattern.
	LeadingChar_RightToLeft

	// A single character at a fixed distance from the start of the pattern.
	FixedDistanceChar_LeftToRight
	// A multi-character case-sensitive string at a fixed distance from the start of the pattern.
	FixedDistanceString_LeftToRight

	// One or more sets at a fixed distance from the start of the pattern.
	FixedDistanceSets_LeftToRight

	// A literal (single character, multi-char string, or set with small number of characters) after a non-overlapping set loop at the start of the pattern.
	LiteralAfterLoop_LeftToRight

	// A sequence of required landmarks after a leading loop.
	RequiredLandmarkChain_LeftToRight
)

func (FindNextStartingPositionMode) String ¶ added in v2.1.2

func (m FindNextStartingPositionMode) String() string

type FindOptimizations ¶

type FindOptimizations struct {
	FindMode             FindNextStartingPositionMode
	LeadingAnchor        NodeType
	TrailingAnchor       NodeType
	MinRequiredLength    int
	MaxPossibleLength    int
	LeadingPrefix        string
	LeadingPrefixes      []string
	LeadingPrefixesRunes [][]rune

	FixedDistanceLiteral FixedDistanceLiteral
	FixedDistanceSets    []FixedDistanceSet
	LiteralAfterLoop     *LiteralAfterLoop
	LandmarkChain        *RequiredLandmarkChain
	// contains filtered or unexported fields
}

func (*FindOptimizations) Dump ¶ added in v2.1.2

func (f *FindOptimizations) Dump() string

type FixedDistanceLiteral ¶

type FixedDistanceLiteral struct {
	S        string
	C        rune
	Distance int
}

type FixedDistanceSet ¶

type FixedDistanceSet struct {
	Set      *CharSet
	Chars    []rune
	Negated  bool
	Range    *SingleRange
	Distance int
}

type InstOp ¶

type InstOp int

const (
	Onerep    InstOp = 0 // lef,back char,min,max    a {n}
	Notonerep InstOp = 1 // lef,back char,min,max    .{n}
	Setrep    InstOp = 2 // lef,back set,min,max     [\d]{n}

	Oneloop    InstOp = 3 // lef,back char,min,max    a {,n}
	Notoneloop InstOp = 4 // lef,back char,min,max    .{,n}
	Setloop    InstOp = 5 // lef,back set,min,max     [\d]{,n}

	Onelazy    InstOp = 6 // lef,back char,min,max    a {,n}?
	Notonelazy InstOp = 7 // lef,back char,min,max    .{,n}?
	Setlazy    InstOp = 8 // lef,back set,min,max     [\d]{,n}?

	One    InstOp = 9  // lef      char            a
	Notone InstOp = 10 // lef      char            [^a]
	Set    InstOp = 11 // lef      set             [a-z\s]  \w \s \d

	Multi InstOp = 12 // lef      string          abcd
	Ref   InstOp = 13 // lef      group           \#

	Bol         InstOp = 14 //                          ^
	Eol         InstOp = 15 //                          $
	Boundary    InstOp = 16 //                          \b
	Nonboundary InstOp = 17 //                          \B
	Beginning   InstOp = 18 //                          \A
	Start       InstOp = 19 //                          \G
	EndZ        InstOp = 20 //                          \Z
	End         InstOp = 21 //                          \Z

	Nothing InstOp = 22 //                          Reject!

	Lazybranch      InstOp = 23 // back     jump            straight first
	Branchmark      InstOp = 24 // back     jump            branch first for loop
	Lazybranchmark  InstOp = 25 // back     jump            straight first for loop
	Nullcount       InstOp = 26 // back     val             set counter, null mark
	Setcount        InstOp = 27 // back     val             set counter, make mark
	Branchcount     InstOp = 28 // back     jump,limit      branch++ if zero<=c<limit
	Lazybranchcount InstOp = 29 // back     jump,limit      same, but straight first
	Nullmark        InstOp = 30 // back                     save position
	Setmark         InstOp = 31 // back                     save position
	Capturemark     InstOp = 32 // back     group           define group
	Getmark         InstOp = 33 // back                     recall position
	Setjump         InstOp = 34 // back                     save backtrack state
	Backjump        InstOp = 35 //                          zap back to saved state
	Forejump        InstOp = 36 //                          zap backtracking state
	Testref         InstOp = 37 //                          backtrack if ref undefined
	Goto            InstOp = 38 //          jump            just go

	Prune InstOp = 39 //                          prune it baby
	Stop  InstOp = 40 //                          done!

	ECMABoundary    InstOp = 41 //                          \b
	NonECMABoundary InstOp = 42 //                          \B

	// Atomic loop of the specified character.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	Oneloopatomic InstOp = 43
	// Atomic loop of a single character other than the one specified.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	Notoneloopatomic InstOp = 44
	// Atomic loop of a single character matching the specified set
	// Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count.
	Setloopatomic InstOp = 45
	// Updates the bumpalong position to the current position.
	UpdateBumpalong InstOp = 46

	Mask  InstOp = 63  // Mask to get unmodified ordinary operator
	Rtl   InstOp = 64  // bit to indicate that we're reverse scanning.
	Back  InstOp = 128 // bit to indicate that we're backtracking.
	Back2 InstOp = 256 // bit to indicate that we're backtracking on a second branch.
	Ci    InstOp = 512 // bit to indicate that we're case-insensitive.
)

type LiteralAfterLoop ¶

type LiteralAfterLoop struct {
	String           string
	StringIgnoreCase bool
	Char             rune
	Chars            []rune

	LoopNode *RegexNode
}

type NodeType ¶

type NodeType int32

const (
	// The following are leaves, and correspond to primitive operations
	NtUnknown NodeType = -1
	//NtOnerep      NodeType = 0  // lef,back char,min,max    a {n}
	//NtNotonerep   NodeType = 1  // lef,back char,min,max    .{n}
	//NtSetrep      NodeType = 2  // lef,back set,min,max     [\d]{n}
	NtOneloop     NodeType = 3  // lef,back char,min,max    a {,n}
	NtNotoneloop  NodeType = 4  // lef,back char,min,max    .{,n}
	NtSetloop     NodeType = 5  // lef,back set,min,max     [\d]{,n}
	NtOnelazy     NodeType = 6  // lef,back char,min,max    a {,n}?
	NtNotonelazy  NodeType = 7  // lef,back char,min,max    .{,n}?
	NtSetlazy     NodeType = 8  // lef,back set,min,max     [\d]{,n}?
	NtOne         NodeType = 9  // lef      char            a
	NtNotone      NodeType = 10 // lef      char            [^a]
	NtSet         NodeType = 11 // lef      set             [a-z\s]  \w \s \d
	NtMulti       NodeType = 12 // lef      string          abcd
	NtRef         NodeType = 13 // lef      group           \#
	NtBol         NodeType = 14 //                          ^
	NtEol         NodeType = 15 //                          $
	NtBoundary    NodeType = 16 //                          \b
	NtNonboundary NodeType = 17 //                          \B
	NtBeginning   NodeType = 18 //                          \A
	NtStart       NodeType = 19 //                          \G
	NtEndZ        NodeType = 20 //                          \Z
	NtEnd         NodeType = 21 //                          \Z

	NtNothing     NodeType = 22 //          []
	NtEmpty       NodeType = 23 //          ()
	NtAlternate   NodeType = 24 //          a|b
	NtConcatenate NodeType = 25 //          ab
	NtLoop        NodeType = 26 // m,x      * + ? {,}
	NtLazyloop    NodeType = 27 // m,x      *? +? ?? {,}?
	NtCapture     NodeType = 28 // n        ()
	NtGroup       NodeType = 29 //          (?:)
	NtPosLook     NodeType = 30 //          (?=) (?<=)
	NtNegLook     NodeType = 31 //          (?!) (?<!)
	NtAtomic      NodeType = 32 //          (?>) (?<)
	NtBackRefCond NodeType = 33 //          (?(n) | )
	NtExprCond    NodeType = 34 //          (?(...) | )

	NtECMABoundary    NodeType = 41 //                          \b
	NtNonECMABoundary NodeType = 42 //                          \B

	// Atomic loop of the specified character.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	NtOneloopatomic NodeType = 43
	// Atomic loop of a single character other than the one specified.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	NtNotoneloopatomic NodeType = 44
	// Atomic loop of a single character matching the specified set
	// Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count.
	NtSetloopatomic NodeType = 45
	// Updates the bumpalong position to the current position.
	NtUpdateBumpalong NodeType = 46
)

const (
	BeforeChild NodeType = 64
	AfterChild  NodeType = 128
	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
	MaxPrefixSize = 50
)

type ParseOptions ¶

type ParseOptions struct {
	RegexOptions         RegexOptions
	MaintainCaptureOrder bool
	CodeGen              bool
}

type Prefix ¶

type Prefix struct {
	PrefixStr       []rune
	PrefixSet       CharSet
	CaseInsensitive bool
}

type RegexNode ¶

type RegexNode struct {
	T        NodeType
	Children []*RegexNode
	Str      []rune
	Set      *CharSet
	Ch       rune
	M        int
	N        int
	Options  RegexOptions
	Parent   *RegexNode
}

Implementation notes:

Since the node tree is a temporary data structure only used during compilation of the regexp to integer codes, it's designed for clarity and convenience rather than space efficiency.

RegexNodes are built into a tree, linked by the n.children list. Each node also has a n.parent and n.ichild member indicating its parent and which child # it is in its parent's list.

RegexNodes come in as many types as there are constructs in a regular expression, for example, "concatenate", "alternate", "one", "rept", "group". There are also node types for basic peephole optimizations, e.g., "onerep", "notsetrep", etc.

Because perl 5 allows "lookback" groups that scan backwards, each node also gets a "direction". Normally the value of boolean n.backward = false.

On the parse stack, each tree has a "role" - basically, the nonterminal in the grammar that the parser has currently assigned to the tree. That code is stored in n.role.

Finally, some of the different kinds of nodes have data. Two integers (for the looping constructs) are stored in n.operands, an an object (either a string or a set) is stored in n.data

func (*RegexNode) ComputeMinLength ¶

func (n *RegexNode) ComputeMinLength() int

Computes a min bound on the required length of any string that could possibly match. If the result is 0, there is no minimum we can enforce.

func (*RegexNode) Description ¶

func (n *RegexNode) Description() string

func (*RegexNode) FindLastExpressionInLoopForAutoAtomic ¶

func (n *RegexNode) FindLastExpressionInLoopForAutoAtomic() *RegexNode

Recurs into the last expression of a loop node, looking to see if it can find a node that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. Returns The found node that should be explored further for auto-atomicity; null if it doesn't exist.

func (*RegexNode) FindStartingLiteral ¶

func (n *RegexNode) FindStartingLiteral() *StartingLiteral

func (*RegexNode) FindStartingLiteralNode ¶

func (n *RegexNode) FindStartingLiteralNode(allowZeroWidth bool) *RegexNode

Finds the guaranteed beginning literal(s) of the node, or null if none exists. allowZeroWidth = true

func (*RegexNode) FirstCharOfOneOrMulti ¶

func (n *RegexNode) FirstCharOfOneOrMulti() rune

Gets the character that begins a One or Multi.

func (*RegexNode) IsAtomicloopFamily ¶

func (n *RegexNode) IsAtomicloopFamily() bool

func (*RegexNode) IsNotoneFamily ¶

func (n *RegexNode) IsNotoneFamily() bool

func (*RegexNode) IsNotoneloopFamily ¶

func (n *RegexNode) IsNotoneloopFamily() bool

func (*RegexNode) IsOneFamily ¶

func (n *RegexNode) IsOneFamily() bool

func (*RegexNode) IsOneloopFamily ¶

func (n *RegexNode) IsOneloopFamily() bool

func (*RegexNode) IsSetFamily ¶

func (n *RegexNode) IsSetFamily() bool

func (*RegexNode) IsSetloopFamily ¶

func (n *RegexNode) IsSetloopFamily() bool

func (*RegexNode) ReplaceChild ¶

func (n *RegexNode) ReplaceChild(index int, newChild *RegexNode)

func (*RegexNode) TryGetJoinableLengthCheckChildRange ¶

func (n *RegexNode) TryGetJoinableLengthCheckChildRange(childIndex int, requiredLength *int, exclusiveEnd *int) bool

Determine whether the specified child node is the beginning of a sequence that can trivially have length checks combined in order to avoid bounds checks. requiredLength is The sum of all the fixed lengths for the nodes in the sequence.</param> exclusiveEnd is The index of the node just after the last one in the sequence.</param> returns true if more than one node can have their length checks combined; otherwise, false.</returns>

There are additional node types for which we can prove a fixed length, e.g. examining all branches of an alternation and returning true if all their lengths are equal. However, the primary purpose of this method is to avoid bounds checks by consolidating length checks that guard accesses to strings/spans for which the JIT can see a fixed index within bounds, and alternations employ patterns that defeat that (e.g. reassigning the span in question). As such, the implementation remains focused on only a core subset of nodes that are a) likely to be used in concatenations and b) employ simple patterns of checks.

func (*RegexNode) TryGetOrdinalCaseInsensitiveString ¶

func (n *RegexNode) TryGetOrdinalCaseInsensitiveString(childIndex int, exclusiveChildBound int, consumeZeroWidthNodes bool) (success bool, nodesConsumed int, caseInsensitiveString string)

Determines whether the specified child index of a concatenation begins a sequence whose values should be used to perform an ordinal case-insensitive comparison.

When consumeZeroWidthNodes is false, the consumer needs the semantics of matching the produced string to fully represent the semantics of all the consumed nodes, which means nodes can be consumed iff they produce text that's represented by the resulting string. When true, the resulting string needs to fully represent all valid matches at that position, but it can have false positives, which means the resulting string doesn't need to fully represent all zero-width nodes consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of actual matching logic. consumeZeroWidthNodes = false

type RegexOptions ¶

type RegexOptions int32

const (
	IgnoreCase              RegexOptions = 0x0001 // "i"
	Multiline               RegexOptions = 0x0002 // "m"
	ExplicitCapture         RegexOptions = 0x0004 // "n"
	Singleline              RegexOptions = 0x0010 // "s"
	IgnorePatternWhitespace RegexOptions = 0x0020 // "x"
	RightToLeft             RegexOptions = 0x0040 // "r"
	ECMAScript              RegexOptions = 0x0100 // "e"
	RE2                     RegexOptions = 0x0200 // RE2 compat mode
	Unicode                 RegexOptions = 0x0400 // "u"
)

type RegexTree ¶

type RegexTree struct {
	Root              *RegexNode
	Caps              map[int]int
	Capnumlist        []int
	Captop            int
	Capnames          map[string]int
	Caplist           []string
	Options           RegexOptions
	FindOptimizations *FindOptimizations
}

func Parse ¶

func Parse(re string, op ParseOptions) (*RegexTree, error)

Parse converts a regex string into a parse tree

func (*RegexTree) Dump ¶

func (t *RegexTree) Dump() string

type ReplacerData ¶

type ReplacerData struct {
	Rep     string
	Strings []string
	Rules   []int
}

func NewReplacerData ¶

func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp

type RequiredLandmark ¶ added in v2.1.0

type RequiredLandmark struct {
	// Alternatives describes the mutually exclusive shapes that can satisfy
	// this single required landmark. A landmark matches when any one alternative
	// matches at a position in the input.
	Alternatives []RequiredLandmarkAlternative
}

type RequiredLandmarkAlternative ¶ added in v2.1.0

type RequiredLandmarkAlternative struct {
	// Literal is the core token for literal alternatives. When non-empty, it
	// must match exactly at the candidate core position, and Set must be nil.
	Literal []rune

	// Set is the core token for character-class alternatives. When non-nil, it
	// must match between MinRepeat and MaxRepeat runes at the candidate core
	// position, and Literal must be empty. The analyzer only builds set
	// alternatives for non-negated sets that can be cheaply enumerated, but the
	// runner uses Set for membership checks.
	Set *CharSet

	// LeadingWhitespaceSet is the optional or required whitespace immediately
	// before the core token. If RequireWhitespaceBefore is true, at least one
	// rune from this set must precede the core. When this alternative is the
	// first matched landmark, the runner may rewind over additional contiguous
	// leading whitespace from this set before rewinding over LeadingLoopSet.
	LeadingWhitespaceSet *CharSet

	// TrailingWhitespaceSet is the optional or required whitespace immediately
	// after the core token. If RequireWhitespaceAfter is true, at least one rune
	// from this set must follow the core. The runner validates the requirement,
	// but does not consume optional trailing whitespace into the landmark end.
	TrailingWhitespaceSet *CharSet

	// MinRepeat and MaxRepeat describe the core token width. Literal alternatives
	// use 1..1 regardless of literal length because the literal is matched as one
	// fixed core token; Set alternatives use the source set repetition.
	MinRepeat int
	MaxRepeat int

	RequireWhitespaceBefore bool
	RequireWhitespaceAfter  bool
}

type RequiredLandmarkChain ¶ added in v2.1.0

type RequiredLandmarkChain struct {
	// LeadingLoopSet is the unbounded leading set loop that precedes every
	// landmark in the original concatenation. At run time, after the first
	// landmark alternative is found, the scanner walks backward over this set
	// to recover the earliest plausible regex start position.
	LeadingLoopSet *CharSet

	// Landmarks must all be found, in slice order, for the chain prefilter to
	// produce a candidate. Each landmark is satisfied by exactly one matching
	// alternative; alternatives are tried independently by the runner.
	Landmarks []RequiredLandmark
}

type SingleRange ¶

type SingleRange struct {
	First rune
	Last  rune
}

type StartingLiteral ¶

type StartingLiteral struct {
	Range    SingleRange
	String   []rune
	SetChars []rune
	Negated  bool
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL