syntax

package
v2.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 22, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	SpaceCategoryText = " "
	WordCategoryText  = "W"
)
View Source
const (
	LowercaseSet = 0 // Set to arg.
	LowercaseAdd = 1 // Add arg.
	LowercaseBor = 2 // Bitwise or with 1.
	LowercaseBad = 3 // Bitwise and with 1 and add original.
)
View Source
const (
	// internal issue
	ErrInternalError ErrorCode = "regexp/syntax: internal error"
	// Parser errors
	ErrUnterminatedComment        = "unterminated comment"
	ErrInvalidCharRange           = "invalid character class range"
	ErrInvalidRepeatSize          = "invalid repeat count"
	ErrInvalidUTF8                = "invalid UTF-8"
	ErrCaptureGroupOutOfRange     = "capture group number out of range"
	ErrUnexpectedParen            = "unexpected )"
	ErrMissingParen               = "missing closing )"
	ErrMissingBrace               = "missing closing }"
	ErrInvalidRepeatOp            = "invalid nested repetition operator"
	ErrMissingRepeatArgument      = "missing argument to repetition operator"
	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
	ErrTooManyAlternates          = "too many | in (?()|)"
	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
	ErrInvalidECMAGroupName       = "invalid capture group name"
	ErrDuplicateGroupName         = "duplicate capture group name"
	ErrCapNumNotZero              = "capture number cannot be zero"
	ErrUndefinedBackRef           = "reference to undefined group number %v"
	ErrUndefinedNameRef           = "reference to undefined group name %v"
	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
	ErrMalformedReference         = "(?(%v) ) malformed"
	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
	ErrMalformedSlashP            = "malformed \\p{X} character escape"
	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
	ErrMissingControl             = "missing control character"
	ErrUnrecognizedControl        = "unrecognized control character"
	ErrTooFewHex                  = "insufficient hexadecimal digits"
	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
	ErrShorthandClassInCharRange  = "cannot create range with shorthand escape sequence \\%v"
	ErrUnterminatedBracket        = "unterminated [] set"
	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
	ErrReversedCharRange          = "[%c-%c] range in reverse order"
)
View Source
const (
	Q byte = 5 // quantifier
	S byte = 4 // ordinary stopper
	Z byte = 3 // ScanBlank stopper
	X byte = 2 // whitespace
	E byte = 1 // should be escaped
)
View Source
const MultiVsRepeaterLimit = 64

Arbitrary number of repetitions of the same character when we'd prefer to represent that as a repeater of that character rather than a string.

Variables

View Source
var (
	AnyClass          = getCharSetFromOldString([]rune{0}, false)
	ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
	NoneClass         = getCharSetFromOldString(nil, false)
	ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
	NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
	ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
	ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)

	WordClass     = getCharSetFromCategoryString(false, false, WordCategoryText)
	NotWordClass  = getCharSetFromCategoryString(true, false, WordCategoryText)
	SpaceClass    = getCharSetFromCategoryString(false, false, SpaceCategoryText)
	NotSpaceClass = getCharSetFromCategoryString(true, false, SpaceCategoryText)
	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")

	RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
	NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)

	NotNewLineClass = getCharSetFromOldString([]rune{0x0a, 0x0b}, true)
)
View Source
var ErrReplacementError = errors.New("replacement pattern error")

ErrReplacementError is a general error during parsing the replacement text

Functions

func CharDescription

func CharDescription(ch rune) string

CharDescription Produces a human-readable description for a single character.

func Escape

func Escape(input string) string

func IsECMAIdentifierChar

func IsECMAIdentifierChar(r rune) bool

func IsECMAIdentifierStartChar

func IsECMAIdentifierStartChar(r rune) bool

func IsECMAWordChar

func IsECMAWordChar(r rune) bool

func IsWordChar

func IsWordChar(r rune) bool

According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

func Unescape

func Unescape(input string) (string, error)

Types

type AnchorLoc

type AnchorLoc int16
const (
	AnchorBeginning    AnchorLoc = 0x0001
	AnchorBol          AnchorLoc = 0x0002
	AnchorStart        AnchorLoc = 0x0004
	AnchorEol          AnchorLoc = 0x0008
	AnchorEndZ         AnchorLoc = 0x0010
	AnchorEnd          AnchorLoc = 0x0020
	AnchorBoundary     AnchorLoc = 0x0040
	AnchorECMABoundary AnchorLoc = 0x0080
)

where the regex can be pegged

func (AnchorLoc) String

func (anchors AnchorLoc) String() string

anchorDescription returns a human-readable description of the anchors

type BmPrefix

type BmPrefix struct {
	// contains filtered or unexported fields
}

BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.

func (*BmPrefix) Dump

func (b *BmPrefix) Dump(indent string) string

Dump returns the contents of the filter as a human readable string

func (*BmPrefix) IsMatch

func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool

When a regex is anchored, we can do a quick IsMatch test instead of a Scan

func (*BmPrefix) Scan

func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int

Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.

The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.

func (*BmPrefix) String

func (b *BmPrefix) String() string

type Category

type Category struct {
	Negate bool
	Cat    string
}

func (Category) String

func (c Category) String() string

type CharClassAnalysisResults

type CharClassAnalysisResults struct {
	// true if the set contains only ranges; false if it contains Unicode categories and/or subtraction.
	OnlyRanges bool
	// true if we know for sure that the set contains only ASCII values; otherwise, false.
	// This can only be true if OnlyRanges is true.
	ContainsOnlyAscii bool
	// true if we know for sure that the set doesn't contain any ASCII values; otherwise, false.
	// This can only be true if OnlyRanges is true.
	ContainsNoAscii bool
	// true if we know for sure that all ASCII values are in the set; otherwise, false.
	// This can only be true if OnlyRanges is true.
	AllAsciiContained bool
	// true if we know for sure that all non-ASCII values are in the set; otherwise, false.
	// This can only be true if OnlyRanges is true.
	AllNonAsciiContained bool
	// The inclusive lower bound.
	// This is only valid if OnlyRanges is true.
	LowerBoundInclusiveIfOnlyRanges rune
	// The exclusive upper bound.
	// This is only valid if OnlyRanges is true.
	UpperBoundExclusiveIfOnlyRanges rune
}

type CharSet

type CharSet struct {
	// contains filtered or unexported fields
}

CharSet combines start-end rune ranges and unicode categories representing a set of characters

func NewCharSetRuntime

func NewCharSetRuntime(buf string) CharSet

func (*CharSet) Analyze

func (set *CharSet) Analyze() CharClassAnalysisResults

<summary>Analyzes the set to determine some basic properties that can be used to optimize usage.

func (CharSet) CharIn

func (c CharSet) CharIn(ch rune) bool

CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.

func (CharSet) Copy

func (c CharSet) Copy() CharSet

Copy makes a deep copy to prevent accidental mutation of a set

func (*CharSet) Equals

func (c *CharSet) Equals(c2 *CharSet) bool

func (CharSet) GetIfNRanges

func (c CharSet) GetIfNRanges(n int) []SingleRange

func (*CharSet) GetIfOnlyUnicodeCategories

func (c *CharSet) GetIfOnlyUnicodeCategories() (cats []Category, negate bool)

func (*CharSet) GetSetChars

func (c *CharSet) GetSetChars(maxChars int) []rune

Gets all of the characters in the specified set, storing them into the provided span.

Only considers character classes that only contain sets (no categories), just simple sets containing starting/ending pairs (subtraction from those pairs is factored in, however).The returned characters may be negated: if IsNegated(set) is false, then the returned characters are the only ones that match; if it returns true, then the returned characters are the only ones that don't match.

func (CharSet) HasSubtraction

func (c CharSet) HasSubtraction() bool

func (*CharSet) Hash

func (c *CharSet) Hash() []byte

func (CharSet) IsAnything

func (c CharSet) IsAnything() bool

func (CharSet) IsEmpty

func (c CharSet) IsEmpty() bool

func (CharSet) IsMergeable

func (c CharSet) IsMergeable() bool

func (CharSet) IsNegated

func (c CharSet) IsNegated() bool

func (CharSet) IsSingleton

func (c CharSet) IsSingleton() bool

func (CharSet) IsSingletonInverse

func (c CharSet) IsSingletonInverse() bool

func (*CharSet) IsUnicodeCategoryOfSmallCharCount

func (c *CharSet) IsUnicodeCategoryOfSmallCharCount() (isSmall bool, chars []rune, negated bool, desc string)

Gets whether the specified set is a named set with a reasonably small count of Unicode characters. Designed to help the regexp code generator choose a better search algo for finding chars Description is a short name that can be used as part of a var name in code gen

func (*CharSet) MayOverlap

func (set1 *CharSet) MayOverlap(set2 *CharSet) bool

Determines whether two sets could overlap.

func (CharSet) SingletonChar

func (c CharSet) SingletonChar() rune

SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input

func (CharSet) String

func (c CharSet) String() string

gets a human-readable description for a set string

type Code

type Code struct {
	Codes             []int              // the code
	Strings           [][]rune           // string table
	Sets              []*CharSet         //character set table
	TrackCount        int                // how many instructions use backtracking
	Caps              map[int]int        // mapping of user group numbers -> impl group slots
	Capsize           int                // number of impl group slots
	FcPrefix          *Prefix            // the set of candidate first characters (may be null)
	BmPrefix          *BmPrefix          // the fixed prefix string as a Boyer-Moore machine (may be null)
	Anchors           AnchorLoc          // the set of zero-length start anchors (RegexFCD.Bol, etc)
	RightToLeft       bool               // true if right to left
	FindOptimizations *FindOptimizations // analyzed candidate search strategy
}

func Write

func Write(tree *RegexTree) (*Code, error)

func (*Code) Dump

func (c *Code) Dump() string

func (*Code) OpcodeDescription

func (c *Code) OpcodeDescription(offset int) string

OpcodeDescription is a humman readable string of the specific offset

func (*Code) PrepareCharSetASCIIBitmaps

func (c *Code) PrepareCharSetASCIIBitmaps()

PrepareCharSetASCIIBitmaps builds bounded ASCII lookup tables for compiled character classes before the regexp is shared across goroutines.

type Error

type Error struct {
	Code ErrorCode
	Expr string
	Args []interface{}
}

An Error describes a failure to parse a regular expression and gives the offending expression.

func (*Error) Error

func (e *Error) Error() string

type ErrorCode

type ErrorCode string

An ErrorCode describes a failure to parse a regular expression.

func (ErrorCode) String

func (e ErrorCode) String() string

type FindNextStartingPositionMode

type FindNextStartingPositionMode int
const (
	NoSearch FindNextStartingPositionMode = iota
	// A "beginning" anchor at the beginning of the pattern.
	LeadingAnchor_LeftToRight_Beginning
	// A "start" anchor at the beginning of the pattern.
	LeadingAnchor_LeftToRight_Start
	// An "endz" anchor at the beginning of the pattern.  This is rare.
	LeadingAnchor_LeftToRight_EndZ
	// An "end" anchor at the beginning of the pattern.  This is rare.
	LeadingAnchor_LeftToRight_End
	// A "beginning" anchor at the beginning of the right-to-left pattern.
	LeadingAnchor_RightToLeft_Beginning
	// A "start" anchor at the beginning of the right-to-left pattern.
	LeadingAnchor_RightToLeft_Start
	// An "endz" anchor at the beginning of the right-to-left pattern.  This is rare.
	LeadingAnchor_RightToLeft_EndZ
	// An "end" anchor at the beginning of the right-to-left pattern.  This is rare.
	LeadingAnchor_RightToLeft_End
	// An "end" anchor at the end of the pattern, with the pattern always matching a fixed-length expression.
	TrailingAnchor_FixedLength_LeftToRight_End
	// An "endz" anchor at the end of the pattern, with the pattern always matching a fixed-length expression.
	TrailingAnchor_FixedLength_LeftToRight_EndZ
	// A multi-character substring at the beginning of the pattern.
	LeadingString_LeftToRight
	// A multi-character substring at the beginning of the right-to-left pattern.
	LeadingString_RightToLeft
	// A multi-character ordinal case-insensitive substring at the beginning of the pattern.
	LeadingString_OrdinalIgnoreCase_LeftToRight
	// Multiple leading prefix strings
	LeadingStrings_LeftToRight
	// Multiple leading ordinal case-insensitive prefix strings
	LeadingStrings_OrdinalIgnoreCase_LeftToRight

	// A set starting the pattern.
	LeadingSet_LeftToRight
	// A set starting the right-to-left pattern.
	LeadingSet_RightToLeft

	// A single character at the start of the right-to-left pattern.
	LeadingChar_RightToLeft

	// A single character at a fixed distance from the start of the pattern.
	FixedDistanceChar_LeftToRight
	// A multi-character case-sensitive string at a fixed distance from the start of the pattern.
	FixedDistanceString_LeftToRight

	// One or more sets at a fixed distance from the start of the pattern.
	FixedDistanceSets_LeftToRight

	// A literal (single character, multi-char string, or set with small number of characters) after a non-overlapping set loop at the start of the pattern.
	LiteralAfterLoop_LeftToRight

	// A sequence of required landmarks after a leading loop.
	RequiredLandmarkChain_LeftToRight
)

type FindOptimizations

type FindOptimizations struct {
	FindMode          FindNextStartingPositionMode
	LeadingAnchor     NodeType
	TrailingAnchor    NodeType
	MinRequiredLength int
	MaxPossibleLength int
	LeadingPrefix     string
	LeadingPrefixes   []string

	FixedDistanceLiteral FixedDistanceLiteral
	FixedDistanceSets    []FixedDistanceSet
	LiteralAfterLoop     *LiteralAfterLoop
	LandmarkChain        *RequiredLandmarkChain
	// contains filtered or unexported fields
}

type FixedDistanceLiteral

type FixedDistanceLiteral struct {
	S        string
	C        rune
	Distance int
}

type FixedDistanceSet

type FixedDistanceSet struct {
	Set      *CharSet
	Chars    []rune
	Negated  bool
	Range    *SingleRange
	Distance int
}

type InstOp

type InstOp int
const (
	Onerep    InstOp = 0 // lef,back char,min,max    a {n}
	Notonerep InstOp = 1 // lef,back char,min,max    .{n}
	Setrep    InstOp = 2 // lef,back set,min,max     [\d]{n}

	Oneloop    InstOp = 3 // lef,back char,min,max    a {,n}
	Notoneloop InstOp = 4 // lef,back char,min,max    .{,n}
	Setloop    InstOp = 5 // lef,back set,min,max     [\d]{,n}

	Onelazy    InstOp = 6 // lef,back char,min,max    a {,n}?
	Notonelazy InstOp = 7 // lef,back char,min,max    .{,n}?
	Setlazy    InstOp = 8 // lef,back set,min,max     [\d]{,n}?

	One    InstOp = 9  // lef      char            a
	Notone InstOp = 10 // lef      char            [^a]
	Set    InstOp = 11 // lef      set             [a-z\s]  \w \s \d

	Multi InstOp = 12 // lef      string          abcd
	Ref   InstOp = 13 // lef      group           \#

	Bol         InstOp = 14 //                          ^
	Eol         InstOp = 15 //                          $
	Boundary    InstOp = 16 //                          \b
	Nonboundary InstOp = 17 //                          \B
	Beginning   InstOp = 18 //                          \A
	Start       InstOp = 19 //                          \G
	EndZ        InstOp = 20 //                          \Z
	End         InstOp = 21 //                          \Z

	Nothing InstOp = 22 //                          Reject!

	Lazybranch      InstOp = 23 // back     jump            straight first
	Branchmark      InstOp = 24 // back     jump            branch first for loop
	Lazybranchmark  InstOp = 25 // back     jump            straight first for loop
	Nullcount       InstOp = 26 // back     val             set counter, null mark
	Setcount        InstOp = 27 // back     val             set counter, make mark
	Branchcount     InstOp = 28 // back     jump,limit      branch++ if zero<=c<limit
	Lazybranchcount InstOp = 29 // back     jump,limit      same, but straight first
	Nullmark        InstOp = 30 // back                     save position
	Setmark         InstOp = 31 // back                     save position
	Capturemark     InstOp = 32 // back     group           define group
	Getmark         InstOp = 33 // back                     recall position
	Setjump         InstOp = 34 // back                     save backtrack state
	Backjump        InstOp = 35 //                          zap back to saved state
	Forejump        InstOp = 36 //                          zap backtracking state
	Testref         InstOp = 37 //                          backtrack if ref undefined
	Goto            InstOp = 38 //          jump            just go

	Prune InstOp = 39 //                          prune it baby
	Stop  InstOp = 40 //                          done!

	ECMABoundary    InstOp = 41 //                          \b
	NonECMABoundary InstOp = 42 //                          \B

	// Atomic loop of the specified character.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	Oneloopatomic InstOp = 43
	// Atomic loop of a single character other than the one specified.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	Notoneloopatomic InstOp = 44
	// Atomic loop of a single character matching the specified set
	// Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count.
	Setloopatomic InstOp = 45
	// Updates the bumpalong position to the current position.
	UpdateBumpalong InstOp = 46

	Mask  InstOp = 63  // Mask to get unmodified ordinary operator
	Rtl   InstOp = 64  // bit to indicate that we're reverse scanning.
	Back  InstOp = 128 // bit to indicate that we're backtracking.
	Back2 InstOp = 256 // bit to indicate that we're backtracking on a second branch.
	Ci    InstOp = 512 // bit to indicate that we're case-insensitive.
)

type LiteralAfterLoop

type LiteralAfterLoop struct {
	String           string
	StringIgnoreCase bool
	Char             rune
	Chars            []rune

	LoopNode *RegexNode
}

type NodeType

type NodeType int32
const (
	// The following are leaves, and correspond to primitive operations
	NtUnknown NodeType = -1
	//NtOnerep      NodeType = 0  // lef,back char,min,max    a {n}
	//NtNotonerep   NodeType = 1  // lef,back char,min,max    .{n}
	//NtSetrep      NodeType = 2  // lef,back set,min,max     [\d]{n}
	NtOneloop     NodeType = 3  // lef,back char,min,max    a {,n}
	NtNotoneloop  NodeType = 4  // lef,back char,min,max    .{,n}
	NtSetloop     NodeType = 5  // lef,back set,min,max     [\d]{,n}
	NtOnelazy     NodeType = 6  // lef,back char,min,max    a {,n}?
	NtNotonelazy  NodeType = 7  // lef,back char,min,max    .{,n}?
	NtSetlazy     NodeType = 8  // lef,back set,min,max     [\d]{,n}?
	NtOne         NodeType = 9  // lef      char            a
	NtNotone      NodeType = 10 // lef      char            [^a]
	NtSet         NodeType = 11 // lef      set             [a-z\s]  \w \s \d
	NtMulti       NodeType = 12 // lef      string          abcd
	NtRef         NodeType = 13 // lef      group           \#
	NtBol         NodeType = 14 //                          ^
	NtEol         NodeType = 15 //                          $
	NtBoundary    NodeType = 16 //                          \b
	NtNonboundary NodeType = 17 //                          \B
	NtBeginning   NodeType = 18 //                          \A
	NtStart       NodeType = 19 //                          \G
	NtEndZ        NodeType = 20 //                          \Z
	NtEnd         NodeType = 21 //                          \Z

	NtNothing     NodeType = 22 //          []
	NtEmpty       NodeType = 23 //          ()
	NtAlternate   NodeType = 24 //          a|b
	NtConcatenate NodeType = 25 //          ab
	NtLoop        NodeType = 26 // m,x      * + ? {,}
	NtLazyloop    NodeType = 27 // m,x      *? +? ?? {,}?
	NtCapture     NodeType = 28 // n        ()
	NtGroup       NodeType = 29 //          (?:)
	NtPosLook     NodeType = 30 //          (?=) (?<=)
	NtNegLook     NodeType = 31 //          (?!) (?<!)
	NtAtomic      NodeType = 32 //          (?>) (?<)
	NtBackRefCond NodeType = 33 //          (?(n) | )
	NtExprCond    NodeType = 34 //          (?(...) | )

	NtECMABoundary    NodeType = 41 //                          \b
	NtNonECMABoundary NodeType = 42 //                          \B

	// Atomic loop of the specified character.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	NtOneloopatomic NodeType = 43
	// Atomic loop of a single character other than the one specified.
	// Operand 0 is the character. Operand 1 is the max iteration count.
	NtNotoneloopatomic NodeType = 44
	// Atomic loop of a single character matching the specified set
	// Operand 0 is index into the strings table of the character class description. Operand 1 is the repetition count.
	NtSetloopatomic NodeType = 45
	// Updates the bumpalong position to the current position.
	NtUpdateBumpalong NodeType = 46
)
const (
	BeforeChild NodeType = 64
	AfterChild  NodeType = 128
	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
	MaxPrefixSize = 50
)

type ParseOptions

type ParseOptions struct {
	RegexOptions         RegexOptions
	MaintainCaptureOrder bool
	CodeGen              bool
}

type Prefix

type Prefix struct {
	PrefixStr       []rune
	PrefixSet       CharSet
	CaseInsensitive bool
}

type RegexNode

type RegexNode struct {
	T        NodeType
	Children []*RegexNode
	Str      []rune
	Set      *CharSet
	Ch       rune
	M        int
	N        int
	Options  RegexOptions
	Parent   *RegexNode
}

Implementation notes:

Since the node tree is a temporary data structure only used during compilation of the regexp to integer codes, it's designed for clarity and convenience rather than space efficiency.

RegexNodes are built into a tree, linked by the n.children list. Each node also has a n.parent and n.ichild member indicating its parent and which child # it is in its parent's list.

RegexNodes come in as many types as there are constructs in a regular expression, for example, "concatenate", "alternate", "one", "rept", "group". There are also node types for basic peephole optimizations, e.g., "onerep", "notsetrep", etc.

Because perl 5 allows "lookback" groups that scan backwards, each node also gets a "direction". Normally the value of boolean n.backward = false.

On the parse stack, each tree has a "role" - basically, the nonterminal in the grammar that the parser has currently assigned to the tree. That code is stored in n.role.

Finally, some of the different kinds of nodes have data. Two integers (for the looping constructs) are stored in n.operands, an an object (either a string or a set) is stored in n.data

func (*RegexNode) ComputeMinLength

func (n *RegexNode) ComputeMinLength() int

Computes a min bound on the required length of any string that could possibly match. If the result is 0, there is no minimum we can enforce.

func (*RegexNode) Description

func (n *RegexNode) Description() string

func (*RegexNode) FindLastExpressionInLoopForAutoAtomic

func (n *RegexNode) FindLastExpressionInLoopForAutoAtomic() *RegexNode

Recurs into the last expression of a loop node, looking to see if it can find a node that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. Returns The found node that should be explored further for auto-atomicity; null if it doesn't exist.

func (*RegexNode) FindStartingLiteral

func (n *RegexNode) FindStartingLiteral() *StartingLiteral

func (*RegexNode) FindStartingLiteralNode

func (n *RegexNode) FindStartingLiteralNode(allowZeroWidth bool) *RegexNode

Finds the guaranteed beginning literal(s) of the node, or null if none exists. allowZeroWidth = true

func (*RegexNode) FirstCharOfOneOrMulti

func (n *RegexNode) FirstCharOfOneOrMulti() rune

Gets the character that begins a One or Multi.

func (*RegexNode) IsAtomicloopFamily

func (n *RegexNode) IsAtomicloopFamily() bool

func (*RegexNode) IsNotoneFamily

func (n *RegexNode) IsNotoneFamily() bool

func (*RegexNode) IsNotoneloopFamily

func (n *RegexNode) IsNotoneloopFamily() bool

func (*RegexNode) IsOneFamily

func (n *RegexNode) IsOneFamily() bool

func (*RegexNode) IsOneloopFamily

func (n *RegexNode) IsOneloopFamily() bool

func (*RegexNode) IsSetFamily

func (n *RegexNode) IsSetFamily() bool

func (*RegexNode) IsSetloopFamily

func (n *RegexNode) IsSetloopFamily() bool

func (*RegexNode) ReplaceChild

func (n *RegexNode) ReplaceChild(index int, newChild *RegexNode)

func (*RegexNode) TryGetJoinableLengthCheckChildRange

func (n *RegexNode) TryGetJoinableLengthCheckChildRange(childIndex int, requiredLength *int, exclusiveEnd *int) bool

Determine whether the specified child node is the beginning of a sequence that can trivially have length checks combined in order to avoid bounds checks. requiredLength is The sum of all the fixed lengths for the nodes in the sequence.</param> exclusiveEnd is The index of the node just after the last one in the sequence.</param> returns true if more than one node can have their length checks combined; otherwise, false.</returns>

There are additional node types for which we can prove a fixed length, e.g. examining all branches of an alternation and returning true if all their lengths are equal. However, the primary purpose of this method is to avoid bounds checks by consolidating length checks that guard accesses to strings/spans for which the JIT can see a fixed index within bounds, and alternations employ patterns that defeat that (e.g. reassigning the span in question). As such, the implementation remains focused on only a core subset of nodes that are a) likely to be used in concatenations and b) employ simple patterns of checks.

func (*RegexNode) TryGetOrdinalCaseInsensitiveString

func (n *RegexNode) TryGetOrdinalCaseInsensitiveString(childIndex int, exclusiveChildBound int, consumeZeroWidthNodes bool) (success bool, nodesConsumed int, caseInsensitiveString string)

Determines whether the specified child index of a concatenation begins a sequence whose values should be used to perform an ordinal case-insensitive comparison.

When consumeZeroWidthNodes is false, the consumer needs the semantics of matching the produced string to fully represent the semantics of all the consumed nodes, which means nodes can be consumed iff they produce text that's represented by the resulting string. When true, the resulting string needs to fully represent all valid matches at that position, but it can have false positives, which means the resulting string doesn't need to fully represent all zero-width nodes consumed. true is only valid when used as part of a search to determine where to try a full match, not as part of actual matching logic. consumeZeroWidthNodes = false

type RegexOptions

type RegexOptions int32
const (
	IgnoreCase              RegexOptions = 0x0001 // "i"
	Multiline               RegexOptions = 0x0002 // "m"
	ExplicitCapture         RegexOptions = 0x0004 // "n"
	Singleline              RegexOptions = 0x0010 // "s"
	IgnorePatternWhitespace RegexOptions = 0x0020 // "x"
	RightToLeft             RegexOptions = 0x0040 // "r"
	ECMAScript              RegexOptions = 0x0100 // "e"
	RE2                     RegexOptions = 0x0200 // RE2 compat mode
	Unicode                 RegexOptions = 0x0400 // "u"
)

type RegexTree

type RegexTree struct {
	Root              *RegexNode
	Caps              map[int]int
	Capnumlist        []int
	Captop            int
	Capnames          map[string]int
	Caplist           []string
	Options           RegexOptions
	FindOptimizations *FindOptimizations
}

func Parse

func Parse(re string, op ParseOptions) (*RegexTree, error)

Parse converts a regex string into a parse tree

func (*RegexTree) Dump

func (t *RegexTree) Dump() string

type ReplacerData

type ReplacerData struct {
	Rep     string
	Strings []string
	Rules   []int
}

func NewReplacerData

func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp

type RequiredLandmark added in v2.1.0

type RequiredLandmark struct {
	Alternatives []RequiredLandmarkAlternative
}

type RequiredLandmarkAlternative added in v2.1.0

type RequiredLandmarkAlternative struct {
	Literal                 string
	Chars                   []rune
	Set                     *CharSet
	WhitespaceSet           *CharSet
	MinRepeat               int
	MaxRepeat               int
	RequireWhitespaceBefore bool
	RequireWhitespaceAfter  bool
}

type RequiredLandmarkChain added in v2.1.0

type RequiredLandmarkChain struct {
	LeadingLoopSet *CharSet
	Landmarks      []RequiredLandmark
}

type SingleRange

type SingleRange struct {
	First rune
	Last  rune
}

type StartingLiteral

type StartingLiteral struct {
	Range    SingleRange
	String   []rune
	SetChars []rune
	Negated  bool
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL