Imports #
"strconv"
"strings"
"unicode"
"unicode/utf8"
"slices"
"strconv"
"strings"
"unicode"
"unicode"
"strconv"
"sort"
"strings"
"unicode"
"unicode/utf8"
"strconv"
"strings"
"unicode"
"unicode/utf8"
"slices"
"strconv"
"strings"
"unicode"
"unicode"
"strconv"
"sort"
"strings"
"unicode"
"unicode/utf8"
const ClassNL
const DotNL
const EmptyBeginLine EmptyOp = *ast.BinaryExpr
const EmptyBeginText
const EmptyEndLine
const EmptyEndText
const EmptyNoWordBoundary
const EmptyWordBoundary
Unexpected error
const ErrInternalError ErrorCode = "regexp/syntax: internal error"
Parse errors
const ErrInvalidCharClass ErrorCode = "invalid character class"
const ErrInvalidCharRange ErrorCode = "invalid character class range"
const ErrInvalidEscape ErrorCode = "invalid escape sequence"
const ErrInvalidNamedCapture ErrorCode = "invalid named capture"
const ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
const ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
const ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
const ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
const ErrLarge ErrorCode = "expression too large"
const ErrMissingBracket ErrorCode = "missing closing ]"
const ErrMissingParen ErrorCode = "missing closing )"
const ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
const ErrNestingDepth ErrorCode = "expression nests too deeply"
const ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
const ErrUnexpectedParen ErrorCode = "unexpected )"
const FoldCase Flags = *ast.BinaryExpr
const InstAlt InstOp = iota
const InstAltMatch
const InstCapture
const InstEmptyWidth
const InstFail
const InstMatch
const InstNop
const InstRune
const InstRune1
const InstRuneAny
const InstRuneAnyNotNL
const Literal
const MatchNL = *ast.BinaryExpr
const NonGreedy
const OneLine
const OpAlternate
const OpAnyChar
const OpAnyCharNotNL
const OpBeginLine
const OpBeginText
const OpCapture
const OpCharClass
const OpConcat
const OpEmptyMatch
const OpEndLine
const OpEndText
const OpLiteral
const OpNoMatch Op = *ast.BinaryExpr
const OpNoWordBoundary
const OpPlus
const OpQuest
const OpRepeat
const OpStar
const OpWordBoundary
const POSIX Flags = 0
const Perl = *ast.BinaryExpr
const PerlX
const Simple
const UnicodeGroups
const WasDollar
var _Op_index_0 = [...]uint8{...}
const _Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
const _Op_name_1 = "opPseudo"
var anyRune = []rune{...}
var anyRuneNotNL = []rune{...}
var anyTable = *ast.UnaryExpr
var code1 = []rune{...}
var code10 = []rune{...}
var code11 = []rune{...}
var code12 = []rune{...}
var code13 = []rune{...}
var code14 = []rune{...}
var code15 = []rune{...}
var code16 = []rune{...}
var code17 = []rune{...}
var code2 = []rune{...}
var code3 = []rune{...}
var code4 = []rune{...}
var code5 = []rune{...}
var code6 = []rune{...}
var code7 = []rune{...}
var code8 = []rune{...}
var code9 = []rune{...}
const flagI printFlags = *ast.BinaryExpr
const flagM
const flagOff
const flagPrec
const flagS
var instOpNames = []string{...}
maxSize is the maximum size of a compiled regexp in Insts. It too is somewhat arbitrarily chosen, but the idea is to be large enough to allow significant regexps while at the same time small enough that the compiled form will not take up too much memory. 128 MB is enough for a 3.3 million Inst structures, which roughly corresponds to a 3.3 MB regexp.
const instSize = *ast.BinaryExpr
const maxFold = 0x1e943
maxHeight is the maximum height of a regexp parse tree. It is somewhat arbitrarily chosen, but the idea is to be large enough that no one will actually hit in real use but at the same time small enough that recursion on the Regexp tree will not hit the 1GB Go stack limit. The maximum amount of stack for a single recursive frame is probably closer to 1kB, so this could potentially be raised, but it seems unlikely that people have regexps nested even this deeply. We ran a test on Google's C++ code base and turned up only a single use case with depth > 100; it had depth 128. Using depth 1000 should be plenty of margin. As an optimization, we don't even bother calculating heights until we've allocated at least maxHeight Regexp structures.
const maxHeight = 1000
maxRunes is the maximum number of runes allowed in a regexp tree counting the runes in all the nodes. Ignoring character classes p.numRunes is always less than the length of the regexp. Character classes can make it much larger: each \pL adds 1292 runes. 128 MB is enough for 32M runes, which is over 26k \pL instances. Note that repetitions do not make copies of the rune slices, so \pL{1000} is only one rune slice, not 1000. We could keep a cache of character classes we've seen, so that all the \pL we see use the same rune list, but that doesn't remove the problem entirely: consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. And because the Rune slice is exposed directly in the Regexp, there is not an opportunity to change the representation to allow partial sharing between different character classes. So the limit is the best we can do.
const maxRunes = *ast.BinaryExpr
maxSize is the maximum size of a compiled regexp in Insts. It too is somewhat arbitrarily chosen, but the idea is to be large enough to allow significant regexps while at the same time small enough that the compiled form will not take up too much memory. 128 MB is enough for a 3.3 million Inst structures, which roughly corresponds to a 3.3 MB regexp.
const maxSize = *ast.BinaryExpr
const meta = `\.+*?()|[]{}^$`
minimum and maximum runes involved in folding. checked during test.
const minFold = 0x0041
const negShift = 5
const noMatch = *ast.UnaryExpr
Pseudo-ops for parsing stack.
const opLeftParen = *ast.BinaryExpr
const opPseudo Op = 128
Pseudo-ops for parsing stack.
const opVerticalBar
var perlGroup = map[string]charGroup{...}
var posixGroup = map[string]charGroup{...}
maxRunes is the maximum number of runes allowed in a regexp tree counting the runes in all the nodes. Ignoring character classes p.numRunes is always less than the length of the regexp. Character classes can make it much larger: each \pL adds 1292 runes. 128 MB is enough for 32M runes, which is over 26k \pL instances. Note that repetitions do not make copies of the rune slices, so \pL{1000} is only one rune slice, not 1000. We could keep a cache of character classes we've seen, so that all the \pL we see use the same rune list, but that doesn't remove the problem entirely: consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. And because the Rune slice is exposed directly in the Regexp, there is not an opportunity to change the representation to allow partial sharing between different character classes. So the limit is the best we can do.
const runeSize = 4
An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
An ErrorCode describes a failure to parse a regular expression.
type ErrorCode string
Flags control the behavior of the parser and record information about regexp context.
type Flags uint16
An InstOp is an instruction opcode.
type InstOp uint8
An Op is a single regular expression operator.
type Op uint8
printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8
An Error describes a failure to parse a regular expression and gives the offending expression.
type Error struct {
Code ErrorCode
Expr string
}
An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32
Arg uint32
Rune []rune
}
A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int
NumCap int
}
A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op
Flags Flags
Sub []*Regexp
Sub0 [1]*Regexp
Rune []rune
Rune0 [2]rune
Min int
Max int
Cap int
Name string
}
type charGroup struct {
sign int
class []rune
}
type compiler struct {
p *Prog
}
A frag represents a compiled program fragment.
type frag struct {
i uint32
out patchList
nullable bool
}
type parser struct {
flags Flags
stack []*Regexp
free *Regexp
numCap int
wholeRegexp string
tmpClass []rune
numRegexp int
numRunes int
repeats int64
height map[*Regexp]int
size map[*Regexp]int64
}
A patchList is a list of instruction pointers that need to be filled in (patched). Because the pointers haven't been filled in yet, we can reuse their storage to hold the list. It's kind of sleazy, but works well in practice. See https://swtch.com/~rsc/regexp/regexp1.html for inspiration. These aren't really pointers: they're integers, so we can reinterpret them this way without using package unsafe. A value l.head denotes p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1). head == 0 denotes the empty list, okay because we start every program with a fail instruction, so we'll never want to point at its output link.
type patchList struct {
head uint32
tail uint32
}
ranges implements sort.Interface on a []rune. The choice of receiver type definition is strange but avoids an allocation since we already have a *[]rune.
type ranges struct {
p *[]rune
}
CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string
Compile compiles the regexp into a program to be executed. The regexp should have been simplified already (returned from re.Simplify).
func Compile(re *Regexp) (*Prog, error)
EmptyOpContext returns the zero-width assertions satisfied at the position between the runes r1 and r2. Passing r1 == -1 indicates that the position is at the beginning of the text. Passing r2 == -1 indicates that the position is at the end of the text.
func EmptyOpContext(r1 rune, r2 rune) EmptyOp
Equal reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool
func (e *Error) Error() string
IsWordChar reports whether r is considered a “word character” during the evaluation of the \b and \B zero-width assertions. These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) bool
func (ra ranges) Len() int
func (ra ranges) Less(i int, j int) bool
MatchEmptyWidth reports whether the instruction matches an empty string between the runes before and after. It should only be called when i.Op == [InstEmptyWidth].
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool
MatchRune reports whether the instruction matches (and consumes) r. It should only be called when i.Op == [InstRune].
func (i *Inst) MatchRune(r rune) bool
MatchRunePos checks whether the instruction matches (and consumes) r. If so, MatchRunePos returns the index of the matching rune pair (or, when len(i.Rune) == 1, rune singleton). If not, MatchRunePos returns -1. MatchRunePos should only be called when i.Op == [InstRune].
func (i *Inst) MatchRunePos(r rune) int
MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int
Parse parses a regular expression string s, controlled by the specified Flags, and returns a regular expression parse tree. The syntax is described in the top-level comment.
func Parse(s string, flags Flags) (*Regexp, error)
Prefix returns a literal string that all matches for the regexp must start with. Complete is true if the prefix is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool)
Simplify returns a regexp equivalent to re but without counted repetitions and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute correctly but its string representation will not produce the same parse tree, because capturing parentheses may have been duplicated or removed. For example, the simplified form for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *Regexp
StartCond returns the leading empty-width conditions that must be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp
func (i InstOp) String() string
func (i *Inst) String() string
func (i Op) String() string
func (re *Regexp) String() string
func (e ErrorCode) String() string
func (p *Prog) String() string
func (ra ranges) Swap(i int, j int)
func _()
addSpan enables the flags f around start..last, by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start *Regexp, last *Regexp, f printFlags, flags *map[*Regexp]printFlags)
func (c *compiler) alt(f1 frag, f2 frag) frag
alternate replaces the top of the stack (above the topmost '(') with its alternation.
func (p *parser) alternate() *Regexp
func (l1 patchList) append(p *Prog, l2 patchList) patchList
appendClass returns the result of appending the class x to the class r. It assume x is clean.
func appendClass(r []rune, x []rune) []rune
appendFoldedClass returns the result of appending the case folding of the class x to the class r.
func appendFoldedClass(r []rune, x []rune) []rune
appendFoldedRange returns the result of appending the range lo-hi and its case folding-equivalent runes to the class r.
func appendFoldedRange(r []rune, lo rune, hi rune) []rune
func (p *parser) appendGroup(r []rune, g charGroup) []rune
appendLiteral returns the result of appending the literal x to the class r.
func appendLiteral(r []rune, x rune, flags Flags) []rune
appendNegatedClass returns the result of appending the negation of the class x to the class r. It assumes x is clean.
func appendNegatedClass(r []rune, x []rune) []rune
appendNegatedTable returns the result of appending the negation of x to the class r.
func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune
appendRange returns the result of appending the range lo-hi to the class r.
func appendRange(r []rune, lo rune, hi rune) []rune
appendTable returns the result of appending x to the class r.
func appendTable(r []rune, x *unicode.RangeTable) []rune
func bw(b *strings.Builder, args ...string)
calcFlags calculates the flags to print around each subexpression in re, storing that information in (*flags)[sub] for each affected subexpression. The first time an entry needs to be written to *flags, calcFlags allocates the map. calcFlags also calculates the flags that must be active or can't be active around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must printFlags, cant printFlags)
func (p *parser) calcHeight(re *Regexp, force bool) int
func (p *parser) calcSize(re *Regexp, force bool) int64
func (c *compiler) cap(arg uint32) frag
func (re *Regexp) capNames(names []string)
func (c *compiler) cat(f1 frag, f2 frag) frag
func (p *parser) checkHeight(re *Regexp)
func (p *parser) checkLimits(re *Regexp)
func (p *parser) checkSize(re *Regexp)
func checkUTF8(s string) error
cleanAlt cleans re for eventual inclusion in an alternation.
func cleanAlt(re *Regexp)
cleanClass sorts the ranges (pairs of elements of r), merges them, and eliminates duplicates.
func cleanClass(rp *[]rune) []rune
collapse returns the result of applying op to sub. If sub contains op nodes, they all get hoisted up so that there is never a concat of a concat or an alternate of an alternate.
func (p *parser) collapse(subs []*Regexp, op Op) *Regexp
func (c *compiler) compile(re *Regexp) frag
concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
func (p *parser) concat() *Regexp
func dumpInst(b *strings.Builder, i *Inst)
func dumpProg(b *strings.Builder, p *Prog)
func (c *compiler) empty(op EmptyOp) frag
func escape(b *strings.Builder, r rune, force bool)
factor factors common prefixes from the alternation list sub. It returns a replacement list that reuses the same storage and frees (passes to p.reuse) any removed *Regexps. For example, ABC|ABD|AEF|BCX|BCY simplifies by literal prefix extraction to A(B(C|D)|EF)|BC(X|Y) which simplifies by character class introduction to A(B[CD]|EF)|BC[XY]
func (p *parser) factor(sub []*Regexp) []*Regexp
func (c *compiler) fail() frag
inCharClass reports whether r is in the class. It assumes the class has been cleaned by cleanClass.
func inCharClass(r rune, class []rune) bool
func (c *compiler) init()
func (c *compiler) inst(op InstOp) frag
can this be represented as a character class? single-rune literal string, char class, ., and .|\n.
func isCharClass(re *Regexp) bool
isValidCaptureName reports whether name is a valid capture name: [A-Za-z0-9_]+. PCRE limits names to 32 bytes. Python rejects names starting with digits. We don't enforce either of those.
func isValidCaptureName(name string) bool
func isalnum(c rune) bool
leadingRegexp returns the leading regexp that re begins with. The regexp refers to storage in re or its children.
func (p *parser) leadingRegexp(re *Regexp) *Regexp
leadingString returns the leading literal string that re begins with. The string refers to storage in re or its children.
func (p *parser) leadingString(re *Regexp) ([]rune, Flags)
literal pushes a literal regexp for the rune r on the stack.
func (p *parser) literal(r rune)
func literalRegexp(s string, flags Flags) *Regexp
loop returns the fragment for the main loop of a plus or star. For plus, it can be used after changing the entry to f1.i. For star, it can be used directly when f1 can't match an empty string. (When f1 can match an empty string, f1* must be implemented as (f1+)? to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) frag
func makePatchList(n uint32) patchList
does re match r?
func matchRune(re *Regexp, r rune) bool
maybeConcat implements incremental concatenation of literal runes into string nodes. The parser calls this before each push, so only the top fragment of the stack might need processing. Since this is called before a push, the topmost literal is no longer subject to operators like * (Otherwise ab* would turn into (ab)*.) If r >= 0 and there's a node left over, maybeConcat uses it to push r with the given flags. maybeConcat reports whether r was pushed.
func (p *parser) maybeConcat(r rune, flags Flags) bool
mergeCharClass makes dst = dst|src. The caller must ensure that dst.Op >= src.Op, to reduce the amount of copying.
func mergeCharClass(dst *Regexp, src *Regexp)
minFoldRune returns the minimum rune fold-equivalent to r.
func minFoldRune(r rune) rune
negateClass overwrites r and returns r's negation. It assumes the class r is already clean.
func negateClass(r []rune) []rune
func (p *parser) newRegexp(op Op) *Regexp
func nextRune(s string) (c rune, t string, err error)
func (c *compiler) nop() frag
op pushes a regexp with the given op onto the stack and returns that regexp.
func (p *parser) op(op Op) *Regexp
op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp
func parse(s string, flags Flags) (_ *Regexp, err error)
parseClass parses a character class at the beginning of s and pushes it onto the parse stack.
func (p *parser) parseClass(s string) (rest string, err error)
parseClassChar parses a character class character at the beginning of s and returns it.
func (p *parser) parseClassChar(s string, wholeClass string) (r rune, rest string, err error)
parseEscape parses an escape sequence at the beginning of s and returns the rune.
func (p *parser) parseEscape(s string) (r rune, rest string, err error)
parseInt parses a decimal integer.
func (p *parser) parseInt(s string) (n int, rest string, ok bool)
parseNamedClass parses a leading POSIX named character class like [:alnum:] from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error)
parsePerlClassEscape parses a leading Perl character class escape like \d from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string)
parsePerlFlags parses a Perl flag setting or non-capturing group or both, like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. The caller must have ensured that s begins with "(?".
func (p *parser) parsePerlFlags(s string) (rest string, err error)
parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. If s is not of that form, it returns ok == false. If s has the right form but the values are too big, it returns min == -1, ok == true.
func (p *parser) parseRepeat(s string) (min int, max int, rest string, ok bool)
parseRightParen handles a ) in the input.
func (p *parser) parseRightParen() error
parseUnicodeClass parses a leading Unicode character class like \p{Han} from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error)
parseVerticalBar handles a | in the input.
func (p *parser) parseVerticalBar()
func (l patchList) patch(p *Prog, val uint32)
func (c *compiler) plus(f1 frag, nongreedy bool) frag
push pushes the regexp re onto the parse stack and returns the regexp.
func (p *parser) push(re *Regexp) *Regexp
func (c *compiler) quest(f1 frag, nongreedy bool) frag
removeLeadingRegexp removes the leading regexp in re. It returns the replacement for re. If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp
removeLeadingString removes the first n leading runes from the beginning of re. It returns the replacement for re.
func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp
repeat replaces the top stack element with itself repeated according to op, min, max. before is the regexp suffix starting at the repetition operator. after is the regexp suffix following after the repetition operator. repeat returns an updated 'after' and an error, if any.
func (p *parser) repeat(op Op, min int, max int, before string, after string, lastRepeat string) (string, error)
repeatIsValid reports whether the repetition re is valid. Valid means that the combination of the top-level repetition and any inner repetitions does not exceed n copies of the innermost thing. This function rewalks the regexp tree and is called for every repetition, so we have to worry about inducing quadratic behavior in the parser. We avoid this by only calling repeatIsValid when min or max >= 2. In that case the depth of any >= 2 nesting can only get to 9 without triggering a parse error, so each subtree can only be rewalked 9 times.
func repeatIsValid(re *Regexp, n int) bool
func (p *parser) reuse(re *Regexp)
func (c *compiler) rune(r []rune, flags Flags) frag
simplify1 implements Simplify for the unary OpStar, OpPlus, and OpQuest operators. It returns the simple regexp equivalent to Regexp{Op: op, Flags: flags, Sub: {sub}} under the assumption that sub is already simple, and without first allocating that structure. If the regexp to be returned turns out to be equivalent to re, simplify1 returns re instead. simplify1 is factored out of Simplify because the implementation for other operators generates these unary expressions. Letting them call simplify1 makes sure the expressions they generate are simple.
func simplify1(op Op, flags Flags, sub *Regexp, re *Regexp) *Regexp
skipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Inst
func (c *compiler) star(f1 frag, nongreedy bool) frag
If the top of the stack is an element followed by an opVerticalBar swapVerticalBar swaps the two and returns true. Otherwise it returns false.
func (p *parser) swapVerticalBar() bool
func u32(i uint32) string
func unhex(c rune) rune
unicodeTable returns the unicode.RangeTable identified by name and the table of additional fold-equivalent code points.
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable)
writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags)
Generated with Arrow