Imports #
"strconv"
"strings"
"unicode"
"unicode/utf8"
"slices"
"strconv"
"strings"
"unicode"
"unicode"
"strconv"
"sort"
"strings"
"unicode"
"unicode/utf8"
"strconv"
"strings"
"unicode"
"unicode/utf8"
"slices"
"strconv"
"strings"
"unicode"
"unicode"
"strconv"
"sort"
"strings"
"unicode"
"unicode/utf8"
const ClassNLconst DotNLconst EmptyBeginLine EmptyOp = *ast.BinaryExprconst EmptyBeginTextconst EmptyEndLineconst EmptyEndTextconst EmptyNoWordBoundaryconst EmptyWordBoundaryUnexpected error
const ErrInternalError ErrorCode = "regexp/syntax: internal error"Parse errors
const ErrInvalidCharClass ErrorCode = "invalid character class"const ErrInvalidCharRange ErrorCode = "invalid character class range"const ErrInvalidEscape ErrorCode = "invalid escape sequence"const ErrInvalidNamedCapture ErrorCode = "invalid named capture"const ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"const ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"const ErrInvalidRepeatSize ErrorCode = "invalid repeat count"const ErrInvalidUTF8 ErrorCode = "invalid UTF-8"const ErrLarge ErrorCode = "expression too large"const ErrMissingBracket ErrorCode = "missing closing ]"const ErrMissingParen ErrorCode = "missing closing )"const ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"const ErrNestingDepth ErrorCode = "expression nests too deeply"const ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"const ErrUnexpectedParen ErrorCode = "unexpected )"const FoldCase Flags = *ast.BinaryExprconst InstAlt InstOp = iotaconst InstAltMatchconst InstCaptureconst InstEmptyWidthconst InstFailconst InstMatchconst InstNopconst InstRuneconst InstRune1const InstRuneAnyconst InstRuneAnyNotNLconst Literalconst MatchNL = *ast.BinaryExprconst NonGreedyconst OneLineconst OpAlternateconst OpAnyCharconst OpAnyCharNotNLconst OpBeginLineconst OpBeginTextconst OpCaptureconst OpCharClassconst OpConcatconst OpEmptyMatchconst OpEndLineconst OpEndTextconst OpLiteralconst OpNoMatch Op = *ast.BinaryExprconst OpNoWordBoundaryconst OpPlusconst OpQuestconst OpRepeatconst OpStarconst OpWordBoundaryconst POSIX Flags = 0const Perl = *ast.BinaryExprconst PerlXconst Simpleconst UnicodeGroupsconst WasDollarvar _Op_index_0 = [...]uint8{...}const _Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"const _Op_name_1 = "opPseudo"var anyRune = []rune{...}var anyRuneNotNL = []rune{...}var anyTable = *ast.UnaryExprvar code1 = []rune{...}var code10 = []rune{...}var code11 = []rune{...}var code12 = []rune{...}var code13 = []rune{...}var code14 = []rune{...}var code15 = []rune{...}var code16 = []rune{...}var code17 = []rune{...}var code2 = []rune{...}var code3 = []rune{...}var code4 = []rune{...}var code5 = []rune{...}var code6 = []rune{...}var code7 = []rune{...}var code8 = []rune{...}var code9 = []rune{...}const flagI printFlags = *ast.BinaryExprconst flagMconst flagOffconst flagPrecconst flagSvar instOpNames = []string{...}maxSize is the maximum size of a compiled regexp in Insts. It too is somewhat arbitrarily chosen, but the idea is to be large enough to allow significant regexps while at the same time small enough that the compiled form will not take up too much memory. 128 MB is enough for a 3.3 million Inst structures, which roughly corresponds to a 3.3 MB regexp.
const instSize = *ast.BinaryExprconst maxFold = 0x1e943maxHeight is the maximum height of a regexp parse tree. It is somewhat arbitrarily chosen, but the idea is to be large enough that no one will actually hit in real use but at the same time small enough that recursion on the Regexp tree will not hit the 1GB Go stack limit. The maximum amount of stack for a single recursive frame is probably closer to 1kB, so this could potentially be raised, but it seems unlikely that people have regexps nested even this deeply. We ran a test on Google's C++ code base and turned up only a single use case with depth > 100; it had depth 128. Using depth 1000 should be plenty of margin. As an optimization, we don't even bother calculating heights until we've allocated at least maxHeight Regexp structures.
const maxHeight = 1000maxRunes is the maximum number of runes allowed in a regexp tree counting the runes in all the nodes. Ignoring character classes p.numRunes is always less than the length of the regexp. Character classes can make it much larger: each \pL adds 1292 runes. 128 MB is enough for 32M runes, which is over 26k \pL instances. Note that repetitions do not make copies of the rune slices, so \pL{1000} is only one rune slice, not 1000. We could keep a cache of character classes we've seen, so that all the \pL we see use the same rune list, but that doesn't remove the problem entirely: consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. And because the Rune slice is exposed directly in the Regexp, there is not an opportunity to change the representation to allow partial sharing between different character classes. So the limit is the best we can do.
const maxRunes = *ast.BinaryExprmaxSize is the maximum size of a compiled regexp in Insts. It too is somewhat arbitrarily chosen, but the idea is to be large enough to allow significant regexps while at the same time small enough that the compiled form will not take up too much memory. 128 MB is enough for a 3.3 million Inst structures, which roughly corresponds to a 3.3 MB regexp.
const maxSize = *ast.BinaryExprconst meta = `\.+*?()|[]{}^$`minimum and maximum runes involved in folding. checked during test.
const minFold = 0x0041const negShift = 5const noMatch = *ast.UnaryExprPseudo-ops for parsing stack.
const opLeftParen = *ast.BinaryExprconst opPseudo Op = 128Pseudo-ops for parsing stack.
const opVerticalBarvar perlGroup = map[string]charGroup{...}var posixGroup = map[string]charGroup{...}maxRunes is the maximum number of runes allowed in a regexp tree counting the runes in all the nodes. Ignoring character classes p.numRunes is always less than the length of the regexp. Character classes can make it much larger: each \pL adds 1292 runes. 128 MB is enough for 32M runes, which is over 26k \pL instances. Note that repetitions do not make copies of the rune slices, so \pL{1000} is only one rune slice, not 1000. We could keep a cache of character classes we've seen, so that all the \pL we see use the same rune list, but that doesn't remove the problem entirely: consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. And because the Rune slice is exposed directly in the Regexp, there is not an opportunity to change the representation to allow partial sharing between different character classes. So the limit is the best we can do.
const runeSize = 4An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8An ErrorCode describes a failure to parse a regular expression.
type ErrorCode stringFlags control the behavior of the parser and record information about regexp context.
type Flags uint16An InstOp is an instruction opcode.
type InstOp uint8An Op is a single regular expression operator.
type Op uint8printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8An Error describes a failure to parse a regular expression and gives the offending expression.
type Error struct {
Code ErrorCode
Expr string
}An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32
Arg uint32
Rune []rune
}A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int
NumCap int
}A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op
Flags Flags
Sub []*Regexp
Sub0 [1]*Regexp
Rune []rune
Rune0 [2]rune
Min int
Max int
Cap int
Name string
}type charGroup struct {
sign int
class []rune
}type compiler struct {
p *Prog
}A frag represents a compiled program fragment.
type frag struct {
i uint32
out patchList
nullable bool
}type parser struct {
flags Flags
stack []*Regexp
free *Regexp
numCap int
wholeRegexp string
tmpClass []rune
numRegexp int
numRunes int
repeats int64
height map[*Regexp]int
size map[*Regexp]int64
}A patchList is a list of instruction pointers that need to be filled in (patched). Because the pointers haven't been filled in yet, we can reuse their storage to hold the list. It's kind of sleazy, but works well in practice. See https://swtch.com/~rsc/regexp/regexp1.html for inspiration. These aren't really pointers: they're integers, so we can reinterpret them this way without using package unsafe. A value l.head denotes p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1). head == 0 denotes the empty list, okay because we start every program with a fail instruction, so we'll never want to point at its output link.
type patchList struct {
head uint32
tail uint32
}ranges implements sort.Interface on a []rune. The choice of receiver type definition is strange but avoids an allocation since we already have a *[]rune.
type ranges struct {
p *[]rune
}CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []stringCompile compiles the regexp into a program to be executed. The regexp should have been simplified already (returned from re.Simplify).
func Compile(re *Regexp) (*Prog, error)EmptyOpContext returns the zero-width assertions satisfied at the position between the runes r1 and r2. Passing r1 == -1 indicates that the position is at the beginning of the text. Passing r2 == -1 indicates that the position is at the end of the text.
func EmptyOpContext(r1 rune, r2 rune) EmptyOpEqual reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) boolfunc (e *Error) Error() stringIsWordChar reports whether r is considered a “word character” during the evaluation of the \b and \B zero-width assertions. These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) boolfunc (ra ranges) Len() intfunc (ra ranges) Less(i int, j int) boolMatchEmptyWidth reports whether the instruction matches an empty string between the runes before and after. It should only be called when i.Op == [InstEmptyWidth].
func (i *Inst) MatchEmptyWidth(before rune, after rune) boolMatchRune reports whether the instruction matches (and consumes) r. It should only be called when i.Op == [InstRune].
func (i *Inst) MatchRune(r rune) boolMatchRunePos checks whether the instruction matches (and consumes) r. If so, MatchRunePos returns the index of the matching rune pair (or, when len(i.Rune) == 1, rune singleton). If not, MatchRunePos returns -1. MatchRunePos should only be called when i.Op == [InstRune].
func (i *Inst) MatchRunePos(r rune) intMaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() intParse parses a regular expression string s, controlled by the specified Flags, and returns a regular expression parse tree. The syntax is described in the top-level comment.
func Parse(s string, flags Flags) (*Regexp, error)Prefix returns a literal string that all matches for the regexp must start with. Complete is true if the prefix is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool)Simplify returns a regexp equivalent to re but without counted repetitions and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute correctly but its string representation will not produce the same parse tree, because capturing parentheses may have been duplicated or removed. For example, the simplified form for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1. The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *RegexpStartCond returns the leading empty-width conditions that must be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOpfunc (i InstOp) String() stringfunc (i *Inst) String() stringfunc (i Op) String() stringfunc (re *Regexp) String() stringfunc (e ErrorCode) String() stringfunc (p *Prog) String() stringfunc (ra ranges) Swap(i int, j int)func _()addSpan enables the flags f around start..last, by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start *Regexp, last *Regexp, f printFlags, flags *map[*Regexp]printFlags)func (c *compiler) alt(f1 frag, f2 frag) fragalternate replaces the top of the stack (above the topmost '(') with its alternation.
func (p *parser) alternate() *Regexpfunc (l1 patchList) append(p *Prog, l2 patchList) patchListappendClass returns the result of appending the class x to the class r. It assume x is clean.
func appendClass(r []rune, x []rune) []runeappendFoldedClass returns the result of appending the case folding of the class x to the class r.
func appendFoldedClass(r []rune, x []rune) []runeappendFoldedRange returns the result of appending the range lo-hi and its case folding-equivalent runes to the class r.
func appendFoldedRange(r []rune, lo rune, hi rune) []runefunc (p *parser) appendGroup(r []rune, g charGroup) []runeappendLiteral returns the result of appending the literal x to the class r.
func appendLiteral(r []rune, x rune, flags Flags) []runeappendNegatedClass returns the result of appending the negation of the class x to the class r. It assumes x is clean.
func appendNegatedClass(r []rune, x []rune) []runeappendNegatedTable returns the result of appending the negation of x to the class r.
func appendNegatedTable(r []rune, x *unicode.RangeTable) []runeappendRange returns the result of appending the range lo-hi to the class r.
func appendRange(r []rune, lo rune, hi rune) []runeappendTable returns the result of appending x to the class r.
func appendTable(r []rune, x *unicode.RangeTable) []runefunc bw(b *strings.Builder, args ...string)calcFlags calculates the flags to print around each subexpression in re, storing that information in (*flags)[sub] for each affected subexpression. The first time an entry needs to be written to *flags, calcFlags allocates the map. calcFlags also calculates the flags that must be active or can't be active around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must printFlags, cant printFlags)func (p *parser) calcHeight(re *Regexp, force bool) intfunc (p *parser) calcSize(re *Regexp, force bool) int64func (c *compiler) cap(arg uint32) fragfunc (re *Regexp) capNames(names []string)func (c *compiler) cat(f1 frag, f2 frag) fragfunc (p *parser) checkHeight(re *Regexp)func (p *parser) checkLimits(re *Regexp)func (p *parser) checkSize(re *Regexp)func checkUTF8(s string) errorcleanAlt cleans re for eventual inclusion in an alternation.
func cleanAlt(re *Regexp)cleanClass sorts the ranges (pairs of elements of r), merges them, and eliminates duplicates.
func cleanClass(rp *[]rune) []runecollapse returns the result of applying op to sub. If sub contains op nodes, they all get hoisted up so that there is never a concat of a concat or an alternate of an alternate.
func (p *parser) collapse(subs []*Regexp, op Op) *Regexpfunc (c *compiler) compile(re *Regexp) fragconcat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
func (p *parser) concat() *Regexpfunc dumpInst(b *strings.Builder, i *Inst)func dumpProg(b *strings.Builder, p *Prog)func (c *compiler) empty(op EmptyOp) fragfunc escape(b *strings.Builder, r rune, force bool)factor factors common prefixes from the alternation list sub. It returns a replacement list that reuses the same storage and frees (passes to p.reuse) any removed *Regexps. For example, ABC|ABD|AEF|BCX|BCY simplifies by literal prefix extraction to A(B(C|D)|EF)|BC(X|Y) which simplifies by character class introduction to A(B[CD]|EF)|BC[XY]
func (p *parser) factor(sub []*Regexp) []*Regexpfunc (c *compiler) fail() fraginCharClass reports whether r is in the class. It assumes the class has been cleaned by cleanClass.
func inCharClass(r rune, class []rune) boolfunc (c *compiler) init()func (c *compiler) inst(op InstOp) fragcan this be represented as a character class? single-rune literal string, char class, ., and .|\n.
func isCharClass(re *Regexp) boolisValidCaptureName reports whether name is a valid capture name: [A-Za-z0-9_]+. PCRE limits names to 32 bytes. Python rejects names starting with digits. We don't enforce either of those.
func isValidCaptureName(name string) boolfunc isalnum(c rune) boolleadingRegexp returns the leading regexp that re begins with. The regexp refers to storage in re or its children.
func (p *parser) leadingRegexp(re *Regexp) *RegexpleadingString returns the leading literal string that re begins with. The string refers to storage in re or its children.
func (p *parser) leadingString(re *Regexp) ([]rune, Flags)literal pushes a literal regexp for the rune r on the stack.
func (p *parser) literal(r rune)func literalRegexp(s string, flags Flags) *Regexploop returns the fragment for the main loop of a plus or star. For plus, it can be used after changing the entry to f1.i. For star, it can be used directly when f1 can't match an empty string. (When f1 can match an empty string, f1* must be implemented as (f1+)? to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) fragfunc makePatchList(n uint32) patchListdoes re match r?
func matchRune(re *Regexp, r rune) boolmaybeConcat implements incremental concatenation of literal runes into string nodes. The parser calls this before each push, so only the top fragment of the stack might need processing. Since this is called before a push, the topmost literal is no longer subject to operators like * (Otherwise ab* would turn into (ab)*.) If r >= 0 and there's a node left over, maybeConcat uses it to push r with the given flags. maybeConcat reports whether r was pushed.
func (p *parser) maybeConcat(r rune, flags Flags) boolmergeCharClass makes dst = dst|src. The caller must ensure that dst.Op >= src.Op, to reduce the amount of copying.
func mergeCharClass(dst *Regexp, src *Regexp)minFoldRune returns the minimum rune fold-equivalent to r.
func minFoldRune(r rune) runenegateClass overwrites r and returns r's negation. It assumes the class r is already clean.
func negateClass(r []rune) []runefunc (p *parser) newRegexp(op Op) *Regexpfunc nextRune(s string) (c rune, t string, err error)func (c *compiler) nop() fragop pushes a regexp with the given op onto the stack and returns that regexp.
func (p *parser) op(op Op) *Regexpop returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOpfunc parse(s string, flags Flags) (_ *Regexp, err error)parseClass parses a character class at the beginning of s and pushes it onto the parse stack.
func (p *parser) parseClass(s string) (rest string, err error)parseClassChar parses a character class character at the beginning of s and returns it.
func (p *parser) parseClassChar(s string, wholeClass string) (r rune, rest string, err error)parseEscape parses an escape sequence at the beginning of s and returns the rune.
func (p *parser) parseEscape(s string) (r rune, rest string, err error)parseInt parses a decimal integer.
func (p *parser) parseInt(s string) (n int, rest string, ok bool)parseNamedClass parses a leading POSIX named character class like [:alnum:] from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error)parsePerlClassEscape parses a leading Perl character class escape like \d from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string)parsePerlFlags parses a Perl flag setting or non-capturing group or both, like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. The caller must have ensured that s begins with "(?".
func (p *parser) parsePerlFlags(s string) (rest string, err error)parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. If s is not of that form, it returns ok == false. If s has the right form but the values are too big, it returns min == -1, ok == true.
func (p *parser) parseRepeat(s string) (min int, max int, rest string, ok bool)parseRightParen handles a ) in the input.
func (p *parser) parseRightParen() errorparseUnicodeClass parses a leading Unicode character class like \p{Han} from the beginning of s. If one is present, it appends the characters to r and returns the new slice r and the remainder of the string.
func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error)parseVerticalBar handles a | in the input.
func (p *parser) parseVerticalBar()func (l patchList) patch(p *Prog, val uint32)func (c *compiler) plus(f1 frag, nongreedy bool) fragpush pushes the regexp re onto the parse stack and returns the regexp.
func (p *parser) push(re *Regexp) *Regexpfunc (c *compiler) quest(f1 frag, nongreedy bool) fragremoveLeadingRegexp removes the leading regexp in re. It returns the replacement for re. If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *RegexpremoveLeadingString removes the first n leading runes from the beginning of re. It returns the replacement for re.
func (p *parser) removeLeadingString(re *Regexp, n int) *Regexprepeat replaces the top stack element with itself repeated according to op, min, max. before is the regexp suffix starting at the repetition operator. after is the regexp suffix following after the repetition operator. repeat returns an updated 'after' and an error, if any.
func (p *parser) repeat(op Op, min int, max int, before string, after string, lastRepeat string) (string, error)repeatIsValid reports whether the repetition re is valid. Valid means that the combination of the top-level repetition and any inner repetitions does not exceed n copies of the innermost thing. This function rewalks the regexp tree and is called for every repetition, so we have to worry about inducing quadratic behavior in the parser. We avoid this by only calling repeatIsValid when min or max >= 2. In that case the depth of any >= 2 nesting can only get to 9 without triggering a parse error, so each subtree can only be rewalked 9 times.
func repeatIsValid(re *Regexp, n int) boolfunc (p *parser) reuse(re *Regexp)func (c *compiler) rune(r []rune, flags Flags) fragsimplify1 implements Simplify for the unary OpStar, OpPlus, and OpQuest operators. It returns the simple regexp equivalent to Regexp{Op: op, Flags: flags, Sub: {sub}} under the assumption that sub is already simple, and without first allocating that structure. If the regexp to be returned turns out to be equivalent to re, simplify1 returns re instead. simplify1 is factored out of Simplify because the implementation for other operators generates these unary expressions. Letting them call simplify1 makes sure the expressions they generate are simple.
func simplify1(op Op, flags Flags, sub *Regexp, re *Regexp) *RegexpskipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Instfunc (c *compiler) star(f1 frag, nongreedy bool) fragIf the top of the stack is an element followed by an opVerticalBar swapVerticalBar swaps the two and returns true. Otherwise it returns false.
func (p *parser) swapVerticalBar() boolfunc u32(i uint32) stringfunc unhex(c rune) runeunicodeTable returns the unicode.RangeTable identified by name and the table of additional fold-equivalent code points.
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable)writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags)Generated with Arrow