mirror of https://github.com/grafana/loki
filters: use faster regexp package (#5315)
* regexp-filter: improve benchmark Add a few more cases based on real-world usage. Also simplify the loop to just run the number of times requested. Go benchmarks run 'N' times to reach a duration, default 1 second. Previously the benchmark was running N*1000000 times, which took 7 seconds minimum for some cases. * regexp filter: use modified package with optimisations See https://github.com/grafana/regexp/tree/speedup#readme Includes the following changes proposed upstream: * [regexp: allow patterns with no alternates to be one-pass](https://go-review.googlesource.com/c/go/+/353711) * [regexp: speed up onepass prefix check](https://go-review.googlesource.com/c/go/+/354909) * [regexp: handle prefix string with fold-case](https://go-review.googlesource.com/c/go/+/358756) * [regexp: avoid copying each instruction executed](https://go-review.googlesource.com/c/go/+/355789) * [regexp: allow prefix string anchored at beginning](https://go-review.googlesource.com/c/go/+/377294) * Add grafana/regexp to vendor directorypull/5337/head
parent
f598484a94
commit
a50cac7674
@ -0,0 +1,15 @@ |
||||
# Binaries for programs and plugins |
||||
*.exe |
||||
*.exe~ |
||||
*.dll |
||||
*.so |
||||
*.dylib |
||||
|
||||
# Test binary, built with `go test -c` |
||||
*.test |
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE |
||||
*.out |
||||
|
||||
# Dependency directories (remove the comment below to include it) |
||||
# vendor/ |
@ -0,0 +1,27 @@ |
||||
Copyright (c) 2009 The Go Authors. All rights reserved. |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions are |
||||
met: |
||||
|
||||
* Redistributions of source code must retain the above copyright |
||||
notice, this list of conditions and the following disclaimer. |
||||
* Redistributions in binary form must reproduce the above |
||||
copyright notice, this list of conditions and the following disclaimer |
||||
in the documentation and/or other materials provided with the |
||||
distribution. |
||||
* Neither the name of Google Inc. nor the names of its |
||||
contributors may be used to endorse or promote products derived from |
||||
this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,19 @@ |
||||
# Grafana Go regexp package |
||||
This repo is a fork of the upstream Go `regexp` package, with some code optimisations to make it run faster. |
||||
|
||||
All the optimisations have been submitted upstream, but not yet merged. |
||||
|
||||
All semantics are the same, and the optimised code passes all tests from upstream. |
||||
|
||||
The `main` branch is non-optimised: switch over to [`speedup`](https://github.com/grafana/regexp/tree/speedup) branch for the improved code. |
||||
|
||||
## Benchmarks: |
||||
|
||||
 |
||||
|
||||
## Links to upstream changes: |
||||
* [regexp: allow patterns with no alternates to be one-pass](https://go-review.googlesource.com/c/go/+/353711) |
||||
* [regexp: speed up onepass prefix check](https://go-review.googlesource.com/c/go/+/354909) |
||||
* [regexp: handle prefix string with fold-case](https://go-review.googlesource.com/c/go/+/358756) |
||||
* [regexp: avoid copying each instruction executed](https://go-review.googlesource.com/c/go/+/355789) |
||||
* [regexp: allow prefix string anchored at beginning](https://go-review.googlesource.com/c/go/+/377294) |
@ -0,0 +1,369 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// backtrack is a regular expression search with submatch
|
||||
// tracking for small regular expressions and texts. It allocates
|
||||
// a bit vector with (length of input) * (length of prog) bits,
|
||||
// to make sure it never explores the same (character position, instruction)
|
||||
// state multiple times. This limits the search to run in time linear in
|
||||
// the length of the test.
|
||||
//
|
||||
// backtrack is a fast replacement for the NFA code on small
|
||||
// regexps when onepass cannot be used.
|
||||
|
||||
package regexp |
||||
|
||||
import ( |
||||
"sync" |
||||
|
||||
"github.com/grafana/regexp/syntax" |
||||
) |
||||
|
||||
// A job is an entry on the backtracker's job stack. It holds
|
||||
// the instruction pc and the position in the input.
|
||||
type job struct { |
||||
pc uint32 |
||||
arg bool |
||||
pos int |
||||
} |
||||
|
||||
const ( |
||||
visitedBits = 32 |
||||
maxBacktrackProg = 500 // len(prog.Inst) <= max
|
||||
maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
|
||||
) |
||||
|
||||
// bitState holds state for the backtracker.
|
||||
type bitState struct { |
||||
end int |
||||
cap []int |
||||
matchcap []int |
||||
jobs []job |
||||
visited []uint32 |
||||
|
||||
inputs inputs |
||||
} |
||||
|
||||
var bitStatePool sync.Pool |
||||
|
||||
func newBitState() *bitState { |
||||
b, ok := bitStatePool.Get().(*bitState) |
||||
if !ok { |
||||
b = new(bitState) |
||||
} |
||||
return b |
||||
} |
||||
|
||||
func freeBitState(b *bitState) { |
||||
b.inputs.clear() |
||||
bitStatePool.Put(b) |
||||
} |
||||
|
||||
// maxBitStateLen returns the maximum length of a string to search with
|
||||
// the backtracker using prog.
|
||||
func maxBitStateLen(prog *syntax.Prog) int { |
||||
if !shouldBacktrack(prog) { |
||||
return 0 |
||||
} |
||||
return maxBacktrackVector / len(prog.Inst) |
||||
} |
||||
|
||||
// shouldBacktrack reports whether the program is too
|
||||
// long for the backtracker to run.
|
||||
func shouldBacktrack(prog *syntax.Prog) bool { |
||||
return len(prog.Inst) <= maxBacktrackProg |
||||
} |
||||
|
||||
// reset resets the state of the backtracker.
|
||||
// end is the end position in the input.
|
||||
// ncap is the number of captures.
|
||||
func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) { |
||||
b.end = end |
||||
|
||||
if cap(b.jobs) == 0 { |
||||
b.jobs = make([]job, 0, 256) |
||||
} else { |
||||
b.jobs = b.jobs[:0] |
||||
} |
||||
|
||||
visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits |
||||
if cap(b.visited) < visitedSize { |
||||
b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) |
||||
} else { |
||||
b.visited = b.visited[:visitedSize] |
||||
for i := range b.visited { |
||||
b.visited[i] = 0 |
||||
} |
||||
} |
||||
|
||||
if cap(b.cap) < ncap { |
||||
b.cap = make([]int, ncap) |
||||
} else { |
||||
b.cap = b.cap[:ncap] |
||||
} |
||||
for i := range b.cap { |
||||
b.cap[i] = -1 |
||||
} |
||||
|
||||
if cap(b.matchcap) < ncap { |
||||
b.matchcap = make([]int, ncap) |
||||
} else { |
||||
b.matchcap = b.matchcap[:ncap] |
||||
} |
||||
for i := range b.matchcap { |
||||
b.matchcap[i] = -1 |
||||
} |
||||
} |
||||
|
||||
// shouldVisit reports whether the combination of (pc, pos) has not
|
||||
// been visited yet.
|
||||
func (b *bitState) shouldVisit(pc uint32, pos int) bool { |
||||
n := uint(int(pc)*(b.end+1) + pos) |
||||
if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { |
||||
return false |
||||
} |
||||
b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) |
||||
return true |
||||
} |
||||
|
||||
// push pushes (pc, pos, arg) onto the job stack if it should be
|
||||
// visited.
|
||||
func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) { |
||||
// Only check shouldVisit when arg is false.
|
||||
// When arg is true, we are continuing a previous visit.
|
||||
if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) { |
||||
b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) |
||||
} |
||||
} |
||||
|
||||
// tryBacktrack runs a backtracking search starting at pos.
|
||||
func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { |
||||
longest := re.longest |
||||
|
||||
b.push(re, pc, pos, false) |
||||
for len(b.jobs) > 0 { |
||||
l := len(b.jobs) - 1 |
||||
// Pop job off the stack.
|
||||
pc := b.jobs[l].pc |
||||
pos := b.jobs[l].pos |
||||
arg := b.jobs[l].arg |
||||
b.jobs = b.jobs[:l] |
||||
|
||||
// Optimization: rather than push and pop,
|
||||
// code that is going to Push and continue
|
||||
// the loop simply updates ip, p, and arg
|
||||
// and jumps to CheckAndLoop. We have to
|
||||
// do the ShouldVisit check that Push
|
||||
// would have, but we avoid the stack
|
||||
// manipulation.
|
||||
goto Skip |
||||
CheckAndLoop: |
||||
if !b.shouldVisit(pc, pos) { |
||||
continue |
||||
} |
||||
Skip: |
||||
|
||||
inst := &re.prog.Inst[pc] |
||||
|
||||
switch inst.Op { |
||||
default: |
||||
panic("bad inst") |
||||
case syntax.InstFail: |
||||
panic("unexpected InstFail") |
||||
case syntax.InstAlt: |
||||
// Cannot just
|
||||
// b.push(inst.Out, pos, false)
|
||||
// b.push(inst.Arg, pos, false)
|
||||
// If during the processing of inst.Out, we encounter
|
||||
// inst.Arg via another path, we want to process it then.
|
||||
// Pushing it here will inhibit that. Instead, re-push
|
||||
// inst with arg==true as a reminder to push inst.Arg out
|
||||
// later.
|
||||
if arg { |
||||
// Finished inst.Out; try inst.Arg.
|
||||
arg = false |
||||
pc = inst.Arg |
||||
goto CheckAndLoop |
||||
} else { |
||||
b.push(re, pc, pos, true) |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
} |
||||
|
||||
case syntax.InstAltMatch: |
||||
// One opcode consumes runes; the other leads to match.
|
||||
switch re.prog.Inst[inst.Out].Op { |
||||
case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: |
||||
// inst.Arg is the match.
|
||||
b.push(re, inst.Arg, pos, false) |
||||
pc = inst.Arg |
||||
pos = b.end |
||||
goto CheckAndLoop |
||||
} |
||||
// inst.Out is the match - non-greedy
|
||||
b.push(re, inst.Out, b.end, false) |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstRune: |
||||
r, width := i.step(pos) |
||||
if !inst.MatchRune(r) { |
||||
continue |
||||
} |
||||
pos += width |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstRune1: |
||||
r, width := i.step(pos) |
||||
if r != inst.Rune[0] { |
||||
continue |
||||
} |
||||
pos += width |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstRuneAnyNotNL: |
||||
r, width := i.step(pos) |
||||
if r == '\n' || r == endOfText { |
||||
continue |
||||
} |
||||
pos += width |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstRuneAny: |
||||
r, width := i.step(pos) |
||||
if r == endOfText { |
||||
continue |
||||
} |
||||
pos += width |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstCapture: |
||||
if arg { |
||||
// Finished inst.Out; restore the old value.
|
||||
b.cap[inst.Arg] = pos |
||||
continue |
||||
} else { |
||||
if inst.Arg < uint32(len(b.cap)) { |
||||
// Capture pos to register, but save old value.
|
||||
b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
|
||||
b.cap[inst.Arg] = pos |
||||
} |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
} |
||||
|
||||
case syntax.InstEmptyWidth: |
||||
flag := i.context(pos) |
||||
if !flag.match(syntax.EmptyOp(inst.Arg)) { |
||||
continue |
||||
} |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstNop: |
||||
pc = inst.Out |
||||
goto CheckAndLoop |
||||
|
||||
case syntax.InstMatch: |
||||
// We found a match. If the caller doesn't care
|
||||
// where the match is, no point going further.
|
||||
if len(b.cap) == 0 { |
||||
return true |
||||
} |
||||
|
||||
// Record best match so far.
|
||||
// Only need to check end point, because this entire
|
||||
// call is only considering one start position.
|
||||
if len(b.cap) > 1 { |
||||
b.cap[1] = pos |
||||
} |
||||
if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) { |
||||
copy(b.matchcap, b.cap) |
||||
} |
||||
|
||||
// If going for first match, we're done.
|
||||
if !longest { |
||||
return true |
||||
} |
||||
|
||||
// If we used the entire text, no longer match is possible.
|
||||
if pos == b.end { |
||||
return true |
||||
} |
||||
|
||||
// Otherwise, continue on in hope of a longer match.
|
||||
continue |
||||
} |
||||
} |
||||
|
||||
return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0 |
||||
} |
||||
|
||||
// backtrack runs a backtracking search of prog on the input starting at pos.
|
||||
func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int { |
||||
startCond := re.cond |
||||
if startCond == ^syntax.EmptyOp(0) { // impossible
|
||||
return nil |
||||
} |
||||
if startCond&syntax.EmptyBeginText != 0 && pos != 0 { |
||||
// Anchored match, past beginning of text.
|
||||
return nil |
||||
} |
||||
|
||||
b := newBitState() |
||||
i, end := b.inputs.init(nil, ib, is) |
||||
b.reset(re.prog, end, ncap) |
||||
|
||||
// Anchored search must start at the beginning of the input
|
||||
if startCond&syntax.EmptyBeginText != 0 { |
||||
if len(b.cap) > 0 { |
||||
b.cap[0] = pos |
||||
} |
||||
if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { |
||||
freeBitState(b) |
||||
return nil |
||||
} |
||||
} else { |
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is pos <= end, not pos < end.
|
||||
// This looks like it's quadratic in the size of the text,
|
||||
// but we are not clearing visited between calls to TrySearch,
|
||||
// so no work is duplicated and it ends up still being linear.
|
||||
width := -1 |
||||
for pos <= end && width != 0 { |
||||
if len(b.cap) > 0 { |
||||
b.cap[0] = pos |
||||
} |
||||
if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { |
||||
// Match must be leftmost; done.
|
||||
goto Match |
||||
} |
||||
_, width = i.step(pos) |
||||
pos += width |
||||
|
||||
if len(re.prefix) > 0 { |
||||
// Match requires literal prefix; fast search for next occurrence.
|
||||
advance := i.index(re, pos) |
||||
if advance < 0 { |
||||
freeBitState(b) |
||||
return nil |
||||
} |
||||
pos += advance |
||||
} |
||||
} |
||||
freeBitState(b) |
||||
return nil |
||||
} |
||||
|
||||
Match: |
||||
dstCap = append(dstCap, b.matchcap...) |
||||
freeBitState(b) |
||||
return dstCap |
||||
} |
@ -0,0 +1,579 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package regexp |
||||
|
||||
import ( |
||||
"io" |
||||
"sync" |
||||
|
||||
"github.com/grafana/regexp/syntax" |
||||
) |
||||
|
||||
// A queue is a 'sparse array' holding pending threads of execution.
|
||||
// See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
|
||||
type queue struct { |
||||
sparse []uint32 |
||||
dense []entry |
||||
} |
||||
|
||||
// An entry is an entry on a queue.
|
||||
// It holds both the instruction pc and the actual thread.
|
||||
// Some queue entries are just place holders so that the machine
|
||||
// knows it has considered that pc. Such entries have t == nil.
|
||||
type entry struct { |
||||
pc uint32 |
||||
t *thread |
||||
} |
||||
|
||||
// A thread is the state of a single path through the machine:
|
||||
// an instruction and a corresponding capture array.
|
||||
// See https://swtch.com/~rsc/regexp/regexp2.html
|
||||
type thread struct { |
||||
inst *syntax.Inst |
||||
cap []int |
||||
} |
||||
|
||||
// A machine holds all the state during an NFA simulation for p.
|
||||
type machine struct { |
||||
re *Regexp // corresponding Regexp
|
||||
p *syntax.Prog // compiled program
|
||||
q0, q1 queue // two queues for runq, nextq
|
||||
pool []*thread // pool of available threads
|
||||
matched bool // whether a match was found
|
||||
matchcap []int // capture information for the match
|
||||
|
||||
inputs inputs |
||||
} |
||||
|
||||
type inputs struct { |
||||
// cached inputs, to avoid allocation
|
||||
bytes inputBytes |
||||
string inputString |
||||
reader inputReader |
||||
} |
||||
|
||||
func (i *inputs) newBytes(b []byte) input { |
||||
i.bytes.str = b |
||||
return &i.bytes |
||||
} |
||||
|
||||
func (i *inputs) newString(s string) input { |
||||
i.string.str = s |
||||
return &i.string |
||||
} |
||||
|
||||
func (i *inputs) newReader(r io.RuneReader) input { |
||||
i.reader.r = r |
||||
i.reader.atEOT = false |
||||
i.reader.pos = 0 |
||||
return &i.reader |
||||
} |
||||
|
||||
func (i *inputs) clear() { |
||||
// We need to clear 1 of these.
|
||||
// Avoid the expense of clearing the others (pointer write barrier).
|
||||
if i.bytes.str != nil { |
||||
i.bytes.str = nil |
||||
} else if i.reader.r != nil { |
||||
i.reader.r = nil |
||||
} else { |
||||
i.string.str = "" |
||||
} |
||||
} |
||||
|
||||
func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) { |
||||
if r != nil { |
||||
return i.newReader(r), 0 |
||||
} |
||||
if b != nil { |
||||
return i.newBytes(b), len(b) |
||||
} |
||||
return i.newString(s), len(s) |
||||
} |
||||
|
||||
func (m *machine) init(ncap int) { |
||||
for _, t := range m.pool { |
||||
t.cap = t.cap[:ncap] |
||||
} |
||||
m.matchcap = m.matchcap[:ncap] |
||||
} |
||||
|
||||
// alloc allocates a new thread with the given instruction.
|
||||
// It uses the free pool if possible.
|
||||
func (m *machine) alloc(i *syntax.Inst) *thread { |
||||
var t *thread |
||||
if n := len(m.pool); n > 0 { |
||||
t = m.pool[n-1] |
||||
m.pool = m.pool[:n-1] |
||||
} else { |
||||
t = new(thread) |
||||
t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) |
||||
} |
||||
t.inst = i |
||||
return t |
||||
} |
||||
|
||||
// A lazyFlag is a lazily-evaluated syntax.EmptyOp,
|
||||
// for checking zero-width flags like ^ $ \A \z \B \b.
|
||||
// It records the pair of relevant runes and does not
|
||||
// determine the implied flags until absolutely necessary
|
||||
// (most of the time, that means never).
|
||||
type lazyFlag uint64 |
||||
|
||||
func newLazyFlag(r1, r2 rune) lazyFlag { |
||||
return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2))) |
||||
} |
||||
|
||||
func (f lazyFlag) match(op syntax.EmptyOp) bool { |
||||
if op == 0 { |
||||
return true |
||||
} |
||||
r1 := rune(f >> 32) |
||||
if op&syntax.EmptyBeginLine != 0 { |
||||
if r1 != '\n' && r1 >= 0 { |
||||
return false |
||||
} |
||||
op &^= syntax.EmptyBeginLine |
||||
} |
||||
if op&syntax.EmptyBeginText != 0 { |
||||
if r1 >= 0 { |
||||
return false |
||||
} |
||||
op &^= syntax.EmptyBeginText |
||||
} |
||||
if op == 0 { |
||||
return true |
||||
} |
||||
r2 := rune(f) |
||||
if op&syntax.EmptyEndLine != 0 { |
||||
if r2 != '\n' && r2 >= 0 { |
||||
return false |
||||
} |
||||
op &^= syntax.EmptyEndLine |
||||
} |
||||
if op&syntax.EmptyEndText != 0 { |
||||
if r2 >= 0 { |
||||
return false |
||||
} |
||||
op &^= syntax.EmptyEndText |
||||
} |
||||
if op == 0 { |
||||
return true |
||||
} |
||||
if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) { |
||||
op &^= syntax.EmptyWordBoundary |
||||
} else { |
||||
op &^= syntax.EmptyNoWordBoundary |
||||
} |
||||
return op == 0 |
||||
} |
||||
|
||||
// match runs the machine over the input starting at pos.
|
||||
// It reports whether a match was found.
|
||||
// If so, m.matchcap holds the submatch information.
|
||||
func (m *machine) match(i input, pos int) bool { |
||||
startCond := m.re.cond |
||||
if startCond == ^syntax.EmptyOp(0) { // impossible
|
||||
return false |
||||
} |
||||
m.matched = false |
||||
for i := range m.matchcap { |
||||
m.matchcap[i] = -1 |
||||
} |
||||
runq, nextq := &m.q0, &m.q1 |
||||
r, r1 := endOfText, endOfText |
||||
width, width1 := 0, 0 |
||||
r, width = i.step(pos) |
||||
if r != endOfText { |
||||
r1, width1 = i.step(pos + width) |
||||
} |
||||
var flag lazyFlag |
||||
if pos == 0 { |
||||
flag = newLazyFlag(-1, r) |
||||
} else { |
||||
flag = i.context(pos) |
||||
} |
||||
for { |
||||
if len(runq.dense) == 0 { |
||||
if startCond&syntax.EmptyBeginText != 0 && pos != 0 { |
||||
// Anchored match, past beginning of text.
|
||||
break |
||||
} |
||||
if m.matched { |
||||
// Have match; finished exploring alternatives.
|
||||
break |
||||
} |
||||
// Note we don't check foldCase here, because Unicode folding is complicated;
|
||||
// just let it fall through to EqualFold on the whole string.
|
||||
if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { |
||||
// Match requires literal prefix; fast search for it.
|
||||
advance := i.index(m.re, pos) |
||||
if advance < 0 { |
||||
break |
||||
} |
||||
pos += advance |
||||
r, width = i.step(pos) |
||||
r1, width1 = i.step(pos + width) |
||||
} |
||||
} |
||||
if !m.matched { |
||||
if len(m.matchcap) > 0 { |
||||
m.matchcap[0] = pos |
||||
} |
||||
m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil) |
||||
} |
||||
flag = newLazyFlag(r, r1) |
||||
m.step(runq, nextq, pos, pos+width, r, &flag) |
||||
if width == 0 { |
||||
break |
||||
} |
||||
if len(m.matchcap) == 0 && m.matched { |
||||
// Found a match and not paying attention
|
||||
// to where it is, so any match will do.
|
||||
break |
||||
} |
||||
pos += width |
||||
r, width = r1, width1 |
||||
if r != endOfText { |
||||
r1, width1 = i.step(pos + width) |
||||
} |
||||
runq, nextq = nextq, runq |
||||
} |
||||
m.clear(nextq) |
||||
return m.matched |
||||
} |
||||
|
||||
// clear frees all threads on the thread queue.
|
||||
func (m *machine) clear(q *queue) { |
||||
for _, d := range q.dense { |
||||
if d.t != nil { |
||||
m.pool = append(m.pool, d.t) |
||||
} |
||||
} |
||||
q.dense = q.dense[:0] |
||||
} |
||||
|
||||
// step executes one step of the machine, running each of the threads
|
||||
// on runq and appending new threads to nextq.
|
||||
// The step processes the rune c (which may be endOfText),
|
||||
// which starts at position pos and ends at nextPos.
|
||||
// nextCond gives the setting for the empty-width flags after c.
|
||||
func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) { |
||||
longest := m.re.longest |
||||
for j := 0; j < len(runq.dense); j++ { |
||||
d := &runq.dense[j] |
||||
t := d.t |
||||
if t == nil { |
||||
continue |
||||
} |
||||
if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { |
||||
m.pool = append(m.pool, t) |
||||
continue |
||||
} |
||||
i := t.inst |
||||
add := false |
||||
switch i.Op { |
||||
default: |
||||
panic("bad inst") |
||||
|
||||
case syntax.InstMatch: |
||||
if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { |
||||
t.cap[1] = pos |
||||
copy(m.matchcap, t.cap) |
||||
} |
||||
if !longest { |
||||
// First-match mode: cut off all lower-priority threads.
|
||||
for _, d := range runq.dense[j+1:] { |
||||
if d.t != nil { |
||||
m.pool = append(m.pool, d.t) |
||||
} |
||||
} |
||||
runq.dense = runq.dense[:0] |
||||
} |
||||
m.matched = true |
||||
|
||||
case syntax.InstRune: |
||||
add = i.MatchRune(c) |
||||
case syntax.InstRune1: |
||||
add = c == i.Rune[0] |
||||
case syntax.InstRuneAny: |
||||
add = true |
||||
case syntax.InstRuneAnyNotNL: |
||||
add = c != '\n' |
||||
} |
||||
if add { |
||||
t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) |
||||
} |
||||
if t != nil { |
||||
m.pool = append(m.pool, t) |
||||
} |
||||
} |
||||
runq.dense = runq.dense[:0] |
||||
} |
||||
|
||||
// add adds an entry to q for pc, unless the q already has such an entry.
|
||||
// It also recursively adds an entry for all instructions reachable from pc by following
|
||||
// empty-width conditions satisfied by cond. pos gives the current position
|
||||
// in the input.
|
||||
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread { |
||||
Again: |
||||
if pc == 0 { |
||||
return t |
||||
} |
||||
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { |
||||
return t |
||||
} |
||||
|
||||
j := len(q.dense) |
||||
q.dense = q.dense[:j+1] |
||||
d := &q.dense[j] |
||||
d.t = nil |
||||
d.pc = pc |
||||
q.sparse[pc] = uint32(j) |
||||
|
||||
i := &m.p.Inst[pc] |
||||
switch i.Op { |
||||
default: |
||||
panic("unhandled") |
||||
case syntax.InstFail: |
||||
// nothing
|
||||
case syntax.InstAlt, syntax.InstAltMatch: |
||||
t = m.add(q, i.Out, pos, cap, cond, t) |
||||
pc = i.Arg |
||||
goto Again |
||||
case syntax.InstEmptyWidth: |
||||
if cond.match(syntax.EmptyOp(i.Arg)) { |
||||
pc = i.Out |
||||
goto Again |
||||
} |
||||
case syntax.InstNop: |
||||
pc = i.Out |
||||
goto Again |
||||
case syntax.InstCapture: |
||||
if int(i.Arg) < len(cap) { |
||||
opos := cap[i.Arg] |
||||
cap[i.Arg] = pos |
||||
m.add(q, i.Out, pos, cap, cond, nil) |
||||
cap[i.Arg] = opos |
||||
} else { |
||||
pc = i.Out |
||||
goto Again |
||||
} |
||||
case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: |
||||
if t == nil { |
||||
t = m.alloc(i) |
||||
} else { |
||||
t.inst = i |
||||
} |
||||
if len(cap) > 0 && &t.cap[0] != &cap[0] { |
||||
copy(t.cap, cap) |
||||
} |
||||
d.t = t |
||||
t = nil |
||||
} |
||||
return t |
||||
} |
||||
|
||||
type onePassMachine struct { |
||||
inputs inputs |
||||
matchcap []int |
||||
} |
||||
|
||||
var onePassPool sync.Pool |
||||
|
||||
func newOnePassMachine() *onePassMachine { |
||||
m, ok := onePassPool.Get().(*onePassMachine) |
||||
if !ok { |
||||
m = new(onePassMachine) |
||||
} |
||||
return m |
||||
} |
||||
|
||||
func freeOnePassMachine(m *onePassMachine) { |
||||
m.inputs.clear() |
||||
onePassPool.Put(m) |
||||
} |
||||
|
||||
// doOnePass implements r.doExecute using the one-pass execution engine.
|
||||
func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int { |
||||
startCond := re.cond |
||||
if startCond == ^syntax.EmptyOp(0) { // impossible
|
||||
return nil |
||||
} |
||||
|
||||
m := newOnePassMachine() |
||||
matched := false |
||||
i, _ := m.inputs.init(ir, ib, is) |
||||
|
||||
r, r1 := endOfText, endOfText |
||||
width, width1 := 0, 0 |
||||
var flag lazyFlag |
||||
var pc int |
||||
var inst *onePassInst |
||||
|
||||
// If there is a simple literal prefix, skip over it.
|
||||
if pos == 0 && len(re.prefix) > 0 && i.canCheckPrefix() { |
||||
// Match requires literal prefix; fast search for it.
|
||||
if !i.hasPrefix(re) { |
||||
goto Return |
||||
} |
||||
pos += len(re.prefix) |
||||
pc = int(re.prefixEnd) |
||||
} else { |
||||
pc = re.onepass.Start |
||||
} |
||||
|
||||
if cap(m.matchcap) < ncap { |
||||
m.matchcap = make([]int, ncap) |
||||
} else { |
||||
m.matchcap = m.matchcap[:ncap] |
||||
} |
||||
for i := range m.matchcap { |
||||
m.matchcap[i] = -1 |
||||
} |
||||
|
||||
r, width = i.step(pos) |
||||
if pos == 0 { |
||||
flag = newLazyFlag(-1, r) |
||||
} else { |
||||
flag = i.context(pos) |
||||
} |
||||
if r != endOfText { |
||||
r1, width1 = i.step(pos + width) |
||||
} |
||||
for { |
||||
inst = &re.onepass.Inst[pc] |
||||
pc = int(inst.Out) |
||||
switch inst.Op { |
||||
default: |
||||
panic("bad inst") |
||||
case syntax.InstMatch: |
||||
matched = true |
||||
if len(m.matchcap) > 0 { |
||||
m.matchcap[0] = 0 |
||||
m.matchcap[1] = pos |
||||
} |
||||
goto Return |
||||
case syntax.InstRune: |
||||
if !inst.MatchRune(r) { |
||||
goto Return |
||||
} |
||||
case syntax.InstRune1: |
||||
if r != inst.Rune[0] { |
||||
goto Return |
||||
} |
||||
case syntax.InstRuneAny: |
||||
// Nothing
|
||||
case syntax.InstRuneAnyNotNL: |
||||
if r == '\n' { |
||||
goto Return |
||||
} |
||||
// peek at the input rune to see which branch of the Alt to take
|
||||
case syntax.InstAlt, syntax.InstAltMatch: |
||||
pc = int(onePassNext(inst, r)) |
||||
continue |
||||
case syntax.InstFail: |
||||
goto Return |
||||
case syntax.InstNop: |
||||
continue |
||||
case syntax.InstEmptyWidth: |
||||
if !flag.match(syntax.EmptyOp(inst.Arg)) { |
||||
goto Return |
||||
} |
||||
continue |
||||
case syntax.InstCapture: |
||||
if int(inst.Arg) < len(m.matchcap) { |
||||
m.matchcap[inst.Arg] = pos |
||||
} |
||||
continue |
||||
} |
||||
if width == 0 { |
||||
break |
||||
} |
||||
flag = newLazyFlag(r, r1) |
||||
pos += width |
||||
r, width = r1, width1 |
||||
if r != endOfText { |
||||
r1, width1 = i.step(pos + width) |
||||
} |
||||
} |
||||
|
||||
Return: |
||||
if !matched { |
||||
freeOnePassMachine(m) |
||||
return nil |
||||
} |
||||
|
||||
dstCap = append(dstCap, m.matchcap...) |
||||
freeOnePassMachine(m) |
||||
return dstCap |
||||
} |
||||
|
||||
// doMatch reports whether either r, b or s match the regexp.
|
||||
func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { |
||||
return re.doExecute(r, b, s, 0, 0, nil) != nil |
||||
} |
||||
|
||||
// doExecute finds the leftmost match in the input, appends the position
|
||||
// of its subexpressions to dstCap and returns dstCap.
|
||||
//
|
||||
// nil is returned if no matches are found and non-nil if matches are found.
|
||||
func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { |
||||
if dstCap == nil { |
||||
// Make sure 'return dstCap' is non-nil.
|
||||
dstCap = arrayNoInts[:0:0] |
||||
} |
||||
|
||||
if r == nil && len(b)+len(s) < re.minInputLen { |
||||
return nil |
||||
} |
||||
|
||||
// Check prefix match before allocating data structures
|
||||
if len(re.prefix) > 0 && r == nil { |
||||
if re.cond&syntax.EmptyBeginText != 0 { // anchored
|
||||
if b != nil && !(&inputBytes{str: b[pos:]}).hasPrefix(re) { |
||||
return nil |
||||
} |
||||
if s != "" && !(&inputString{str: s[pos:]}).hasPrefix(re) { |
||||
return nil |
||||
} |
||||
} else { // non-anchored
|
||||
var advance int |
||||
if b != nil { |
||||
advance = (&inputBytes{str: b}).index(re, pos) |
||||
} else { |
||||
advance = (&inputString{str: s}).index(re, pos) |
||||
} |
||||
if advance < 0 { |
||||
return nil |
||||
} |
||||
pos += advance |
||||
} |
||||
} |
||||
|
||||
if re.onepass != nil { |
||||
return re.doOnePass(r, b, s, pos, ncap, dstCap) |
||||
} |
||||
if r == nil && len(b)+len(s) < re.maxBitStateLen { |
||||
return re.backtrack(b, s, pos, ncap, dstCap) |
||||
} |
||||
|
||||
m := re.get() |
||||
i, _ := m.inputs.init(r, b, s) |
||||
|
||||
m.init(ncap) |
||||
if !m.match(i, pos) { |
||||
re.put(m) |
||||
return nil |
||||
} |
||||
|
||||
dstCap = append(dstCap, m.matchcap...) |
||||
re.put(m) |
||||
return dstCap |
||||
} |
||||
|
||||
// arrayNoInts is returned by doExecute match if nil dstCap is passed
|
||||
// to it with ncap=0.
|
||||
var arrayNoInts [0]int |
@ -0,0 +1,517 @@ |
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package regexp |
||||
|
||||
import ( |
||||
"sort" |
||||
"strings" |
||||
"unicode" |
||||
"unicode/utf8" |
||||
|
||||
"github.com/grafana/regexp/syntax" |
||||
) |
||||
|
||||
// "One-pass" regexp execution.
|
||||
// Some regexps can be analyzed to determine that they never need
|
||||
// backtracking: they are guaranteed to run in one pass over the string
|
||||
// without bothering to save all the usual NFA state.
|
||||
// Detect those and execute them more quickly.
|
||||
|
||||
// A onePassProg is a compiled one-pass regular expression program.
|
||||
// It is the same as syntax.Prog except for the use of onePassInst.
|
||||
type onePassProg struct { |
||||
Inst []onePassInst |
||||
Start int // index of start instruction
|
||||
NumCap int // number of InstCapture insts in re
|
||||
} |
||||
|
||||
// A onePassInst is a single instruction in a one-pass regular expression program.
|
||||
// It is the same as syntax.Inst except for the new 'Next' field.
|
||||
type onePassInst struct { |
||||
syntax.Inst |
||||
Next []uint32 |
||||
} |
||||
|
||||
// OnePassPrefix returns a literal string that all matches for the
|
||||
// regexp must start with. Complete is true if the prefix
|
||||
// is the entire match. Pc is the index of the last rune instruction
|
||||
// in the string. The OnePassPrefix skips over the mandatory
|
||||
// EmptyBeginText
|
||||
func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, foldCase bool, pc uint32) { |
||||
i := &p.Inst[p.Start] |
||||
if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { |
||||
return "", i.Op == syntax.InstMatch, false, uint32(p.Start) |
||||
} |
||||
pc = i.Out |
||||
i = &p.Inst[pc] |
||||
for i.Op == syntax.InstNop { |
||||
pc = i.Out |
||||
i = &p.Inst[pc] |
||||
} |
||||
// Avoid allocation of buffer if prefix is empty.
|
||||
if iop(i) != syntax.InstRune || len(i.Rune) != 1 { |
||||
return "", i.Op == syntax.InstMatch, false, uint32(p.Start) |
||||
} |
||||
|
||||
foldCase = (syntax.Flags(i.Arg)&syntax.FoldCase != 0) |
||||
// Have prefix; gather characters.
|
||||
var buf strings.Builder |
||||
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && (syntax.Flags(i.Arg)&syntax.FoldCase != 0) == foldCase && i.Rune[0] != utf8.RuneError { |
||||
buf.WriteRune(i.Rune[0]) |
||||
pc, i = i.Out, &p.Inst[i.Out] |
||||
} |
||||
if i.Op == syntax.InstEmptyWidth && |
||||
syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && |
||||
p.Inst[i.Out].Op == syntax.InstMatch { |
||||
complete = true |
||||
} |
||||
return buf.String(), complete, foldCase, pc |
||||
} |
||||
|
||||
// OnePassNext selects the next actionable state of the prog, based on the input character.
|
||||
// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
|
||||
// One of the alternates may ultimately lead without input to end of line. If the instruction
|
||||
// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
|
||||
func onePassNext(i *onePassInst, r rune) uint32 { |
||||
next := i.MatchRunePos(r) |
||||
if next >= 0 { |
||||
return i.Next[next] |
||||
} |
||||
if i.Op == syntax.InstAltMatch { |
||||
return i.Out |
||||
} |
||||
return 0 |
||||
} |
||||
|
||||
func iop(i *syntax.Inst) syntax.InstOp { |
||||
op := i.Op |
||||
switch op { |
||||
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: |
||||
op = syntax.InstRune |
||||
} |
||||
return op |
||||
} |
||||
|
||||
// Sparse Array implementation is used as a queueOnePass.
|
||||
type queueOnePass struct { |
||||
sparse []uint32 |
||||
dense []uint32 |
||||
size, nextIndex uint32 |
||||
} |
||||
|
||||
func (q *queueOnePass) empty() bool { |
||||
return q.nextIndex >= q.size |
||||
} |
||||
|
||||
func (q *queueOnePass) next() (n uint32) { |
||||
n = q.dense[q.nextIndex] |
||||
q.nextIndex++ |
||||
return |
||||
} |
||||
|
||||
func (q *queueOnePass) clear() { |
||||
q.size = 0 |
||||
q.nextIndex = 0 |
||||
} |
||||
|
||||
func (q *queueOnePass) contains(u uint32) bool { |
||||
if u >= uint32(len(q.sparse)) { |
||||
return false |
||||
} |
||||
return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u |
||||
} |
||||
|
||||
func (q *queueOnePass) insert(u uint32) { |
||||
if !q.contains(u) { |
||||
q.insertNew(u) |
||||
} |
||||
} |
||||
|
||||
func (q *queueOnePass) insertNew(u uint32) { |
||||
if u >= uint32(len(q.sparse)) { |
||||
return |
||||
} |
||||
q.sparse[u] = q.size |
||||
q.dense[q.size] = u |
||||
q.size++ |
||||
} |
||||
|
||||
func newQueue(size int) (q *queueOnePass) { |
||||
return &queueOnePass{ |
||||
sparse: make([]uint32, size), |
||||
dense: make([]uint32, size), |
||||
} |
||||
} |
||||
|
||||
// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
|
||||
// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
|
||||
// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
|
||||
// NextIp array with the single element mergeFailed is returned.
|
||||
// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
|
||||
const mergeFailed = uint32(0xffffffff) |
||||
|
||||
var ( |
||||
noRune = []rune{} |
||||
noNext = []uint32{mergeFailed} |
||||
) |
||||
|
||||
func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { |
||||
leftLen := len(*leftRunes) |
||||
rightLen := len(*rightRunes) |
||||
if leftLen&0x1 != 0 || rightLen&0x1 != 0 { |
||||
panic("mergeRuneSets odd length []rune") |
||||
} |
||||
var ( |
||||
lx, rx int |
||||
) |
||||
merged := make([]rune, 0) |
||||
next := make([]uint32, 0) |
||||
ok := true |
||||
defer func() { |
||||
if !ok { |
||||
merged = nil |
||||
next = nil |
||||
} |
||||
}() |
||||
|
||||
ix := -1 |
||||
extend := func(newLow *int, newArray *[]rune, pc uint32) bool { |
||||
if ix > 0 && (*newArray)[*newLow] <= merged[ix] { |
||||
return false |
||||
} |
||||
merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) |
||||
*newLow += 2 |
||||
ix += 2 |
||||
next = append(next, pc) |
||||
return true |
||||
} |
||||
|
||||
for lx < leftLen || rx < rightLen { |
||||
switch { |
||||
case rx >= rightLen: |
||||
ok = extend(&lx, leftRunes, leftPC) |
||||
case lx >= leftLen: |
||||
ok = extend(&rx, rightRunes, rightPC) |
||||
case (*rightRunes)[rx] < (*leftRunes)[lx]: |
||||
ok = extend(&rx, rightRunes, rightPC) |
||||
default: |
||||
ok = extend(&lx, leftRunes, leftPC) |
||||
} |
||||
if !ok { |
||||
return noRune, noNext |
||||
} |
||||
} |
||||
return merged, next |
||||
} |
||||
|
||||
// cleanupOnePass drops working memory, and restores certain shortcut instructions.
|
||||
func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { |
||||
for ix, instOriginal := range original.Inst { |
||||
switch instOriginal.Op { |
||||
case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: |
||||
case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: |
||||
prog.Inst[ix].Next = nil |
||||
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: |
||||
prog.Inst[ix].Next = nil |
||||
prog.Inst[ix] = onePassInst{Inst: instOriginal} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// onePassCopy creates a copy of the original Prog, as we'll be modifying it
|
||||
func onePassCopy(prog *syntax.Prog) *onePassProg { |
||||
p := &onePassProg{ |
||||
Start: prog.Start, |
||||
NumCap: prog.NumCap, |
||||
Inst: make([]onePassInst, len(prog.Inst)), |
||||
} |
||||
for i, inst := range prog.Inst { |
||||
p.Inst[i] = onePassInst{Inst: inst} |
||||
} |
||||
|
||||
// rewrites one or more common Prog constructs that enable some otherwise
|
||||
// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
|
||||
// ip A, that points to ips B & C.
|
||||
// A:BC + B:DA => A:BC + B:CD
|
||||
// A:BC + B:DC => A:DC + B:DC
|
||||
for pc := range p.Inst { |
||||
switch p.Inst[pc].Op { |
||||
default: |
||||
continue |
||||
case syntax.InstAlt, syntax.InstAltMatch: |
||||
// A:Bx + B:Ay
|
||||
p_A_Other := &p.Inst[pc].Out |
||||
p_A_Alt := &p.Inst[pc].Arg |
||||
// make sure a target is another Alt
|
||||
instAlt := p.Inst[*p_A_Alt] |
||||
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { |
||||
p_A_Alt, p_A_Other = p_A_Other, p_A_Alt |
||||
instAlt = p.Inst[*p_A_Alt] |
||||
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { |
||||
continue |
||||
} |
||||
} |
||||
instOther := p.Inst[*p_A_Other] |
||||
// Analyzing both legs pointing to Alts is for another day
|
||||
if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { |
||||
// too complicated
|
||||
continue |
||||
} |
||||
// simple empty transition loop
|
||||
// A:BC + B:DA => A:BC + B:DC
|
||||
p_B_Alt := &p.Inst[*p_A_Alt].Out |
||||
p_B_Other := &p.Inst[*p_A_Alt].Arg |
||||
patch := false |
||||
if instAlt.Out == uint32(pc) { |
||||
patch = true |
||||
} else if instAlt.Arg == uint32(pc) { |
||||
patch = true |
||||
p_B_Alt, p_B_Other = p_B_Other, p_B_Alt |
||||
} |
||||
if patch { |
||||
*p_B_Alt = *p_A_Other |
||||
} |
||||
|
||||
// empty transition to common target
|
||||
// A:BC + B:DC => A:DC + B:DC
|
||||
if *p_A_Other == *p_B_Alt { |
||||
*p_A_Alt = *p_B_Other |
||||
} |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
// runeSlice exists to permit sorting the case-folded rune sets.
|
||||
type runeSlice []rune |
||||
|
||||
func (p runeSlice) Len() int { return len(p) } |
||||
func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } |
||||
func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } |
||||
|
||||
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} |
||||
var anyRune = []rune{0, unicode.MaxRune} |
||||
|
||||
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
|
||||
// the match engine can always tell which branch to take. The routine may modify
|
||||
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
|
||||
// onepass Prog, the Prog nil is returned. makeOnePass is recursive
|
||||
// to the size of the Prog.
|
||||
func makeOnePass(p *onePassProg) *onePassProg { |
||||
// If the machine is very long, it's not worth the time to check if we can use one pass.
|
||||
if len(p.Inst) >= 1000 { |
||||
return nil |
||||
} |
||||
|
||||
var ( |
||||
instQueue = newQueue(len(p.Inst)) |
||||
visitQueue = newQueue(len(p.Inst)) |
||||
check func(uint32, []bool) bool |
||||
onePassRunes = make([][]rune, len(p.Inst)) |
||||
) |
||||
|
||||
// check that paths from Alt instructions are unambiguous, and rebuild the new
|
||||
// program as a onepass program
|
||||
check = func(pc uint32, m []bool) (ok bool) { |
||||
ok = true |
||||
inst := &p.Inst[pc] |
||||
if visitQueue.contains(pc) { |
||||
return |
||||
} |
||||
visitQueue.insert(pc) |
||||
switch inst.Op { |
||||
case syntax.InstAlt, syntax.InstAltMatch: |
||||
ok = check(inst.Out, m) && check(inst.Arg, m) |
||||
// check no-input paths to InstMatch
|
||||
matchOut := m[inst.Out] |
||||
matchArg := m[inst.Arg] |
||||
if matchOut && matchArg { |
||||
ok = false |
||||
break |
||||
} |
||||
// Match on empty goes in inst.Out
|
||||
if matchArg { |
||||
inst.Out, inst.Arg = inst.Arg, inst.Out |
||||
matchOut, matchArg = matchArg, matchOut |
||||
} |
||||
if matchOut { |
||||
m[pc] = true |
||||
inst.Op = syntax.InstAltMatch |
||||
} |
||||
|
||||
// build a dispatch operator from the two legs of the alt.
|
||||
onePassRunes[pc], inst.Next = mergeRuneSets( |
||||
&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) |
||||
if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { |
||||
ok = false |
||||
break |
||||
} |
||||
case syntax.InstCapture, syntax.InstNop: |
||||
ok = check(inst.Out, m) |
||||
m[pc] = m[inst.Out] |
||||
// pass matching runes back through these no-ops.
|
||||
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) |
||||
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) |
||||
for i := range inst.Next { |
||||
inst.Next[i] = inst.Out |
||||
} |
||||
case syntax.InstEmptyWidth: |
||||
ok = check(inst.Out, m) |
||||
m[pc] = m[inst.Out] |
||||
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) |
||||
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) |
||||
for i := range inst.Next { |
||||
inst.Next[i] = inst.Out |
||||
} |
||||
case syntax.InstMatch, syntax.InstFail: |
||||
m[pc] = inst.Op == syntax.InstMatch |
||||
case syntax.InstRune: |
||||
m[pc] = false |
||||
if len(inst.Next) > 0 { |
||||
break |
||||
} |
||||
instQueue.insert(inst.Out) |
||||
if len(inst.Rune) == 0 { |
||||
onePassRunes[pc] = []rune{} |
||||
inst.Next = []uint32{inst.Out} |
||||
break |
||||
} |
||||
runes := make([]rune, 0) |
||||
if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { |
||||
r0 := inst.Rune[0] |
||||
runes = append(runes, r0, r0) |
||||
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { |
||||
runes = append(runes, r1, r1) |
||||
} |
||||
sort.Sort(runeSlice(runes)) |
||||
} else { |
||||
runes = append(runes, inst.Rune...) |
||||
} |
||||
onePassRunes[pc] = runes |
||||
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) |
||||
for i := range inst.Next { |
||||
inst.Next[i] = inst.Out |
||||
} |
||||
inst.Op = syntax.InstRune |
||||
case syntax.InstRune1: |
||||
m[pc] = false |
||||
if len(inst.Next) > 0 { |
||||
break |
||||
} |
||||
instQueue.insert(inst.Out) |
||||
runes := []rune{} |
||||
// expand case-folded runes
|
||||
if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { |
||||
r0 := inst.Rune[0] |
||||
runes = append(runes, r0, r0) |
||||
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { |
||||
runes = append(runes, r1, r1) |
||||
} |
||||
sort.Sort(runeSlice(runes)) |
||||
} else { |
||||
runes = append(runes, inst.Rune[0], inst.Rune[0]) |
||||
} |
||||
onePassRunes[pc] = runes |
||||
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) |
||||
for i := range inst.Next { |
||||
inst.Next[i] = inst.Out |
||||
} |
||||
inst.Op = syntax.InstRune |
||||
case syntax.InstRuneAny: |
||||
m[pc] = false |
||||
if len(inst.Next) > 0 { |
||||
break |
||||
} |
||||
instQueue.insert(inst.Out) |
||||
onePassRunes[pc] = append([]rune{}, anyRune...) |
||||
inst.Next = []uint32{inst.Out} |
||||
case syntax.InstRuneAnyNotNL: |
||||
m[pc] = false |
||||
if len(inst.Next) > 0 { |
||||
break |
||||
} |
||||
instQueue.insert(inst.Out) |
||||
onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) |
||||
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) |
||||
for i := range inst.Next { |
||||
inst.Next[i] = inst.Out |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
instQueue.clear() |
||||
instQueue.insert(uint32(p.Start)) |
||||
m := make([]bool, len(p.Inst)) |
||||
for !instQueue.empty() { |
||||
visitQueue.clear() |
||||
pc := instQueue.next() |
||||
if !check(pc, m) { |
||||
p = nil |
||||
break |
||||
} |
||||
} |
||||
if p != nil { |
||||
for i := range p.Inst { |
||||
p.Inst[i].Rune = onePassRunes[i] |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
|
||||
// can be recharacterized as a one-pass regexp program, or syntax.nil if the
|
||||
// Prog cannot be converted. For a one pass prog, the fundamental condition that must
|
||||
// be true is: at any InstAlt, there must be no ambiguity about what branch to take.
|
||||
func compileOnePass(prog *syntax.Prog) (p *onePassProg) { |
||||
if prog.Start == 0 { |
||||
return nil |
||||
} |
||||
// onepass regexp is anchored
|
||||
if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || |
||||
syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { |
||||
return nil |
||||
} |
||||
hasAlt := false |
||||
for _, inst := range prog.Inst { |
||||
if inst.Op == syntax.InstAlt || inst.Op == syntax.InstAltMatch { |
||||
hasAlt = true |
||||
break |
||||
} |
||||
} |
||||
// If we have alternates, every instruction leading to InstMatch must be EmptyEndText.
|
||||
// Also, any match on empty text must be $.
|
||||
for _, inst := range prog.Inst { |
||||
opOut := prog.Inst[inst.Out].Op |
||||
switch inst.Op { |
||||
default: |
||||
if opOut == syntax.InstMatch && hasAlt { |
||||
return nil |
||||
} |
||||
case syntax.InstAlt, syntax.InstAltMatch: |
||||
if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { |
||||
return nil |
||||
} |
||||
case syntax.InstEmptyWidth: |
||||
if opOut == syntax.InstMatch { |
||||
if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { |
||||
continue |
||||
} |
||||
return nil |
||||
} |
||||
} |
||||
} |
||||
// Creates a slightly optimized copy of the original Prog
|
||||
// that cleans up some Prog idioms that block valid onepass programs
|
||||
p = onePassCopy(prog) |
||||
|
||||
// checkAmbiguity on InstAlts, build onepass Prog if possible
|
||||
p = makeOnePass(p) |
||||
|
||||
if p != nil { |
||||
cleanupOnePass(p, prog) |
||||
} |
||||
return p |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,296 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax |
||||
|
||||
import "unicode" |
||||
|
||||
// A patchList is a list of instruction pointers that need to be filled in (patched).
|
||||
// Because the pointers haven't been filled in yet, we can reuse their storage
|
||||
// to hold the list. It's kind of sleazy, but works well in practice.
|
||||
// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
|
||||
//
|
||||
// These aren't really pointers: they're integers, so we can reinterpret them
|
||||
// this way without using package unsafe. A value l.head denotes
|
||||
// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
|
||||
// head == 0 denotes the empty list, okay because we start every program
|
||||
// with a fail instruction, so we'll never want to point at its output link.
|
||||
type patchList struct { |
||||
head, tail uint32 |
||||
} |
||||
|
||||
func makePatchList(n uint32) patchList { |
||||
return patchList{n, n} |
||||
} |
||||
|
||||
func (l patchList) patch(p *Prog, val uint32) { |
||||
head := l.head |
||||
for head != 0 { |
||||
i := &p.Inst[head>>1] |
||||
if head&1 == 0 { |
||||
head = i.Out |
||||
i.Out = val |
||||
} else { |
||||
head = i.Arg |
||||
i.Arg = val |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (l1 patchList) append(p *Prog, l2 patchList) patchList { |
||||
if l1.head == 0 { |
||||
return l2 |
||||
} |
||||
if l2.head == 0 { |
||||
return l1 |
||||
} |
||||
|
||||
i := &p.Inst[l1.tail>>1] |
||||
if l1.tail&1 == 0 { |
||||
i.Out = l2.head |
||||
} else { |
||||
i.Arg = l2.head |
||||
} |
||||
return patchList{l1.head, l2.tail} |
||||
} |
||||
|
||||
// A frag represents a compiled program fragment.
|
||||
type frag struct { |
||||
i uint32 // index of first instruction
|
||||
out patchList // where to record end instruction
|
||||
nullable bool // whether fragment can match empty string
|
||||
} |
||||
|
||||
type compiler struct { |
||||
p *Prog |
||||
} |
||||
|
||||
// Compile compiles the regexp into a program to be executed.
|
||||
// The regexp should have been simplified already (returned from re.Simplify).
|
||||
func Compile(re *Regexp) (*Prog, error) { |
||||
var c compiler |
||||
c.init() |
||||
f := c.compile(re) |
||||
f.out.patch(c.p, c.inst(InstMatch).i) |
||||
c.p.Start = int(f.i) |
||||
return c.p, nil |
||||
} |
||||
|
||||
func (c *compiler) init() { |
||||
c.p = new(Prog) |
||||
c.p.NumCap = 2 // implicit ( and ) for whole match $0
|
||||
c.inst(InstFail) |
||||
} |
||||
|
||||
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} |
||||
var anyRune = []rune{0, unicode.MaxRune} |
||||
|
||||
func (c *compiler) compile(re *Regexp) frag { |
||||
switch re.Op { |
||||
case OpNoMatch: |
||||
return c.fail() |
||||
case OpEmptyMatch: |
||||
return c.nop() |
||||
case OpLiteral: |
||||
if len(re.Rune) == 0 { |
||||
return c.nop() |
||||
} |
||||
var f frag |
||||
for j := range re.Rune { |
||||
f1 := c.rune(re.Rune[j:j+1], re.Flags) |
||||
if j == 0 { |
||||
f = f1 |
||||
} else { |
||||
f = c.cat(f, f1) |
||||
} |
||||
} |
||||
return f |
||||
case OpCharClass: |
||||
return c.rune(re.Rune, re.Flags) |
||||
case OpAnyCharNotNL: |
||||
return c.rune(anyRuneNotNL, 0) |
||||
case OpAnyChar: |
||||
return c.rune(anyRune, 0) |
||||
case OpBeginLine: |
||||
return c.empty(EmptyBeginLine) |
||||
case OpEndLine: |
||||
return c.empty(EmptyEndLine) |
||||
case OpBeginText: |
||||
return c.empty(EmptyBeginText) |
||||
case OpEndText: |
||||
return c.empty(EmptyEndText) |
||||
case OpWordBoundary: |
||||
return c.empty(EmptyWordBoundary) |
||||
case OpNoWordBoundary: |
||||
return c.empty(EmptyNoWordBoundary) |
||||
case OpCapture: |
||||
bra := c.cap(uint32(re.Cap << 1)) |
||||
sub := c.compile(re.Sub[0]) |
||||
ket := c.cap(uint32(re.Cap<<1 | 1)) |
||||
return c.cat(c.cat(bra, sub), ket) |
||||
case OpStar: |
||||
return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) |
||||
case OpPlus: |
||||
return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) |
||||
case OpQuest: |
||||
return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0) |
||||
case OpConcat: |
||||
if len(re.Sub) == 0 { |
||||
return c.nop() |
||||
} |
||||
var f frag |
||||
for i, sub := range re.Sub { |
||||
if i == 0 { |
||||
f = c.compile(sub) |
||||
} else { |
||||
f = c.cat(f, c.compile(sub)) |
||||
} |
||||
} |
||||
return f |
||||
case OpAlternate: |
||||
var f frag |
||||
for _, sub := range re.Sub { |
||||
f = c.alt(f, c.compile(sub)) |
||||
} |
||||
return f |
||||
} |
||||
panic("regexp: unhandled case in compile") |
||||
} |
||||
|
||||
func (c *compiler) inst(op InstOp) frag { |
||||
// TODO: impose length limit
|
||||
f := frag{i: uint32(len(c.p.Inst)), nullable: true} |
||||
c.p.Inst = append(c.p.Inst, Inst{Op: op}) |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) nop() frag { |
||||
f := c.inst(InstNop) |
||||
f.out = makePatchList(f.i << 1) |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) fail() frag { |
||||
return frag{} |
||||
} |
||||
|
||||
func (c *compiler) cap(arg uint32) frag { |
||||
f := c.inst(InstCapture) |
||||
f.out = makePatchList(f.i << 1) |
||||
c.p.Inst[f.i].Arg = arg |
||||
|
||||
if c.p.NumCap < int(arg)+1 { |
||||
c.p.NumCap = int(arg) + 1 |
||||
} |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) cat(f1, f2 frag) frag { |
||||
// concat of failure is failure
|
||||
if f1.i == 0 || f2.i == 0 { |
||||
return frag{} |
||||
} |
||||
|
||||
// TODO: elide nop
|
||||
|
||||
f1.out.patch(c.p, f2.i) |
||||
return frag{f1.i, f2.out, f1.nullable && f2.nullable} |
||||
} |
||||
|
||||
func (c *compiler) alt(f1, f2 frag) frag { |
||||
// alt of failure is other
|
||||
if f1.i == 0 { |
||||
return f2 |
||||
} |
||||
if f2.i == 0 { |
||||
return f1 |
||||
} |
||||
|
||||
f := c.inst(InstAlt) |
||||
i := &c.p.Inst[f.i] |
||||
i.Out = f1.i |
||||
i.Arg = f2.i |
||||
f.out = f1.out.append(c.p, f2.out) |
||||
f.nullable = f1.nullable || f2.nullable |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) quest(f1 frag, nongreedy bool) frag { |
||||
f := c.inst(InstAlt) |
||||
i := &c.p.Inst[f.i] |
||||
if nongreedy { |
||||
i.Arg = f1.i |
||||
f.out = makePatchList(f.i << 1) |
||||
} else { |
||||
i.Out = f1.i |
||||
f.out = makePatchList(f.i<<1 | 1) |
||||
} |
||||
f.out = f.out.append(c.p, f1.out) |
||||
return f |
||||
} |
||||
|
||||
// loop returns the fragment for the main loop of a plus or star.
|
||||
// For plus, it can be used after changing the entry to f1.i.
|
||||
// For star, it can be used directly when f1 can't match an empty string.
|
||||
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
|
||||
// to get the priority match order correct.)
|
||||
func (c *compiler) loop(f1 frag, nongreedy bool) frag { |
||||
f := c.inst(InstAlt) |
||||
i := &c.p.Inst[f.i] |
||||
if nongreedy { |
||||
i.Arg = f1.i |
||||
f.out = makePatchList(f.i << 1) |
||||
} else { |
||||
i.Out = f1.i |
||||
f.out = makePatchList(f.i<<1 | 1) |
||||
} |
||||
f1.out.patch(c.p, f.i) |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) star(f1 frag, nongreedy bool) frag { |
||||
if f1.nullable { |
||||
// Use (f1+)? to get priority match order correct.
|
||||
// See golang.org/issue/46123.
|
||||
return c.quest(c.plus(f1, nongreedy), nongreedy) |
||||
} |
||||
return c.loop(f1, nongreedy) |
||||
} |
||||
|
||||
func (c *compiler) plus(f1 frag, nongreedy bool) frag { |
||||
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable} |
||||
} |
||||
|
||||
func (c *compiler) empty(op EmptyOp) frag { |
||||
f := c.inst(InstEmptyWidth) |
||||
c.p.Inst[f.i].Arg = uint32(op) |
||||
f.out = makePatchList(f.i << 1) |
||||
return f |
||||
} |
||||
|
||||
func (c *compiler) rune(r []rune, flags Flags) frag { |
||||
f := c.inst(InstRune) |
||||
f.nullable = false |
||||
i := &c.p.Inst[f.i] |
||||
i.Rune = r |
||||
flags &= FoldCase // only relevant flag is FoldCase
|
||||
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] { |
||||
// and sometimes not even that
|
||||
flags &^= FoldCase |
||||
} |
||||
i.Arg = uint32(flags) |
||||
f.out = makePatchList(f.i << 1) |
||||
|
||||
// Special cases for exec machine.
|
||||
switch { |
||||
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]): |
||||
i.Op = InstRune1 |
||||
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune: |
||||
i.Op = InstRuneAny |
||||
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune: |
||||
i.Op = InstRuneAnyNotNL |
||||
} |
||||
|
||||
return f |
||||
} |
@ -0,0 +1,132 @@ |
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
|
||||
|
||||
/* |
||||
Package syntax parses regular expressions into parse trees and compiles |
||||
parse trees into programs. Most clients of regular expressions will use the |
||||
facilities of package regexp (such as Compile and Match) instead of this package. |
||||
|
||||
Syntax |
||||
|
||||
The regular expression syntax understood by this package when parsing with the Perl flag is as follows. |
||||
Parts of the syntax can be disabled by passing alternate flags to Parse. |
||||
|
||||
|
||||
Single characters: |
||||
. any character, possibly including newline (flag s=true) |
||||
[xyz] character class |
||||
[^xyz] negated character class |
||||
\d Perl character class |
||||
\D negated Perl character class |
||||
[[:alpha:]] ASCII character class |
||||
[[:^alpha:]] negated ASCII character class |
||||
\pN Unicode character class (one-letter name) |
||||
\p{Greek} Unicode character class |
||||
\PN negated Unicode character class (one-letter name) |
||||
\P{Greek} negated Unicode character class |
||||
|
||||
Composites: |
||||
xy x followed by y |
||||
x|y x or y (prefer x) |
||||
|
||||
Repetitions: |
||||
x* zero or more x, prefer more |
||||
x+ one or more x, prefer more |
||||
x? zero or one x, prefer one |
||||
x{n,m} n or n+1 or ... or m x, prefer more |
||||
x{n,} n or more x, prefer more |
||||
x{n} exactly n x |
||||
x*? zero or more x, prefer fewer |
||||
x+? one or more x, prefer fewer |
||||
x?? zero or one x, prefer zero |
||||
x{n,m}? n or n+1 or ... or m x, prefer fewer |
||||
x{n,}? n or more x, prefer fewer |
||||
x{n}? exactly n x |
||||
|
||||
Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} |
||||
reject forms that create a minimum or maximum repetition count above 1000. |
||||
Unlimited repetitions are not subject to this restriction. |
||||
|
||||
Grouping: |
||||
(re) numbered capturing group (submatch) |
||||
(?P<name>re) named & numbered capturing group (submatch) |
||||
(?:re) non-capturing group |
||||
(?flags) set flags within current group; non-capturing |
||||
(?flags:re) set flags during re; non-capturing |
||||
|
||||
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are: |
||||
|
||||
i case-insensitive (default false) |
||||
m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false) |
||||
s let . match \n (default false) |
||||
U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false) |
||||
|
||||
Empty strings: |
||||
^ at beginning of text or line (flag m=true) |
||||
$ at end of text (like \z not \Z) or line (flag m=true) |
||||
\A at beginning of text |
||||
\b at ASCII word boundary (\w on one side and \W, \A, or \z on the other) |
||||
\B not at ASCII word boundary |
||||
\z at end of text |
||||
|
||||
Escape sequences: |
||||
\a bell (== \007) |
||||
\f form feed (== \014) |
||||
\t horizontal tab (== \011) |
||||
\n newline (== \012) |
||||
\r carriage return (== \015) |
||||
\v vertical tab character (== \013) |
||||
\* literal *, for any punctuation character * |
||||
\123 octal character code (up to three digits) |
||||
\x7F hex character code (exactly two digits) |
||||
\x{10FFFF} hex character code |
||||
\Q...\E literal text ... even if ... has punctuation |
||||
|
||||
Character class elements: |
||||
x single character |
||||
A-Z character range (inclusive) |
||||
\d Perl character class |
||||
[:foo:] ASCII character class foo |
||||
\p{Foo} Unicode character class Foo |
||||
\pF Unicode character class F (one-letter name) |
||||
|
||||
Named character classes as character class elements: |
||||
[\d] digits (== \d) |
||||
[^\d] not digits (== \D) |
||||
[\D] not digits (== \D) |
||||
[^\D] not not digits (== \d) |
||||
[[:name:]] named ASCII class inside character class (== [:name:]) |
||||
[^[:name:]] named ASCII class inside negated character class (== [:^name:]) |
||||
[\p{Name}] named Unicode property inside character class (== \p{Name}) |
||||
[^\p{Name}] named Unicode property inside negated character class (== \P{Name}) |
||||
|
||||
Perl character classes (all ASCII-only): |
||||
\d digits (== [0-9]) |
||||
\D not digits (== [^0-9]) |
||||
\s whitespace (== [\t\n\f\r ]) |
||||
\S not whitespace (== [^\t\n\f\r ]) |
||||
\w word characters (== [0-9A-Za-z_]) |
||||
\W not word characters (== [^0-9A-Za-z_]) |
||||
|
||||
ASCII character classes: |
||||
[[:alnum:]] alphanumeric (== [0-9A-Za-z]) |
||||
[[:alpha:]] alphabetic (== [A-Za-z]) |
||||
[[:ascii:]] ASCII (== [\x00-\x7F]) |
||||
[[:blank:]] blank (== [\t ]) |
||||
[[:cntrl:]] control (== [\x00-\x1F\x7F]) |
||||
[[:digit:]] digits (== [0-9]) |
||||
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) |
||||
[[:lower:]] lower case (== [a-z]) |
||||
[[:print:]] printable (== [ -~] == [ [:graph:]]) |
||||
[[:punct:]] punctuation (== [!-/:-@[-`{-~]) |
||||
[[:space:]] whitespace (== [\t\n\v\f\r ]) |
||||
[[:upper:]] upper case (== [A-Z]) |
||||
[[:word:]] word characters (== [0-9A-Za-z_]) |
||||
[[:xdigit:]] hex digit (== [0-9A-Fa-f]) |
||||
|
||||
Unicode character classes are those in unicode.Categories and unicode.Scripts. |
||||
*/ |
||||
package syntax |
@ -0,0 +1,113 @@ |
||||
#!/usr/bin/perl |
||||
# Copyright 2008 The Go Authors. All rights reserved. |
||||
# Use of this source code is governed by a BSD-style |
||||
# license that can be found in the LICENSE file. |
||||
|
||||
# Modified version of RE2's make_perl_groups.pl. |
||||
|
||||
# Generate table entries giving character ranges |
||||
# for POSIX/Perl character classes. Rather than |
||||
# figure out what the definition is, it is easier to ask |
||||
# Perl about each letter from 0-128 and write down |
||||
# its answer. |
||||
|
||||
@posixclasses = ( |
||||
"[:alnum:]", |
||||
"[:alpha:]", |
||||
"[:ascii:]", |
||||
"[:blank:]", |
||||
"[:cntrl:]", |
||||
"[:digit:]", |
||||
"[:graph:]", |
||||
"[:lower:]", |
||||
"[:print:]", |
||||
"[:punct:]", |
||||
"[:space:]", |
||||
"[:upper:]", |
||||
"[:word:]", |
||||
"[:xdigit:]", |
||||
); |
||||
|
||||
@perlclasses = ( |
||||
"\\d", |
||||
"\\s", |
||||
"\\w", |
||||
); |
||||
|
||||
%overrides = ( |
||||
# Prior to Perl 5.18, \s did not match vertical tab. |
||||
# RE2 preserves that original behaviour. |
||||
"\\s:11" => 0, |
||||
); |
||||
|
||||
sub ComputeClass($) { |
||||
my @ranges; |
||||
my ($class) = @_; |
||||
my $regexp = "[$class]"; |
||||
my $start = -1; |
||||
for (my $i=0; $i<=129; $i++) { |
||||
if ($i == 129) { $i = 256; } |
||||
if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) { |
||||
if ($start < 0) { |
||||
$start = $i; |
||||
} |
||||
} else { |
||||
if ($start >= 0) { |
||||
push @ranges, [$start, $i-1]; |
||||
} |
||||
$start = -1; |
||||
} |
||||
} |
||||
return @ranges; |
||||
} |
||||
|
||||
sub PrintClass($$@) { |
||||
my ($cname, $name, @ranges) = @_; |
||||
print "var code$cname = []rune{ /* $name */\n"; |
||||
for (my $i=0; $i<@ranges; $i++) { |
||||
my @a = @{$ranges[$i]}; |
||||
printf "\t0x%x, 0x%x,\n", $a[0], $a[1]; |
||||
} |
||||
print "}\n\n"; |
||||
my $n = @ranges; |
||||
$negname = $name; |
||||
if ($negname =~ /:/) { |
||||
$negname =~ s/:/:^/; |
||||
} else { |
||||
$negname =~ y/a-z/A-Z/; |
||||
} |
||||
return "\t`$name`: {+1, code$cname},\n" . |
||||
"\t`$negname`: {-1, code$cname},\n"; |
||||
} |
||||
|
||||
my $gen = 0; |
||||
|
||||
sub PrintClasses($@) { |
||||
my ($cname, @classes) = @_; |
||||
my @entries; |
||||
foreach my $cl (@classes) { |
||||
my @ranges = ComputeClass($cl); |
||||
push @entries, PrintClass(++$gen, $cl, @ranges); |
||||
} |
||||
print "var ${cname}Group = map[string]charGroup{\n"; |
||||
foreach my $e (@entries) { |
||||
print $e; |
||||
} |
||||
print "}\n"; |
||||
my $count = @entries; |
||||
} |
||||
|
||||
print <<EOF; |
||||
// Copyright 2013 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT. |
||||
// make_perl_groups.pl >perl_groups.go |
||||
|
||||
package syntax |
||||
|
||||
EOF |
||||
|
||||
PrintClasses("perl", @perlclasses); |
||||
PrintClasses("posix", @posixclasses); |
@ -0,0 +1,26 @@ |
||||
// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
|
||||
|
||||
package syntax |
||||
|
||||
import "strconv" |
||||
|
||||
const ( |
||||
_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate" |
||||
_Op_name_1 = "opPseudo" |
||||
) |
||||
|
||||
var ( |
||||
_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151} |
||||
) |
||||
|
||||
func (i Op) String() string { |
||||
switch { |
||||
case 1 <= i && i <= 19: |
||||
i -= 1 |
||||
return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]] |
||||
case i == 128: |
||||
return _Op_name_1 |
||||
default: |
||||
return "Op(" + strconv.FormatInt(int64(i), 10) + ")" |
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,134 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.go
|
||||
|
||||
package syntax |
||||
|
||||
var code1 = []rune{ /* \d */ |
||||
0x30, 0x39, |
||||
} |
||||
|
||||
var code2 = []rune{ /* \s */ |
||||
0x9, 0xa, |
||||
0xc, 0xd, |
||||
0x20, 0x20, |
||||
} |
||||
|
||||
var code3 = []rune{ /* \w */ |
||||
0x30, 0x39, |
||||
0x41, 0x5a, |
||||
0x5f, 0x5f, |
||||
0x61, 0x7a, |
||||
} |
||||
|
||||
var perlGroup = map[string]charGroup{ |
||||
`\d`: {+1, code1}, |
||||
`\D`: {-1, code1}, |
||||
`\s`: {+1, code2}, |
||||
`\S`: {-1, code2}, |
||||
`\w`: {+1, code3}, |
||||
`\W`: {-1, code3}, |
||||
} |
||||
var code4 = []rune{ /* [:alnum:] */ |
||||
0x30, 0x39, |
||||
0x41, 0x5a, |
||||
0x61, 0x7a, |
||||
} |
||||
|
||||
var code5 = []rune{ /* [:alpha:] */ |
||||
0x41, 0x5a, |
||||
0x61, 0x7a, |
||||
} |
||||
|
||||
var code6 = []rune{ /* [:ascii:] */ |
||||
0x0, 0x7f, |
||||
} |
||||
|
||||
var code7 = []rune{ /* [:blank:] */ |
||||
0x9, 0x9, |
||||
0x20, 0x20, |
||||
} |
||||
|
||||
var code8 = []rune{ /* [:cntrl:] */ |
||||
0x0, 0x1f, |
||||
0x7f, 0x7f, |
||||
} |
||||
|
||||
var code9 = []rune{ /* [:digit:] */ |
||||
0x30, 0x39, |
||||
} |
||||
|
||||
var code10 = []rune{ /* [:graph:] */ |
||||
0x21, 0x7e, |
||||
} |
||||
|
||||
var code11 = []rune{ /* [:lower:] */ |
||||
0x61, 0x7a, |
||||
} |
||||
|
||||
var code12 = []rune{ /* [:print:] */ |
||||
0x20, 0x7e, |
||||
} |
||||
|
||||
var code13 = []rune{ /* [:punct:] */ |
||||
0x21, 0x2f, |
||||
0x3a, 0x40, |
||||
0x5b, 0x60, |
||||
0x7b, 0x7e, |
||||
} |
||||
|
||||
var code14 = []rune{ /* [:space:] */ |
||||
0x9, 0xd, |
||||
0x20, 0x20, |
||||
} |
||||
|
||||
var code15 = []rune{ /* [:upper:] */ |
||||
0x41, 0x5a, |
||||
} |
||||
|
||||
var code16 = []rune{ /* [:word:] */ |
||||
0x30, 0x39, |
||||
0x41, 0x5a, |
||||
0x5f, 0x5f, |
||||
0x61, 0x7a, |
||||
} |
||||
|
||||
var code17 = []rune{ /* [:xdigit:] */ |
||||
0x30, 0x39, |
||||
0x41, 0x46, |
||||
0x61, 0x66, |
||||
} |
||||
|
||||
var posixGroup = map[string]charGroup{ |
||||
`[:alnum:]`: {+1, code4}, |
||||
`[:^alnum:]`: {-1, code4}, |
||||
`[:alpha:]`: {+1, code5}, |
||||
`[:^alpha:]`: {-1, code5}, |
||||
`[:ascii:]`: {+1, code6}, |
||||
`[:^ascii:]`: {-1, code6}, |
||||
`[:blank:]`: {+1, code7}, |
||||
`[:^blank:]`: {-1, code7}, |
||||
`[:cntrl:]`: {+1, code8}, |
||||
`[:^cntrl:]`: {-1, code8}, |
||||
`[:digit:]`: {+1, code9}, |
||||
`[:^digit:]`: {-1, code9}, |
||||
`[:graph:]`: {+1, code10}, |
||||
`[:^graph:]`: {-1, code10}, |
||||
`[:lower:]`: {+1, code11}, |
||||
`[:^lower:]`: {-1, code11}, |
||||
`[:print:]`: {+1, code12}, |
||||
`[:^print:]`: {-1, code12}, |
||||
`[:punct:]`: {+1, code13}, |
||||
`[:^punct:]`: {-1, code13}, |
||||
`[:space:]`: {+1, code14}, |
||||
`[:^space:]`: {-1, code14}, |
||||
`[:upper:]`: {+1, code15}, |
||||
`[:^upper:]`: {-1, code15}, |
||||
`[:word:]`: {+1, code16}, |
||||
`[:^word:]`: {-1, code16}, |
||||
`[:xdigit:]`: {+1, code17}, |
||||
`[:^xdigit:]`: {-1, code17}, |
||||
} |
@ -0,0 +1,363 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax |
||||
|
||||
import ( |
||||
"strconv" |
||||
"strings" |
||||
"unicode" |
||||
) |
||||
|
||||
// Compiled program.
|
||||
// May not belong in this package, but convenient for now.
|
||||
|
||||
// A Prog is a compiled regular expression program.
|
||||
type Prog struct { |
||||
Inst []Inst |
||||
Start int // index of start instruction
|
||||
NumCap int // number of InstCapture insts in re
|
||||
} |
||||
|
||||
// An InstOp is an instruction opcode.
|
||||
type InstOp uint8 |
||||
|
||||
const ( |
||||
InstAlt InstOp = iota |
||||
InstAltMatch |
||||
InstCapture |
||||
InstEmptyWidth |
||||
InstMatch |
||||
InstFail |
||||
InstNop |
||||
InstRune |
||||
InstRune1 |
||||
InstRuneAny |
||||
InstRuneAnyNotNL |
||||
) |
||||
|
||||
var instOpNames = []string{ |
||||
"InstAlt", |
||||
"InstAltMatch", |
||||
"InstCapture", |
||||
"InstEmptyWidth", |
||||
"InstMatch", |
||||
"InstFail", |
||||
"InstNop", |
||||
"InstRune", |
||||
"InstRune1", |
||||
"InstRuneAny", |
||||
"InstRuneAnyNotNL", |
||||
} |
||||
|
||||
func (i InstOp) String() string { |
||||
if uint(i) >= uint(len(instOpNames)) { |
||||
return "" |
||||
} |
||||
return instOpNames[i] |
||||
} |
||||
|
||||
// An EmptyOp specifies a kind or mixture of zero-width assertions.
|
||||
type EmptyOp uint8 |
||||
|
||||
const ( |
||||
EmptyBeginLine EmptyOp = 1 << iota |
||||
EmptyEndLine |
||||
EmptyBeginText |
||||
EmptyEndText |
||||
EmptyWordBoundary |
||||
EmptyNoWordBoundary |
||||
) |
||||
|
||||
// EmptyOpContext returns the zero-width assertions
|
||||
// satisfied at the position between the runes r1 and r2.
|
||||
// Passing r1 == -1 indicates that the position is
|
||||
// at the beginning of the text.
|
||||
// Passing r2 == -1 indicates that the position is
|
||||
// at the end of the text.
|
||||
func EmptyOpContext(r1, r2 rune) EmptyOp { |
||||
var op EmptyOp = EmptyNoWordBoundary |
||||
var boundary byte |
||||
switch { |
||||
case IsWordChar(r1): |
||||
boundary = 1 |
||||
case r1 == '\n': |
||||
op |= EmptyBeginLine |
||||
case r1 < 0: |
||||
op |= EmptyBeginText | EmptyBeginLine |
||||
} |
||||
switch { |
||||
case IsWordChar(r2): |
||||
boundary ^= 1 |
||||
case r2 == '\n': |
||||
op |= EmptyEndLine |
||||
case r2 < 0: |
||||
op |= EmptyEndText | EmptyEndLine |
||||
} |
||||
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
|
||||
op ^= (EmptyWordBoundary | EmptyNoWordBoundary) |
||||
} |
||||
return op |
||||
} |
||||
|
||||
// IsWordChar reports whether r is consider a ``word character''
|
||||
// during the evaluation of the \b and \B zero-width assertions.
|
||||
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
||||
func IsWordChar(r rune) bool { |
||||
return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' |
||||
} |
||||
|
||||
// An Inst is a single instruction in a regular expression program.
|
||||
type Inst struct { |
||||
Op InstOp |
||||
Out uint32 // all but InstMatch, InstFail
|
||||
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
|
||||
Rune []rune |
||||
} |
||||
|
||||
func (p *Prog) String() string { |
||||
var b strings.Builder |
||||
dumpProg(&b, p) |
||||
return b.String() |
||||
} |
||||
|
||||
// skipNop follows any no-op or capturing instructions.
|
||||
func (p *Prog) skipNop(pc uint32) *Inst { |
||||
i := &p.Inst[pc] |
||||
for i.Op == InstNop || i.Op == InstCapture { |
||||
i = &p.Inst[i.Out] |
||||
} |
||||
return i |
||||
} |
||||
|
||||
// op returns i.Op but merges all the Rune special cases into InstRune
|
||||
func (i *Inst) op() InstOp { |
||||
op := i.Op |
||||
switch op { |
||||
case InstRune1, InstRuneAny, InstRuneAnyNotNL: |
||||
op = InstRune |
||||
} |
||||
return op |
||||
} |
||||
|
||||
// Prefix returns a literal string that all matches for the
|
||||
// regexp must start with. Complete is true if the prefix
|
||||
// is the entire match.
|
||||
func (p *Prog) Prefix() (prefix string, complete bool) { |
||||
prefix, complete, foldCase := p.PrefixAndCase() |
||||
if foldCase { |
||||
return "", false |
||||
} |
||||
return prefix, complete |
||||
} |
||||
|
||||
// Prefix returns a literal string that all matches for the
|
||||
// regexp must start with. Complete is true if the prefix
|
||||
// is the entire match. FoldCase is true if the string should
|
||||
// match in upper or lower case.
|
||||
func (p *Prog) PrefixAndCase() (prefix string, complete bool, foldCase bool) { |
||||
i := &p.Inst[p.Start] |
||||
// Skip any no-op, capturing or begin-text instructions
|
||||
for i.Op == InstNop || i.Op == InstCapture || (i.Op == InstEmptyWidth && EmptyOp(i.Arg)&EmptyBeginText != 0) { |
||||
i = &p.Inst[i.Out] |
||||
} |
||||
|
||||
// Avoid allocation of buffer if prefix is empty.
|
||||
if i.op() != InstRune || len(i.Rune) != 1 { |
||||
return "", i.Op == InstMatch, false |
||||
} |
||||
|
||||
// Have prefix; gather characters.
|
||||
var buf strings.Builder |
||||
foldCase = (Flags(i.Arg)&FoldCase != 0) |
||||
for i.op() == InstRune && len(i.Rune) == 1 && (Flags(i.Arg)&FoldCase != 0) == foldCase { |
||||
buf.WriteRune(i.Rune[0]) |
||||
i = p.skipNop(i.Out) |
||||
} |
||||
return buf.String(), i.Op == InstMatch, foldCase |
||||
} |
||||
|
||||
// StartCond returns the leading empty-width conditions that must
|
||||
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
|
||||
func (p *Prog) StartCond() EmptyOp { |
||||
var flag EmptyOp |
||||
pc := uint32(p.Start) |
||||
i := &p.Inst[pc] |
||||
Loop: |
||||
for { |
||||
switch i.Op { |
||||
case InstEmptyWidth: |
||||
flag |= EmptyOp(i.Arg) |
||||
case InstFail: |
||||
return ^EmptyOp(0) |
||||
case InstCapture, InstNop: |
||||
// skip
|
||||
default: |
||||
break Loop |
||||
} |
||||
pc = i.Out |
||||
i = &p.Inst[pc] |
||||
} |
||||
return flag |
||||
} |
||||
|
||||
const noMatch = -1 |
||||
|
||||
// MatchRune reports whether the instruction matches (and consumes) r.
|
||||
// It should only be called when i.Op == InstRune.
|
||||
func (i *Inst) MatchRune(r rune) bool { |
||||
return i.MatchRunePos(r) != noMatch |
||||
} |
||||
|
||||
// MatchRunePos checks whether the instruction matches (and consumes) r.
|
||||
// If so, MatchRunePos returns the index of the matching rune pair
|
||||
// (or, when len(i.Rune) == 1, rune singleton).
|
||||
// If not, MatchRunePos returns -1.
|
||||
// MatchRunePos should only be called when i.Op == InstRune.
|
||||
func (i *Inst) MatchRunePos(r rune) int { |
||||
rune := i.Rune |
||||
|
||||
switch len(rune) { |
||||
case 0: |
||||
return noMatch |
||||
|
||||
case 1: |
||||
// Special case: single-rune slice is from literal string, not char class.
|
||||
r0 := rune[0] |
||||
if r == r0 { |
||||
return 0 |
||||
} |
||||
if Flags(i.Arg)&FoldCase != 0 { |
||||
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { |
||||
if r == r1 { |
||||
return 0 |
||||
} |
||||
} |
||||
} |
||||
return noMatch |
||||
|
||||
case 2: |
||||
if r >= rune[0] && r <= rune[1] { |
||||
return 0 |
||||
} |
||||
return noMatch |
||||
|
||||
case 4, 6, 8: |
||||
// Linear search for a few pairs.
|
||||
// Should handle ASCII well.
|
||||
for j := 0; j < len(rune); j += 2 { |
||||
if r < rune[j] { |
||||
return noMatch |
||||
} |
||||
if r <= rune[j+1] { |
||||
return j / 2 |
||||
} |
||||
} |
||||
return noMatch |
||||
} |
||||
|
||||
// Otherwise binary search.
|
||||
lo := 0 |
||||
hi := len(rune) / 2 |
||||
for lo < hi { |
||||
m := lo + (hi-lo)/2 |
||||
if c := rune[2*m]; c <= r { |
||||
if r <= rune[2*m+1] { |
||||
return m |
||||
} |
||||
lo = m + 1 |
||||
} else { |
||||
hi = m |
||||
} |
||||
} |
||||
return noMatch |
||||
} |
||||
|
||||
// MatchEmptyWidth reports whether the instruction matches
|
||||
// an empty string between the runes before and after.
|
||||
// It should only be called when i.Op == InstEmptyWidth.
|
||||
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { |
||||
switch EmptyOp(i.Arg) { |
||||
case EmptyBeginLine: |
||||
return before == '\n' || before == -1 |
||||
case EmptyEndLine: |
||||
return after == '\n' || after == -1 |
||||
case EmptyBeginText: |
||||
return before == -1 |
||||
case EmptyEndText: |
||||
return after == -1 |
||||
case EmptyWordBoundary: |
||||
return IsWordChar(before) != IsWordChar(after) |
||||
case EmptyNoWordBoundary: |
||||
return IsWordChar(before) == IsWordChar(after) |
||||
} |
||||
panic("unknown empty width arg") |
||||
} |
||||
|
||||
func (i *Inst) String() string { |
||||
var b strings.Builder |
||||
dumpInst(&b, i) |
||||
return b.String() |
||||
} |
||||
|
||||
func bw(b *strings.Builder, args ...string) { |
||||
for _, s := range args { |
||||
b.WriteString(s) |
||||
} |
||||
} |
||||
|
||||
func dumpProg(b *strings.Builder, p *Prog) { |
||||
for j := range p.Inst { |
||||
i := &p.Inst[j] |
||||
pc := strconv.Itoa(j) |
||||
if len(pc) < 3 { |
||||
b.WriteString(" "[len(pc):]) |
||||
} |
||||
if j == p.Start { |
||||
pc += "*" |
||||
} |
||||
bw(b, pc, "\t") |
||||
dumpInst(b, i) |
||||
bw(b, "\n") |
||||
} |
||||
} |
||||
|
||||
func u32(i uint32) string { |
||||
return strconv.FormatUint(uint64(i), 10) |
||||
} |
||||
|
||||
func dumpInst(b *strings.Builder, i *Inst) { |
||||
switch i.Op { |
||||
case InstAlt: |
||||
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) |
||||
case InstAltMatch: |
||||
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) |
||||
case InstCapture: |
||||
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) |
||||
case InstEmptyWidth: |
||||
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) |
||||
case InstMatch: |
||||
bw(b, "match") |
||||
case InstFail: |
||||
bw(b, "fail") |
||||
case InstNop: |
||||
bw(b, "nop -> ", u32(i.Out)) |
||||
case InstRune: |
||||
if i.Rune == nil { |
||||
// shouldn't happen
|
||||
bw(b, "rune <nil>") |
||||
} |
||||
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) |
||||
if Flags(i.Arg)&FoldCase != 0 { |
||||
bw(b, "/i") |
||||
} |
||||
bw(b, " -> ", u32(i.Out)) |
||||
case InstRune1: |
||||
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) |
||||
case InstRuneAny: |
||||
bw(b, "any -> ", u32(i.Out)) |
||||
case InstRuneAnyNotNL: |
||||
bw(b, "anynotnl -> ", u32(i.Out)) |
||||
} |
||||
} |
@ -0,0 +1,320 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax |
||||
|
||||
// Note to implementers:
|
||||
// In this package, re is always a *Regexp and r is always a rune.
|
||||
|
||||
import ( |
||||
"strconv" |
||||
"strings" |
||||
"unicode" |
||||
) |
||||
|
||||
// A Regexp is a node in a regular expression syntax tree.
|
||||
type Regexp struct { |
||||
Op Op // operator
|
||||
Flags Flags |
||||
Sub []*Regexp // subexpressions, if any
|
||||
Sub0 [1]*Regexp // storage for short Sub
|
||||
Rune []rune // matched runes, for OpLiteral, OpCharClass
|
||||
Rune0 [2]rune // storage for short Rune
|
||||
Min, Max int // min, max for OpRepeat
|
||||
Cap int // capturing index, for OpCapture
|
||||
Name string // capturing name, for OpCapture
|
||||
} |
||||
|
||||
//go:generate stringer -type Op -trimprefix Op
|
||||
|
||||
// An Op is a single regular expression operator.
|
||||
type Op uint8 |
||||
|
||||
// Operators are listed in precedence order, tightest binding to weakest.
|
||||
// Character class operators are listed simplest to most complex
|
||||
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
|
||||
|
||||
const ( |
||||
OpNoMatch Op = 1 + iota // matches no strings
|
||||
OpEmptyMatch // matches empty string
|
||||
OpLiteral // matches Runes sequence
|
||||
OpCharClass // matches Runes interpreted as range pair list
|
||||
OpAnyCharNotNL // matches any character except newline
|
||||
OpAnyChar // matches any character
|
||||
OpBeginLine // matches empty string at beginning of line
|
||||
OpEndLine // matches empty string at end of line
|
||||
OpBeginText // matches empty string at beginning of text
|
||||
OpEndText // matches empty string at end of text
|
||||
OpWordBoundary // matches word boundary `\b`
|
||||
OpNoWordBoundary // matches word non-boundary `\B`
|
||||
OpCapture // capturing subexpression with index Cap, optional name Name
|
||||
OpStar // matches Sub[0] zero or more times
|
||||
OpPlus // matches Sub[0] one or more times
|
||||
OpQuest // matches Sub[0] zero or one times
|
||||
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
|
||||
OpConcat // matches concatenation of Subs
|
||||
OpAlternate // matches alternation of Subs
|
||||
) |
||||
|
||||
const opPseudo Op = 128 // where pseudo-ops start
|
||||
|
||||
// Equal reports whether x and y have identical structure.
|
||||
func (x *Regexp) Equal(y *Regexp) bool { |
||||
if x == nil || y == nil { |
||||
return x == y |
||||
} |
||||
if x.Op != y.Op { |
||||
return false |
||||
} |
||||
switch x.Op { |
||||
case OpEndText: |
||||
// The parse flags remember whether this is \z or \Z.
|
||||
if x.Flags&WasDollar != y.Flags&WasDollar { |
||||
return false |
||||
} |
||||
|
||||
case OpLiteral, OpCharClass: |
||||
if len(x.Rune) != len(y.Rune) { |
||||
return false |
||||
} |
||||
for i, r := range x.Rune { |
||||
if r != y.Rune[i] { |
||||
return false |
||||
} |
||||
} |
||||
|
||||
case OpAlternate, OpConcat: |
||||
if len(x.Sub) != len(y.Sub) { |
||||
return false |
||||
} |
||||
for i, sub := range x.Sub { |
||||
if !sub.Equal(y.Sub[i]) { |
||||
return false |
||||
} |
||||
} |
||||
|
||||
case OpStar, OpPlus, OpQuest: |
||||
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { |
||||
return false |
||||
} |
||||
|
||||
case OpRepeat: |
||||
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { |
||||
return false |
||||
} |
||||
|
||||
case OpCapture: |
||||
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
||||
func writeRegexp(b *strings.Builder, re *Regexp) { |
||||
switch re.Op { |
||||
default: |
||||
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") |
||||
case OpNoMatch: |
||||
b.WriteString(`[^\x00-\x{10FFFF}]`) |
||||
case OpEmptyMatch: |
||||
b.WriteString(`(?:)`) |
||||
case OpLiteral: |
||||
if re.Flags&FoldCase != 0 { |
||||
b.WriteString(`(?i:`) |
||||
} |
||||
for _, r := range re.Rune { |
||||
escape(b, r, false) |
||||
} |
||||
if re.Flags&FoldCase != 0 { |
||||
b.WriteString(`)`) |
||||
} |
||||
case OpCharClass: |
||||
if len(re.Rune)%2 != 0 { |
||||
b.WriteString(`[invalid char class]`) |
||||
break |
||||
} |
||||
b.WriteRune('[') |
||||
if len(re.Rune) == 0 { |
||||
b.WriteString(`^\x00-\x{10FFFF}`) |
||||
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { |
||||
// Contains 0 and MaxRune. Probably a negated class.
|
||||
// Print the gaps.
|
||||
b.WriteRune('^') |
||||
for i := 1; i < len(re.Rune)-1; i += 2 { |
||||
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 |
||||
escape(b, lo, lo == '-') |
||||
if lo != hi { |
||||
b.WriteRune('-') |
||||
escape(b, hi, hi == '-') |
||||
} |
||||
} |
||||
} else { |
||||
for i := 0; i < len(re.Rune); i += 2 { |
||||
lo, hi := re.Rune[i], re.Rune[i+1] |
||||
escape(b, lo, lo == '-') |
||||
if lo != hi { |
||||
b.WriteRune('-') |
||||
escape(b, hi, hi == '-') |
||||
} |
||||
} |
||||
} |
||||
b.WriteRune(']') |
||||
case OpAnyCharNotNL: |
||||
b.WriteString(`(?-s:.)`) |
||||
case OpAnyChar: |
||||
b.WriteString(`(?s:.)`) |
||||
case OpBeginLine: |
||||
b.WriteString(`(?m:^)`) |
||||
case OpEndLine: |
||||
b.WriteString(`(?m:$)`) |
||||
case OpBeginText: |
||||
b.WriteString(`\A`) |
||||
case OpEndText: |
||||
if re.Flags&WasDollar != 0 { |
||||
b.WriteString(`(?-m:$)`) |
||||
} else { |
||||
b.WriteString(`\z`) |
||||
} |
||||
case OpWordBoundary: |
||||
b.WriteString(`\b`) |
||||
case OpNoWordBoundary: |
||||
b.WriteString(`\B`) |
||||
case OpCapture: |
||||
if re.Name != "" { |
||||
b.WriteString(`(?P<`) |
||||
b.WriteString(re.Name) |
||||
b.WriteRune('>') |
||||
} else { |
||||
b.WriteRune('(') |
||||
} |
||||
if re.Sub[0].Op != OpEmptyMatch { |
||||
writeRegexp(b, re.Sub[0]) |
||||
} |
||||
b.WriteRune(')') |
||||
case OpStar, OpPlus, OpQuest, OpRepeat: |
||||
if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { |
||||
b.WriteString(`(?:`) |
||||
writeRegexp(b, sub) |
||||
b.WriteString(`)`) |
||||
} else { |
||||
writeRegexp(b, sub) |
||||
} |
||||
switch re.Op { |
||||
case OpStar: |
||||
b.WriteRune('*') |
||||
case OpPlus: |
||||
b.WriteRune('+') |
||||
case OpQuest: |
||||
b.WriteRune('?') |
||||
case OpRepeat: |
||||
b.WriteRune('{') |
||||
b.WriteString(strconv.Itoa(re.Min)) |
||||
if re.Max != re.Min { |
||||
b.WriteRune(',') |
||||
if re.Max >= 0 { |
||||
b.WriteString(strconv.Itoa(re.Max)) |
||||
} |
||||
} |
||||
b.WriteRune('}') |
||||
} |
||||
if re.Flags&NonGreedy != 0 { |
||||
b.WriteRune('?') |
||||
} |
||||
case OpConcat: |
||||
for _, sub := range re.Sub { |
||||
if sub.Op == OpAlternate { |
||||
b.WriteString(`(?:`) |
||||
writeRegexp(b, sub) |
||||
b.WriteString(`)`) |
||||
} else { |
||||
writeRegexp(b, sub) |
||||
} |
||||
} |
||||
case OpAlternate: |
||||
for i, sub := range re.Sub { |
||||
if i > 0 { |
||||
b.WriteRune('|') |
||||
} |
||||
writeRegexp(b, sub) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (re *Regexp) String() string { |
||||
var b strings.Builder |
||||
writeRegexp(&b, re) |
||||
return b.String() |
||||
} |
||||
|
||||
const meta = `\.+*?()|[]{}^$` |
||||
|
||||
func escape(b *strings.Builder, r rune, force bool) { |
||||
if unicode.IsPrint(r) { |
||||
if strings.ContainsRune(meta, r) || force { |
||||
b.WriteRune('\\') |
||||
} |
||||
b.WriteRune(r) |
||||
return |
||||
} |
||||
|
||||
switch r { |
||||
case '\a': |
||||
b.WriteString(`\a`) |
||||
case '\f': |
||||
b.WriteString(`\f`) |
||||
case '\n': |
||||
b.WriteString(`\n`) |
||||
case '\r': |
||||
b.WriteString(`\r`) |
||||
case '\t': |
||||
b.WriteString(`\t`) |
||||
case '\v': |
||||
b.WriteString(`\v`) |
||||
default: |
||||
if r < 0x100 { |
||||
b.WriteString(`\x`) |
||||
s := strconv.FormatInt(int64(r), 16) |
||||
if len(s) == 1 { |
||||
b.WriteRune('0') |
||||
} |
||||
b.WriteString(s) |
||||
break |
||||
} |
||||
b.WriteString(`\x{`) |
||||
b.WriteString(strconv.FormatInt(int64(r), 16)) |
||||
b.WriteString(`}`) |
||||
} |
||||
} |
||||
|
||||
// MaxCap walks the regexp to find the maximum capture index.
|
||||
func (re *Regexp) MaxCap() int { |
||||
m := 0 |
||||
if re.Op == OpCapture { |
||||
m = re.Cap |
||||
} |
||||
for _, sub := range re.Sub { |
||||
if n := sub.MaxCap(); m < n { |
||||
m = n |
||||
} |
||||
} |
||||
return m |
||||
} |
||||
|
||||
// CapNames walks the regexp to find the names of capturing groups.
|
||||
func (re *Regexp) CapNames() []string { |
||||
names := make([]string, re.MaxCap()+1) |
||||
re.capNames(names) |
||||
return names |
||||
} |
||||
|
||||
func (re *Regexp) capNames(names []string) { |
||||
if re.Op == OpCapture { |
||||
names[re.Cap] = re.Name |
||||
} |
||||
for _, sub := range re.Sub { |
||||
sub.capNames(names) |
||||
} |
||||
} |
@ -0,0 +1,151 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax |
||||
|
||||
// Simplify returns a regexp equivalent to re but without counted repetitions
|
||||
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
|
||||
// The resulting regexp will execute correctly but its string representation
|
||||
// will not produce the same parse tree, because capturing parentheses
|
||||
// may have been duplicated or removed. For example, the simplified form
|
||||
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
|
||||
// The returned regexp may share structure with or be the original.
|
||||
func (re *Regexp) Simplify() *Regexp { |
||||
if re == nil { |
||||
return nil |
||||
} |
||||
switch re.Op { |
||||
case OpCapture, OpConcat, OpAlternate: |
||||
// Simplify children, building new Regexp if children change.
|
||||
nre := re |
||||
for i, sub := range re.Sub { |
||||
nsub := sub.Simplify() |
||||
if nre == re && nsub != sub { |
||||
// Start a copy.
|
||||
nre = new(Regexp) |
||||
*nre = *re |
||||
nre.Rune = nil |
||||
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...) |
||||
} |
||||
if nre != re { |
||||
nre.Sub = append(nre.Sub, nsub) |
||||
} |
||||
} |
||||
return nre |
||||
|
||||
case OpStar, OpPlus, OpQuest: |
||||
sub := re.Sub[0].Simplify() |
||||
return simplify1(re.Op, re.Flags, sub, re) |
||||
|
||||
case OpRepeat: |
||||
// Special special case: x{0} matches the empty string
|
||||
// and doesn't even need to consider x.
|
||||
if re.Min == 0 && re.Max == 0 { |
||||
return &Regexp{Op: OpEmptyMatch} |
||||
} |
||||
|
||||
// The fun begins.
|
||||
sub := re.Sub[0].Simplify() |
||||
|
||||
// x{n,} means at least n matches of x.
|
||||
if re.Max == -1 { |
||||
// Special case: x{0,} is x*.
|
||||
if re.Min == 0 { |
||||
return simplify1(OpStar, re.Flags, sub, nil) |
||||
} |
||||
|
||||
// Special case: x{1,} is x+.
|
||||
if re.Min == 1 { |
||||
return simplify1(OpPlus, re.Flags, sub, nil) |
||||
} |
||||
|
||||
// General case: x{4,} is xxxx+.
|
||||
nre := &Regexp{Op: OpConcat} |
||||
nre.Sub = nre.Sub0[:0] |
||||
for i := 0; i < re.Min-1; i++ { |
||||
nre.Sub = append(nre.Sub, sub) |
||||
} |
||||
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil)) |
||||
return nre |
||||
} |
||||
|
||||
// Special case x{0} handled above.
|
||||
|
||||
// Special case: x{1} is just x.
|
||||
if re.Min == 1 && re.Max == 1 { |
||||
return sub |
||||
} |
||||
|
||||
// General case: x{n,m} means n copies of x and m copies of x?
|
||||
// The machine will do less work if we nest the final m copies,
|
||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||
|
||||
// Build leading prefix: xx.
|
||||
var prefix *Regexp |
||||
if re.Min > 0 { |
||||
prefix = &Regexp{Op: OpConcat} |
||||
prefix.Sub = prefix.Sub0[:0] |
||||
for i := 0; i < re.Min; i++ { |
||||
prefix.Sub = append(prefix.Sub, sub) |
||||
} |
||||
} |
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
if re.Max > re.Min { |
||||
suffix := simplify1(OpQuest, re.Flags, sub, nil) |
||||
for i := re.Min + 1; i < re.Max; i++ { |
||||
nre2 := &Regexp{Op: OpConcat} |
||||
nre2.Sub = append(nre2.Sub0[:0], sub, suffix) |
||||
suffix = simplify1(OpQuest, re.Flags, nre2, nil) |
||||
} |
||||
if prefix == nil { |
||||
return suffix |
||||
} |
||||
prefix.Sub = append(prefix.Sub, suffix) |
||||
} |
||||
if prefix != nil { |
||||
return prefix |
||||
} |
||||
|
||||
// Some degenerate case like min > max or min < max < 0.
|
||||
// Handle as impossible match.
|
||||
return &Regexp{Op: OpNoMatch} |
||||
} |
||||
|
||||
return re |
||||
} |
||||
|
||||
// simplify1 implements Simplify for the unary OpStar,
|
||||
// OpPlus, and OpQuest operators. It returns the simple regexp
|
||||
// equivalent to
|
||||
//
|
||||
// Regexp{Op: op, Flags: flags, Sub: {sub}}
|
||||
//
|
||||
// under the assumption that sub is already simple, and
|
||||
// without first allocating that structure. If the regexp
|
||||
// to be returned turns out to be equivalent to re, simplify1
|
||||
// returns re instead.
|
||||
//
|
||||
// simplify1 is factored out of Simplify because the implementation
|
||||
// for other operators generates these unary expressions.
|
||||
// Letting them call simplify1 makes sure the expressions they
|
||||
// generate are simple.
|
||||
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp { |
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if sub.Op == OpEmptyMatch { |
||||
return sub |
||||
} |
||||
// The operators are idempotent if the flags match.
|
||||
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy { |
||||
return sub |
||||
} |
||||
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] { |
||||
return re |
||||
} |
||||
|
||||
re = &Regexp{Op: op, Flags: flags} |
||||
re.Sub = append(re.Sub0[:0], sub) |
||||
return re |
||||
} |
Loading…
Reference in new issue