filters: use faster regexp package (#5315)

* regexp-filter: improve benchmark

Add a few more cases based on real-world usage.

Also simplify the loop to just run the number of times requested.
Go benchmarks run 'N' times to reach a duration, default 1 second.
Previously the benchmark was running N*1000000 times, which
took 7 seconds minimum for some cases.

* regexp filter: use modified package with optimisations

See https://github.com/grafana/regexp/tree/speedup#readme

Includes the following changes proposed upstream:
* [regexp: allow patterns with no alternates to be one-pass](https://go-review.googlesource.com/c/go/+/353711)
* [regexp: speed up onepass prefix check](https://go-review.googlesource.com/c/go/+/354909)
* [regexp: handle prefix string with fold-case](https://go-review.googlesource.com/c/go/+/358756)
* [regexp: avoid copying each instruction executed](https://go-review.googlesource.com/c/go/+/355789)
* [regexp: allow prefix string anchored at beginning](https://go-review.googlesource.com/c/go/+/377294)

* Add grafana/regexp to vendor directory
pull/5337/head
Bryan Boreham 3 years ago committed by GitHub
parent f598484a94
commit a50cac7674
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      CHANGELOG.md
  2. 1
      go.mod
  3. 2
      go.sum
  4. 5
      pkg/logql/log/filter.go
  5. 12
      pkg/logql/log/filter_test.go
  6. 2
      pkg/logql/log/fmt.go
  7. 2
      pkg/logql/log/parser.go
  8. 15
      vendor/github.com/grafana/regexp/.gitignore
  9. 27
      vendor/github.com/grafana/regexp/LICENSE
  10. 19
      vendor/github.com/grafana/regexp/README.md
  11. 369
      vendor/github.com/grafana/regexp/backtrack.go
  12. 579
      vendor/github.com/grafana/regexp/exec.go
  13. 517
      vendor/github.com/grafana/regexp/onepass.go
  14. 1310
      vendor/github.com/grafana/regexp/regexp.go
  15. 296
      vendor/github.com/grafana/regexp/syntax/compile.go
  16. 132
      vendor/github.com/grafana/regexp/syntax/doc.go
  17. 113
      vendor/github.com/grafana/regexp/syntax/make_perl_groups.pl
  18. 26
      vendor/github.com/grafana/regexp/syntax/op_string.go
  19. 1910
      vendor/github.com/grafana/regexp/syntax/parse.go
  20. 134
      vendor/github.com/grafana/regexp/syntax/perl_groups.go
  21. 363
      vendor/github.com/grafana/regexp/syntax/prog.go
  22. 320
      vendor/github.com/grafana/regexp/syntax/regexp.go
  23. 151
      vendor/github.com/grafana/regexp/syntax/simplify.go
  24. 4
      vendor/modules.txt

@ -53,6 +53,7 @@
* [5077](https://github.com/grafana/loki/pull/5077) **trevorwhitney**: Change some default values for better out-of-the-box performance
* [5204](https://github.com/grafana/loki/pull/5204) **trevorwhitney**: Default `max_outstanding_per_tenant` to `2048`
* [5253](https://github.com/grafana/loki/pull/5253) **Juneezee**: refactor: use `T.TempDir` to create temporary test directory
* [5315](https://github.com/grafana/loki/pull/5315) **bboreham**: filters: use faster regexp package
# 2.4.1 (2021/11/07)

@ -288,6 +288,7 @@ require (
github.com/cloudflare/cloudflare-go v0.27.0
github.com/gofrs/flock v0.7.1 // indirect
github.com/gogo/status v1.1.0
github.com/grafana/regexp v0.0.0-20220202152701-6a046c4caf32
github.com/oklog/ulid v1.3.1
)

@ -1039,6 +1039,8 @@ github.com/grafana/go-gelf v0.0.0-20211112153804-126646b86de8 h1:aEOagXOTqtN9gd4
github.com/grafana/go-gelf v0.0.0-20211112153804-126646b86de8/go.mod h1:QAvS2C7TtQRhhv9Uf/sxD+BUhpkrPFm5jK/9MzUiDCY=
github.com/grafana/gocql v0.0.0-20200605141915-ba5dc39ece85 h1:xLuzPoOzdfNb/RF/IENCw+oLVdZB4G21VPhkHBgwSHY=
github.com/grafana/gocql v0.0.0-20200605141915-ba5dc39ece85/go.mod h1:crI9WX6p0IhrqB+DqIUHulRW853PaNFf7o4UprV//3I=
github.com/grafana/regexp v0.0.0-20220202152701-6a046c4caf32 h1:M3wP8Hwic62qJsiydSgXtev03d4f92uN1I52nVjRgw0=
github.com/grafana/regexp v0.0.0-20220202152701-6a046c4caf32/go.mod h1:M5qHK+eWfAv8VR/265dIuEpL3fNfeC21tXXp9itM24A=
github.com/grafana/tail v0.0.0-20201004203643-7aa4e4a91f03 h1:fGgFrAraMB0BaPfYumu+iulfDXwHm+GFyHA4xEtBqI8=
github.com/grafana/tail v0.0.0-20201004203643-7aa4e4a91f03/go.mod h1:GIMXMPB/lRAllP5rVDvcGif87ryO2hgD7tCtHMdHrho=
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=

@ -3,11 +3,12 @@ package log
import (
"bytes"
"fmt"
"regexp"
"regexp/syntax"
"unicode"
"unicode/utf8"
"github.com/grafana/regexp"
"github.com/grafana/regexp/syntax"
"github.com/prometheus/prometheus/model/labels"
)

@ -156,6 +156,10 @@ func Benchmark_LineFilter(b *testing.B) {
{".*foo.*|bar|uzz"},
{"((f.*)|foobar.*)|.*buzz"},
{"(?P<foo>.*foo.*|bar)"},
{"(\\s|\")+(?i)bar"},
{"(node:24) buzz*"},
{"(HTTP/.*\\\"|HEAD|GET) (2..|5..)"},
{"\"@l\":\"(Warning|Error|Fatal)\""},
} {
benchmarkRegex(b, test.re, logline, true)
benchmarkRegex(b, test.re, logline, false)
@ -180,16 +184,12 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) {
b.ResetTimer()
b.Run(fmt.Sprintf("default_%v_%s", match, re), func(b *testing.B) {
for i := 0; i < b.N; i++ {
for j := 0; j < 1e6; j++ {
m = d.Filter(l)
}
m = d.Filter(l)
}
})
b.Run(fmt.Sprintf("simplified_%v_%s", match, re), func(b *testing.B) {
for i := 0; i < b.N; i++ {
for j := 0; j < 1e6; j++ {
m = s.Filter(l)
}
m = s.Filter(l)
}
})
res = m

@ -3,12 +3,12 @@ package log
import (
"bytes"
"fmt"
"regexp"
"strings"
"text/template"
"text/template/parse"
"github.com/Masterminds/sprig/v3"
"github.com/grafana/regexp"
"github.com/grafana/loki/pkg/logqlmodel"
)

@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"io"
"regexp"
"strings"
"unicode/utf8"
@ -14,6 +13,7 @@ import (
"github.com/grafana/loki/pkg/logql/log/pattern"
"github.com/grafana/loki/pkg/logqlmodel"
"github.com/grafana/regexp"
jsoniter "github.com/json-iterator/go"
"github.com/prometheus/common/model"
)

@ -0,0 +1,15 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, built with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/

@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

@ -0,0 +1,19 @@
# Grafana Go regexp package
This repo is a fork of the upstream Go `regexp` package, with some code optimisations to make it run faster.
All the optimisations have been submitted upstream, but not yet merged.
All semantics are the same, and the optimised code passes all tests from upstream.
The `main` branch is non-optimised: switch over to [`speedup`](https://github.com/grafana/regexp/tree/speedup) branch for the improved code.
## Benchmarks:
![image](https://user-images.githubusercontent.com/8125524/152182951-856549ed-6044-4285-b799-69b31f598e32.png)
## Links to upstream changes:
* [regexp: allow patterns with no alternates to be one-pass](https://go-review.googlesource.com/c/go/+/353711)
* [regexp: speed up onepass prefix check](https://go-review.googlesource.com/c/go/+/354909)
* [regexp: handle prefix string with fold-case](https://go-review.googlesource.com/c/go/+/358756)
* [regexp: avoid copying each instruction executed](https://go-review.googlesource.com/c/go/+/355789)
* [regexp: allow prefix string anchored at beginning](https://go-review.googlesource.com/c/go/+/377294)

@ -0,0 +1,369 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// backtrack is a regular expression search with submatch
// tracking for small regular expressions and texts. It allocates
// a bit vector with (length of input) * (length of prog) bits,
// to make sure it never explores the same (character position, instruction)
// state multiple times. This limits the search to run in time linear in
// the length of the test.
//
// backtrack is a fast replacement for the NFA code on small
// regexps when onepass cannot be used.
package regexp
import (
"sync"
"github.com/grafana/regexp/syntax"
)
// A job is an entry on the backtracker's job stack. It holds
// the instruction pc and the position in the input.
type job struct {
pc uint32
arg bool
pos int
}
const (
visitedBits = 32
maxBacktrackProg = 500 // len(prog.Inst) <= max
maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
)
// bitState holds state for the backtracker.
type bitState struct {
end int
cap []int
matchcap []int
jobs []job
visited []uint32
inputs inputs
}
var bitStatePool sync.Pool
func newBitState() *bitState {
b, ok := bitStatePool.Get().(*bitState)
if !ok {
b = new(bitState)
}
return b
}
func freeBitState(b *bitState) {
b.inputs.clear()
bitStatePool.Put(b)
}
// maxBitStateLen returns the maximum length of a string to search with
// the backtracker using prog.
func maxBitStateLen(prog *syntax.Prog) int {
if !shouldBacktrack(prog) {
return 0
}
return maxBacktrackVector / len(prog.Inst)
}
// shouldBacktrack reports whether the program is too
// long for the backtracker to run.
func shouldBacktrack(prog *syntax.Prog) bool {
return len(prog.Inst) <= maxBacktrackProg
}
// reset resets the state of the backtracker.
// end is the end position in the input.
// ncap is the number of captures.
func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
b.end = end
if cap(b.jobs) == 0 {
b.jobs = make([]job, 0, 256)
} else {
b.jobs = b.jobs[:0]
}
visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
if cap(b.visited) < visitedSize {
b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
} else {
b.visited = b.visited[:visitedSize]
for i := range b.visited {
b.visited[i] = 0
}
}
if cap(b.cap) < ncap {
b.cap = make([]int, ncap)
} else {
b.cap = b.cap[:ncap]
}
for i := range b.cap {
b.cap[i] = -1
}
if cap(b.matchcap) < ncap {
b.matchcap = make([]int, ncap)
} else {
b.matchcap = b.matchcap[:ncap]
}
for i := range b.matchcap {
b.matchcap[i] = -1
}
}
// shouldVisit reports whether the combination of (pc, pos) has not
// been visited yet.
func (b *bitState) shouldVisit(pc uint32, pos int) bool {
n := uint(int(pc)*(b.end+1) + pos)
if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
return false
}
b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
return true
}
// push pushes (pc, pos, arg) onto the job stack if it should be
// visited.
func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
// Only check shouldVisit when arg is false.
// When arg is true, we are continuing a previous visit.
if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
}
}
// tryBacktrack runs a backtracking search starting at pos.
func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
longest := re.longest
b.push(re, pc, pos, false)
for len(b.jobs) > 0 {
l := len(b.jobs) - 1
// Pop job off the stack.
pc := b.jobs[l].pc
pos := b.jobs[l].pos
arg := b.jobs[l].arg
b.jobs = b.jobs[:l]
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
goto Skip
CheckAndLoop:
if !b.shouldVisit(pc, pos) {
continue
}
Skip:
inst := &re.prog.Inst[pc]
switch inst.Op {
default:
panic("bad inst")
case syntax.InstFail:
panic("unexpected InstFail")
case syntax.InstAlt:
// Cannot just
// b.push(inst.Out, pos, false)
// b.push(inst.Arg, pos, false)
// If during the processing of inst.Out, we encounter
// inst.Arg via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// inst with arg==true as a reminder to push inst.Arg out
// later.
if arg {
// Finished inst.Out; try inst.Arg.
arg = false
pc = inst.Arg
goto CheckAndLoop
} else {
b.push(re, pc, pos, true)
pc = inst.Out
goto CheckAndLoop
}
case syntax.InstAltMatch:
// One opcode consumes runes; the other leads to match.
switch re.prog.Inst[inst.Out].Op {
case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
// inst.Arg is the match.
b.push(re, inst.Arg, pos, false)
pc = inst.Arg
pos = b.end
goto CheckAndLoop
}
// inst.Out is the match - non-greedy
b.push(re, inst.Out, b.end, false)
pc = inst.Out
goto CheckAndLoop
case syntax.InstRune:
r, width := i.step(pos)
if !inst.MatchRune(r) {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRune1:
r, width := i.step(pos)
if r != inst.Rune[0] {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRuneAnyNotNL:
r, width := i.step(pos)
if r == '\n' || r == endOfText {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstRuneAny:
r, width := i.step(pos)
if r == endOfText {
continue
}
pos += width
pc = inst.Out
goto CheckAndLoop
case syntax.InstCapture:
if arg {
// Finished inst.Out; restore the old value.
b.cap[inst.Arg] = pos
continue
} else {
if inst.Arg < uint32(len(b.cap)) {
// Capture pos to register, but save old value.
b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
b.cap[inst.Arg] = pos
}
pc = inst.Out
goto CheckAndLoop
}
case syntax.InstEmptyWidth:
flag := i.context(pos)
if !flag.match(syntax.EmptyOp(inst.Arg)) {
continue
}
pc = inst.Out
goto CheckAndLoop
case syntax.InstNop:
pc = inst.Out
goto CheckAndLoop
case syntax.InstMatch:
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if len(b.cap) == 0 {
return true
}
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
if len(b.cap) > 1 {
b.cap[1] = pos
}
if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
copy(b.matchcap, b.cap)
}
// If going for first match, we're done.
if !longest {
return true
}
// If we used the entire text, no longer match is possible.
if pos == b.end {
return true
}
// Otherwise, continue on in hope of a longer match.
continue
}
}
return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
}
// backtrack runs a backtracking search of prog on the input starting at pos.
func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
startCond := re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return nil
}
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
return nil
}
b := newBitState()
i, end := b.inputs.init(nil, ib, is)
b.reset(re.prog, end, ncap)
// Anchored search must start at the beginning of the input
if startCond&syntax.EmptyBeginText != 0 {
if len(b.cap) > 0 {
b.cap[0] = pos
}
if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
freeBitState(b)
return nil
}
} else {
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is pos <= end, not pos < end.
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
width := -1
for pos <= end && width != 0 {
if len(b.cap) > 0 {
b.cap[0] = pos
}
if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
// Match must be leftmost; done.
goto Match
}
_, width = i.step(pos)
pos += width
if len(re.prefix) > 0 {
// Match requires literal prefix; fast search for next occurrence.
advance := i.index(re, pos)
if advance < 0 {
freeBitState(b)
return nil
}
pos += advance
}
}
freeBitState(b)
return nil
}
Match:
dstCap = append(dstCap, b.matchcap...)
freeBitState(b)
return dstCap
}

@ -0,0 +1,579 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"io"
"sync"
"github.com/grafana/regexp/syntax"
)
// A queue is a 'sparse array' holding pending threads of execution.
// See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
type queue struct {
sparse []uint32
dense []entry
}
// An entry is an entry on a queue.
// It holds both the instruction pc and the actual thread.
// Some queue entries are just place holders so that the machine
// knows it has considered that pc. Such entries have t == nil.
type entry struct {
pc uint32
t *thread
}
// A thread is the state of a single path through the machine:
// an instruction and a corresponding capture array.
// See https://swtch.com/~rsc/regexp/regexp2.html
type thread struct {
inst *syntax.Inst
cap []int
}
// A machine holds all the state during an NFA simulation for p.
type machine struct {
re *Regexp // corresponding Regexp
p *syntax.Prog // compiled program
q0, q1 queue // two queues for runq, nextq
pool []*thread // pool of available threads
matched bool // whether a match was found
matchcap []int // capture information for the match
inputs inputs
}
type inputs struct {
// cached inputs, to avoid allocation
bytes inputBytes
string inputString
reader inputReader
}
func (i *inputs) newBytes(b []byte) input {
i.bytes.str = b
return &i.bytes
}
func (i *inputs) newString(s string) input {
i.string.str = s
return &i.string
}
func (i *inputs) newReader(r io.RuneReader) input {
i.reader.r = r
i.reader.atEOT = false
i.reader.pos = 0
return &i.reader
}
func (i *inputs) clear() {
// We need to clear 1 of these.
// Avoid the expense of clearing the others (pointer write barrier).
if i.bytes.str != nil {
i.bytes.str = nil
} else if i.reader.r != nil {
i.reader.r = nil
} else {
i.string.str = ""
}
}
func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) {
if r != nil {
return i.newReader(r), 0
}
if b != nil {
return i.newBytes(b), len(b)
}
return i.newString(s), len(s)
}
func (m *machine) init(ncap int) {
for _, t := range m.pool {
t.cap = t.cap[:ncap]
}
m.matchcap = m.matchcap[:ncap]
}
// alloc allocates a new thread with the given instruction.
// It uses the free pool if possible.
func (m *machine) alloc(i *syntax.Inst) *thread {
var t *thread
if n := len(m.pool); n > 0 {
t = m.pool[n-1]
m.pool = m.pool[:n-1]
} else {
t = new(thread)
t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
}
t.inst = i
return t
}
// A lazyFlag is a lazily-evaluated syntax.EmptyOp,
// for checking zero-width flags like ^ $ \A \z \B \b.
// It records the pair of relevant runes and does not
// determine the implied flags until absolutely necessary
// (most of the time, that means never).
type lazyFlag uint64
func newLazyFlag(r1, r2 rune) lazyFlag {
return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
}
func (f lazyFlag) match(op syntax.EmptyOp) bool {
if op == 0 {
return true
}
r1 := rune(f >> 32)
if op&syntax.EmptyBeginLine != 0 {
if r1 != '\n' && r1 >= 0 {
return false
}
op &^= syntax.EmptyBeginLine
}
if op&syntax.EmptyBeginText != 0 {
if r1 >= 0 {
return false
}
op &^= syntax.EmptyBeginText
}
if op == 0 {
return true
}
r2 := rune(f)
if op&syntax.EmptyEndLine != 0 {
if r2 != '\n' && r2 >= 0 {
return false
}
op &^= syntax.EmptyEndLine
}
if op&syntax.EmptyEndText != 0 {
if r2 >= 0 {
return false
}
op &^= syntax.EmptyEndText
}
if op == 0 {
return true
}
if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
op &^= syntax.EmptyWordBoundary
} else {
op &^= syntax.EmptyNoWordBoundary
}
return op == 0
}
// match runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) match(i input, pos int) bool {
startCond := m.re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return false
}
m.matched = false
for i := range m.matchcap {
m.matchcap[i] = -1
}
runq, nextq := &m.q0, &m.q1
r, r1 := endOfText, endOfText
width, width1 := 0, 0
r, width = i.step(pos)
if r != endOfText {
r1, width1 = i.step(pos + width)
}
var flag lazyFlag
if pos == 0 {
flag = newLazyFlag(-1, r)
} else {
flag = i.context(pos)
}
for {
if len(runq.dense) == 0 {
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
break
}
if m.matched {
// Have match; finished exploring alternatives.
break
}
// Note we don't check foldCase here, because Unicode folding is complicated;
// just let it fall through to EqualFold on the whole string.
if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
advance := i.index(m.re, pos)
if advance < 0 {
break
}
pos += advance
r, width = i.step(pos)
r1, width1 = i.step(pos + width)
}
}
if !m.matched {
if len(m.matchcap) > 0 {
m.matchcap[0] = pos
}
m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
}
flag = newLazyFlag(r, r1)
m.step(runq, nextq, pos, pos+width, r, &flag)
if width == 0 {
break
}
if len(m.matchcap) == 0 && m.matched {
// Found a match and not paying attention
// to where it is, so any match will do.
break
}
pos += width
r, width = r1, width1
if r != endOfText {
r1, width1 = i.step(pos + width)
}
runq, nextq = nextq, runq
}
m.clear(nextq)
return m.matched
}
// clear frees all threads on the thread queue.
func (m *machine) clear(q *queue) {
for _, d := range q.dense {
if d.t != nil {
m.pool = append(m.pool, d.t)
}
}
q.dense = q.dense[:0]
}
// step executes one step of the machine, running each of the threads
// on runq and appending new threads to nextq.
// The step processes the rune c (which may be endOfText),
// which starts at position pos and ends at nextPos.
// nextCond gives the setting for the empty-width flags after c.
func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
longest := m.re.longest
for j := 0; j < len(runq.dense); j++ {
d := &runq.dense[j]
t := d.t
if t == nil {
continue
}
if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
m.pool = append(m.pool, t)
continue
}
i := t.inst
add := false
switch i.Op {
default:
panic("bad inst")
case syntax.InstMatch:
if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
t.cap[1] = pos
copy(m.matchcap, t.cap)
}
if !longest {
// First-match mode: cut off all lower-priority threads.
for _, d := range runq.dense[j+1:] {
if d.t != nil {
m.pool = append(m.pool, d.t)
}
}
runq.dense = runq.dense[:0]
}
m.matched = true
case syntax.InstRune:
add = i.MatchRune(c)
case syntax.InstRune1:
add = c == i.Rune[0]
case syntax.InstRuneAny:
add = true
case syntax.InstRuneAnyNotNL:
add = c != '\n'
}
if add {
t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
}
if t != nil {
m.pool = append(m.pool, t)
}
}
runq.dense = runq.dense[:0]
}
// add adds an entry to q for pc, unless the q already has such an entry.
// It also recursively adds an entry for all instructions reachable from pc by following
// empty-width conditions satisfied by cond. pos gives the current position
// in the input.
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread {
Again:
if pc == 0 {
return t
}
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
return t
}
j := len(q.dense)
q.dense = q.dense[:j+1]
d := &q.dense[j]
d.t = nil
d.pc = pc
q.sparse[pc] = uint32(j)
i := &m.p.Inst[pc]
switch i.Op {
default:
panic("unhandled")
case syntax.InstFail:
// nothing
case syntax.InstAlt, syntax.InstAltMatch:
t = m.add(q, i.Out, pos, cap, cond, t)
pc = i.Arg
goto Again
case syntax.InstEmptyWidth:
if cond.match(syntax.EmptyOp(i.Arg)) {
pc = i.Out
goto Again
}
case syntax.InstNop:
pc = i.Out
goto Again
case syntax.InstCapture:
if int(i.Arg) < len(cap) {
opos := cap[i.Arg]
cap[i.Arg] = pos
m.add(q, i.Out, pos, cap, cond, nil)
cap[i.Arg] = opos
} else {
pc = i.Out
goto Again
}
case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
if t == nil {
t = m.alloc(i)
} else {
t.inst = i
}
if len(cap) > 0 && &t.cap[0] != &cap[0] {
copy(t.cap, cap)
}
d.t = t
t = nil
}
return t
}
type onePassMachine struct {
inputs inputs
matchcap []int
}
var onePassPool sync.Pool
func newOnePassMachine() *onePassMachine {
m, ok := onePassPool.Get().(*onePassMachine)
if !ok {
m = new(onePassMachine)
}
return m
}
func freeOnePassMachine(m *onePassMachine) {
m.inputs.clear()
onePassPool.Put(m)
}
// doOnePass implements r.doExecute using the one-pass execution engine.
func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
startCond := re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return nil
}
m := newOnePassMachine()
matched := false
i, _ := m.inputs.init(ir, ib, is)
r, r1 := endOfText, endOfText
width, width1 := 0, 0
var flag lazyFlag
var pc int
var inst *onePassInst
// If there is a simple literal prefix, skip over it.
if pos == 0 && len(re.prefix) > 0 && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
if !i.hasPrefix(re) {
goto Return
}
pos += len(re.prefix)
pc = int(re.prefixEnd)
} else {
pc = re.onepass.Start
}
if cap(m.matchcap) < ncap {
m.matchcap = make([]int, ncap)
} else {
m.matchcap = m.matchcap[:ncap]
}
for i := range m.matchcap {
m.matchcap[i] = -1
}
r, width = i.step(pos)
if pos == 0 {
flag = newLazyFlag(-1, r)
} else {
flag = i.context(pos)
}
if r != endOfText {
r1, width1 = i.step(pos + width)
}
for {
inst = &re.onepass.Inst[pc]
pc = int(inst.Out)
switch inst.Op {
default:
panic("bad inst")
case syntax.InstMatch:
matched = true
if len(m.matchcap) > 0 {
m.matchcap[0] = 0
m.matchcap[1] = pos
}
goto Return
case syntax.InstRune:
if !inst.MatchRune(r) {
goto Return
}
case syntax.InstRune1:
if r != inst.Rune[0] {
goto Return
}
case syntax.InstRuneAny:
// Nothing
case syntax.InstRuneAnyNotNL:
if r == '\n' {
goto Return
}
// peek at the input rune to see which branch of the Alt to take
case syntax.InstAlt, syntax.InstAltMatch:
pc = int(onePassNext(inst, r))
continue
case syntax.InstFail:
goto Return
case syntax.InstNop:
continue
case syntax.InstEmptyWidth:
if !flag.match(syntax.EmptyOp(inst.Arg)) {
goto Return
}
continue
case syntax.InstCapture:
if int(inst.Arg) < len(m.matchcap) {
m.matchcap[inst.Arg] = pos
}
continue
}
if width == 0 {
break
}
flag = newLazyFlag(r, r1)
pos += width
r, width = r1, width1
if r != endOfText {
r1, width1 = i.step(pos + width)
}
}
Return:
if !matched {
freeOnePassMachine(m)
return nil
}
dstCap = append(dstCap, m.matchcap...)
freeOnePassMachine(m)
return dstCap
}
// doMatch reports whether either r, b or s match the regexp.
func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
return re.doExecute(r, b, s, 0, 0, nil) != nil
}
// doExecute finds the leftmost match in the input, appends the position
// of its subexpressions to dstCap and returns dstCap.
//
// nil is returned if no matches are found and non-nil if matches are found.
func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
if dstCap == nil {
// Make sure 'return dstCap' is non-nil.
dstCap = arrayNoInts[:0:0]
}
if r == nil && len(b)+len(s) < re.minInputLen {
return nil
}
// Check prefix match before allocating data structures
if len(re.prefix) > 0 && r == nil {
if re.cond&syntax.EmptyBeginText != 0 { // anchored
if b != nil && !(&inputBytes{str: b[pos:]}).hasPrefix(re) {
return nil
}
if s != "" && !(&inputString{str: s[pos:]}).hasPrefix(re) {
return nil
}
} else { // non-anchored
var advance int
if b != nil {
advance = (&inputBytes{str: b}).index(re, pos)
} else {
advance = (&inputString{str: s}).index(re, pos)
}
if advance < 0 {
return nil
}
pos += advance
}
}
if re.onepass != nil {
return re.doOnePass(r, b, s, pos, ncap, dstCap)
}
if r == nil && len(b)+len(s) < re.maxBitStateLen {
return re.backtrack(b, s, pos, ncap, dstCap)
}
m := re.get()
i, _ := m.inputs.init(r, b, s)
m.init(ncap)
if !m.match(i, pos) {
re.put(m)
return nil
}
dstCap = append(dstCap, m.matchcap...)
re.put(m)
return dstCap
}
// arrayNoInts is returned by doExecute match if nil dstCap is passed
// to it with ncap=0.
var arrayNoInts [0]int

@ -0,0 +1,517 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"sort"
"strings"
"unicode"
"unicode/utf8"
"github.com/grafana/regexp/syntax"
)
// "One-pass" regexp execution.
// Some regexps can be analyzed to determine that they never need
// backtracking: they are guaranteed to run in one pass over the string
// without bothering to save all the usual NFA state.
// Detect those and execute them more quickly.
// A onePassProg is a compiled one-pass regular expression program.
// It is the same as syntax.Prog except for the use of onePassInst.
type onePassProg struct {
Inst []onePassInst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// A onePassInst is a single instruction in a one-pass regular expression program.
// It is the same as syntax.Inst except for the new 'Next' field.
type onePassInst struct {
syntax.Inst
Next []uint32
}
// OnePassPrefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match. Pc is the index of the last rune instruction
// in the string. The OnePassPrefix skips over the mandatory
// EmptyBeginText
func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, foldCase bool, pc uint32) {
i := &p.Inst[p.Start]
if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
return "", i.Op == syntax.InstMatch, false, uint32(p.Start)
}
pc = i.Out
i = &p.Inst[pc]
for i.Op == syntax.InstNop {
pc = i.Out
i = &p.Inst[pc]
}
// Avoid allocation of buffer if prefix is empty.
if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
return "", i.Op == syntax.InstMatch, false, uint32(p.Start)
}
foldCase = (syntax.Flags(i.Arg)&syntax.FoldCase != 0)
// Have prefix; gather characters.
var buf strings.Builder
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && (syntax.Flags(i.Arg)&syntax.FoldCase != 0) == foldCase && i.Rune[0] != utf8.RuneError {
buf.WriteRune(i.Rune[0])
pc, i = i.Out, &p.Inst[i.Out]
}
if i.Op == syntax.InstEmptyWidth &&
syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
p.Inst[i.Out].Op == syntax.InstMatch {
complete = true
}
return buf.String(), complete, foldCase, pc
}
// OnePassNext selects the next actionable state of the prog, based on the input character.
// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
// One of the alternates may ultimately lead without input to end of line. If the instruction
// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
func onePassNext(i *onePassInst, r rune) uint32 {
next := i.MatchRunePos(r)
if next >= 0 {
return i.Next[next]
}
if i.Op == syntax.InstAltMatch {
return i.Out
}
return 0
}
func iop(i *syntax.Inst) syntax.InstOp {
op := i.Op
switch op {
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
op = syntax.InstRune
}
return op
}
// Sparse Array implementation is used as a queueOnePass.
type queueOnePass struct {
sparse []uint32
dense []uint32
size, nextIndex uint32
}
func (q *queueOnePass) empty() bool {
return q.nextIndex >= q.size
}
func (q *queueOnePass) next() (n uint32) {
n = q.dense[q.nextIndex]
q.nextIndex++
return
}
func (q *queueOnePass) clear() {
q.size = 0
q.nextIndex = 0
}
func (q *queueOnePass) contains(u uint32) bool {
if u >= uint32(len(q.sparse)) {
return false
}
return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
}
func (q *queueOnePass) insert(u uint32) {
if !q.contains(u) {
q.insertNew(u)
}
}
func (q *queueOnePass) insertNew(u uint32) {
if u >= uint32(len(q.sparse)) {
return
}
q.sparse[u] = q.size
q.dense[q.size] = u
q.size++
}
func newQueue(size int) (q *queueOnePass) {
return &queueOnePass{
sparse: make([]uint32, size),
dense: make([]uint32, size),
}
}
// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
// NextIp array with the single element mergeFailed is returned.
// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
const mergeFailed = uint32(0xffffffff)
var (
noRune = []rune{}
noNext = []uint32{mergeFailed}
)
func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
leftLen := len(*leftRunes)
rightLen := len(*rightRunes)
if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
panic("mergeRuneSets odd length []rune")
}
var (
lx, rx int
)
merged := make([]rune, 0)
next := make([]uint32, 0)
ok := true
defer func() {
if !ok {
merged = nil
next = nil
}
}()
ix := -1
extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
return false
}
merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
*newLow += 2
ix += 2
next = append(next, pc)
return true
}
for lx < leftLen || rx < rightLen {
switch {
case rx >= rightLen:
ok = extend(&lx, leftRunes, leftPC)
case lx >= leftLen:
ok = extend(&rx, rightRunes, rightPC)
case (*rightRunes)[rx] < (*leftRunes)[lx]:
ok = extend(&rx, rightRunes, rightPC)
default:
ok = extend(&lx, leftRunes, leftPC)
}
if !ok {
return noRune, noNext
}
}
return merged, next
}
// cleanupOnePass drops working memory, and restores certain shortcut instructions.
func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
for ix, instOriginal := range original.Inst {
switch instOriginal.Op {
case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
prog.Inst[ix].Next = nil
case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
prog.Inst[ix].Next = nil
prog.Inst[ix] = onePassInst{Inst: instOriginal}
}
}
}
// onePassCopy creates a copy of the original Prog, as we'll be modifying it
func onePassCopy(prog *syntax.Prog) *onePassProg {
p := &onePassProg{
Start: prog.Start,
NumCap: prog.NumCap,
Inst: make([]onePassInst, len(prog.Inst)),
}
for i, inst := range prog.Inst {
p.Inst[i] = onePassInst{Inst: inst}
}
// rewrites one or more common Prog constructs that enable some otherwise
// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
// ip A, that points to ips B & C.
// A:BC + B:DA => A:BC + B:CD
// A:BC + B:DC => A:DC + B:DC
for pc := range p.Inst {
switch p.Inst[pc].Op {
default:
continue
case syntax.InstAlt, syntax.InstAltMatch:
// A:Bx + B:Ay
p_A_Other := &p.Inst[pc].Out
p_A_Alt := &p.Inst[pc].Arg
// make sure a target is another Alt
instAlt := p.Inst[*p_A_Alt]
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
instAlt = p.Inst[*p_A_Alt]
if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
continue
}
}
instOther := p.Inst[*p_A_Other]
// Analyzing both legs pointing to Alts is for another day
if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
// too complicated
continue
}
// simple empty transition loop
// A:BC + B:DA => A:BC + B:DC
p_B_Alt := &p.Inst[*p_A_Alt].Out
p_B_Other := &p.Inst[*p_A_Alt].Arg
patch := false
if instAlt.Out == uint32(pc) {
patch = true
} else if instAlt.Arg == uint32(pc) {
patch = true
p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
}
if patch {
*p_B_Alt = *p_A_Other
}
// empty transition to common target
// A:BC + B:DC => A:DC + B:DC
if *p_A_Other == *p_B_Alt {
*p_A_Alt = *p_B_Other
}
}
}
return p
}
// runeSlice exists to permit sorting the case-folded rune sets.
type runeSlice []rune
func (p runeSlice) Len() int { return len(p) }
func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
// the match engine can always tell which branch to take. The routine may modify
// p if it is turned into a onepass Prog. If it isn't possible for this to be a
// onepass Prog, the Prog nil is returned. makeOnePass is recursive
// to the size of the Prog.
func makeOnePass(p *onePassProg) *onePassProg {
// If the machine is very long, it's not worth the time to check if we can use one pass.
if len(p.Inst) >= 1000 {
return nil
}
var (
instQueue = newQueue(len(p.Inst))
visitQueue = newQueue(len(p.Inst))
check func(uint32, []bool) bool
onePassRunes = make([][]rune, len(p.Inst))
)
// check that paths from Alt instructions are unambiguous, and rebuild the new
// program as a onepass program
check = func(pc uint32, m []bool) (ok bool) {
ok = true
inst := &p.Inst[pc]
if visitQueue.contains(pc) {
return
}
visitQueue.insert(pc)
switch inst.Op {
case syntax.InstAlt, syntax.InstAltMatch:
ok = check(inst.Out, m) && check(inst.Arg, m)
// check no-input paths to InstMatch
matchOut := m[inst.Out]
matchArg := m[inst.Arg]
if matchOut && matchArg {
ok = false
break
}
// Match on empty goes in inst.Out
if matchArg {
inst.Out, inst.Arg = inst.Arg, inst.Out
matchOut, matchArg = matchArg, matchOut
}
if matchOut {
m[pc] = true
inst.Op = syntax.InstAltMatch
}
// build a dispatch operator from the two legs of the alt.
onePassRunes[pc], inst.Next = mergeRuneSets(
&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
ok = false
break
}
case syntax.InstCapture, syntax.InstNop:
ok = check(inst.Out, m)
m[pc] = m[inst.Out]
// pass matching runes back through these no-ops.
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
case syntax.InstEmptyWidth:
ok = check(inst.Out, m)
m[pc] = m[inst.Out]
onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
case syntax.InstMatch, syntax.InstFail:
m[pc] = inst.Op == syntax.InstMatch
case syntax.InstRune:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
if len(inst.Rune) == 0 {
onePassRunes[pc] = []rune{}
inst.Next = []uint32{inst.Out}
break
}
runes := make([]rune, 0)
if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
r0 := inst.Rune[0]
runes = append(runes, r0, r0)
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
runes = append(runes, r1, r1)
}
sort.Sort(runeSlice(runes))
} else {
runes = append(runes, inst.Rune...)
}
onePassRunes[pc] = runes
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
inst.Op = syntax.InstRune
case syntax.InstRune1:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
runes := []rune{}
// expand case-folded runes
if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
r0 := inst.Rune[0]
runes = append(runes, r0, r0)
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
runes = append(runes, r1, r1)
}
sort.Sort(runeSlice(runes))
} else {
runes = append(runes, inst.Rune[0], inst.Rune[0])
}
onePassRunes[pc] = runes
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
inst.Op = syntax.InstRune
case syntax.InstRuneAny:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
onePassRunes[pc] = append([]rune{}, anyRune...)
inst.Next = []uint32{inst.Out}
case syntax.InstRuneAnyNotNL:
m[pc] = false
if len(inst.Next) > 0 {
break
}
instQueue.insert(inst.Out)
onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
for i := range inst.Next {
inst.Next[i] = inst.Out
}
}
return
}
instQueue.clear()
instQueue.insert(uint32(p.Start))
m := make([]bool, len(p.Inst))
for !instQueue.empty() {
visitQueue.clear()
pc := instQueue.next()
if !check(pc, m) {
p = nil
break
}
}
if p != nil {
for i := range p.Inst {
p.Inst[i].Rune = onePassRunes[i]
}
}
return p
}
// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
// can be recharacterized as a one-pass regexp program, or syntax.nil if the
// Prog cannot be converted. For a one pass prog, the fundamental condition that must
// be true is: at any InstAlt, there must be no ambiguity about what branch to take.
func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
if prog.Start == 0 {
return nil
}
// onepass regexp is anchored
if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
return nil
}
hasAlt := false
for _, inst := range prog.Inst {
if inst.Op == syntax.InstAlt || inst.Op == syntax.InstAltMatch {
hasAlt = true
break
}
}
// If we have alternates, every instruction leading to InstMatch must be EmptyEndText.
// Also, any match on empty text must be $.
for _, inst := range prog.Inst {
opOut := prog.Inst[inst.Out].Op
switch inst.Op {
default:
if opOut == syntax.InstMatch && hasAlt {
return nil
}
case syntax.InstAlt, syntax.InstAltMatch:
if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
return nil
}
case syntax.InstEmptyWidth:
if opOut == syntax.InstMatch {
if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
continue
}
return nil
}
}
}
// Creates a slightly optimized copy of the original Prog
// that cleans up some Prog idioms that block valid onepass programs
p = onePassCopy(prog)
// checkAmbiguity on InstAlts, build onepass Prog if possible
p = makeOnePass(p)
if p != nil {
cleanupOnePass(p, prog)
}
return p
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,296 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "unicode"
// A patchList is a list of instruction pointers that need to be filled in (patched).
// Because the pointers haven't been filled in yet, we can reuse their storage
// to hold the list. It's kind of sleazy, but works well in practice.
// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
//
// These aren't really pointers: they're integers, so we can reinterpret them
// this way without using package unsafe. A value l.head denotes
// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
// head == 0 denotes the empty list, okay because we start every program
// with a fail instruction, so we'll never want to point at its output link.
type patchList struct {
head, tail uint32
}
func makePatchList(n uint32) patchList {
return patchList{n, n}
}
func (l patchList) patch(p *Prog, val uint32) {
head := l.head
for head != 0 {
i := &p.Inst[head>>1]
if head&1 == 0 {
head = i.Out
i.Out = val
} else {
head = i.Arg
i.Arg = val
}
}
}
func (l1 patchList) append(p *Prog, l2 patchList) patchList {
if l1.head == 0 {
return l2
}
if l2.head == 0 {
return l1
}
i := &p.Inst[l1.tail>>1]
if l1.tail&1 == 0 {
i.Out = l2.head
} else {
i.Arg = l2.head
}
return patchList{l1.head, l2.tail}
}
// A frag represents a compiled program fragment.
type frag struct {
i uint32 // index of first instruction
out patchList // where to record end instruction
nullable bool // whether fragment can match empty string
}
type compiler struct {
p *Prog
}
// Compile compiles the regexp into a program to be executed.
// The regexp should have been simplified already (returned from re.Simplify).
func Compile(re *Regexp) (*Prog, error) {
var c compiler
c.init()
f := c.compile(re)
f.out.patch(c.p, c.inst(InstMatch).i)
c.p.Start = int(f.i)
return c.p, nil
}
func (c *compiler) init() {
c.p = new(Prog)
c.p.NumCap = 2 // implicit ( and ) for whole match $0
c.inst(InstFail)
}
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
func (c *compiler) compile(re *Regexp) frag {
switch re.Op {
case OpNoMatch:
return c.fail()
case OpEmptyMatch:
return c.nop()
case OpLiteral:
if len(re.Rune) == 0 {
return c.nop()
}
var f frag
for j := range re.Rune {
f1 := c.rune(re.Rune[j:j+1], re.Flags)
if j == 0 {
f = f1
} else {
f = c.cat(f, f1)
}
}
return f
case OpCharClass:
return c.rune(re.Rune, re.Flags)
case OpAnyCharNotNL:
return c.rune(anyRuneNotNL, 0)
case OpAnyChar:
return c.rune(anyRune, 0)
case OpBeginLine:
return c.empty(EmptyBeginLine)
case OpEndLine:
return c.empty(EmptyEndLine)
case OpBeginText:
return c.empty(EmptyBeginText)
case OpEndText:
return c.empty(EmptyEndText)
case OpWordBoundary:
return c.empty(EmptyWordBoundary)
case OpNoWordBoundary:
return c.empty(EmptyNoWordBoundary)
case OpCapture:
bra := c.cap(uint32(re.Cap << 1))
sub := c.compile(re.Sub[0])
ket := c.cap(uint32(re.Cap<<1 | 1))
return c.cat(c.cat(bra, sub), ket)
case OpStar:
return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpPlus:
return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpQuest:
return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpConcat:
if len(re.Sub) == 0 {
return c.nop()
}
var f frag
for i, sub := range re.Sub {
if i == 0 {
f = c.compile(sub)
} else {
f = c.cat(f, c.compile(sub))
}
}
return f
case OpAlternate:
var f frag
for _, sub := range re.Sub {
f = c.alt(f, c.compile(sub))
}
return f
}
panic("regexp: unhandled case in compile")
}
func (c *compiler) inst(op InstOp) frag {
// TODO: impose length limit
f := frag{i: uint32(len(c.p.Inst)), nullable: true}
c.p.Inst = append(c.p.Inst, Inst{Op: op})
return f
}
func (c *compiler) nop() frag {
f := c.inst(InstNop)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) fail() frag {
return frag{}
}
func (c *compiler) cap(arg uint32) frag {
f := c.inst(InstCapture)
f.out = makePatchList(f.i << 1)
c.p.Inst[f.i].Arg = arg
if c.p.NumCap < int(arg)+1 {
c.p.NumCap = int(arg) + 1
}
return f
}
func (c *compiler) cat(f1, f2 frag) frag {
// concat of failure is failure
if f1.i == 0 || f2.i == 0 {
return frag{}
}
// TODO: elide nop
f1.out.patch(c.p, f2.i)
return frag{f1.i, f2.out, f1.nullable && f2.nullable}
}
func (c *compiler) alt(f1, f2 frag) frag {
// alt of failure is other
if f1.i == 0 {
return f2
}
if f2.i == 0 {
return f1
}
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
i.Out = f1.i
i.Arg = f2.i
f.out = f1.out.append(c.p, f2.out)
f.nullable = f1.nullable || f2.nullable
return f
}
func (c *compiler) quest(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f.out = f.out.append(c.p, f1.out)
return f
}
// loop returns the fragment for the main loop of a plus or star.
// For plus, it can be used after changing the entry to f1.i.
// For star, it can be used directly when f1 can't match an empty string.
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
// to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f1.out.patch(c.p, f.i)
return f
}
func (c *compiler) star(f1 frag, nongreedy bool) frag {
if f1.nullable {
// Use (f1+)? to get priority match order correct.
// See golang.org/issue/46123.
return c.quest(c.plus(f1, nongreedy), nongreedy)
}
return c.loop(f1, nongreedy)
}
func (c *compiler) plus(f1 frag, nongreedy bool) frag {
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
}
func (c *compiler) empty(op EmptyOp) frag {
f := c.inst(InstEmptyWidth)
c.p.Inst[f.i].Arg = uint32(op)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
f.nullable = false
i := &c.p.Inst[f.i]
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
// and sometimes not even that
flags &^= FoldCase
}
i.Arg = uint32(flags)
f.out = makePatchList(f.i << 1)
// Special cases for exec machine.
switch {
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
i.Op = InstRune1
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
i.Op = InstRuneAny
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
i.Op = InstRuneAnyNotNL
}
return f
}

@ -0,0 +1,132 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package regexp (such as Compile and Match) instead of this package.
Syntax
The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to Parse.
Single characters:
. any character, possibly including newline (flag s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[[:alpha:]] ASCII character class
[[:^alpha:]] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy x followed by y
x|y x or y (prefer x)
Repetitions:
x* zero or more x, prefer more
x+ one or more x, prefer more
x? zero or one x, prefer one
x{n,m} n or n+1 or ... or m x, prefer more
x{n,} n or more x, prefer more
x{n} exactly n x
x*? zero or more x, prefer fewer
x+? one or more x, prefer fewer
x?? zero or one x, prefer zero
x{n,m}? n or n+1 or ... or m x, prefer fewer
x{n,}? n or more x, prefer fewer
x{n}? exactly n x
Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
reject forms that create a minimum or maximum repetition count above 1000.
Unlimited repetitions are not subject to this restriction.
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
i case-insensitive (default false)
m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
s let . match \n (default false)
U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
Empty strings:
^ at beginning of text or line (flag m=true)
$ at end of text (like \z not \Z) or line (flag m=true)
\A at beginning of text
\b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
\B not at ASCII word boundary
\z at end of text
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal *, for any punctuation character *
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\Q...\E literal text ... even if ... has punctuation
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class foo
\p{Foo} Unicode character class Foo
\pF Unicode character class F (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes (all ASCII-only):
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
ASCII character classes:
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
[[:alpha:]] alphabetic (== [A-Za-z])
[[:ascii:]] ASCII (== [\x00-\x7F])
[[:blank:]] blank (== [\t ])
[[:cntrl:]] control (== [\x00-\x1F\x7F])
[[:digit:]] digits (== [0-9])
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[[:lower:]] lower case (== [a-z])
[[:print:]] printable (== [ -~] == [ [:graph:]])
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
[[:space:]] whitespace (== [\t\n\v\f\r ])
[[:upper:]] upper case (== [A-Z])
[[:word:]] word characters (== [0-9A-Za-z_])
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
Unicode character classes are those in unicode.Categories and unicode.Scripts.
*/
package syntax

@ -0,0 +1,113 @@
#!/usr/bin/perl
# Copyright 2008 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Modified version of RE2's make_perl_groups.pl.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
@perlclasses = (
"\\d",
"\\s",
"\\w",
);
%overrides = (
# Prior to Perl 5.18, \s did not match vertical tab.
# RE2 preserves that original behaviour.
"\\s:11" => 0,
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "var code$cname = []rune{ /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
}
print "}\n\n";
my $n = @ranges;
$negname = $name;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "\t`$name`: {+1, code$cname},\n" .
"\t`$negname`: {-1, code$cname},\n";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "var ${cname}Group = map[string]charGroup{\n";
foreach my $e (@entries) {
print $e;
}
print "}\n";
my $count = @entries;
}
print <<EOF;
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.go
package syntax
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);

@ -0,0 +1,26 @@
// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
package syntax
import "strconv"
const (
_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
_Op_name_1 = "opPseudo"
)
var (
_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
)
func (i Op) String() string {
switch {
case 1 <= i && i <= 19:
i -= 1
return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
case i == 128:
return _Op_name_1
default:
return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,134 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.go
package syntax
var code1 = []rune{ /* \d */
0x30, 0x39,
}
var code2 = []rune{ /* \s */
0x9, 0xa,
0xc, 0xd,
0x20, 0x20,
}
var code3 = []rune{ /* \w */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var perlGroup = map[string]charGroup{
`\d`: {+1, code1},
`\D`: {-1, code1},
`\s`: {+1, code2},
`\S`: {-1, code2},
`\w`: {+1, code3},
`\W`: {-1, code3},
}
var code4 = []rune{ /* [:alnum:] */
0x30, 0x39,
0x41, 0x5a,
0x61, 0x7a,
}
var code5 = []rune{ /* [:alpha:] */
0x41, 0x5a,
0x61, 0x7a,
}
var code6 = []rune{ /* [:ascii:] */
0x0, 0x7f,
}
var code7 = []rune{ /* [:blank:] */
0x9, 0x9,
0x20, 0x20,
}
var code8 = []rune{ /* [:cntrl:] */
0x0, 0x1f,
0x7f, 0x7f,
}
var code9 = []rune{ /* [:digit:] */
0x30, 0x39,
}
var code10 = []rune{ /* [:graph:] */
0x21, 0x7e,
}
var code11 = []rune{ /* [:lower:] */
0x61, 0x7a,
}
var code12 = []rune{ /* [:print:] */
0x20, 0x7e,
}
var code13 = []rune{ /* [:punct:] */
0x21, 0x2f,
0x3a, 0x40,
0x5b, 0x60,
0x7b, 0x7e,
}
var code14 = []rune{ /* [:space:] */
0x9, 0xd,
0x20, 0x20,
}
var code15 = []rune{ /* [:upper:] */
0x41, 0x5a,
}
var code16 = []rune{ /* [:word:] */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var code17 = []rune{ /* [:xdigit:] */
0x30, 0x39,
0x41, 0x46,
0x61, 0x66,
}
var posixGroup = map[string]charGroup{
`[:alnum:]`: {+1, code4},
`[:^alnum:]`: {-1, code4},
`[:alpha:]`: {+1, code5},
`[:^alpha:]`: {-1, code5},
`[:ascii:]`: {+1, code6},
`[:^ascii:]`: {-1, code6},
`[:blank:]`: {+1, code7},
`[:^blank:]`: {-1, code7},
`[:cntrl:]`: {+1, code8},
`[:^cntrl:]`: {-1, code8},
`[:digit:]`: {+1, code9},
`[:^digit:]`: {-1, code9},
`[:graph:]`: {+1, code10},
`[:^graph:]`: {-1, code10},
`[:lower:]`: {+1, code11},
`[:^lower:]`: {-1, code11},
`[:print:]`: {+1, code12},
`[:^print:]`: {-1, code12},
`[:punct:]`: {+1, code13},
`[:^punct:]`: {-1, code13},
`[:space:]`: {+1, code14},
`[:^space:]`: {-1, code14},
`[:upper:]`: {+1, code15},
`[:^upper:]`: {-1, code15},
`[:word:]`: {+1, code16},
`[:^word:]`: {-1, code16},
`[:xdigit:]`: {+1, code17},
`[:^xdigit:]`: {-1, code17},
}

@ -0,0 +1,363 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"strconv"
"strings"
"unicode"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// An InstOp is an instruction opcode.
type InstOp uint8
const (
InstAlt InstOp = iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
InstRune1
InstRuneAny
InstRuneAnyNotNL
)
var instOpNames = []string{
"InstAlt",
"InstAltMatch",
"InstCapture",
"InstEmptyWidth",
"InstMatch",
"InstFail",
"InstNop",
"InstRune",
"InstRune1",
"InstRuneAny",
"InstRuneAnyNotNL",
}
func (i InstOp) String() string {
if uint(i) >= uint(len(instOpNames)) {
return ""
}
return instOpNames[i]
}
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
const (
EmptyBeginLine EmptyOp = 1 << iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// EmptyOpContext returns the zero-width assertions
// satisfied at the position between the runes r1 and r2.
// Passing r1 == -1 indicates that the position is
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
func EmptyOpContext(r1, r2 rune) EmptyOp {
var op EmptyOp = EmptyNoWordBoundary
var boundary byte
switch {
case IsWordChar(r1):
boundary = 1
case r1 == '\n':
op |= EmptyBeginLine
case r1 < 0:
op |= EmptyBeginText | EmptyBeginLine
}
switch {
case IsWordChar(r2):
boundary ^= 1
case r2 == '\n':
op |= EmptyEndLine
case r2 < 0:
op |= EmptyEndText | EmptyEndLine
}
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
}
return op
}
// IsWordChar reports whether r is consider a ``word character''
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) bool {
return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
// An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []rune
}
func (p *Prog) String() string {
var b strings.Builder
dumpProg(&b, p)
return b.String()
}
// skipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
i = &p.Inst[i.Out]
}
return i
}
// op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp {
op := i.Op
switch op {
case InstRune1, InstRuneAny, InstRuneAnyNotNL:
op = InstRune
}
return op
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
prefix, complete, foldCase := p.PrefixAndCase()
if foldCase {
return "", false
}
return prefix, complete
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match. FoldCase is true if the string should
// match in upper or lower case.
func (p *Prog) PrefixAndCase() (prefix string, complete bool, foldCase bool) {
i := &p.Inst[p.Start]
// Skip any no-op, capturing or begin-text instructions
for i.Op == InstNop || i.Op == InstCapture || (i.Op == InstEmptyWidth && EmptyOp(i.Arg)&EmptyBeginText != 0) {
i = &p.Inst[i.Out]
}
// Avoid allocation of buffer if prefix is empty.
if i.op() != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch, false
}
// Have prefix; gather characters.
var buf strings.Builder
foldCase = (Flags(i.Arg)&FoldCase != 0)
for i.op() == InstRune && len(i.Rune) == 1 && (Flags(i.Arg)&FoldCase != 0) == foldCase {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch, foldCase
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
const noMatch = -1
// MatchRune reports whether the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r rune) bool {
return i.MatchRunePos(r) != noMatch
}
// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == InstRune.
func (i *Inst) MatchRunePos(r rune) int {
rune := i.Rune
switch len(rune) {
case 0:
return noMatch
case 1:
// Special case: single-rune slice is from literal string, not char class.
r0 := rune[0]
if r == r0 {
return 0
}
if Flags(i.Arg)&FoldCase != 0 {
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
if r == r1 {
return 0
}
}
}
return noMatch
case 2:
if r >= rune[0] && r <= rune[1] {
return 0
}
return noMatch
case 4, 6, 8:
// Linear search for a few pairs.
// Should handle ASCII well.
for j := 0; j < len(rune); j += 2 {
if r < rune[j] {
return noMatch
}
if r <= rune[j+1] {
return j / 2
}
}
return noMatch
}
// Otherwise binary search.
lo := 0
hi := len(rune) / 2
for lo < hi {
m := lo + (hi-lo)/2
if c := rune[2*m]; c <= r {
if r <= rune[2*m+1] {
return m
}
lo = m + 1
} else {
hi = m
}
}
return noMatch
}
// MatchEmptyWidth reports whether the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == InstEmptyWidth.
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
case EmptyEndLine:
return after == '\n' || after == -1
case EmptyBeginText:
return before == -1
case EmptyEndText:
return after == -1
case EmptyWordBoundary:
return IsWordChar(before) != IsWordChar(after)
case EmptyNoWordBoundary:
return IsWordChar(before) == IsWordChar(after)
}
panic("unknown empty width arg")
}
func (i *Inst) String() string {
var b strings.Builder
dumpInst(&b, i)
return b.String()
}
func bw(b *strings.Builder, args ...string) {
for _, s := range args {
b.WriteString(s)
}
}
func dumpProg(b *strings.Builder, p *Prog) {
for j := range p.Inst {
i := &p.Inst[j]
pc := strconv.Itoa(j)
if len(pc) < 3 {
b.WriteString(" "[len(pc):])
}
if j == p.Start {
pc += "*"
}
bw(b, pc, "\t")
dumpInst(b, i)
bw(b, "\n")
}
}
func u32(i uint32) string {
return strconv.FormatUint(uint64(i), 10)
}
func dumpInst(b *strings.Builder, i *Inst) {
switch i.Op {
case InstAlt:
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
case InstAltMatch:
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
case InstCapture:
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
case InstEmptyWidth:
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
case InstMatch:
bw(b, "match")
case InstFail:
bw(b, "fail")
case InstNop:
bw(b, "nop -> ", u32(i.Out))
case InstRune:
if i.Rune == nil {
// shouldn't happen
bw(b, "rune <nil>")
}
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
if Flags(i.Arg)&FoldCase != 0 {
bw(b, "/i")
}
bw(b, " -> ", u32(i.Out))
case InstRune1:
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
case InstRuneAny:
bw(b, "any -> ", u32(i.Out))
case InstRuneAnyNotNL:
bw(b, "anynotnl -> ", u32(i.Out))
}
}

@ -0,0 +1,320 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
//go:generate stringer -type Op -trimprefix Op
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character except newline
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// Equal reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
if len(x.Rune) != len(y.Rune) {
return false
}
for i, r := range x.Rune {
if r != y.Rune[i] {
return false
}
}
case OpAlternate, OpConcat:
if len(x.Sub) != len(y.Sub) {
return false
}
for i, sub := range x.Sub {
if !sub.Equal(y.Sub[i]) {
return false
}
}
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp) {
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
if re.Flags&FoldCase != 0 {
b.WriteString(`(?i:`)
}
for _, r := range re.Rune {
escape(b, r, false)
}
if re.Flags&FoldCase != 0 {
b.WriteString(`)`)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL:
b.WriteString(`(?-s:.)`)
case OpAnyChar:
b.WriteString(`(?s:.)`)
case OpBeginLine:
b.WriteString(`(?m:^)`)
case OpEndLine:
b.WriteString(`(?m:$)`)
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`(?-m:$)`)
} else {
b.WriteString(`\z`)
}
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0])
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
if re.Flags&NonGreedy != 0 {
b.WriteRune('?')
}
case OpConcat:
for _, sub := range re.Sub {
if sub.Op == OpAlternate {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub)
}
}
}
func (re *Regexp) String() string {
var b strings.Builder
writeRegexp(&b, re)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *strings.Builder, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.ContainsRune(meta, r) || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.FormatInt(int64(r), 16))
b.WriteString(`}`)
}
}
// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
m := 0
if re.Op == OpCapture {
m = re.Cap
}
for _, sub := range re.Sub {
if n := sub.MaxCap(); m < n {
m = n
}
}
return m
}
// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
names := make([]string, re.MaxCap()+1)
re.capNames(names)
return names
}
func (re *Regexp) capNames(names []string) {
if re.Op == OpCapture {
names[re.Cap] = re.Name
}
for _, sub := range re.Sub {
sub.capNames(names)
}
}

@ -0,0 +1,151 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Simplify returns a regexp equivalent to re but without counted repetitions
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
// The resulting regexp will execute correctly but its string representation
// will not produce the same parse tree, because capturing parentheses
// may have been duplicated or removed. For example, the simplified form
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
// The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *Regexp {
if re == nil {
return nil
}
switch re.Op {
case OpCapture, OpConcat, OpAlternate:
// Simplify children, building new Regexp if children change.
nre := re
for i, sub := range re.Sub {
nsub := sub.Simplify()
if nre == re && nsub != sub {
// Start a copy.
nre = new(Regexp)
*nre = *re
nre.Rune = nil
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
}
if nre != re {
nre.Sub = append(nre.Sub, nsub)
}
}
return nre
case OpStar, OpPlus, OpQuest:
sub := re.Sub[0].Simplify()
return simplify1(re.Op, re.Flags, sub, re)
case OpRepeat:
// Special special case: x{0} matches the empty string
// and doesn't even need to consider x.
if re.Min == 0 && re.Max == 0 {
return &Regexp{Op: OpEmptyMatch}
}
// The fun begins.
sub := re.Sub[0].Simplify()
// x{n,} means at least n matches of x.
if re.Max == -1 {
// Special case: x{0,} is x*.
if re.Min == 0 {
return simplify1(OpStar, re.Flags, sub, nil)
}
// Special case: x{1,} is x+.
if re.Min == 1 {
return simplify1(OpPlus, re.Flags, sub, nil)
}
// General case: x{4,} is xxxx+.
nre := &Regexp{Op: OpConcat}
nre.Sub = nre.Sub0[:0]
for i := 0; i < re.Min-1; i++ {
nre.Sub = append(nre.Sub, sub)
}
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
return nre
}
// Special case x{0} handled above.
// Special case: x{1} is just x.
if re.Min == 1 && re.Max == 1 {
return sub
}
// General case: x{n,m} means n copies of x and m copies of x?
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx.
var prefix *Regexp
if re.Min > 0 {
prefix = &Regexp{Op: OpConcat}
prefix.Sub = prefix.Sub0[:0]
for i := 0; i < re.Min; i++ {
prefix.Sub = append(prefix.Sub, sub)
}
}
// Build and attach suffix: (x(x(x)?)?)?
if re.Max > re.Min {
suffix := simplify1(OpQuest, re.Flags, sub, nil)
for i := re.Min + 1; i < re.Max; i++ {
nre2 := &Regexp{Op: OpConcat}
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
}
if prefix == nil {
return suffix
}
prefix.Sub = append(prefix.Sub, suffix)
}
if prefix != nil {
return prefix
}
// Some degenerate case like min > max or min < max < 0.
// Handle as impossible match.
return &Regexp{Op: OpNoMatch}
}
return re
}
// simplify1 implements Simplify for the unary OpStar,
// OpPlus, and OpQuest operators. It returns the simple regexp
// equivalent to
//
// Regexp{Op: op, Flags: flags, Sub: {sub}}
//
// under the assumption that sub is already simple, and
// without first allocating that structure. If the regexp
// to be returned turns out to be equivalent to re, simplify1
// returns re instead.
//
// simplify1 is factored out of Simplify because the implementation
// for other operators generates these unary expressions.
// Letting them call simplify1 makes sure the expressions they
// generate are simple.
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if sub.Op == OpEmptyMatch {
return sub
}
// The operators are idempotent if the flags match.
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
return sub
}
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
return re
}
re = &Regexp{Op: op, Flags: flags}
re.Sub = append(re.Sub0[:0], sub)
return re
}

@ -649,6 +649,10 @@ github.com/grafana/dskit/ring/util
github.com/grafana/dskit/runtimeconfig
github.com/grafana/dskit/services
github.com/grafana/dskit/spanlogger
# github.com/grafana/regexp v0.0.0-20220202152701-6a046c4caf32
## explicit; go 1.17
github.com/grafana/regexp
github.com/grafana/regexp/syntax
# github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
## explicit; go 1.14
github.com/grpc-ecosystem/go-grpc-middleware

Loading…
Cancel
Save