gitea/vendor/github.com/blevesearch/vellum/regexp/dfa.go

197 lines
4.1 KiB
Go

// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import (
"encoding/binary"
"fmt"
)
// StateLimit is the maximum number of states allowed
const StateLimit = 10000
// ErrTooManyStates is returned if you attempt to build a Levenshtein
// automaton which requires too many states.
var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
StateLimit)
type dfaBuilder struct {
dfa *dfa
cache map[string]int
keyBuf []byte
}
func newDfaBuilder(insts prog) *dfaBuilder {
d := &dfaBuilder{
dfa: &dfa{
insts: insts,
states: make([]state, 0, 16),
},
cache: make(map[string]int, 1024),
}
// add 0 state that is invalid
d.dfa.states = append(d.dfa.states, state{
next: make([]int, 256),
match: false,
})
return d
}
func (d *dfaBuilder) build() (*dfa, error) {
cur := newSparseSet(uint(len(d.dfa.insts)))
next := newSparseSet(uint(len(d.dfa.insts)))
d.dfa.add(cur, 0)
ns, instsReuse := d.cachedState(cur, nil)
states := intStack{ns}
seen := make(map[int]struct{})
var s int
states, s = states.Pop()
for s != 0 {
for b := 0; b < 256; b++ {
var ns int
ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse)
if ns != 0 {
if _, ok := seen[ns]; !ok {
seen[ns] = struct{}{}
states = states.Push(ns)
}
}
if len(d.dfa.states) > StateLimit {
return nil, ErrTooManyStates
}
}
states, s = states.Pop()
}
return d.dfa, nil
}
func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) (
int, []uint) {
cur.Clear()
for _, ip := range d.dfa.states[state].insts {
cur.Add(ip)
}
d.dfa.run(cur, next, b)
var nextState int
nextState, instsReuse = d.cachedState(next, instsReuse)
d.dfa.states[state].next[b] = nextState
return nextState, instsReuse
}
func instsKey(insts []uint, buf []byte) []byte {
if cap(buf) < 8*len(insts) {
buf = make([]byte, 8*len(insts))
} else {
buf = buf[0 : 8*len(insts)]
}
for i, inst := range insts {
binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
}
return buf
}
func (d *dfaBuilder) cachedState(set *sparseSet,
instsReuse []uint) (int, []uint) {
insts := instsReuse[:0]
if cap(insts) == 0 {
insts = make([]uint, 0, set.Len())
}
var isMatch bool
for i := uint(0); i < uint(set.Len()); i++ {
ip := set.Get(i)
switch d.dfa.insts[ip].op {
case OpRange:
insts = append(insts, ip)
case OpMatch:
isMatch = true
insts = append(insts, ip)
}
}
if len(insts) == 0 {
return 0, insts
}
d.keyBuf = instsKey(insts, d.keyBuf)
v, ok := d.cache[string(d.keyBuf)]
if ok {
return v, insts
}
d.dfa.states = append(d.dfa.states, state{
insts: insts,
next: make([]int, 256),
match: isMatch,
})
newV := len(d.dfa.states) - 1
d.cache[string(d.keyBuf)] = newV
return newV, nil
}
type dfa struct {
insts prog
states []state
}
func (d *dfa) add(set *sparseSet, ip uint) {
if set.Contains(ip) {
return
}
set.Add(ip)
switch d.insts[ip].op {
case OpJmp:
d.add(set, d.insts[ip].to)
case OpSplit:
d.add(set, d.insts[ip].splitA)
d.add(set, d.insts[ip].splitB)
}
}
func (d *dfa) run(from, to *sparseSet, b byte) bool {
to.Clear()
var isMatch bool
for i := uint(0); i < uint(from.Len()); i++ {
ip := from.Get(i)
switch d.insts[ip].op {
case OpMatch:
isMatch = true
case OpRange:
if d.insts[ip].rangeStart <= b &&
b <= d.insts[ip].rangeEnd {
d.add(to, ip+1)
}
}
}
return isMatch
}
type state struct {
insts []uint
next []int
match bool
}
type intStack []int
func (s intStack) Push(v int) intStack {
return append(s, v)
}
func (s intStack) Pop() (intStack, int) {
l := len(s)
if l < 1 {
return s, 0
}
return s[:l-1], s[l-1]
}