gitea/vendor/github.com/blevesearch/vellum/regexp/compile.go

344 lines
7.4 KiB
Go

// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import (
"regexp/syntax"
"unicode"
unicode_utf8 "unicode/utf8"
"github.com/blevesearch/vellum/utf8"
)
type compiler struct {
sizeLimit uint
insts prog
instsPool []inst
sequences utf8.Sequences
rangeStack utf8.RangeStack
startBytes []byte
endBytes []byte
}
func newCompiler(sizeLimit uint) *compiler {
return &compiler{
sizeLimit: sizeLimit,
startBytes: make([]byte, unicode_utf8.UTFMax),
endBytes: make([]byte, unicode_utf8.UTFMax),
}
}
func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
err := c.c(ast)
if err != nil {
return nil, err
}
inst := c.allocInst()
inst.op = OpMatch
c.insts = append(c.insts, inst)
return c.insts, nil
}
func (c *compiler) c(ast *syntax.Regexp) (err error) {
if ast.Flags&syntax.NonGreedy > 1 {
return ErrNoLazy
}
switch ast.Op {
case syntax.OpEndLine, syntax.OpBeginLine,
syntax.OpBeginText, syntax.OpEndText:
return ErrNoEmpty
case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
return ErrNoWordBoundary
case syntax.OpEmptyMatch:
return nil
case syntax.OpLiteral:
for _, r := range ast.Rune {
if ast.Flags&syntax.FoldCase > 0 {
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune0: [2]rune{r, r},
}
next.Rune = next.Rune0[0:2]
// try to find more folded runes
for r1 := unicode.SimpleFold(r); r1 != r; r1 = unicode.SimpleFold(r1) {
next.Rune = append(next.Rune, r1, r1)
}
err = c.c(&next)
if err != nil {
return err
}
} else {
c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
if err != nil {
return err
}
for _, seq := range c.sequences {
c.compileUtf8Ranges(seq)
}
}
}
case syntax.OpAnyChar:
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune0: [2]rune{0, unicode.MaxRune},
}
next.Rune = next.Rune0[:2]
return c.c(&next)
case syntax.OpAnyCharNotNL:
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
}
return c.c(&next)
case syntax.OpCharClass:
return c.compileClass(ast)
case syntax.OpCapture:
return c.c(ast.Sub[0])
case syntax.OpConcat:
for _, sub := range ast.Sub {
err := c.c(sub)
if err != nil {
return err
}
}
return nil
case syntax.OpAlternate:
if len(ast.Sub) == 0 {
return nil
}
jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
// does not handle last entry
for i := 0; i < len(ast.Sub)-1; i++ {
sub := ast.Sub[i]
split := c.emptySplit()
j1 := c.top()
err := c.c(sub)
if err != nil {
return err
}
jmpsToEnd = append(jmpsToEnd, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last entry
err := c.c(ast.Sub[len(ast.Sub)-1])
if err != nil {
return err
}
end := uint(len(c.insts))
for _, jmpToEnd := range jmpsToEnd {
c.setJump(jmpToEnd, end)
}
case syntax.OpQuest:
split := c.emptySplit()
j1 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
j2 := c.top()
c.setSplit(split, j1, j2)
case syntax.OpStar:
j1 := c.top()
split := c.emptySplit()
j2 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
jmp := c.emptyJump()
j3 := uint(len(c.insts))
c.setJump(jmp, j1)
c.setSplit(split, j2, j3)
case syntax.OpPlus:
j1 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
split := c.emptySplit()
j2 := c.top()
c.setSplit(split, j1, j2)
case syntax.OpRepeat:
if ast.Max == -1 {
for i := 0; i < ast.Min; i++ {
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
next := syntax.Regexp{
Op: syntax.OpStar,
Flags: ast.Flags,
Sub: ast.Sub,
Sub0: ast.Sub0,
Rune: ast.Rune,
Rune0: ast.Rune0,
}
return c.c(&next)
}
for i := 0; i < ast.Min; i++ {
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
splits := make([]uint, 0, ast.Max-ast.Min)
starts := make([]uint, 0, ast.Max-ast.Min)
for i := ast.Min; i < ast.Max; i++ {
splits = append(splits, c.emptySplit())
starts = append(starts, uint(len(c.insts)))
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
end := uint(len(c.insts))
for i := 0; i < len(splits); i++ {
c.setSplit(splits[i], starts[i], end)
}
}
return c.checkSize()
}
func (c *compiler) checkSize() error {
if uint(len(c.insts)*instSize) > c.sizeLimit {
return ErrCompiledTooBig
}
return nil
}
func (c *compiler) compileClass(ast *syntax.Regexp) error {
if len(ast.Rune) == 0 {
return nil
}
jmps := make([]uint, 0, len(ast.Rune)-2)
// does not do last pair
for i := 0; i < len(ast.Rune)-2; i += 2 {
rstart := ast.Rune[i]
rend := ast.Rune[i+1]
split := c.emptySplit()
j1 := c.top()
err := c.compileClassRange(rstart, rend)
if err != nil {
return err
}
jmps = append(jmps, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last pair
rstart := ast.Rune[len(ast.Rune)-2]
rend := ast.Rune[len(ast.Rune)-1]
err := c.compileClassRange(rstart, rend)
if err != nil {
return err
}
end := c.top()
for _, jmp := range jmps {
c.setJump(jmp, end)
}
return nil
}
func (c *compiler) compileClassRange(startR, endR rune) (err error) {
c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
if err != nil {
return err
}
jmps := make([]uint, 0, len(c.sequences)-1)
// does not do last entry
for i := 0; i < len(c.sequences)-1; i++ {
seq := c.sequences[i]
split := c.emptySplit()
j1 := c.top()
c.compileUtf8Ranges(seq)
jmps = append(jmps, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last entry
c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
end := c.top()
for _, jmp := range jmps {
c.setJump(jmp, end)
}
return nil
}
func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
for _, r := range seq {
inst := c.allocInst()
inst.op = OpRange
inst.rangeStart = r.Start
inst.rangeEnd = r.End
c.insts = append(c.insts, inst)
}
}
func (c *compiler) emptySplit() uint {
inst := c.allocInst()
inst.op = OpSplit
c.insts = append(c.insts, inst)
return c.top() - 1
}
func (c *compiler) emptyJump() uint {
inst := c.allocInst()
inst.op = OpJmp
c.insts = append(c.insts, inst)
return c.top() - 1
}
func (c *compiler) setSplit(i, pc1, pc2 uint) {
split := c.insts[i]
split.splitA = pc1
split.splitB = pc2
}
func (c *compiler) setJump(i, pc uint) {
jmp := c.insts[i]
jmp.to = pc
}
func (c *compiler) top() uint {
return uint(len(c.insts))
}
func (c *compiler) allocInst() *inst {
if len(c.instsPool) <= 0 {
c.instsPool = make([]inst, 16)
}
inst := &c.instsPool[0]
c.instsPool = c.instsPool[1:]
return inst
}