gitea/vendor/github.com/blevesearch/vellum/utf8/utf8.go

269 lines
6.0 KiB
Go

// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package utf8
import (
"fmt"
"unicode/utf8"
)
// Sequences is a collection of Sequence
type Sequences []Sequence
// NewSequences constructs a collection of Sequence which describe the
// byte ranges covered between the start and end runes.
func NewSequences(start, end rune) (Sequences, error) {
rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil)
return rv, err
}
func NewSequencesPrealloc(start, end rune,
preallocSequences Sequences,
preallocRangeStack RangeStack,
preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) {
rv := preallocSequences[:0]
startBytes := preallocStartBytes
if cap(startBytes) < utf8.UTFMax {
startBytes = make([]byte, utf8.UTFMax)
}
startBytes = startBytes[:utf8.UTFMax]
endBytes := preallocEndBytes
if cap(endBytes) < utf8.UTFMax {
endBytes = make([]byte, utf8.UTFMax)
}
endBytes = endBytes[:utf8.UTFMax]
rangeStack := preallocRangeStack[:0]
rangeStack = rangeStack.Push(scalarRange{start, end})
rangeStack, r := rangeStack.Pop()
TOP:
for r != nilScalarRange {
INNER:
for {
r1, r2 := r.split()
if r1 != nilScalarRange {
rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end})
r.start = r1.start
r.end = r1.end
continue INNER
}
if !r.valid() {
rangeStack, r = rangeStack.Pop()
continue TOP
}
for i := 1; i < utf8.UTFMax; i++ {
max := maxScalarValue(i)
if r.start <= max && max < r.end {
rangeStack = rangeStack.Push(scalarRange{max + 1, r.end})
r.end = max
continue INNER
}
}
asciiRange := r.ascii()
if asciiRange != nilRange {
rv = append(rv, Sequence{
asciiRange,
})
rangeStack, r = rangeStack.Pop()
continue TOP
}
for i := uint(1); i < utf8.UTFMax; i++ {
m := rune((1 << (6 * i)) - 1)
if (r.start & ^m) != (r.end & ^m) {
if (r.start & m) != 0 {
rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end})
r.end = r.start | m
continue INNER
}
if (r.end & m) != m {
rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end})
r.end = (r.end & ^m) - 1
continue INNER
}
}
}
n, m := r.encode(startBytes, endBytes)
seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m])
if err != nil {
return nil, nil, err
}
rv = append(rv, seq)
rangeStack, r = rangeStack.Pop()
continue TOP
}
}
return rv, rangeStack, nil
}
// Sequence is a collection of Range
type Sequence []Range
// SequenceFromEncodedRange creates sequence from the encoded bytes
func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
if len(start) != len(end) {
return nil, fmt.Errorf("byte slices must be the same length")
}
switch len(start) {
case 2:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
}, nil
case 3:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
Range{start[2], end[2]},
}, nil
case 4:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
Range{start[2], end[2]},
Range{start[3], end[3]},
}, nil
}
return nil, fmt.Errorf("invalid encoded byte length")
}
// Matches checks to see if the provided byte slice matches the Sequence
func (u Sequence) Matches(bytes []byte) bool {
if len(bytes) < len(u) {
return false
}
for i := 0; i < len(u); i++ {
if !u[i].matches(bytes[i]) {
return false
}
}
return true
}
func (u Sequence) String() string {
switch len(u) {
case 1:
return fmt.Sprintf("%v", u[0])
case 2:
return fmt.Sprintf("%v%v", u[0], u[1])
case 3:
return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
case 4:
return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
default:
return fmt.Sprintf("invalid utf8 sequence")
}
}
// Range describes a single range of byte values
type Range struct {
Start byte
End byte
}
var nilRange = Range{0xff, 0}
func (u Range) matches(b byte) bool {
if u.Start <= b && b <= u.End {
return true
}
return false
}
func (u Range) String() string {
if u.Start == u.End {
return fmt.Sprintf("[%X]", u.Start)
}
return fmt.Sprintf("[%X-%X]", u.Start, u.End)
}
type scalarRange struct {
start rune
end rune
}
var nilScalarRange = scalarRange{0xffff, 0}
func (s *scalarRange) String() string {
return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
}
// split this scalar range if it overlaps with a surrogate codepoint
func (s *scalarRange) split() (scalarRange, scalarRange) {
if s.start < 0xe000 && s.end > 0xd7ff {
return scalarRange{
start: s.start,
end: 0xd7ff,
},
scalarRange{
start: 0xe000,
end: s.end,
}
}
return nilScalarRange, nilScalarRange
}
func (s *scalarRange) valid() bool {
return s.start <= s.end
}
func (s *scalarRange) ascii() Range {
if s.valid() && s.end <= 0x7f {
return Range{
Start: byte(s.start),
End: byte(s.end),
}
}
return nilRange
}
// start and end MUST have capacity for utf8.UTFMax bytes
func (s *scalarRange) encode(start, end []byte) (int, int) {
n := utf8.EncodeRune(start, s.start)
m := utf8.EncodeRune(end, s.end)
return n, m
}
type RangeStack []scalarRange
func (s RangeStack) Push(v scalarRange) RangeStack {
return append(s, v)
}
func (s RangeStack) Pop() (RangeStack, scalarRange) {
l := len(s)
if l < 1 {
return s, nilScalarRange
}
return s[:l-1], s[l-1]
}
func maxScalarValue(nbytes int) rune {
switch nbytes {
case 1:
return 0x007f
case 2:
return 0x07FF
case 3:
return 0xFFFF
default:
return 0x10FFFF
}
}