From ef0f1a726901d6c614040cfc2d7e8f9a2ca97816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Thu, 13 May 2021 13:10:32 +0200 Subject: [PATCH] publisher: Make the HTML element collector more robust Fixes #8530 --- common/text/transform.go | 22 ++ publisher/htmlElementsCollector.go | 405 ++++++++++++++---------- publisher/htmlElementsCollector_test.go | 38 ++- 3 files changed, 299 insertions(+), 166 deletions(-) diff --git a/common/text/transform.go b/common/text/transform.go index f59577803..2d51f6c33 100644 --- a/common/text/transform.go +++ b/common/text/transform.go @@ -45,3 +45,25 @@ func RemoveAccentsString(s string) string { accentTransformerPool.Put(t) return s } + +// Chunk splits s into strings of size. +func Chunk(s string, size int) []string { + if size >= len(s) { + return []string{s} + } + var chunks []string + chunk := make([]rune, size) + l := 0 + for _, r := range s { + chunk[l] = r + l++ + if l == size { + chunks = append(chunks, string(chunk)) + l = 0 + } + } + if l > 0 { + chunks = append(chunks, string(chunk[:l])) + } + return chunks +} diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 9dc28c4c2..1bc1a09bc 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -19,12 +19,51 @@ import ( "sort" "strings" "sync" + "unicode" + "unicode/utf8" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) +const eof = -1 + +var ( + htmlJsonFixer = strings.NewReplacer(", ", "\n") + jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) + classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) + + skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) + skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) + endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) + + exceptionList = map[string]bool{ + "thead": true, + "tbody": true, + "tfoot": true, + "td": true, + "tr": true, + } +) + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + w := &htmlElementsCollectorWriter{ + collector: collector, + state: htmlLexStart, + } + + w.defaultLexElementInside = w.lexElementInside(htmlLexStart) + + return w +} + // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` @@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } +type htmlElement struct { + Tag string + Classes []string + IDs []string +} + type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this @@ -59,12 +104,6 @@ type htmlElementsCollector struct { mu sync.RWMutex } -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} - func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string @@ -93,114 +132,118 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements { type htmlElementsCollectorWriter struct { collector *htmlElementsCollector - buff bytes.Buffer - isCollecting bool - inPreTag string + r rune // Current rune + width int // The width in bytes of r + input []byte // The current slice written to Write + pos int // The current position in input - inQuote bool - quoteValue byte + err error + + inQuote rune + + buff bytes.Buffer + + // Current state + state htmlCollectorStateFunc + + // Precompiled state funcs + defaultLexElementInside htmlCollectorStateFunc } -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { - return &htmlElementsCollectorWriter{ - collector: collector, +// Write collects HTML elements from p. +func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { + n = len(p) + w.input = p + w.pos = 0 + + for { + w.r = w.next() + if w.r == eof { + return + } + w.state = w.state(w) } } -// Write splits the incoming stream into single html element. -func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { - n = len(p) - i := 0 +func (l *htmlElementsCollectorWriter) backup() { + l.pos -= l.width + l.r, _ = utf8.DecodeRune(l.input[l.pos:]) +} - for i < len(p) { - // If we are not collecting, cycle through byte stream until start bracket "<" is found. - if !w.isCollecting { - for ; i < len(p); i++ { - b := p[i] - if b == '<' { - w.startCollecting() - break - } +func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { + var s htmlCollectorStateFunc + s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { + w.buff.WriteRune(w.r) + if condition() { + w.buff.Reset() + return resolve + } + return s + } + return s +} + +func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { + var s htmlCollectorStateFunc + s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { + if condition(w.r) { + return resolve + } + return s + } + return s +} + +// Starts with e.g. "" is found, - // disregard any ">" if within a quote, - // write bytes until found to buffer. - for ; i < len(p); i++ { - b := p[i] - w.toggleIfQuote(b) - w.buff.WriteByte(b) - - if !w.inQuote && b == '>' { - w.endCollecting() - break - } - } + if w.inQuote != 0 { + return s } - // If no end bracket ">" is found while collecting, but the stream ended - // this could mean we received chunks of a stream from e.g. the minify functionality - // next if loop will be skipped. - - // At this point we have collected an element line between angle brackets "<" and ">". - if !w.isCollecting { - if w.buff.Len() == 0 { - continue - } - - if w.inPreTag != "" { // within preformatted code block - s := w.buff.String() - w.buff.Reset() - if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - continue - } - - // First check if we have processed this element before. - w.collector.mu.RLock() + if w.r == '>' { // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. + defer func() { + w.buff.Reset() + }() + + // First check if we have processed this element before. + w.collector.mu.RLock() + seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { - w.buff.Reset() - continue - } - - // Filter out unwanted tags - // if within preformatted code blocks
, 
`, f("div textarea", "foo textareaclass", "")}, {"DOCTYPE should beskipped", ``, f("", "", "")}, {"Comments should be skipped", ``, f("", "", "")}, + {"Comments with elements before and after", `
`, f("div span", "", "")}, + // Issue #8530 + {"Comment with single quote", ``, f("i", "foo", "")}, + {"Uppercase tags", `
`, f("div", "", "")}, + {"Predefined tags with distinct casing", `
`, f("div script", "", "")}, // Issue #8417 {"Tabs inline", `
d
`, f("div hr", "bar foo", "a")}, {"Tabs on multiple rows", `
d`, f("div form", "foo", "a b")}, } { - for _, minify := range []bool{false, true} { - c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { + for _, variant := range []struct { + minify bool + stream bool + }{ + {minify: false, stream: false}, + {minify: true, stream: false}, + {minify: false, stream: true}, + } { + + c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if minify { + if variant.minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") } v := viper.New() m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) m.Minify(media.HTMLType, w, strings.NewReader(test.html)) + } else if variant.stream { + chunks := text.Chunk(test.html, rnd.Intn(41)+1) + for _, chunk := range chunks { + fmt.Fprint(w, chunk) + } } else { fmt.Fprint(w, test.html) } @@ -126,6 +155,7 @@ func TestClassCollector(t *testing.T) { }) } } + } func BenchmarkElementsCollectorWriter(b *testing.B) {