From 79bd8d2ab6cae1c0be3233a9a7551d0b7bcc5944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= Date: Sat, 28 Sep 2019 18:41:49 +0200 Subject: [PATCH 1/4] tree-sitter: update vendored tree-sitter runtime tree-sitter/tree-sitter commit edb569310005c66838b7d69fa60850acac6abeee Included files are: lib/include/tree-sitter/*.h lib/src/*.[ch] lib/src/unicode/* LICENSE --- src/tree_sitter/api.h | 241 ++++- src/tree_sitter/bits.h | 29 + src/tree_sitter/language.c | 102 +- src/tree_sitter/language.h | 3 + src/tree_sitter/lexer.c | 342 ++++--- src/tree_sitter/lexer.h | 2 +- src/tree_sitter/lib.c | 5 +- src/tree_sitter/node.c | 10 +- src/tree_sitter/parser.c | 41 +- src/tree_sitter/parser.h | 5 +- src/tree_sitter/point.h | 1 + src/tree_sitter/query.c | 1450 ++++++++++++++++++++++++++++ src/tree_sitter/subtree.c | 13 +- src/tree_sitter/tree.c | 7 +- src/tree_sitter/tree_cursor.c | 79 +- src/tree_sitter/tree_cursor.h | 1 + src/tree_sitter/unicode.h | 50 + src/tree_sitter/unicode/ICU_SHA | 1 + src/tree_sitter/unicode/LICENSE | 414 ++++++++ src/tree_sitter/unicode/README.md | 29 + src/tree_sitter/unicode/ptypes.h | 1 + src/tree_sitter/unicode/umachine.h | 448 +++++++++ src/tree_sitter/unicode/urename.h | 1 + src/tree_sitter/unicode/utf.h | 1 + src/tree_sitter/unicode/utf16.h | 733 ++++++++++++++ src/tree_sitter/unicode/utf8.h | 881 +++++++++++++++++ 26 files changed, 4657 insertions(+), 233 deletions(-) create mode 100644 src/tree_sitter/bits.h create mode 100644 src/tree_sitter/query.c create mode 100644 src/tree_sitter/unicode.h create mode 100644 src/tree_sitter/unicode/ICU_SHA create mode 100644 src/tree_sitter/unicode/LICENSE create mode 100644 src/tree_sitter/unicode/README.md create mode 100644 src/tree_sitter/unicode/ptypes.h create mode 100644 src/tree_sitter/unicode/umachine.h create mode 100644 src/tree_sitter/unicode/urename.h create mode 100644 src/tree_sitter/unicode/utf.h create mode 100644 src/tree_sitter/unicode/utf16.h create mode 100644 src/tree_sitter/unicode/utf8.h diff --git a/src/tree_sitter/api.h b/src/tree_sitter/api.h index d39d0521ee..40187e3db0 100644 --- a/src/tree_sitter/api.h +++ b/src/tree_sitter/api.h @@ -14,7 +14,19 @@ extern "C" { /* Section - ABI Versioning */ /****************************/ +/** + * The latest ABI version that is supported by the current version of the + * library. When Languages are generated by the Tree-sitter CLI, they are + * assigned an ABI version number that corresponds to the current CLI version. + * The Tree-sitter library is generally backwards-compatible with languages + * generated using older CLI versions, but is not forwards-compatible. + */ #define TREE_SITTER_LANGUAGE_VERSION 11 + +/** + * The earliest ABI version that is supported by the current version of the + * library. + */ #define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9 /*******************/ @@ -26,6 +38,8 @@ typedef uint16_t TSFieldId; typedef struct TSLanguage TSLanguage; typedef struct TSParser TSParser; typedef struct TSTree TSTree; +typedef struct TSQuery TSQuery; +typedef struct TSQueryCursor TSQueryCursor; typedef enum { TSInputEncodingUTF8, @@ -87,6 +101,37 @@ typedef struct { uint32_t context[2]; } TSTreeCursor; +typedef struct { + TSNode node; + uint32_t index; +} TSQueryCapture; + +typedef struct { + uint32_t id; + uint16_t pattern_index; + uint16_t capture_count; + const TSQueryCapture *captures; +} TSQueryMatch; + +typedef enum { + TSQueryPredicateStepTypeDone, + TSQueryPredicateStepTypeCapture, + TSQueryPredicateStepTypeString, +} TSQueryPredicateStepType; + +typedef struct { + TSQueryPredicateStepType type; + uint32_t value_id; +} TSQueryPredicateStep; + +typedef enum { + TSQueryErrorNone = 0, + TSQueryErrorSyntax, + TSQueryErrorNodeType, + TSQueryErrorField, + TSQueryErrorCapture, +} TSQueryError; + /********************/ /* Section - Parser */ /********************/ @@ -119,7 +164,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language); const TSLanguage *ts_parser_language(const TSParser *self); /** - * Set the spans of text that the parser should include when parsing. + * Set the ranges of text that the parser should include when parsing. * * By default, the parser will always include entire documents. This function * allows you to parse only a *portion* of a document but still return a syntax @@ -226,14 +271,16 @@ TSTree *ts_parser_parse_string_encoding( * by default, it will resume where it left off on the next call to * `ts_parser_parse` or other parsing functions. If you don't want to resume, * and instead intend to use this parser to parse some other document, you must - * call this `ts_parser_reset` first. + * call `ts_parser_reset` first. */ void ts_parser_reset(TSParser *self); /** * Set the maximum duration in microseconds that parsing should be allowed to - * take before halting. If parsing takes longer than this, it will halt early, - * returning NULL. See `ts_parser_parse` for more information. + * take before halting. + * + * If parsing takes longer than this, it will halt early, returning NULL. + * See `ts_parser_parse` for more information. */ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout); @@ -243,10 +290,11 @@ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout); uint64_t ts_parser_timeout_micros(const TSParser *self); /** - * Set the parser's current cancellation flag pointer. If a non-null pointer is - * assigned, then the parser will periodically read from this pointer during - * parsing. If it reads a non-zero value, it will halt early, returning NULL. - * See `ts_parser_parse` for more information. + * Set the parser's current cancellation flag pointer. + * + * If a non-null pointer is assigned, then the parser will periodically read + * from this pointer during parsing. If it reads a non-zero value, it will + * halt early, returning NULL. See `ts_parser_parse` for more information. */ void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag); @@ -322,22 +370,22 @@ const TSLanguage *ts_tree_language(const TSTree *); void ts_tree_edit(TSTree *self, const TSInputEdit *edit); /** - * Compare a new syntax tree to a previous syntax tree representing the same + * Compare an old edited syntax tree to a new syntax tree representing the same * document, returning an array of ranges whose syntactic structure has changed. * * For this to work correctly, the old syntax tree must have been edited such * that its ranges match up to the new tree. Generally, you'll want to call - * this function right after calling one of the `ts_parser_parse` functions, - * passing in the new tree that was returned from `ts_parser_parse` and the old - * tree that was passed as a parameter. + * this function right after calling one of the `ts_parser_parse` functions. + * You need to pass the old tree that was passed to parse, as well as the new + * tree that was returned from that function. * * The returned array is allocated using `malloc` and the caller is responsible * for freeing it using `free`. The length of the array will be written to the * given `length` pointer. */ TSRange *ts_tree_get_changed_ranges( - const TSTree *self, const TSTree *old_tree, + const TSTree *new_tree, uint32_t *length ); @@ -409,8 +457,8 @@ bool ts_node_is_named(TSNode); bool ts_node_is_missing(TSNode); /** - * Check if the node is *missing*. Missing nodes are inserted by the parser in - * order to recover from certain kinds of syntax errors. + * Check if the node is *extra*. Extra nodes represent things like comments, + * which are not required the grammar, but can appear anywhere. */ bool ts_node_is_extra(TSNode); @@ -542,7 +590,7 @@ TSTreeCursor ts_tree_cursor_new(TSNode); void ts_tree_cursor_delete(TSTreeCursor *); /** - * Re-initialize a tree cursor to start at a different ndoe. + * Re-initialize a tree cursor to start at a different node. */ void ts_tree_cursor_reset(TSTreeCursor *, TSNode); @@ -584,7 +632,7 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *); bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); /** - * Move the cursor to the first schild of its current node. + * Move the cursor to the first child of its current node. * * This returns `true` if the cursor successfully moved, and returns `false` * if there were no children. @@ -592,7 +640,7 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); bool ts_tree_cursor_goto_first_child(TSTreeCursor *); /** - * Move the cursor to the first schild of its current node that extends beyond + * Move the cursor to the first child of its current node that extends beyond * the given byte offset. * * This returns the index of the child node if one was found, and returns -1 @@ -602,6 +650,156 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t); TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *); +/*******************/ +/* Section - Query */ +/*******************/ + +/** + * Create a new query from a string containing one or more S-expression + * patterns. The query is associated with a particular language, and can + * only be run on syntax nodes parsed with that language. + * + * If all of the given patterns are valid, this returns a `TSQuery`. + * If a pattern is invalid, this returns `NULL`, and provides two pieces + * of information about the problem: + * 1. The byte offset of the error is written to the `error_offset` parameter. + * 2. The type of error is written to the `error_type` parameter. + */ +TSQuery *ts_query_new( + const TSLanguage *language, + const char *source, + uint32_t source_len, + uint32_t *error_offset, + TSQueryError *error_type +); + +/** + * Delete a query, freeing all of the memory that it used. + */ +void ts_query_delete(TSQuery *); + +/** + * Get the number of patterns, captures, or string literals in the query. + */ +uint32_t ts_query_pattern_count(const TSQuery *); +uint32_t ts_query_capture_count(const TSQuery *); +uint32_t ts_query_string_count(const TSQuery *); + +/** + * Get the byte offset where the given pattern starts in the query's source. + * + * This can be useful when combining queries by concatenating their source + * code strings. + */ +uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t); + +/** + * Get all of the predicates for the given pattern in the query. + * + * The predicates are represented as a single array of steps. There are three + * types of steps in this array, which correspond to the three legal values for + * the `type` field: + * - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names + * of captures. Their `value_id` can be used with the + * `ts_query_capture_name_for_id` function to obtain the name of the capture. + * - `TSQueryPredicateStepTypeString` - Steps with this type represent literal + * strings. Their `value_id` can be used with the + * `ts_query_string_value_for_id` function to obtain their string value. + * - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels* + * that represent the end of an individual predicate. If a pattern has two + * predicates, then there will be two steps with this `type` in the array. + */ +const TSQueryPredicateStep *ts_query_predicates_for_pattern( + const TSQuery *self, + uint32_t pattern_index, + uint32_t *length +); + +/** + * Get the name and length of one of the query's captures, or one of the + * query's string literals. Each capture and string is associated with a + * numeric id based on the order that it appeared in the query's source. + */ +const char *ts_query_capture_name_for_id( + const TSQuery *, + uint32_t id, + uint32_t *length +); +const char *ts_query_string_value_for_id( + const TSQuery *, + uint32_t id, + uint32_t *length +); + +/** + * Disable a certain capture within a query. This prevents the capture + * from being returned in matches, and also avoids any resource usage + * associated with recording the capture. + */ +void ts_query_disable_capture(TSQuery *, const char *, uint32_t); + +/** + * Create a new cursor for executing a given query. + * + * The cursor stores the state that is needed to iteratively search + * for matches. To use the query cursor, first call `ts_query_cursor_exec` + * to start running a given query on a given syntax node. Then, there are + * two options for consuming the results of the query: + * 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the + * the *matches* in the order that they were found. Each match contains the + * index of the pattern that matched, and an array of captures. Because + * multiple patterns can match the same set of nodes, one match may contain + * captures that appear *before* some of the captures from a previous match. + * 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the + * individual *captures* in the order that they appear. This is useful if + * don't care about which pattern matched, and just want a single ordered + * sequence of captures. + * + * If you don't care about consuming all of the results, you can stop calling + * `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point. + * You can then start executing another query on another node by calling + * `ts_query_cursor_exec` again. + */ +TSQueryCursor *ts_query_cursor_new(void); + +/** + * Delete a query cursor, freeing all of the memory that it used. + */ +void ts_query_cursor_delete(TSQueryCursor *); + +/** + * Start running a given query on a given node. + */ +void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode); + +/** + * Set the range of bytes or (row, column) positions in which the query + * will be executed. + */ +void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t); +void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint); + +/** + * Advance to the next match of the currently running query. + * + * If there is a match, write it to `*match` and return `true`. + * Otherwise, return `false`. + */ +bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match); +void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id); + +/** + * Advance to the next capture of the currently running query. + * + * If there is a capture, write its match to `*match` and its index within + * the matche's capture list to `*capture_index`. Otherwise, return `false`. + */ +bool ts_query_cursor_next_capture( + TSQueryCursor *, + TSQueryMatch *match, + uint32_t *capture_index +); + /**********************/ /* Section - Language */ /**********************/ @@ -619,7 +817,12 @@ const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); /** * Get the numerical id for the given node type string. */ -TSSymbol ts_language_symbol_for_name(const TSLanguage *, const char *); +TSSymbol ts_language_symbol_for_name( + const TSLanguage *self, + const char *string, + uint32_t length, + bool is_named +); /** * Get the number of distinct field names in the language. diff --git a/src/tree_sitter/bits.h b/src/tree_sitter/bits.h new file mode 100644 index 0000000000..3bec455dd1 --- /dev/null +++ b/src/tree_sitter/bits.h @@ -0,0 +1,29 @@ +#ifndef TREE_SITTER_BITS_H_ +#define TREE_SITTER_BITS_H_ + +#include + +static inline uint32_t bitmask_for_index(uint16_t id) { + return (1u << (31 - id)); +} + +#ifdef _WIN32 + +#include + +static inline uint32_t count_leading_zeros(uint32_t x) { + if (x == 0) return 32; + uint32_t result; + _BitScanReverse(&result, x); + return 31 - result; +} + +#else + +static inline uint32_t count_leading_zeros(uint32_t x) { + if (x == 0) return 32; + return __builtin_clz(x); +} + +#endif +#endif // TREE_SITTER_BITS_H_ diff --git a/src/tree_sitter/language.c b/src/tree_sitter/language.c index 1bfb1a8d03..e240ef2a53 100644 --- a/src/tree_sitter/language.c +++ b/src/tree_sitter/language.c @@ -3,8 +3,28 @@ #include "./error_costs.h" #include -void ts_language_table_entry(const TSLanguage *self, TSStateId state, - TSSymbol symbol, TableEntry *result) { +uint32_t ts_language_symbol_count(const TSLanguage *self) { + return self->symbol_count + self->alias_count; +} + +uint32_t ts_language_version(const TSLanguage *self) { + return self->version; +} + +uint32_t ts_language_field_count(const TSLanguage *self) { + if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) { + return self->field_count; + } else { + return 0; + } +} + +void ts_language_table_entry( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol, + TableEntry *result +) { if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { result->action_count = 0; result->is_reusable = false; @@ -19,48 +39,72 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state, } } -uint32_t ts_language_symbol_count(const TSLanguage *language) { - return language->symbol_count + language->alias_count; -} - -uint32_t ts_language_version(const TSLanguage *language) { - return language->version; -} - -TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) { +TSSymbolMetadata ts_language_symbol_metadata( + const TSLanguage *self, + TSSymbol symbol +) { if (symbol == ts_builtin_sym_error) { return (TSSymbolMetadata){.visible = true, .named = true}; } else if (symbol == ts_builtin_sym_error_repeat) { return (TSSymbolMetadata){.visible = false, .named = false}; } else { - return language->symbol_metadata[symbol]; + return self->symbol_metadata[symbol]; } } -const char *ts_language_symbol_name(const TSLanguage *language, TSSymbol symbol) { +TSSymbol ts_language_public_symbol( + const TSLanguage *self, + TSSymbol symbol +) { + if (symbol == ts_builtin_sym_error) return symbol; + if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { + return self->public_symbol_map[symbol]; + } else { + return symbol; + } +} + +const char *ts_language_symbol_name( + const TSLanguage *self, + TSSymbol symbol +) { if (symbol == ts_builtin_sym_error) { return "ERROR"; } else if (symbol == ts_builtin_sym_error_repeat) { return "_ERROR"; } else { - return language->symbol_names[symbol]; + return self->symbol_names[symbol]; } } -TSSymbol ts_language_symbol_for_name(const TSLanguage *self, const char *name) { - if (!strcmp(name, "ERROR")) return ts_builtin_sym_error; - +TSSymbol ts_language_symbol_for_name( + const TSLanguage *self, + const char *string, + uint32_t length, + bool is_named +) { + if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error; uint32_t count = ts_language_symbol_count(self); for (TSSymbol i = 0; i < count; i++) { - if (!strcmp(self->symbol_names[i], name)) { - return i; + TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); + if (!metadata.visible || metadata.named != is_named) continue; + const char *symbol_name = self->symbol_names[i]; + if (!strncmp(symbol_name, string, length) && !symbol_name[length]) { + if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { + return self->public_symbol_map[i]; + } else { + return i; + } } } return 0; } -TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); +TSSymbolType ts_language_symbol_type( + const TSLanguage *self, + TSSymbol symbol +) { + TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol); if (metadata.named) { return TSSymbolTypeRegular; } else if (metadata.visible) { @@ -70,15 +114,10 @@ TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol } } -uint32_t ts_language_field_count(const TSLanguage *self) { - if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) { - return self->field_count; - } else { - return 0; - } -} - -const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id) { +const char *ts_language_field_name_for_id( + const TSLanguage *self, + TSFieldId id +) { uint32_t count = ts_language_field_count(self); if (count) { return self->field_names[id]; @@ -96,7 +135,8 @@ TSFieldId ts_language_field_id_for_name( for (TSSymbol i = 1; i < count + 1; i++) { switch (strncmp(name, self->field_names[i], name_length)) { case 0: - return i; + if (self->field_names[i][name_length] == 0) return i; + break; case -1: return 0; default: diff --git a/src/tree_sitter/language.h b/src/tree_sitter/language.h index 0741486a1b..d7e17c3d70 100644 --- a/src/tree_sitter/language.h +++ b/src/tree_sitter/language.h @@ -10,6 +10,7 @@ extern "C" { #define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) #define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10 +#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11 #define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11 typedef struct { @@ -22,6 +23,8 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); +TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol); + static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { return 0 < symbol && symbol < self->external_token_count + 1; } diff --git a/src/tree_sitter/lexer.c b/src/tree_sitter/lexer.c index fdc127466f..e2ca851973 100644 --- a/src/tree_sitter/lexer.c +++ b/src/tree_sitter/lexer.c @@ -2,26 +2,58 @@ #include "./lexer.h" #include "./subtree.h" #include "./length.h" -#include "./utf16.h" -#include "utf8proc.h" +#include "./unicode.h" -#define LOG(...) \ - if (self->logger.log) { \ - snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ - self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \ +#define LOG(message, character) \ + if (self->logger.log) { \ + snprintf( \ + self->debug_buffer, \ + TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ + 32 <= character && character < 127 ? \ + message " character:'%c'" : \ + message " character:%d", \ + character \ + ); \ + self->logger.log( \ + self->logger.payload, \ + TSLogTypeLex, \ + self->debug_buffer \ + ); \ } -#define LOG_CHARACTER(message, character) \ - LOG( \ - 32 <= character && character < 127 ? \ - message " character:'%c'" : \ - message " character:%d", character \ - ) - -static const char empty_chunk[3] = { 0, 0 }; - static const int32_t BYTE_ORDER_MARK = 0xFEFF; +static const TSRange DEFAULT_RANGE = { + .start_point = { + .row = 0, + .column = 0, + }, + .end_point = { + .row = UINT32_MAX, + .column = UINT32_MAX, + }, + .start_byte = 0, + .end_byte = UINT32_MAX +}; + +// Check if the lexer has reached EOF. This state is stored +// by setting the lexer's `current_included_range_index` such that +// it has consumed all of its available ranges. +static bool ts_lexer__eof(const TSLexer *_self) { + Lexer *self = (Lexer *)_self; + return self->current_included_range_index == self->included_range_count; +} + +// Clear the currently stored chunk of source code, because the lexer's +// position has changed. +static void ts_lexer__clear_chunk(Lexer *self) { + self->chunk = NULL; + self->chunk_size = 0; + self->chunk_start = 0; +} + +// Call the lexer's input callback to obtain a new chunk of source code +// for the current position. static void ts_lexer__get_chunk(Lexer *self) { self->chunk_start = self->current_position.bytes; self->chunk = self->input.read( @@ -30,15 +62,15 @@ static void ts_lexer__get_chunk(Lexer *self) { self->current_position.extent, &self->chunk_size ); - if (!self->chunk_size) self->chunk = empty_chunk; + if (!self->chunk_size) { + self->current_included_range_index = self->included_range_count; + self->chunk = NULL; + } } -typedef utf8proc_ssize_t (*DecodeFunction)( - const utf8proc_uint8_t *, - utf8proc_ssize_t, - utf8proc_int32_t * -); - +// Decode the next unicode character in the current chunk of source code. +// This assumes that the lexer has already retrieved a chunk of source +// code that spans the current position. static void ts_lexer__get_lookahead(Lexer *self) { uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; @@ -50,29 +82,37 @@ static void ts_lexer__get_lookahead(Lexer *self) { return; } - DecodeFunction decode = - self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate; + UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 + ? ts_decode_utf8 + : ts_decode_utf16; self->lookahead_size = decode(chunk, size, &self->data.lookahead); // If this chunk ended in the middle of a multi-byte character, // try again with a fresh chunk. - if (self->data.lookahead == -1 && size < 4) { + if (self->data.lookahead == TS_DECODE_ERROR && size < 4) { ts_lexer__get_chunk(self); chunk = (const uint8_t *)self->chunk; size = self->chunk_size; self->lookahead_size = decode(chunk, size, &self->data.lookahead); } - if (self->data.lookahead == -1) { + if (self->data.lookahead == TS_DECODE_ERROR) { self->lookahead_size = 1; } } -static void ts_lexer__advance(TSLexer *payload, bool skip) { - Lexer *self = (Lexer *)payload; - if (self->chunk == empty_chunk) - return; +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +static void ts_lexer__advance(TSLexer *_self, bool skip) { + Lexer *self = (Lexer *)_self; + if (!self->chunk) return; + + if (skip) { + LOG("skip", self->data.lookahead); + } else { + LOG("consume", self->data.lookahead); + } if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; @@ -84,53 +124,65 @@ static void ts_lexer__advance(TSLexer *payload, bool skip) { } } - TSRange *current_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_position.bytes == current_range->end_byte) { - self->current_included_range_index++; - if (self->current_included_range_index == self->included_range_count) { - self->data.lookahead = '\0'; - self->lookahead_size = 1; - return; - } else { - current_range++; - self->current_position = (Length) { - current_range->start_byte, - current_range->start_point, - }; + const TSRange *current_range = NULL; + if (self->current_included_range_index < self->included_range_count) { + current_range = &self->included_ranges[self->current_included_range_index]; + if (self->current_position.bytes == current_range->end_byte) { + self->current_included_range_index++; + if (self->current_included_range_index < self->included_range_count) { + current_range++; + self->current_position = (Length) { + current_range->start_byte, + current_range->start_point, + }; + } else { + current_range = NULL; + } } } - if (skip) { - LOG_CHARACTER("skip", self->data.lookahead); - self->token_start_position = self->current_position; + if (skip) self->token_start_position = self->current_position; + + if (current_range) { + if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { + ts_lexer__get_chunk(self); + } + ts_lexer__get_lookahead(self); } else { - LOG_CHARACTER("consume", self->data.lookahead); - } - - if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { - ts_lexer__get_chunk(self); - } - - ts_lexer__get_lookahead(self); -} - -static void ts_lexer__mark_end(TSLexer *payload) { - Lexer *self = (Lexer *)payload; - TSRange *current_included_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_included_range_index > 0 && - self->current_position.bytes == current_included_range->start_byte) { - TSRange *previous_included_range = current_included_range - 1; - self->token_end_position = (Length) { - previous_included_range->end_byte, - previous_included_range->end_point, - }; - } else { - self->token_end_position = self->current_position; + ts_lexer__clear_chunk(self); + self->data.lookahead = '\0'; + self->lookahead_size = 1; } } -static uint32_t ts_lexer__get_column(TSLexer *payload) { - Lexer *self = (Lexer *)payload; +// Mark that a token match has completed. This can be called multiple +// times if a longer match is found later. +static void ts_lexer__mark_end(TSLexer *_self) { + Lexer *self = (Lexer *)_self; + if (!ts_lexer__eof(&self->data)) { + // If the lexer is right at the beginning of included range, + // then the token should be considered to end at the *end* of the + // previous included range, rather than here. + TSRange *current_included_range = &self->included_ranges[ + self->current_included_range_index + ]; + if ( + self->current_included_range_index > 0 && + self->current_position.bytes == current_included_range->start_byte + ) { + TSRange *previous_included_range = current_included_range - 1; + self->token_end_position = (Length) { + previous_included_range->end_byte, + previous_included_range->end_point, + }; + return; + } + } + self->token_end_position = self->current_position; +} + +static uint32_t ts_lexer__get_column(TSLexer *_self) { + Lexer *self = (Lexer *)_self; uint32_t goal_byte = self->current_position.bytes; self->current_position.bytes -= self->current_position.extent.column; @@ -142,67 +194,69 @@ static uint32_t ts_lexer__get_column(TSLexer *payload) { uint32_t result = 0; while (self->current_position.bytes < goal_byte) { - ts_lexer__advance(payload, false); + ts_lexer__advance(&self->data, false); result++; } return result; } -static bool ts_lexer__is_at_included_range_start(TSLexer *payload) { - const Lexer *self = (const Lexer *)payload; - TSRange *current_range = &self->included_ranges[self->current_included_range_index]; - return self->current_position.bytes == current_range->start_byte; +// Is the lexer at a boundary between two disjoint included ranges of +// source code? This is exposed as an API because some languages' external +// scanners need to perform custom actions at these bounaries. +static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { + const Lexer *self = (const Lexer *)_self; + if (self->current_included_range_index < self->included_range_count) { + TSRange *current_range = &self->included_ranges[self->current_included_range_index]; + return self->current_position.bytes == current_range->start_byte; + } else { + return false; + } } -// The lexer's methods are stored as a struct field so that generated -// parsers can call them without needing to be linked against this library. - void ts_lexer_init(Lexer *self) { *self = (Lexer) { .data = { + // The lexer's methods are stored as struct fields so that generated + // parsers can call them without needing to be linked against this + // library. .advance = ts_lexer__advance, .mark_end = ts_lexer__mark_end, .get_column = ts_lexer__get_column, .is_at_included_range_start = ts_lexer__is_at_included_range_start, + .eof = ts_lexer__eof, .lookahead = 0, .result_symbol = 0, }, .chunk = NULL, + .chunk_size = 0, .chunk_start = 0, - .current_position = {UINT32_MAX, {0, 0}}, + .current_position = {0, {0, 0}}, .logger = { .payload = NULL, .log = NULL }, + .included_ranges = NULL, + .included_range_count = 0, .current_included_range_index = 0, }; - - self->included_ranges = NULL; ts_lexer_set_included_ranges(self, NULL, 0); - ts_lexer_reset(self, length_zero()); } void ts_lexer_delete(Lexer *self) { ts_free(self->included_ranges); } -void ts_lexer_set_input(Lexer *self, TSInput input) { - self->input = input; - self->data.lookahead = 0; - self->lookahead_size = 0; - self->chunk = 0; - self->chunk_start = 0; - self->chunk_size = 0; -} - static void ts_lexer_goto(Lexer *self, Length position) { + self->current_position = position; bool found_included_range = false; + + // Move to the first valid position at or after the given position. for (unsigned i = 0; i < self->included_range_count; i++) { TSRange *included_range = &self->included_ranges[i]; if (included_range->end_byte > position.bytes) { if (included_range->start_byte > position.bytes) { - position = (Length) { + self->current_position = (Length) { .bytes = included_range->start_byte, .extent = included_range->start_point, }; @@ -214,46 +268,61 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } - if (!found_included_range) { + if (found_included_range) { + // If the current position is outside of the current chunk of text, + // then clear out the current chunk of text. + if (self->chunk && ( + position.bytes < self->chunk_start || + position.bytes >= self->chunk_start + self->chunk_size + )) { + ts_lexer__clear_chunk(self); + } + + self->lookahead_size = 0; + self->data.lookahead = '\0'; + } + + // If the given position is beyond any of included ranges, move to the EOF + // state - past the end of the included ranges. + else { + self->current_included_range_index = self->included_range_count; TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; - position = (Length) { + self->current_position = (Length) { .bytes = last_included_range->end_byte, .extent = last_included_range->end_point, }; - self->chunk = empty_chunk; - self->chunk_start = position.bytes; - self->chunk_size = 2; + ts_lexer__clear_chunk(self); + self->lookahead_size = 1; + self->data.lookahead = '\0'; } - - self->token_start_position = position; - self->token_end_position = LENGTH_UNDEFINED; - self->current_position = position; - - if (self->chunk && (position.bytes < self->chunk_start || - position.bytes >= self->chunk_start + self->chunk_size)) { - self->chunk = 0; - self->chunk_start = 0; - self->chunk_size = 0; - } - - self->lookahead_size = 0; - self->data.lookahead = 0; } +void ts_lexer_set_input(Lexer *self, TSInput input) { + self->input = input; + ts_lexer__clear_chunk(self); + ts_lexer_goto(self, self->current_position); +} + +// Move the lexer to the given position. This doesn't do any work +// if the parser is already at the given position. void ts_lexer_reset(Lexer *self, Length position) { - if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position); + if (position.bytes != self->current_position.bytes) { + ts_lexer_goto(self, position); + } } void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->token_end_position = LENGTH_UNDEFINED; self->data.result_symbol = 0; - if (!self->chunk) ts_lexer__get_chunk(self); - if (!self->lookahead_size) ts_lexer__get_lookahead(self); - if ( - self->current_position.bytes == 0 && - self->data.lookahead == BYTE_ORDER_MARK - ) ts_lexer__advance((TSLexer *)self, true); + if (!ts_lexer__eof(&self->data)) { + if (!self->chunk_size) ts_lexer__get_chunk(self); + if (!self->lookahead_size) ts_lexer__get_lookahead(self); + if ( + self->current_position.bytes == 0 && + self->data.lookahead == BYTE_ORDER_MARK + ) ts_lexer__advance(&self->data, true); + } } void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { @@ -267,7 +336,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { // the character decoding algorithm may have looked at the following byte. // Therefore, the next byte *after* the current (invalid) character // affects the interpretation of the current character. - if (self->data.lookahead == -1) { + if (self->data.lookahead == TS_DECODE_ERROR) { current_lookahead_end_byte++; } @@ -277,8 +346,8 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { } void ts_lexer_advance_to_end(Lexer *self) { - while (self->data.lookahead != 0) { - ts_lexer__advance((TSLexer *)self, false); + while (self->chunk) { + ts_lexer__advance(&self->data, false); } } @@ -286,30 +355,19 @@ void ts_lexer_mark_end(Lexer *self) { ts_lexer__mark_end(&self->data); } -static const TSRange DEFAULT_RANGES[] = { - { - .start_point = { - .row = 0, - .column = 0, - }, - .end_point = { - .row = UINT32_MAX, - .column = UINT32_MAX, - }, - .start_byte = 0, - .end_byte = UINT32_MAX - } -}; - -void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count) { - if (!ranges) { - ranges = DEFAULT_RANGES; +void ts_lexer_set_included_ranges( + Lexer *self, + const TSRange *ranges, + uint32_t count +) { + if (count == 0 || !ranges) { + ranges = &DEFAULT_RANGE; count = 1; } - size_t sz = count * sizeof(TSRange); - self->included_ranges = ts_realloc(self->included_ranges, sz); - memcpy(self->included_ranges, ranges, sz); + size_t size = count * sizeof(TSRange); + self->included_ranges = ts_realloc(self->included_ranges, size); + memcpy(self->included_ranges, ranges, size); self->included_range_count = count; ts_lexer_goto(self, self->current_position); } diff --git a/src/tree_sitter/lexer.h b/src/tree_sitter/lexer.h index f523d88f65..8cd9c26706 100644 --- a/src/tree_sitter/lexer.h +++ b/src/tree_sitter/lexer.h @@ -16,7 +16,7 @@ typedef struct { Length token_start_position; Length token_end_position; - TSRange * included_ranges; + TSRange *included_ranges; size_t included_range_count; size_t current_included_range_index; diff --git a/src/tree_sitter/lib.c b/src/tree_sitter/lib.c index fc5fbc9210..289d32f4c5 100644 --- a/src/tree_sitter/lib.c +++ b/src/tree_sitter/lib.c @@ -2,19 +2,16 @@ // // The following directories must be added to the include path: // - include -// - utf8proc #define _POSIX_C_SOURCE 200112L -#define UTF8PROC_STATIC #include "./get_changed_ranges.c" #include "./language.c" #include "./lexer.c" #include "./node.c" #include "./parser.c" +#include "./query.c" #include "./stack.c" #include "./subtree.c" #include "./tree_cursor.c" #include "./tree.c" -#include "./utf16.c" -#include "utf8proc.c" diff --git a/src/tree_sitter/node.c b/src/tree_sitter/node.c index 6b2be36ee5..b03e2fc979 100644 --- a/src/tree_sitter/node.c +++ b/src/tree_sitter/node.c @@ -415,13 +415,15 @@ TSPoint ts_node_end_point(TSNode self) { } TSSymbol ts_node_symbol(TSNode self) { - return ts_node__alias(&self) - ? ts_node__alias(&self) - : ts_subtree_symbol(ts_node__subtree(self)); + TSSymbol symbol = ts_node__alias(&self); + if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); + return ts_language_public_symbol(self.tree->language, symbol); } const char *ts_node_type(TSNode self) { - return ts_language_symbol_name(self.tree->language, ts_node_symbol(self)); + TSSymbol symbol = ts_node__alias(&self); + if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); + return ts_language_symbol_name(self.tree->language, symbol); } char *ts_node_string(TSNode self) { diff --git a/src/tree_sitter/parser.c b/src/tree_sitter/parser.c index 88b20845fd..f381afccab 100644 --- a/src/tree_sitter/parser.c +++ b/src/tree_sitter/parser.c @@ -351,6 +351,7 @@ static Subtree ts_parser__lex( Length start_position = ts_stack_position(self->stack, version); Subtree external_token = ts_stack_last_external_token(self->stack, version); TSLexMode lex_mode = self->language->lex_modes[parse_state]; + if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE; const bool *valid_external_tokens = ts_language_enabled_external_tokens( self->language, lex_mode.external_lex_state @@ -438,7 +439,7 @@ static Subtree ts_parser__lex( } if (self->lexer.current_position.bytes == error_end_position.bytes) { - if (self->lexer.data.lookahead == 0) { + if (self->lexer.data.eof(&self->lexer.data)) { self->lexer.data.result_symbol = ts_builtin_sym_error; break; } @@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce( uint32_t count, int dynamic_precedence, uint16_t production_id, - bool fragile + bool is_fragile, + bool is_extra ) { uint32_t initial_version_count = ts_stack_version_count(self->stack); uint32_t removed_version_count = 0; @@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce( TSStateId state = ts_stack_state(self->stack, slice_version); TSStateId next_state = ts_language_next_state(self->language, state, symbol); - if (fragile || pop.size > 1 || initial_version_count > 1) { + if (is_extra) parent.ptr->extra = true; + if (is_fragile || pop.size > 1 || initial_version_count > 1) { parent.ptr->fragile_left = true; parent.ptr->fragile_right = true; parent.ptr->parse_state = TS_TREE_STATE_NONE; @@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions( reduction_version = ts_parser__reduce( self, version, action.symbol, action.count, action.dynamic_precedence, action.production_id, - true + true, false ); } @@ -1366,8 +1369,17 @@ static bool ts_parser__advance( // Otherwise, re-run the lexer. if (!lookahead.ptr) { lookahead = ts_parser__lex(self, version, state); - ts_parser__set_cached_token(self, position, last_external_token, lookahead); - ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); + if (lookahead.ptr) { + ts_parser__set_cached_token(self, position, last_external_token, lookahead); + ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); + } + + // When parsing a non-terminal extra, a null lookahead indicates the + // end of the rule. The reduction is stored in the EOF table entry. + // After the reduction, the lexer needs to be run again. + else { + ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); + } } for (;;) { @@ -1422,11 +1434,12 @@ static bool ts_parser__advance( case TSParseActionTypeReduce: { bool is_fragile = table_entry.action_count > 1; + bool is_extra = lookahead.ptr == NULL; LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count); StackVersion reduction_version = ts_parser__reduce( self, version, action.params.symbol, action.params.child_count, action.params.dynamic_precedence, action.params.production_id, - is_fragile + is_fragile, is_extra ); if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; @@ -1459,6 +1472,15 @@ static bool ts_parser__advance( ts_stack_renumber_version(self->stack, last_reduction_version, version); LOG_STACK(); state = ts_stack_state(self->stack, version); + + // At the end of a non-terminal extra rule, the lexer will return a + // null subtree, because the parser needs to perform a fixed reduction + // regardless of the lookahead node. After performing that reduction, + // (and completing the non-terminal extra rule) run the lexer again based + // on the current parse state. + if (!lookahead.ptr) { + lookahead = ts_parser__lex(self, version, state); + } ts_language_table_entry( self->language, state, @@ -1655,6 +1677,7 @@ TSParser *ts_parser_new(void) { void ts_parser_delete(TSParser *self) { if (!self) return; + ts_parser_set_language(self, NULL); ts_stack_delete(self->stack); if (self->reduce_actions.contents) { array_delete(&self->reduce_actions); @@ -1670,7 +1693,6 @@ void ts_parser_delete(TSParser *self) { ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); ts_subtree_pool_delete(&self->tree_pool); reusable_node_delete(&self->reusable_node); - ts_parser_set_language(self, NULL); ts_free(self); } @@ -1695,6 +1717,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language) { } self->language = language; + ts_parser_reset(self); return true; } @@ -1747,7 +1770,7 @@ const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count) } void ts_parser_reset(TSParser *self) { - if (self->language->external_scanner.deserialize) { + if (self->language && self->language->external_scanner.deserialize) { self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); } diff --git a/src/tree_sitter/parser.h b/src/tree_sitter/parser.h index 974a7ca52f..9df91f8c3c 100644 --- a/src/tree_sitter/parser.h +++ b/src/tree_sitter/parser.h @@ -45,7 +45,8 @@ struct TSLexer { void (*advance)(TSLexer *, bool); void (*mark_end)(TSLexer *); uint32_t (*get_column)(TSLexer *); - bool (*is_at_included_range_start)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); }; typedef enum { @@ -117,6 +118,7 @@ struct TSLanguage { uint32_t large_state_count; const uint16_t *small_parse_table; const uint32_t *small_parse_table_map; + const TSSymbol *public_symbol_map; }; /* @@ -126,6 +128,7 @@ struct TSLanguage { #define START_LEXER() \ bool result = false; \ bool skip = false; \ + bool eof = false; \ int32_t lookahead; \ goto start; \ next_state: \ diff --git a/src/tree_sitter/point.h b/src/tree_sitter/point.h index 4d0aed18ef..a50d20214b 100644 --- a/src/tree_sitter/point.h +++ b/src/tree_sitter/point.h @@ -3,6 +3,7 @@ #include "tree_sitter/api.h" +#define POINT_ZERO ((TSPoint) {0, 0}) #define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) static inline TSPoint point__new(unsigned row, unsigned column) { diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c new file mode 100644 index 0000000000..a5ce86032c --- /dev/null +++ b/src/tree_sitter/query.c @@ -0,0 +1,1450 @@ +#include "tree_sitter/api.h" +#include "./alloc.h" +#include "./array.h" +#include "./bits.h" +#include "./language.h" +#include "./point.h" +#include "./tree_cursor.h" +#include "./unicode.h" +#include + +/* + * Stream - A sequence of unicode characters derived from a UTF8 string. + * This struct is used in parsing queries from S-expressions. + */ +typedef struct { + const char *input; + const char *end; + int32_t next; + uint8_t next_size; +} Stream; + +/* + * QueryStep - A step in the process of matching a query. Each node within + * a query S-expression maps to one of these steps. An entire pattern is + * represented as a sequence of these steps. Fields: + * + * - `symbol` - The grammar symbol to match. A zero value represents the + * wildcard symbol, '*'. + * - `field` - The field name to match. A zero value means that a field name + * was not specified. + * - `capture_id` - An integer representing the name of the capture associated + * with this node in the pattern. A `NONE` value means this node is not + * captured in this pattern. + * - `depth` - The depth where this node occurs in the pattern. The root node + * of the pattern has depth zero. + */ +typedef struct { + TSSymbol symbol; + TSFieldId field; + uint16_t capture_id; + uint16_t depth: 15; + bool contains_captures: 1; +} QueryStep; + +/* + * Slice - A slice of an external array. Within a query, capture names, + * literal string values, and predicate step informations are stored in three + * contiguous arrays. Individual captures, string values, and predicates are + * represented as slices of these three arrays. + */ +typedef struct { + uint32_t offset; + uint32_t length; +} Slice; + +/* + * SymbolTable - a two-way mapping of strings to ids. + */ +typedef struct { + Array(char) characters; + Array(Slice) slices; +} SymbolTable; + +/* + * PatternEntry - The set of steps needed to match a particular pattern, + * represented as a slice of a shared array. These entries are stored in a + * 'pattern map' - a sorted array that makes it possible to efficiently lookup + * patterns based on the symbol for their first step. + */ +typedef struct { + uint16_t step_index; + uint16_t pattern_index; +} PatternEntry; + +/* + * QueryState - The state of an in-progress match of a particular pattern + * in a query. While executing, a `TSQueryCursor` must keep track of a number + * of possible in-progress matches. Each of those possible matches is + * represented as one of these states. + */ +typedef struct { + uint16_t start_depth; + uint16_t pattern_index; + uint16_t step_index; + uint16_t capture_count; + uint16_t capture_list_id; + uint16_t consumed_capture_count; + uint32_t id; +} QueryState; + +/* + * CaptureListPool - A collection of *lists* of captures. Each QueryState + * needs to maintain its own list of captures. They are all represented as + * slices of one shared array. The CaptureListPool keeps track of which + * parts of the shared array are currently in use by a QueryState. + */ +typedef struct { + Array(TSQueryCapture) list; + uint32_t usage_map; +} CaptureListPool; + +/* + * TSQuery - A tree query, compiled from a string of S-expressions. The query + * itself is immutable. The mutable state used in the process of executing the + * query is stored in a `TSQueryCursor`. + */ +struct TSQuery { + SymbolTable captures; + SymbolTable predicate_values; + Array(QueryStep) steps; + Array(PatternEntry) pattern_map; + Array(TSQueryPredicateStep) predicate_steps; + Array(Slice) predicates_by_pattern; + Array(uint32_t) start_bytes_by_pattern; + const TSLanguage *language; + uint16_t max_capture_count; + uint16_t wildcard_root_pattern_count; + TSSymbol *symbol_map; +}; + +/* + * TSQueryCursor - A stateful struct used to execute a query on a tree. + */ +struct TSQueryCursor { + const TSQuery *query; + TSTreeCursor cursor; + Array(QueryState) states; + Array(QueryState) finished_states; + CaptureListPool capture_list_pool; + uint32_t depth; + uint32_t start_byte; + uint32_t end_byte; + uint32_t next_state_id; + TSPoint start_point; + TSPoint end_point; + bool ascending; +}; + +static const TSQueryError PARENT_DONE = -1; +static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; +static const uint16_t NONE = UINT16_MAX; +static const TSSymbol WILDCARD_SYMBOL = 0; +static const uint16_t MAX_STATE_COUNT = 32; + +// #define LOG(...) fprintf(stderr, __VA_ARGS__) +#define LOG(...) + +/********** + * Stream + **********/ + +// Advance to the next unicode code point in the stream. +static bool stream_advance(Stream *self) { + self->input += self->next_size; + if (self->input < self->end) { + uint32_t size = ts_decode_utf8( + (const uint8_t *)self->input, + self->end - self->input, + &self->next + ); + if (size > 0) { + self->next_size = size; + return true; + } + } else { + self->next_size = 0; + self->next = '\0'; + } + return false; +} + +// Reset the stream to the given input position, represented as a pointer +// into the input string. +static void stream_reset(Stream *self, const char *input) { + self->input = input; + self->next_size = 0; + stream_advance(self); +} + +static Stream stream_new(const char *string, uint32_t length) { + Stream self = { + .next = 0, + .input = string, + .end = string + length, + }; + stream_advance(&self); + return self; +} + +static void stream_skip_whitespace(Stream *stream) { + for (;;) { + if (iswspace(stream->next)) { + stream_advance(stream); + } else if (stream->next == ';') { + // skip over comments + stream_advance(stream); + while (stream->next && stream->next != '\n') { + if (!stream_advance(stream)) break; + } + } else { + break; + } + } +} + +static bool stream_is_ident_start(Stream *stream) { + return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; +} + +static void stream_scan_identifier(Stream *stream) { + do { + stream_advance(stream); + } while ( + iswalnum(stream->next) || + stream->next == '_' || + stream->next == '-' || + stream->next == '.' || + stream->next == '?' || + stream->next == '!' + ); +} + +/****************** + * CaptureListPool + ******************/ + +static CaptureListPool capture_list_pool_new() { + return (CaptureListPool) { + .list = array_new(), + .usage_map = UINT32_MAX, + }; +} + +static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { + self->usage_map = UINT32_MAX; + uint32_t total_size = MAX_STATE_COUNT * list_size; + array_reserve(&self->list, total_size); + self->list.size = total_size; +} + +static void capture_list_pool_delete(CaptureListPool *self) { + array_delete(&self->list); +} + +static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { + return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; +} + +static bool capture_list_pool_is_empty(const CaptureListPool *self) { + return self->usage_map == 0; +} + +static uint16_t capture_list_pool_acquire(CaptureListPool *self) { + // In the usage_map bitmask, ones represent free lists, and zeros represent + // lists that are in use. A free list id can quickly be found by counting + // the leading zeros in the usage map. An id of zero corresponds to the + // highest-order bit in the bitmask. + uint16_t id = count_leading_zeros(self->usage_map); + if (id == 32) return NONE; + self->usage_map &= ~bitmask_for_index(id); + return id; +} + +static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + self->usage_map |= bitmask_for_index(id); +} + +/************** + * SymbolTable + **************/ + +static SymbolTable symbol_table_new() { + return (SymbolTable) { + .characters = array_new(), + .slices = array_new(), + }; +} + +static void symbol_table_delete(SymbolTable *self) { + array_delete(&self->characters); + array_delete(&self->slices); +} + +static int symbol_table_id_for_name( + const SymbolTable *self, + const char *name, + uint32_t length +) { + for (unsigned i = 0; i < self->slices.size; i++) { + Slice slice = self->slices.contents[i]; + if ( + slice.length == length && + !strncmp(&self->characters.contents[slice.offset], name, length) + ) return i; + } + return -1; +} + +static const char *symbol_table_name_for_id( + const SymbolTable *self, + uint16_t id, + uint32_t *length +) { + Slice slice = self->slices.contents[id]; + *length = slice.length; + return &self->characters.contents[slice.offset]; +} + +static uint16_t symbol_table_insert_name( + SymbolTable *self, + const char *name, + uint32_t length +) { + int id = symbol_table_id_for_name(self, name, length); + if (id >= 0) return (uint16_t)id; + Slice slice = { + .offset = self->characters.size, + .length = length, + }; + array_grow_by(&self->characters, length + 1); + memcpy(&self->characters.contents[slice.offset], name, length); + self->characters.contents[self->characters.size - 1] = 0; + array_push(&self->slices, slice); + return self->slices.size - 1; +} + +/********* + * Query + *********/ + +// The `pattern_map` contains a mapping from TSSymbol values to indices in the +// `steps` array. For a given syntax node, the `pattern_map` makes it possible +// to quickly find the starting steps of all of the patterns whose root matches +// that node. Each entry has two fields: a `pattern_index`, which identifies one +// of the patterns in the query, and a `step_index`, which indicates the start +// offset of that pattern's steps pattern within the `steps` array. +// +// The entries are sorted by the patterns' root symbols, and lookups use a +// binary search. This ensures that the cost of this initial lookup step +// scales logarithmically with the number of patterns in the query. +// +// This returns `true` if the symbol is present and `false` otherwise. +// If the symbol is not present `*result` is set to the index where the +// symbol should be inserted. +static inline bool ts_query__pattern_map_search( + const TSQuery *self, + TSSymbol needle, + uint32_t *result +) { + uint32_t base_index = self->wildcard_root_pattern_count; + uint32_t size = self->pattern_map.size - base_index; + if (size == 0) { + *result = base_index; + return false; + } + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = base_index + half_size; + TSSymbol mid_symbol = self->steps.contents[ + self->pattern_map.contents[mid_index].step_index + ].symbol; + if (needle > mid_symbol) base_index = mid_index; + size -= half_size; + } + + TSSymbol symbol = self->steps.contents[ + self->pattern_map.contents[base_index].step_index + ].symbol; + + if (needle > symbol) { + base_index++; + if (base_index < self->pattern_map.size) { + symbol = self->steps.contents[ + self->pattern_map.contents[base_index].step_index + ].symbol; + } + } + + *result = base_index; + return needle == symbol; +} + +// Insert a new pattern's start index into the pattern map, maintaining +// the pattern map's ordering invariant. +static inline void ts_query__pattern_map_insert( + TSQuery *self, + TSSymbol symbol, + uint32_t start_step_index +) { + uint32_t index; + ts_query__pattern_map_search(self, symbol, &index); + array_insert(&self->pattern_map, index, ((PatternEntry) { + .step_index = start_step_index, + .pattern_index = self->pattern_map.size, + })); +} + +static void ts_query__finalize_steps(TSQuery *self) { + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + uint32_t depth = step->depth; + if (step->capture_id != NONE) { + step->contains_captures = true; + } else { + step->contains_captures = false; + for (unsigned j = i + 1; j < self->steps.size; j++) { + QueryStep *s = &self->steps.contents[j]; + if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; + if (s->capture_id != NONE) step->contains_captures = true; + } + } + } +} + +// Parse a single predicate associated with a pattern, adding it to the +// query's internal `predicate_steps` array. Predicates are arbitrary +// S-expressions associated with a pattern which are meant to be handled at +// a higher level of abstraction, such as the Rust/JavaScript bindings. They +// can contain '@'-prefixed capture names, double-quoted strings, and bare +// symbols, which also represent strings. +static TSQueryError ts_query__parse_predicate( + TSQuery *self, + Stream *stream +) { + if (stream->next == ')') return PARENT_DONE; + if (stream->next != '(') return TSQueryErrorSyntax; + stream_advance(stream); + stream_skip_whitespace(stream); + + unsigned step_count = 0; + for (;;) { + if (stream->next == ')') { + stream_advance(stream); + stream_skip_whitespace(stream); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeDone, + .value_id = 0, + })); + break; + } + + // Parse an '@'-prefixed capture name + else if (stream->next == '@') { + stream_advance(stream); + + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + int capture_id = symbol_table_id_for_name( + &self->captures, + capture_name, + length + ); + if (capture_id == -1) { + stream_reset(stream, capture_name); + return TSQueryErrorCapture; + } + + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeCapture, + .value_id = capture_id, + })); + } + + // Parse a string literal + else if (stream->next == '"') { + stream_advance(stream); + + // Parse the string content + const char *string_content = stream->input; + while (stream->next != '"') { + if (stream->next == '\n' || !stream_advance(stream)) { + stream_reset(stream, string_content - 1); + return TSQueryErrorSyntax; + } + } + uint32_t length = stream->input - string_content; + + // Add a step for the node + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + string_content, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); + + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + } + + // Parse a bare symbol + else if (stream_is_ident_start(stream)) { + const char *symbol_start = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - symbol_start; + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + symbol_start, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); + } + + else { + return TSQueryErrorSyntax; + } + + step_count++; + stream_skip_whitespace(stream); + } + + return 0; +} + +// Read one S-expression pattern from the stream, and incorporate it into +// the query's internal state machine representation. For nested patterns, +// this function calls itself recursively. +static TSQueryError ts_query__parse_pattern( + TSQuery *self, + Stream *stream, + uint32_t depth, + uint32_t *capture_count +) { + uint16_t starting_step_index = self->steps.size; + + if (stream->next == 0) return TSQueryErrorSyntax; + + // Finish the parent S-expression + if (stream->next == ')') { + return PARENT_DONE; + } + + // Parse a parenthesized node expression + else if (stream->next == '(') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse a nested list, which represents a pattern followed by + // zero-or-more predicates. + if (stream->next == '(' && depth == 0) { + TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count); + if (e) return e; + + // Parse the predicates. + stream_skip_whitespace(stream); + for (;;) { + TSQueryError e = ts_query__parse_predicate(self, stream); + if (e == PARENT_DONE) { + stream_advance(stream); + stream_skip_whitespace(stream); + return 0; + } else if (e) { + return e; + } + } + } + + TSSymbol symbol; + + // Parse the wildcard symbol + if (stream->next == '*') { + symbol = WILDCARD_SYMBOL; + stream_advance(stream); + } + + // Parse a normal node name + else if (stream_is_ident_start(stream)) { + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + } else { + return TSQueryErrorSyntax; + } + + // Add a step for the node. + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = symbol, + .field = 0, + .capture_id = NONE, + .contains_captures = false, + })); + + // Parse the child patterns + stream_skip_whitespace(stream); + for (;;) { + TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count); + if (e == PARENT_DONE) { + stream_advance(stream); + break; + } else if (e) { + return e; + } + } + } + + // Parse a double-quoted anonymous leaf node expression + else if (stream->next == '"') { + stream_advance(stream); + + // Parse the string content + const char *string_content = stream->input; + while (stream->next != '"') { + if (!stream_advance(stream)) { + stream_reset(stream, string_content - 1); + return TSQueryErrorSyntax; + } + } + uint32_t length = stream->input - string_content; + + // Add a step for the node + TSSymbol symbol = ts_language_symbol_for_name( + self->language, + string_content, + length, + false + ); + if (!symbol) { + stream_reset(stream, string_content); + return TSQueryErrorNodeType; + } + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = symbol, + .field = 0, + .capture_id = NONE, + .contains_captures = false, + })); + + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + } + + // Parse a field-prefixed pattern + else if (stream_is_ident_start(stream)) { + // Parse the field name + const char *field_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - field_name; + stream_skip_whitespace(stream); + + if (stream->next != ':') { + stream_reset(stream, field_name); + return TSQueryErrorSyntax; + } + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse the pattern + uint32_t step_index = self->steps.size; + TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count); + if (e == PARENT_DONE) return TSQueryErrorSyntax; + if (e) return e; + + // Add the field name to the first step of the pattern + TSFieldId field_id = ts_language_field_id_for_name( + self->language, + field_name, + length + ); + if (!field_id) { + stream->input = field_name; + return TSQueryErrorField; + } + self->steps.contents[step_index].field = field_id; + } + + // Parse a wildcard pattern + else if (stream->next == '*') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Add a step that matches any kind of node + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = WILDCARD_SYMBOL, + .field = 0, + .contains_captures = false, + })); + } + + else { + return TSQueryErrorSyntax; + } + + stream_skip_whitespace(stream); + + // Parse an '@'-prefixed capture pattern + if (stream->next == '@') { + stream_advance(stream); + + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + uint16_t capture_id = symbol_table_insert_name( + &self->captures, + capture_name, + length + ); + self->steps.contents[starting_step_index].capture_id = capture_id; + (*capture_count)++; + + stream_skip_whitespace(stream); + } + + return 0; +} + +TSQuery *ts_query_new( + const TSLanguage *language, + const char *source, + uint32_t source_len, + uint32_t *error_offset, + TSQueryError *error_type +) { + TSSymbol *symbol_map; + if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { + symbol_map = NULL; + } else { + // Work around the fact that multiple symbols can currently be + // associated with the same name, due to "simple aliases". + // In the next language ABI version, this map will be contained + // in the language's `public_symbol_map` field. + uint32_t symbol_count = ts_language_symbol_count(language); + symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count); + for (unsigned i = 0; i < symbol_count; i++) { + const char *name = ts_language_symbol_name(language, i); + const TSSymbolType symbol_type = ts_language_symbol_type(language, i); + + symbol_map[i] = i; + + for (unsigned j = 0; j < i; j++) { + if (ts_language_symbol_type(language, j) == symbol_type) { + if (!strcmp(name, ts_language_symbol_name(language, j))) { + symbol_map[i] = j; + break; + } + } + } + } + } + + TSQuery *self = ts_malloc(sizeof(TSQuery)); + *self = (TSQuery) { + .steps = array_new(), + .pattern_map = array_new(), + .captures = symbol_table_new(), + .predicate_values = symbol_table_new(), + .predicate_steps = array_new(), + .predicates_by_pattern = array_new(), + .symbol_map = symbol_map, + .wildcard_root_pattern_count = 0, + .max_capture_count = 0, + .language = language, + }; + + // Parse all of the S-expressions in the given string. + Stream stream = stream_new(source, source_len); + stream_skip_whitespace(&stream); + uint32_t start_step_index; + while (stream.input < stream.end) { + start_step_index = self->steps.size; + uint32_t capture_count = 0; + array_push(&self->start_bytes_by_pattern, stream.input - source); + array_push(&self->predicates_by_pattern, ((Slice) { + .offset = self->predicate_steps.size, + .length = 0, + })); + *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count); + array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER })); + + // If any pattern could not be parsed, then report the error information + // and terminate. + if (*error_type) { + *error_offset = stream.input - source; + ts_query_delete(self); + return NULL; + } + + // Maintain a map that can look up patterns for a given root symbol. + ts_query__pattern_map_insert( + self, + self->steps.contents[start_step_index].symbol, + start_step_index + ); + if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { + self->wildcard_root_pattern_count++; + } + + // Keep track of the maximum number of captures in pattern, because + // that numer determines how much space is needed to store each capture + // list. + if (capture_count > self->max_capture_count) { + self->max_capture_count = capture_count; + } + } + + ts_query__finalize_steps(self); + return self; +} + +void ts_query_delete(TSQuery *self) { + if (self) { + array_delete(&self->steps); + array_delete(&self->pattern_map); + array_delete(&self->predicate_steps); + array_delete(&self->predicates_by_pattern); + array_delete(&self->start_bytes_by_pattern); + symbol_table_delete(&self->captures); + symbol_table_delete(&self->predicate_values); + ts_free(self->symbol_map); + ts_free(self); + } +} + +uint32_t ts_query_pattern_count(const TSQuery *self) { + return self->predicates_by_pattern.size; +} + +uint32_t ts_query_capture_count(const TSQuery *self) { + return self->captures.slices.size; +} + +uint32_t ts_query_string_count(const TSQuery *self) { + return self->predicate_values.slices.size; +} + +const char *ts_query_capture_name_for_id( + const TSQuery *self, + uint32_t index, + uint32_t *length +) { + return symbol_table_name_for_id(&self->captures, index, length); +} + +const char *ts_query_string_value_for_id( + const TSQuery *self, + uint32_t index, + uint32_t *length +) { + return symbol_table_name_for_id(&self->predicate_values, index, length); +} + +const TSQueryPredicateStep *ts_query_predicates_for_pattern( + const TSQuery *self, + uint32_t pattern_index, + uint32_t *step_count +) { + Slice slice = self->predicates_by_pattern.contents[pattern_index]; + *step_count = slice.length; + return &self->predicate_steps.contents[slice.offset]; +} + +uint32_t ts_query_start_byte_for_pattern( + const TSQuery *self, + uint32_t pattern_index +) { + return self->start_bytes_by_pattern.contents[pattern_index]; +} + +void ts_query_disable_capture( + TSQuery *self, + const char *name, + uint32_t length +) { + int id = symbol_table_id_for_name(&self->captures, name, length); + if (id != -1) { + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + if (step->capture_id == id) { + step->capture_id = NONE; + } + } + } + ts_query__finalize_steps(self); +} + +/*************** + * QueryCursor + ***************/ + +TSQueryCursor *ts_query_cursor_new() { + TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); + *self = (TSQueryCursor) { + .ascending = false, + .states = array_new(), + .finished_states = array_new(), + .capture_list_pool = capture_list_pool_new(), + .start_byte = 0, + .end_byte = UINT32_MAX, + .start_point = {0, 0}, + .end_point = POINT_MAX, + }; + array_reserve(&self->states, MAX_STATE_COUNT); + array_reserve(&self->finished_states, MAX_STATE_COUNT); + return self; +} + +void ts_query_cursor_delete(TSQueryCursor *self) { + array_delete(&self->states); + array_delete(&self->finished_states); + ts_tree_cursor_delete(&self->cursor); + capture_list_pool_delete(&self->capture_list_pool); + ts_free(self); +} + +void ts_query_cursor_exec( + TSQueryCursor *self, + const TSQuery *query, + TSNode node +) { + array_clear(&self->states); + array_clear(&self->finished_states); + ts_tree_cursor_reset(&self->cursor, node); + capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); + self->next_state_id = 0; + self->depth = 0; + self->ascending = false; + self->query = query; +} + +void ts_query_cursor_set_byte_range( + TSQueryCursor *self, + uint32_t start_byte, + uint32_t end_byte +) { + if (end_byte == 0) { + start_byte = 0; + end_byte = UINT32_MAX; + } + self->start_byte = start_byte; + self->end_byte = end_byte; +} + +void ts_query_cursor_set_point_range( + TSQueryCursor *self, + TSPoint start_point, + TSPoint end_point +) { + if (end_point.row == 0 && end_point.column == 0) { + start_point = POINT_ZERO; + end_point = POINT_MAX; + } + self->start_point = start_point; + self->end_point = end_point; +} + +// Search through all of the in-progress states, and find the captured +// node that occurs earliest in the document. +static bool ts_query_cursor__first_in_progress_capture( + TSQueryCursor *self, + uint32_t *state_index, + uint32_t *byte_offset, + uint32_t *pattern_index +) { + bool result = false; + for (unsigned i = 0; i < self->states.size; i++) { + const QueryState *state = &self->states.contents[i]; + if (state->capture_count > 0) { + const TSQueryCapture *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + uint32_t capture_byte = ts_node_start_byte(captures[0].node); + if ( + !result || + capture_byte < *byte_offset || + ( + capture_byte == *byte_offset && + state->pattern_index < *pattern_index + ) + ) { + result = true; + *state_index = i; + *byte_offset = capture_byte; + *pattern_index = state->pattern_index; + } + } + } + return result; +} + +static bool ts_query__cursor_add_state( + TSQueryCursor *self, + const PatternEntry *slice +) { + uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if (ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index + )) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + list_id = self->states.contents[state_index].capture_list_id; + array_erase(&self->states, state_index); + } else { + LOG(" too many finished states.\n"); + return false; + } + } + + LOG(" start state. pattern:%u\n", slice->pattern_index); + array_push(&self->states, ((QueryState) { + .capture_list_id = list_id, + .step_index = slice->step_index, + .pattern_index = slice->pattern_index, + .start_depth = self->depth, + .capture_count = 0, + .consumed_capture_count = 0, + })); + return true; +} + +static QueryState *ts_query__cursor_copy_state( + TSQueryCursor *self, + const QueryState *state +) { + uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); + if (new_list_id == NONE) return NULL; + array_push(&self->states, *state); + QueryState *new_state = array_back(&self->states); + new_state->capture_list_id = new_list_id; + TSQueryCapture *old_captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + TSQueryCapture *new_captures = capture_list_pool_get( + &self->capture_list_pool, + new_list_id + ); + memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); + return new_state; +} + +// Walk the tree, processing patterns until at least one pattern finishes, +// If one or more patterns finish, return `true` and store their states in the +// `finished_states` array. Multiple patterns can finish on the same node. If +// there are no more matches, return `false`. +static inline bool ts_query_cursor__advance(TSQueryCursor *self) { + do { + if (self->ascending) { + LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); + + // When leaving a node, remove any unfinished states whose next step + // needed to match something within that node. + uint32_t deleted_count = 0; + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + LOG( + " failed to match. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + deleted_count++; + } else if (deleted_count > 0) { + self->states.contents[i - deleted_count] = *state; + } + } + + self->states.size -= deleted_count; + + if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = false; + } else if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; + } else { + return self->finished_states.size > 0; + } + } else { + bool can_have_later_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = ts_tree_cursor_current_status( + &self->cursor, + &can_have_later_siblings, + &can_have_later_siblings_with_this_field + ); + TSNode node = ts_tree_cursor_current_node(&self->cursor); + TSSymbol symbol = ts_node_symbol(node); + if (symbol != ts_builtin_sym_error && self->query->symbol_map) { + symbol = self->query->symbol_map[symbol]; + } + + // If this node is before the selected range, then avoid descending + // into it. + if ( + ts_node_end_byte(node) <= self->start_byte || + point_lte(ts_node_end_point(node), self->start_point) + ) { + if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = true; + } + continue; + } + + // If this node is after the selected range, then stop walking. + if ( + self->end_byte <= ts_node_start_byte(node) || + point_lte(self->end_point, ts_node_start_point(node)) + ) return false; + + LOG( + "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n", + ts_node_type(node), + ts_language_field_name_for_id(self->query->language, field_id), + ts_node_start_point(node).row, + self->states.size, + self->finished_states.size, + can_have_later_siblings, + can_have_later_siblings_with_this_field + ); + + // Add new states for any patterns whose root node is a wildcard. + for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { + PatternEntry *slice = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[slice->step_index]; + + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if (step->field && field_id != step->field) continue; + if (!ts_query__cursor_add_state(self, slice)) break; + } + + // Add new states for any patterns whose root node matches this node. + unsigned i; + if (ts_query__pattern_map_search(self->query, symbol, &i)) { + PatternEntry *slice = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[slice->step_index]; + do { + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if (step->field && field_id != step->field) continue; + if (!ts_query__cursor_add_state(self, slice)) break; + + // Advance to the next pattern whose root node matches this node. + i++; + if (i == self->query->pattern_map.size) break; + slice = &self->query->pattern_map.contents[i]; + step = &self->query->steps.contents[slice->step_index]; + } while (step->symbol == symbol); + } + + // Update all of the in-progress states with current node. + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + // Check that the node matches all of the criteria for the next + // step of the pattern.if ( + if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; + + // Determine if this node matches this step of the pattern, and also + // if this node can have later siblings that match this step of the + // pattern. + bool node_does_match = !step->symbol || step->symbol == symbol; + bool later_sibling_can_match = can_have_later_siblings; + if (step->field) { + if (step->field == field_id) { + if (!can_have_later_siblings_with_this_field) { + later_sibling_can_match = false; + } + } else { + node_does_match = false; + } + } + + if (!node_does_match) { + if (!later_sibling_can_match) { + LOG( + " discard state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->states, i); + i--; + n--; + } + continue; + } + + // Some patterns can match their root node in multiple ways, + // capturing different children. If this pattern step could match + // later children within the same parent, then this query state + // cannot simply be updated in place. It must be split into two + // states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later + // siblings. + QueryState *next_state = state; + if ( + step->depth > 0 && + step->contains_captures && + later_sibling_can_match + ) { + QueryState *copy = ts_query__cursor_copy_state(self, state); + if (copy) { + LOG( + " split state. pattern:%u, step:%u\n", + copy->pattern_index, + copy->step_index + ); + next_state = copy; + } else { + LOG(" canot split state.\n"); + } + } + + LOG( + " advance state. pattern:%u, step:%u\n", + next_state->pattern_index, + next_state->step_index + ); + + // If the current node is captured in this pattern, add it to the + // capture list. + if (step->capture_id != NONE) { + LOG( + " capture node. pattern:%u, capture_id:%u\n", + next_state->pattern_index, + step->capture_id + ); + TSQueryCapture *capture_list = capture_list_pool_get( + &self->capture_list_pool, + next_state->capture_list_id + ); + capture_list[next_state->capture_count++] = (TSQueryCapture) { + node, + step->capture_id + }; + } + + // If the pattern is now done, then remove it from the list of + // in-progress states, and add it to the list of finished states. + next_state->step_index++; + QueryStep *next_step = step + 1; + if (next_step->depth == PATTERN_DONE_MARKER) { + LOG(" finish pattern %u\n", next_state->pattern_index); + + next_state->id = self->next_state_id++; + array_push(&self->finished_states, *next_state); + if (next_state == state) { + array_erase(&self->states, i); + i--; + n--; + } else { + self->states.size--; + } + } + } + + // Continue descending if possible. + if (ts_tree_cursor_goto_first_child(&self->cursor)) { + self->depth++; + } else { + self->ascending = true; + } + } + } while (self->finished_states.size == 0); + + return true; +} + +bool ts_query_cursor_next_match( + TSQueryCursor *self, + TSQueryMatch *match +) { + if (self->finished_states.size == 0) { + if (!ts_query_cursor__advance(self)) { + return false; + } + } + + QueryState *state = &self->finished_states.contents[0]; + match->id = state->id; + match->pattern_index = state->pattern_index; + match->capture_count = state->capture_count; + match->captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->finished_states, 0); + return true; +} + +void ts_query_cursor_remove_match( + TSQueryCursor *self, + uint32_t match_id +) { + for (unsigned i = 0; i < self->finished_states.size; i++) { + const QueryState *state = &self->finished_states.contents[i]; + if (state->id == match_id) { + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->finished_states, i); + return; + } + } +} + +bool ts_query_cursor_next_capture( + TSQueryCursor *self, + TSQueryMatch *match, + uint32_t *capture_index +) { + for (;;) { + // The goal here is to return captures in order, even though they may not + // be discovered in order, because patterns can overlap. If there are any + // finished patterns, then try to find one that contains a capture that + // is *definitely* before any capture in an *unfinished* pattern. + if (self->finished_states.size > 0) { + // First, identify the position of the earliest capture in an unfinished + // match. For a finished capture to be returned, it must be *before* + // this position. + uint32_t first_unfinished_capture_byte = UINT32_MAX; + uint32_t first_unfinished_pattern_index = UINT32_MAX; + uint32_t first_unfinished_state_index; + ts_query_cursor__first_in_progress_capture( + self, + &first_unfinished_state_index, + &first_unfinished_capture_byte, + &first_unfinished_pattern_index + ); + + // Find the earliest capture in a finished match. + int first_finished_state_index = -1; + uint32_t first_finished_capture_byte = first_unfinished_capture_byte; + uint32_t first_finished_pattern_index = first_unfinished_pattern_index; + for (unsigned i = 0; i < self->finished_states.size; i++) { + const QueryState *state = &self->finished_states.contents[i]; + if (state->capture_count > state->consumed_capture_count) { + const TSQueryCapture *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + uint32_t capture_byte = ts_node_start_byte( + captures[state->consumed_capture_count].node + ); + if ( + capture_byte < first_finished_capture_byte || + ( + capture_byte == first_finished_capture_byte && + state->pattern_index < first_finished_pattern_index + ) + ) { + first_finished_state_index = i; + first_finished_capture_byte = capture_byte; + first_finished_pattern_index = state->pattern_index; + } + } else { + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->finished_states, i); + i--; + } + } + + // If there is finished capture that is clearly before any unfinished + // capture, then return its match, and its capture index. Internally + // record the fact that the capture has been 'consumed'. + if (first_finished_state_index != -1) { + QueryState *state = &self->finished_states.contents[ + first_finished_state_index + ]; + match->id = state->id; + match->pattern_index = state->pattern_index; + match->capture_count = state->capture_count; + match->captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + *capture_index = state->consumed_capture_count; + state->consumed_capture_count++; + return true; + } + + if (capture_list_pool_is_empty(&self->capture_list_pool)) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + first_unfinished_state_index, + first_unfinished_pattern_index, + first_unfinished_capture_byte + ); + capture_list_pool_release( + &self->capture_list_pool, + self->states.contents[first_unfinished_state_index].capture_list_id + ); + array_erase(&self->states, first_unfinished_state_index); + } + } + + // If there are no finished matches that are ready to be returned, then + // continue finding more matches. + if (!ts_query_cursor__advance(self)) return false; + } +} + +#undef LOG diff --git a/src/tree_sitter/subtree.c b/src/tree_sitter/subtree.c index e95733eb46..30144fa175 100644 --- a/src/tree_sitter/subtree.c +++ b/src/tree_sitter/subtree.c @@ -18,18 +18,9 @@ typedef struct { Length new_end; } Edit; -#ifdef TREE_SITTER_TEST - -#define TS_MAX_INLINE_TREE_LENGTH 2 -#define TS_MAX_TREE_POOL_SIZE 0 - -#else - #define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX #define TS_MAX_TREE_POOL_SIZE 32 -#endif - static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}}; // ExternalScannerState @@ -775,10 +766,10 @@ Subtree ts_subtree_last_external_token(Subtree tree) { } static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) { - if (c == 0) - return snprintf(s, n, "EOF"); if (c == -1) return snprintf(s, n, "INVALID"); + else if (c == '\0') + return snprintf(s, n, "'\\0'"); else if (c == '\n') return snprintf(s, n, "'\\n'"); else if (c == '\t') diff --git a/src/tree_sitter/tree.c b/src/tree_sitter/tree.c index 04cb1d242f..391fa7f592 100644 --- a/src/tree_sitter/tree.c +++ b/src/tree_sitter/tree.c @@ -84,12 +84,10 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { } TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) { - TSRange *result; TreeCursor cursor1 = {NULL, array_new()}; TreeCursor cursor2 = {NULL, array_new()}; - TSNode root = ts_tree_root_node(self); - ts_tree_cursor_init(&cursor1, root); - ts_tree_cursor_init(&cursor2, root); + ts_tree_cursor_init(&cursor1, ts_tree_root_node(self)); + ts_tree_cursor_init(&cursor2, ts_tree_root_node(other)); TSRangeArray included_range_differences = array_new(); ts_range_array_get_changed_ranges( @@ -98,6 +96,7 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin &included_range_differences ); + TSRange *result; *count = ts_subtree_get_changed_ranges( &self->root, &other->root, &cursor1, &cursor2, self->language, &included_range_differences, &result diff --git a/src/tree_sitter/tree_cursor.c b/src/tree_sitter/tree_cursor.c index 7103fc411d..00b9679d73 100644 --- a/src/tree_sitter/tree_cursor.c +++ b/src/tree_sitter/tree_cursor.c @@ -244,6 +244,72 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { ); } +TSFieldId ts_tree_cursor_current_status( + const TSTreeCursor *_self, + bool *can_have_later_siblings, + bool *can_have_later_siblings_with_this_field +) { + const TreeCursor *self = (const TreeCursor *)_self; + TSFieldId result = 0; + *can_have_later_siblings = false; + *can_have_later_siblings_with_this_field = false; + + // Walk up the tree, visiting the current node and its invisible ancestors, + // because fields can refer to nodes through invisible *wrapper* nodes, + for (unsigned i = self->stack.size - 1; i > 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + + // Stop walking up when a visible ancestor is found. + if (i != self->stack.size - 1) { + if (ts_subtree_visible(*entry->subtree)) break; + const TSSymbol *alias_sequence = ts_language_alias_sequence( + self->tree->language, + parent_entry->subtree->ptr->production_id + ); + if (alias_sequence && alias_sequence[entry->structural_child_index]) { + break; + } + } + + if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) { + *can_have_later_siblings = true; + } + + if (ts_subtree_extra(*entry->subtree)) break; + + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self->tree->language, + parent_entry->subtree->ptr->production_id, + &field_map, &field_map_end + ); + + // Look for a field name associated with the current node. + if (!result) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (!i->inherited && i->child_index == entry->structural_child_index) { + result = i->field_id; + *can_have_later_siblings_with_this_field = false; + break; + } + } + } + + // Determine if there other later siblings with the same field name. + if (result) { + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (i->field_id == result && i->child_index > entry->structural_child_index) { + *can_have_later_siblings_with_this_field = true; + break; + } + } + } + } + + return result; +} + TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; @@ -264,19 +330,18 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { } } + if (ts_subtree_extra(*entry->subtree)) break; + const TSFieldMapEntry *field_map, *field_map_end; ts_language_field_map( self->tree->language, parent_entry->subtree->ptr->production_id, &field_map, &field_map_end ); - - while (field_map < field_map_end) { - if ( - !field_map->inherited && - field_map->child_index == entry->structural_child_index - ) return field_map->field_id; - field_map++; + for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { + if (!i->inherited && i->child_index == entry->structural_child_index) { + return i->field_id; + } } } return 0; diff --git a/src/tree_sitter/tree_cursor.h b/src/tree_sitter/tree_cursor.h index 55bdad86da..5a39dd278c 100644 --- a/src/tree_sitter/tree_cursor.h +++ b/src/tree_sitter/tree_cursor.h @@ -16,5 +16,6 @@ typedef struct { } TreeCursor; void ts_tree_cursor_init(TreeCursor *, TSNode); +TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *); #endif // TREE_SITTER_TREE_CURSOR_H_ diff --git a/src/tree_sitter/unicode.h b/src/tree_sitter/unicode.h new file mode 100644 index 0000000000..0fba56a612 --- /dev/null +++ b/src/tree_sitter/unicode.h @@ -0,0 +1,50 @@ +#ifndef TREE_SITTER_UNICODE_H_ +#define TREE_SITTER_UNICODE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#define U_EXPORT +#define U_EXPORT2 +#include "unicode/utf8.h" +#include "unicode/utf16.h" + +static const int32_t TS_DECODE_ERROR = U_SENTINEL; + +// These functions read one unicode code point from the given string, +// returning the number of bytes consumed. +typedef uint32_t (*UnicodeDecodeFunction)( + const uint8_t *string, + uint32_t length, + int32_t *code_point +); + +static inline uint32_t ts_decode_utf8( + const uint8_t *string, + uint32_t length, + int32_t *code_point +) { + uint32_t i = 0; + U8_NEXT(string, i, length, *code_point); + return i; +} + +static inline uint32_t ts_decode_utf16( + const uint8_t *string, + uint32_t length, + int32_t *code_point +) { + uint32_t i = 0; + U16_NEXT(((uint16_t *)string), i, length, *code_point); + return i * 2; +} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_UNICODE_H_ diff --git a/src/tree_sitter/unicode/ICU_SHA b/src/tree_sitter/unicode/ICU_SHA new file mode 100644 index 0000000000..3622283ba3 --- /dev/null +++ b/src/tree_sitter/unicode/ICU_SHA @@ -0,0 +1 @@ +552b01f61127d30d6589aa4bf99468224979b661 diff --git a/src/tree_sitter/unicode/LICENSE b/src/tree_sitter/unicode/LICENSE new file mode 100644 index 0000000000..2e01e36876 --- /dev/null +++ b/src/tree_sitter/unicode/LICENSE @@ -0,0 +1,414 @@ +COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) + +Copyright © 1991-2019 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +--------------------- + +Third-Party Software Licenses + +This section contains third-party software notices and/or additional +terms for licensed third-party software components included within ICU +libraries. + +1. ICU License - ICU 1.8.1 to ICU 57.1 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1995-2016 International Business Machines Corporation and others +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies of +the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY +SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, use +or other dealings in this Software without prior written authorization +of the copyright holder. + +All trademarks and registered trademarks mentioned herein are the +property of their respective owners. + +2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt) + + # The Google Chrome software developed by Google is licensed under + # the BSD license. Other software included in this distribution is + # provided under other licenses, as set forth below. + # + # The BSD License + # http://opensource.org/licenses/bsd-license.php + # Copyright (C) 2006-2008, Google Inc. + # + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions are met: + # + # Redistributions of source code must retain the above copyright notice, + # this list of conditions and the following disclaimer. + # Redistributions in binary form must reproduce the above + # copyright notice, this list of conditions and the following + # disclaimer in the documentation and/or other materials provided with + # the distribution. + # Neither the name of Google Inc. nor the names of its + # contributors may be used to endorse or promote products derived from + # this software without specific prior written permission. + # + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # + # + # The word list in cjdict.txt are generated by combining three word lists + # listed below with further processing for compound word breaking. The + # frequency is generated with an iterative training against Google web + # corpora. + # + # * Libtabe (Chinese) + # - https://sourceforge.net/project/?group_id=1519 + # - Its license terms and conditions are shown below. + # + # * IPADIC (Japanese) + # - http://chasen.aist-nara.ac.jp/chasen/distribution.html + # - Its license terms and conditions are shown below. + # + # ---------COPYING.libtabe ---- BEGIN-------------------- + # + # /* + # * Copyright (c) 1999 TaBE Project. + # * Copyright (c) 1999 Pai-Hsiang Hsiao. + # * All rights reserved. + # * + # * Redistribution and use in source and binary forms, with or without + # * modification, are permitted provided that the following conditions + # * are met: + # * + # * . Redistributions of source code must retain the above copyright + # * notice, this list of conditions and the following disclaimer. + # * . Redistributions in binary form must reproduce the above copyright + # * notice, this list of conditions and the following disclaimer in + # * the documentation and/or other materials provided with the + # * distribution. + # * . Neither the name of the TaBE Project nor the names of its + # * contributors may be used to endorse or promote products derived + # * from this software without specific prior written permission. + # * + # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # * OF THE POSSIBILITY OF SUCH DAMAGE. + # */ + # + # /* + # * Copyright (c) 1999 Computer Systems and Communication Lab, + # * Institute of Information Science, Academia + # * Sinica. All rights reserved. + # * + # * Redistribution and use in source and binary forms, with or without + # * modification, are permitted provided that the following conditions + # * are met: + # * + # * . Redistributions of source code must retain the above copyright + # * notice, this list of conditions and the following disclaimer. + # * . Redistributions in binary form must reproduce the above copyright + # * notice, this list of conditions and the following disclaimer in + # * the documentation and/or other materials provided with the + # * distribution. + # * . Neither the name of the Computer Systems and Communication Lab + # * nor the names of its contributors may be used to endorse or + # * promote products derived from this software without specific + # * prior written permission. + # * + # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # * OF THE POSSIBILITY OF SUCH DAMAGE. + # */ + # + # Copyright 1996 Chih-Hao Tsai @ Beckman Institute, + # University of Illinois + # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4 + # + # ---------------COPYING.libtabe-----END-------------------------------- + # + # + # ---------------COPYING.ipadic-----BEGIN------------------------------- + # + # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science + # and Technology. All Rights Reserved. + # + # Use, reproduction, and distribution of this software is permitted. + # Any copy of this software, whether in its original form or modified, + # must include both the above copyright notice and the following + # paragraphs. + # + # Nara Institute of Science and Technology (NAIST), + # the copyright holders, disclaims all warranties with regard to this + # software, including all implied warranties of merchantability and + # fitness, in no event shall NAIST be liable for + # any special, indirect or consequential damages or any damages + # whatsoever resulting from loss of use, data or profits, whether in an + # action of contract, negligence or other tortuous action, arising out + # of or in connection with the use or performance of this software. + # + # A large portion of the dictionary entries + # originate from ICOT Free Software. The following conditions for ICOT + # Free Software applies to the current dictionary as well. + # + # Each User may also freely distribute the Program, whether in its + # original form or modified, to any third party or parties, PROVIDED + # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear + # on, or be attached to, the Program, which is distributed substantially + # in the same form as set out herein and that such intended + # distribution, if actually made, will neither violate or otherwise + # contravene any of the laws and regulations of the countries having + # jurisdiction over the User or the intended distribution itself. + # + # NO WARRANTY + # + # The program was produced on an experimental basis in the course of the + # research and development conducted during the project and is provided + # to users as so produced on an experimental basis. Accordingly, the + # program is provided without any warranty whatsoever, whether express, + # implied, statutory or otherwise. The term "warranty" used herein + # includes, but is not limited to, any warranty of the quality, + # performance, merchantability and fitness for a particular purpose of + # the program and the nonexistence of any infringement or violation of + # any right of any third party. + # + # Each user of the program will agree and understand, and be deemed to + # have agreed and understood, that there is no warranty whatsoever for + # the program and, accordingly, the entire risk arising from or + # otherwise connected with the program is assumed by the user. + # + # Therefore, neither ICOT, the copyright holder, or any other + # organization that participated in or was otherwise related to the + # development of the program and their respective officials, directors, + # officers and other employees shall be held liable for any and all + # damages, including, without limitation, general, special, incidental + # and consequential damages, arising out of or otherwise in connection + # with the use or inability to use the program or any product, material + # or result produced or otherwise obtained by using the program, + # regardless of whether they have been advised of, or otherwise had + # knowledge of, the possibility of such damages at any time during the + # project or thereafter. Each user will be deemed to have agreed to the + # foregoing by his or her commencement of use of the program. The term + # "use" as used herein includes, but is not limited to, the use, + # modification, copying and distribution of the program and the + # production of secondary products from the program. + # + # In the case where the program, whether in its original form or + # modified, was distributed or delivered to or received by a user from + # any person, organization or entity other than ICOT, unless it makes or + # grants independently of ICOT any specific warranty to the user in + # writing, such person, organization or entity, will also be exempted + # from and not be held liable to the user for any such damages as noted + # above as far as the program is concerned. + # + # ---------------COPYING.ipadic-----END---------------------------------- + +3. Lao Word Break Dictionary Data (laodict.txt) + + # Copyright (c) 2013 International Business Machines Corporation + # and others. All Rights Reserved. + # + # Project: http://code.google.com/p/lao-dictionary/ + # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt + # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt + # (copied below) + # + # This file is derived from the above dictionary, with slight + # modifications. + # ---------------------------------------------------------------------- + # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell. + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, + # are permitted provided that the following conditions are met: + # + # + # Redistributions of source code must retain the above copyright notice, this + # list of conditions and the following disclaimer. Redistributions in + # binary form must reproduce the above copyright notice, this list of + # conditions and the following disclaimer in the documentation and/or + # other materials provided with the distribution. + # + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + # OF THE POSSIBILITY OF SUCH DAMAGE. + # -------------------------------------------------------------------------- + +4. Burmese Word Break Dictionary Data (burmesedict.txt) + + # Copyright (c) 2014 International Business Machines Corporation + # and others. All Rights Reserved. + # + # This list is part of a project hosted at: + # github.com/kanyawtech/myanmar-karen-word-lists + # + # -------------------------------------------------------------------------- + # Copyright (c) 2013, LeRoy Benjamin Sharon + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions + # are met: Redistributions of source code must retain the above + # copyright notice, this list of conditions and the following + # disclaimer. Redistributions in binary form must reproduce the + # above copyright notice, this list of conditions and the following + # disclaimer in the documentation and/or other materials provided + # with the distribution. + # + # Neither the name Myanmar Karen Word Lists, nor the names of its + # contributors may be used to endorse or promote products derived + # from this software without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS + # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + # SUCH DAMAGE. + # -------------------------------------------------------------------------- + +5. Time Zone Database + + ICU uses the public domain data and code derived from Time Zone +Database for its time zone support. The ownership of the TZ database +is explained in BCP 175: Procedure for Maintaining the Time Zone +Database section 7. + + # 7. Database Ownership + # + # The TZ database itself is not an IETF Contribution or an IETF + # document. Rather it is a pre-existing and regularly updated work + # that is in the public domain, and is intended to remain in the + # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do + # not apply to the TZ Database or contributions that individuals make + # to it. Should any claims be made and substantiated against the TZ + # Database, the organization that is providing the IANA + # Considerations defined in this RFC, under the memorandum of + # understanding with the IETF, currently ICANN, may act in accordance + # with all competent court orders. No ownership claims will be made + # by ICANN or the IETF Trust on the database or the code. Any person + # making a contribution to the database or code waives all rights to + # future claims in that contribution or in the TZ Database. + +6. Google double-conversion + +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/tree_sitter/unicode/README.md b/src/tree_sitter/unicode/README.md new file mode 100644 index 0000000000..623b8e3843 --- /dev/null +++ b/src/tree_sitter/unicode/README.md @@ -0,0 +1,29 @@ +# ICU Parts + +This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu). + +### License + +The license for these files is contained in the `LICENSE` file within this directory. + +### Contents + +* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory: + * `utf8.h` + * `utf16.h` + * `umachine.h` +* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed: + * `ptypes.h` + * `urename.h` + * `utf.h` +* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained. +* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository. +* `README.md` - This text file. + +### Updating ICU + +To incorporate changes from the upstream `icu` repository: + +* Update `ICU_SHA` with the new Git SHA. +* Update `LICENSE` with the license text from the directory mentioned above. +* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository. diff --git a/src/tree_sitter/unicode/ptypes.h b/src/tree_sitter/unicode/ptypes.h new file mode 100644 index 0000000000..ac79ad0f98 --- /dev/null +++ b/src/tree_sitter/unicode/ptypes.h @@ -0,0 +1 @@ +// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/umachine.h b/src/tree_sitter/unicode/umachine.h new file mode 100644 index 0000000000..9195824d5b --- /dev/null +++ b/src/tree_sitter/unicode/umachine.h @@ -0,0 +1,448 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: umachine.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep13 +* created by: Markus W. Scherer +* +* This file defines basic types and constants for ICU to be +* platform-independent. umachine.h and utf.h are included into +* utypes.h to provide all the general definitions for ICU. +* All of these definitions used to be in utypes.h before +* the UTF-handling macros made this unmaintainable. +*/ + +#ifndef __UMACHINE_H__ +#define __UMACHINE_H__ + + +/** + * \file + * \brief Basic types and constants for UTF + * + *

Basic types and constants for UTF

+ * This file defines basic types and constants for utf.h to be + * platform-independent. umachine.h and utf.h are included into + * utypes.h to provide all the general definitions for ICU. + * All of these definitions used to be in utypes.h before + * the UTF-handling macros made this unmaintainable. + * + */ +/*==========================================================================*/ +/* Include platform-dependent definitions */ +/* which are contained in the platform-specific file platform.h */ +/*==========================================================================*/ + +#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */ + +/* + * ANSI C headers: + * stddef.h defines wchar_t + */ +#include + +/*==========================================================================*/ +/* For C wrappers, we use the symbol U_STABLE. */ +/* This works properly if the includer is C or C++. */ +/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */ +/*==========================================================================*/ + +/** + * \def U_CFUNC + * This is used in a declaration of a library private ICU C function. + * @stable ICU 2.4 + */ + +/** + * \def U_CDECL_BEGIN + * This is used to begin a declaration of a library private ICU C API. + * @stable ICU 2.4 + */ + +/** + * \def U_CDECL_END + * This is used to end a declaration of a library private ICU C API + * @stable ICU 2.4 + */ + +#ifdef __cplusplus +# define U_CFUNC extern "C" +# define U_CDECL_BEGIN extern "C" { +# define U_CDECL_END } +#else +# define U_CFUNC extern +# define U_CDECL_BEGIN +# define U_CDECL_END +#endif + +#ifndef U_ATTRIBUTE_DEPRECATED +/** + * \def U_ATTRIBUTE_DEPRECATED + * This is used for GCC specific attributes + * @internal + */ +#if U_GCC_MAJOR_MINOR >= 302 +# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) +/** + * \def U_ATTRIBUTE_DEPRECATED + * This is used for Visual C++ specific attributes + * @internal + */ +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) +# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) +#else +# define U_ATTRIBUTE_DEPRECATED +#endif +#endif + +/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ +#define U_CAPI U_CFUNC U_EXPORT +/** This is used to declare a function as a stable public ICU C API*/ +#define U_STABLE U_CAPI +/** This is used to declare a function as a draft public ICU C API */ +#define U_DRAFT U_CAPI +/** This is used to declare a function as a deprecated public ICU C API */ +#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED +/** This is used to declare a function as an obsolete public ICU C API */ +#define U_OBSOLETE U_CAPI +/** This is used to declare a function as an internal ICU C API */ +#define U_INTERNAL U_CAPI + +/** + * \def U_OVERRIDE + * Defined to the C++11 "override" keyword if available. + * Denotes a class or member which is an override of the base class. + * May result in an error if it applied to something not an override. + * @internal + */ +#ifndef U_OVERRIDE +#define U_OVERRIDE override +#endif + +/** + * \def U_FINAL + * Defined to the C++11 "final" keyword if available. + * Denotes a class or member which may not be overridden in subclasses. + * May result in an error if subclasses attempt to override. + * @internal + */ +#if !defined(U_FINAL) || defined(U_IN_DOXYGEN) +#define U_FINAL final +#endif + +// Before ICU 65, function-like, multi-statement ICU macros were just defined as +// series of statements wrapped in { } blocks and the caller could choose to +// either treat them as if they were actual functions and end the invocation +// with a trailing ; creating an empty statement after the block or else omit +// this trailing ; using the knowledge that the macro would expand to { }. +// +// But doing so doesn't work well with macros that look like functions and +// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore +// switches to the standard solution of wrapping such macros in do { } while. +// +// This will however break existing code that depends on being able to invoke +// these macros without a trailing ; so to be able to remain compatible with +// such code the wrapper is itself defined as macros so that it's possible to +// build ICU 65 and later with the old macro behaviour, like this: +// +// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""' +// runConfigureICU ... + +/** + * \def UPRV_BLOCK_MACRO_BEGIN + * Defined as the "do" keyword by default. + * @internal + */ +#ifndef UPRV_BLOCK_MACRO_BEGIN +#define UPRV_BLOCK_MACRO_BEGIN do +#endif + +/** + * \def UPRV_BLOCK_MACRO_END + * Defined as "while (FALSE)" by default. + * @internal + */ +#ifndef UPRV_BLOCK_MACRO_END +#define UPRV_BLOCK_MACRO_END while (FALSE) +#endif + +/*==========================================================================*/ +/* limits for int32_t etc., like in POSIX inttypes.h */ +/*==========================================================================*/ + +#ifndef INT8_MIN +/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ +# define INT8_MIN ((int8_t)(-128)) +#endif +#ifndef INT16_MIN +/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ +# define INT16_MIN ((int16_t)(-32767-1)) +#endif +#ifndef INT32_MIN +/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ +# define INT32_MIN ((int32_t)(-2147483647-1)) +#endif + +#ifndef INT8_MAX +/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ +# define INT8_MAX ((int8_t)(127)) +#endif +#ifndef INT16_MAX +/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ +# define INT16_MAX ((int16_t)(32767)) +#endif +#ifndef INT32_MAX +/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ +# define INT32_MAX ((int32_t)(2147483647)) +#endif + +#ifndef UINT8_MAX +/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT8_MAX ((uint8_t)(255U)) +#endif +#ifndef UINT16_MAX +/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT16_MAX ((uint16_t)(65535U)) +#endif +#ifndef UINT32_MAX +/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ +# define UINT32_MAX ((uint32_t)(4294967295U)) +#endif + +#if defined(U_INT64_T_UNAVAILABLE) +# error int64_t is required for decimal format and rule-based number format. +#else +# ifndef INT64_C +/** + * Provides a platform independent way to specify a signed 64-bit integer constant. + * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C + * @stable ICU 2.8 + */ +# define INT64_C(c) c ## LL +# endif +# ifndef UINT64_C +/** + * Provides a platform independent way to specify an unsigned 64-bit integer constant. + * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C + * @stable ICU 2.8 + */ +# define UINT64_C(c) c ## ULL +# endif +# ifndef U_INT64_MIN +/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ +# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) +# endif +# ifndef U_INT64_MAX +/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ +# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) +# endif +# ifndef U_UINT64_MAX +/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ +# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) +# endif +#endif + +/*==========================================================================*/ +/* Boolean data type */ +/*==========================================================================*/ + +/** The ICU boolean type @stable ICU 2.0 */ +typedef int8_t UBool; + +#ifndef TRUE +/** The TRUE value of a UBool @stable ICU 2.0 */ +# define TRUE 1 +#endif +#ifndef FALSE +/** The FALSE value of a UBool @stable ICU 2.0 */ +# define FALSE 0 +#endif + + +/*==========================================================================*/ +/* Unicode data types */ +/*==========================================================================*/ + +/* wchar_t-related definitions -------------------------------------------- */ + +/* + * \def U_WCHAR_IS_UTF16 + * Defined if wchar_t uses UTF-16. + * + * @stable ICU 2.0 + */ +/* + * \def U_WCHAR_IS_UTF32 + * Defined if wchar_t uses UTF-32. + * + * @stable ICU 2.0 + */ +#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) +# ifdef __STDC_ISO_10646__ +# if (U_SIZEOF_WCHAR_T==2) +# define U_WCHAR_IS_UTF16 +# elif (U_SIZEOF_WCHAR_T==4) +# define U_WCHAR_IS_UTF32 +# endif +# elif defined __UCS2__ +# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) +# define U_WCHAR_IS_UTF16 +# endif +# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) +# if (U_SIZEOF_WCHAR_T==4) +# define U_WCHAR_IS_UTF32 +# endif +# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) +# define U_WCHAR_IS_UTF32 +# elif U_PLATFORM_HAS_WIN32_API +# define U_WCHAR_IS_UTF16 +# endif +#endif + +/* UChar and UChar32 definitions -------------------------------------------- */ + +/** Number of bytes in a UChar. @stable ICU 2.0 */ +#define U_SIZEOF_UCHAR 2 + +/** + * \def U_CHAR16_IS_TYPEDEF + * If 1, then char16_t is a typedef and not a real type (yet) + * @internal + */ +#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) +// for AIX, uchar.h needs to be included +# include +# define U_CHAR16_IS_TYPEDEF 1 +#elif defined(_MSC_VER) && (_MSC_VER < 1900) +// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type, +// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx +# define U_CHAR16_IS_TYPEDEF 1 +#else +# define U_CHAR16_IS_TYPEDEF 0 +#endif + + +/** + * \var UChar + * + * The base type for UTF-16 code units and pointers. + * Unsigned 16-bit integer. + * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar. + * + * UChar is configurable by defining the macro UCHAR_TYPE + * on the preprocessor or compiler command line: + * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc. + * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.) + * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16. + * + * The default is UChar=char16_t. + * + * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type. + * + * In C, char16_t is a simple typedef of uint_least16_t. + * ICU requires uint_least16_t=uint16_t for data memory mapping. + * On macOS, char16_t is not available because the uchar.h standard header is missing. + * + * @stable ICU 4.4 + */ + +#if 1 + // #if 1 is normal. UChar defaults to char16_t in C++. + // For configuration testing of UChar=uint16_t temporarily change this to #if 0. + // The intltest Makefile #defines UCHAR_TYPE=char16_t, + // so we only #define it to uint16_t if it is undefined so far. +#elif !defined(UCHAR_TYPE) +# define UCHAR_TYPE uint16_t +#endif + +#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \ + defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) + // Inside the ICU library code, never configurable. + typedef char16_t UChar; +#elif defined(UCHAR_TYPE) + typedef UCHAR_TYPE UChar; +#elif defined(__cplusplus) + typedef char16_t UChar; +#else + typedef uint16_t UChar; +#endif + +/** + * \var OldUChar + * Default ICU 58 definition of UChar. + * A base type for UTF-16 code units and pointers. + * Unsigned 16-bit integer. + * + * Define OldUChar to be wchar_t if that is 16 bits wide. + * If wchar_t is not 16 bits wide, then define UChar to be uint16_t. + * + * This makes the definition of OldUChar platform-dependent + * but allows direct string type compatibility with platforms with + * 16-bit wchar_t types. + * + * This is how UChar was defined in ICU 58, for transition convenience. + * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined. + * The current UChar responds to UCHAR_TYPE but OldUChar does not. + * + * @stable ICU 59 + */ +#if U_SIZEOF_WCHAR_T==2 + typedef wchar_t OldUChar; +#elif defined(__CHAR16_TYPE__) + typedef __CHAR16_TYPE__ OldUChar; +#else + typedef uint16_t OldUChar; +#endif + +/** + * Define UChar32 as a type for single Unicode code points. + * UChar32 is a signed 32-bit integer (same as int32_t). + * + * The Unicode code point range is 0..0x10ffff. + * All other values (negative or >=0x110000) are illegal as Unicode code points. + * They may be used as sentinel values to indicate "done", "error" + * or similar non-code point conditions. + * + * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined + * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) + * or else to be uint32_t. + * That is, the definition of UChar32 was platform-dependent. + * + * @see U_SENTINEL + * @stable ICU 2.4 + */ +typedef int32_t UChar32; + +/** + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. + * + * ICU APIs designed before ICU 2.4 usually define service-specific "done" + * values, mostly 0xffff. + * Those may need to be distinguished from + * actual U+ffff text contents by calling functions like + * CharacterIterator::hasNext() or UnicodeString::length(). + * + * @return -1 + * @see UChar32 + * @stable ICU 2.4 + */ +#define U_SENTINEL (-1) + +#include "unicode/urename.h" + +#endif diff --git a/src/tree_sitter/unicode/urename.h b/src/tree_sitter/unicode/urename.h new file mode 100644 index 0000000000..ac79ad0f98 --- /dev/null +++ b/src/tree_sitter/unicode/urename.h @@ -0,0 +1 @@ +// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/utf.h b/src/tree_sitter/unicode/utf.h new file mode 100644 index 0000000000..ac79ad0f98 --- /dev/null +++ b/src/tree_sitter/unicode/utf.h @@ -0,0 +1 @@ +// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/utf16.h b/src/tree_sitter/unicode/utf16.h new file mode 100644 index 0000000000..9fd7d5c8a7 --- /dev/null +++ b/src/tree_sitter/unicode/utf16.h @@ -0,0 +1,733 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf16.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep09 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: 16-bit Unicode handling macros + * + * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://userguide.icu-project.org/strings). + * + * Usage: + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ + +#ifndef __UTF16_H__ +#define __UTF16_H__ + +#include "unicode/umachine.h" +#ifndef __UTF_H__ +# include "unicode/utf.h" +#endif + +/* single-code point definitions -------------------------------------------- */ + +/** + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) + +/** + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) + +/** + * Is this code unit a trail surrogate (U+dc00..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) + +/** + * Is this code unit a surrogate (U+d800..U+dfff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) + +/** + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) + +/** + * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), + * is it a trail surrogate? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @stable ICU 4.2 + */ +#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) + +/** + * Helper constant for U16_GET_SUPPLEMENTARY. + * @internal + */ +#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) + +/** + * Get a supplementary code point value (U+10000..U+10ffff) + * from its lead and trail surrogates. + * The result is undefined if the input values are not + * lead and trail surrogates. + * + * @param lead lead surrogate (U+d800..U+dbff) + * @param trail trail surrogate (U+dc00..U+dfff) + * @return supplementary code point (U+10000..U+10ffff) + * @stable ICU 2.4 + */ +#define U16_GET_SUPPLEMENTARY(lead, trail) \ + (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) + + +/** + * Get the lead surrogate (0xd800..0xdbff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return lead surrogate (U+d800..U+dbff) for supplementary + * @stable ICU 2.4 + */ +#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) + +/** + * Get the trail surrogate (0xdc00..0xdfff) for a + * supplementary code point (0x10000..0x10ffff). + * @param supplementary 32-bit code point (U+10000..U+10ffff) + * @return trail surrogate (U+dc00..U+dfff) for supplementary + * @stable ICU 2.4 + */ +#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +/** + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). + * @param c 32-bit code point + * @return 1 or 2 + * @stable ICU 2.4 + */ +#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + * @return 2 + * @stable ICU 2.4 + */ +#define U16_MAX_LENGTH 2 + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * The result is undefined if the offset points to a single, unpaired surrogate. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_GET + * @stable ICU 2.4 + */ +#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[i]; \ + if(U16_IS_SURROGATE(c)) { \ + if(U16_IS_SURROGATE_LEAD(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \ + } else { \ + (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to a single, unpaired surrogate, then + * c is set to that unpaired surrogate. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to a single, unpaired surrogate, then + * c is set to U+FFFD. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } else { \ + (c)=0xfffd; \ + } \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/* definitions with forward iteration --------------------------------------- */ + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate, then that itself + * will be returned as the code point. + * The result is undefined if the offset points to a single, unpaired lead surrogate. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_NEXT + * @stable ICU 2.4 + */ +#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[(i)++]; \ + if(U16_IS_LEAD(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. + * + * @param s const UChar * string + * @param i string offset, must be i>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Append a code point to a string, overwriting 1 or 2 code units. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Safe" macro, checks for a valid code point. + * If a surrogate pair is written, checks for sufficient space in the string. + * If the code point is not valid or a trail surrogate does not fit, + * then isError is set to TRUE. + * + * @param s const UChar * string buffer + * @param i string offset, must be i>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=TRUE; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_FWD_1 + * @stable ICU 2.4 + */ +#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U16_IS_LEAD((s)[(i)++])) { \ + ++(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param i string offset, must be i0) { \ + U16_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ + U16_FWD_1(s, i, length); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_START + * @stable ICU 2.4 + */ +#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U16_IS_TRAIL((s)[i])) { \ + --(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i + * @see U16_SET_CP_START_UNSAFE + * @stable ICU 2.4 + */ +#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/* definitions with backward iteration -------------------------------------- */ + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate, then that itself + * will be returned as the code point. + * The result is undefined if the offset is behind a single, unpaired trail surrogate. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_PREV + * @stable ICU 2.4 + */ +#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[--(i)]; \ + if(U16_IS_TRAIL(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate or behind a single, unpaired + * trail surrogate, then c is set to that unpaired surrogate. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate or behind a single, unpaired + * trail surrogate, then c is set to U+FFFD. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } else { \ + (c)=0xfffd; \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_BACK_1 + * @stable ICU 2.4 + */ +#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U16_IS_TRAIL((s)[--(i)])) { \ + --(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U16_BACK_N + * @stable ICU 2.4 + */ +#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ + int32_t __N=(n); \ + while(__N>0) { \ + U16_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start start of string + * @param i string offset, must be start0 && (i)>(start)) { \ + U16_BACK_1(s, start, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_LIMIT + * @stable ICU 2.4 + */ +#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U16_IS_LEAD((s)[(i)-1])) { \ + ++(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, start<=i<=length + * @param length int32_t string length + * @see U16_SET_CP_LIMIT_UNSAFE + * @stable ICU 2.4 + */ +#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ + if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ +} UPRV_BLOCK_MACRO_END + +#endif diff --git a/src/tree_sitter/unicode/utf8.h b/src/tree_sitter/unicode/utf8.h new file mode 100644 index 0000000000..bb00130374 --- /dev/null +++ b/src/tree_sitter/unicode/utf8.h @@ -0,0 +1,881 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf8.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep13 +* created by: Markus W. Scherer +*/ + +/** + * \file + * \brief C API: 8-bit Unicode handling macros + * + * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://userguide.icu-project.org/strings). + * + * Usage: + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ + +#ifndef __UTF8_H__ +#define __UTF8_H__ + +#include "unicode/umachine.h" +#ifndef __UTF_H__ +# include "unicode/utf.h" +#endif + +/* internal definitions ----------------------------------------------------- */ + +/** + * Counts the trail bytes for a UTF-8 lead byte. + * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. + * leadByte might be evaluated multiple times. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES(leadByte) \ + (U8_IS_LEAD(leadByte) ? \ + ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) + +/** + * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. + * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. + * leadByte might be evaluated multiple times. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ + (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) + +/** + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * @internal + */ +#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) + +/** + * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte E0..EF bits 3..0 are used as byte index, + * first trail byte bits 7..5 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD3_AND_T1 + * @internal + */ +#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" + +/** + * Internal 3-byte UTF-8 validity check. + * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. + * @internal + */ +#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) + +/** + * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * First trail byte bits 7..4 are used as byte index, + * lead byte F0..F4 bits 2..0 are used as bit index into that byte. + * @see U8_IS_VALID_LEAD4_AND_T1 + * @internal + */ +#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" + +/** + * Internal 4-byte UTF-8 validity check. + * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. + * @internal + */ +#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) + +/** + * Function for handling "next code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE UChar32 U_EXPORT2 +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); + +/** + * Function for handling "append code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE int32_t U_EXPORT2 +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); + +/** + * Function for handling "previous code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE UChar32 U_EXPORT2 +utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); + +/** + * Function for handling "skip backward one code point" with error-checking. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this + * file and thus must remain stable, and should not be hidden when other internal + * functions are hidden (otherwise public macros would fail to compile). + * @internal + */ +U_STABLE int32_t U_EXPORT2 +utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); + +/* single-code point definitions -------------------------------------------- */ + +/** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_SINGLE(c) (((c)&0x80)==0) + +/** + * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) +// 0x32=0xf4-0xc2 + +/** + * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ +#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) + +/** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + * @stable ICU 2.4 + */ +#define U8_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xd7ff ? 3 : \ + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ + ((uint32_t)(c)<=0xffff ? 3 : 4)\ + ) \ + ) \ + ) \ + ) + +/** + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). + * @return 4 + * @stable ICU 2.4 + */ +#define U8_MAX_LENGTH 4 + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * The result is undefined if the offset points to an illegal UTF-8 + * byte sequence. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const uint8_t * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_GET + * @stable ICU 2.4 + */ +#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + int32_t _u8_get_unsafe_index=(int32_t)(i); \ + U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ + U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ +} UPRV_BLOCK_MACRO_END + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to a negative value. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i=0xe0 ? \ + ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ + U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ + (__t&=0x3f, 1) \ + : /* U+10000..U+10FFFF */ \ + ((c)-=0xf0)<=4 && \ + U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ + ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ + (__t=(s)[i]-0x80)<=0x3f) && \ + /* valid second-to-last trail byte */ \ + ((c)=((c)<<6)|__t, ++(i)!=(length)) \ + : /* U+0080..U+07FF */ \ + (c)>=0xc2 && ((c)&=0x1f, 1)) && \ + /* last trail byte */ \ + (__t=(s)[i]-0x80)<=0x3f && \ + ((c)=((c)<<6)|__t, ++(i), 1)) { \ + } else { \ + (c)=(sub); /* ill-formed*/ \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const uint8_t * string buffer + * @param i string offset + * @param c code point to append + * @see U8_APPEND + * @stable ICU 2.4 + */ +#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + uint32_t __uc=(c); \ + if(__uc<=0x7f) { \ + (s)[(i)++]=(uint8_t)__uc; \ + } else { \ + if(__uc<=0x7ff) { \ + (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ + } else { \ + if(__uc<=0xffff) { \ + (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Safe" macro, checks for a valid code point. + * If a non-ASCII code point is written, checks for sufficient space in the string. + * If the code point is not valid or trail bytes do not fit, + * then isError is set to TRUE. + * + * @param s const uint8_t * string buffer + * @param i int32_t string offset, must be i>6)|0xc0); \ + (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ + } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ + (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ + (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ + (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ + } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ + (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ + (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ + (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ + } else { \ + (isError)=TRUE; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_FWD_1 + * @stable ICU 2.4 + */ +#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i=0xf0 */ { \ + if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ + ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ + ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @param n number of code points to skip + * @see U8_FWD_N + * @stable ICU 2.4 + */ +#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ + int32_t __N=(n); \ + while(__N>0) { \ + U8_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ + U8_FWD_1(s, i, length); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_SET_CP_START + * @stable ICU 2.4 + */ +#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + while(U8_IS_TRAIL((s)[i])) { --(i); } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * + * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i + * @see U8_SET_CP_START_UNSAFE + * @see U8_TRUNCATE_IF_INCOMPLETE + * @stable ICU 2.4 + */ +#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ + if(U8_IS_TRAIL((s)[(i)])) { \ + (i)=utf8_back1SafeBody(s, start, (i)); \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * If the string ends with a UTF-8 byte sequence that is valid so far + * but incomplete, then reduce the length of the string to end before + * the lead byte of that incomplete sequence. + * For example, if the string ends with E1 80, the length is reduced by 2. + * + * In all other cases (the string ends with a complete sequence, or it is not + * possible for any further trail byte to extend the trailing sequence) + * the length remains unchanged. + * + * Useful for processing text split across multiple buffers + * (save the incomplete sequence for later) + * and for optimizing iteration + * (check for string length only once per character). + * + * "Safe" macro, checks for illegal sequences and for string boundaries. + * Unlike U8_SET_CP_START(), this macro never reads s[length]. + * + * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param length int32_t string length (usually start<=length) + * @see U8_SET_CP_START + * @stable ICU 61 + */ +#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ + if((length)>(start)) { \ + uint8_t __b1=s[(length)-1]; \ + if(U8_IS_SINGLE(__b1)) { \ + /* common ASCII character */ \ + } else if(U8_IS_LEAD(__b1)) { \ + --(length); \ + } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ + uint8_t __b2=s[(length)-2]; \ + if(0xe0<=__b2 && __b2<=0xf4) { \ + if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ + U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ + (length)-=2; \ + } \ + } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ + uint8_t __b3=s[(length)-3]; \ + if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ + (length)-=3; \ + } \ + } \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/* definitions with backward iteration -------------------------------------- */ + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * The result is undefined if the offset is behind an illegal UTF-8 sequence. + * + * @param s const uint8_t * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_PREV + * @stable ICU 2.4 + */ +#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=(uint8_t)(s)[--(i)]; \ + if(U8_IS_TRAIL(c)) { \ + uint8_t __b, __count=1, __shift=6; \ +\ + /* c is a trail byte */ \ + (c)&=0x3f; \ + for(;;) { \ + __b=(s)[--(i)]; \ + if(__b>=0xc0) { \ + U8_MASK_LEAD_BYTE(__b, __count); \ + (c)|=(UChar32)__b<<__shift; \ + break; \ + } else { \ + (c)|=(UChar32)(__b&0x3f)<<__shift; \ + ++__count; \ + __shift+=6; \ + } \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start0) { \ + U8_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const uint8_t * string + * @param start int32_t index of the start of the string + * @param i int32_t string offset, must be start0 && (i)>(start)) { \ + U8_BACK_1(s, start, i); \ + --__N; \ + } \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const uint8_t * string + * @param i string offset + * @see U8_SET_CP_LIMIT + * @stable ICU 2.4 + */ +#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ + U8_BACK_1_UNSAFE(s, i); \ + U8_FWD_1_UNSAFE(s, i); \ +} UPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i<=length + * @param length int32_t string length + * @see U8_SET_CP_LIMIT_UNSAFE + * @stable ICU 2.4 + */ +#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ + if((start)<(i) && ((i)<(length) || (length)<0)) { \ + U8_BACK_1(s, start, i); \ + U8_FWD_1(s, i, length); \ + } \ +} UPRV_BLOCK_MACRO_END + +#endif From f486d2145a5c252d7f37ea399f9a769ddd745934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= Date: Mon, 16 Dec 2019 19:41:15 +0100 Subject: [PATCH 2/4] tree-sitter: fix relative paths in unicode/ subdir --- src/tree_sitter/unicode.h | 4 ++-- src/tree_sitter/unicode/umachine.h | 4 ++-- src/tree_sitter/unicode/utf16.h | 4 ++-- src/tree_sitter/unicode/utf8.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/tree_sitter/unicode.h b/src/tree_sitter/unicode.h index 0fba56a612..2ab51c2a3a 100644 --- a/src/tree_sitter/unicode.h +++ b/src/tree_sitter/unicode.h @@ -10,8 +10,8 @@ extern "C" { #define U_EXPORT #define U_EXPORT2 -#include "unicode/utf8.h" -#include "unicode/utf16.h" +#include "./unicode/utf8.h" +#include "./unicode/utf16.h" static const int32_t TS_DECODE_ERROR = U_SENTINEL; diff --git a/src/tree_sitter/unicode/umachine.h b/src/tree_sitter/unicode/umachine.h index 9195824d5b..bbf6ef9c8b 100644 --- a/src/tree_sitter/unicode/umachine.h +++ b/src/tree_sitter/unicode/umachine.h @@ -43,7 +43,7 @@ /* which are contained in the platform-specific file platform.h */ /*==========================================================================*/ -#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */ +#include "./ptypes.h" /* platform.h is included in ptypes.h */ /* * ANSI C headers: @@ -443,6 +443,6 @@ typedef int32_t UChar32; */ #define U_SENTINEL (-1) -#include "unicode/urename.h" +#include "./urename.h" #endif diff --git a/src/tree_sitter/unicode/utf16.h b/src/tree_sitter/unicode/utf16.h index 9fd7d5c8a7..b547922441 100644 --- a/src/tree_sitter/unicode/utf16.h +++ b/src/tree_sitter/unicode/utf16.h @@ -34,9 +34,9 @@ #ifndef __UTF16_H__ #define __UTF16_H__ -#include "unicode/umachine.h" +#include "./umachine.h" #ifndef __UTF_H__ -# include "unicode/utf.h" +# include "./utf.h" #endif /* single-code point definitions -------------------------------------------- */ diff --git a/src/tree_sitter/unicode/utf8.h b/src/tree_sitter/unicode/utf8.h index bb00130374..3b37873e37 100644 --- a/src/tree_sitter/unicode/utf8.h +++ b/src/tree_sitter/unicode/utf8.h @@ -34,9 +34,9 @@ #ifndef __UTF8_H__ #define __UTF8_H__ -#include "unicode/umachine.h" +#include "./umachine.h" #ifndef __UTF_H__ -# include "unicode/utf.h" +# include "./utf.h" #endif /* internal definitions ----------------------------------------------------- */ From c21511b2f48685461bf2655b28eff4434c91d449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= Date: Sat, 21 Dec 2019 09:57:33 +0100 Subject: [PATCH 3/4] tree-sitter: fix prototypes (to be upstreamed) --- src/tree_sitter/query.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index a5ce86032c..2563325248 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -224,7 +224,7 @@ static void stream_scan_identifier(Stream *stream) { * CaptureListPool ******************/ -static CaptureListPool capture_list_pool_new() { +static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { .list = array_new(), .usage_map = UINT32_MAX, @@ -269,7 +269,7 @@ static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { * SymbolTable **************/ -static SymbolTable symbol_table_new() { +static SymbolTable symbol_table_new(void) { return (SymbolTable) { .characters = array_new(), .slices = array_new(), From 440695c29696f261337227e5c419aa1cf313c2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Linse?= Date: Sat, 28 Sep 2019 14:27:20 +0200 Subject: [PATCH 4/4] tree-sitter: implement query functionality and highlighting prototype [skip.lint] --- runtime/doc/lua.txt | 96 +++++ runtime/lua/vim/treesitter.lua | 120 +++++- runtime/lua/vim/tshighlighter.lua | 142 ++++++++ src/nvim/api/buffer.c | 91 ++++- src/nvim/api/private/helpers.h | 6 + src/nvim/api/vim.c | 33 ++ src/nvim/buffer_defs.h | 6 + src/nvim/buffer_updates.c | 11 +- src/nvim/func_attr.h | 2 + src/nvim/generators/c_grammar.lua | 1 + src/nvim/generators/gen_api_dispatch.lua | 15 +- src/nvim/generators/gen_eval.lua | 2 +- src/nvim/globals.h | 7 + src/nvim/lua/executor.c | 25 +- src/nvim/lua/treesitter.c | 321 +++++++++++++++- src/nvim/screen.c | 105 +++++- test/functional/api/highlight_spec.lua | 19 + test/functional/lua/treesitter_spec.lua | 442 ++++++++++++++++++----- 18 files changed, 1293 insertions(+), 151 deletions(-) create mode 100644 runtime/lua/vim/tshighlighter.lua diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt index c0da06ffe3..1c3a7f70c9 100644 --- a/runtime/doc/lua.txt +++ b/runtime/doc/lua.txt @@ -594,6 +594,102 @@ tsnode:named_descendant_for_range(start_row, start_col, end_row, end_col) Get the smallest named node within this node that spans the given range of (row, column) positions +Query methods *lua-treesitter-query* + +Tree-sitter queries are supported, with some limitations. Currently, the only +supported match predicate is `eq?` (both comparing a capture against a string +and two captures against each other). + +vim.treesitter.parse_query(lang, query) + *vim.treesitter.parse_query(()* + Parse the query as a string. (If the query is in a file, the caller + should read the contents into a string before calling). + +query:iter_captures(node, bufnr, start_row, end_row) + *query:iter_captures()* + Iterate over all captures from all matches inside a `node`. + `bufnr` is needed if the query contains predicates, then the caller + must ensure to use a freshly parsed tree consistent with the current + text of the buffer. `start_row` and `end_row` can be used to limit + matches inside a row range (this is typically used with root node + as the node, i e to get syntax highlight matches in the current + viewport) + + The iterator returns two values, a numeric id identifying the capture + and the captured node. The following example shows how to get captures + by name: +> + for id, node in query:iter_captures(tree:root(), bufnr, first, last) do + local name = query.captures[id] -- name of the capture in the query + -- typically useful info about the node: + local type = node:type() -- type of the captured node + local row1, col1, row2, col2 = node:range() -- range of the capture + ... use the info here ... + end +< +query:iter_matches(node, bufnr, start_row, end_row) + *query:iter_matches()* + Iterate over all matches within a node. The arguments are the same as + for |query:iter_captures()| but the iterated values are different: + an (1-based) index of the pattern in the query, and a table mapping + capture indices to nodes. If the query has more than one pattern + the capture table might be sparse, and e.g. `pairs` should be used and not + `ipairs`. Here an example iterating over all captures in + every match: +> + for pattern, match in cquery:iter_matches(tree:root(), bufnr, first, last) do + for id,node in pairs(match) do + local name = query.captures[id] + -- `node` was captured by the `name` capture in the match + ... use the info here ... + end + end +> +Treesitter syntax highlighting (WIP) *lua-treesitter-highlight* + +NOTE: This is a partially implemented feature, and not usable as a default +solution yet. What is documented here is a temporary interface indented +for those who want to experiment with this feature and contribute to +its development. + +Highlights are defined in the same query format as in the tree-sitter highlight +crate, which some limitations and additions. Set a highlight query for a +buffer with this code: > + + local query = [[ + "for" @keyword + "if" @keyword + "return" @keyword + + (string_literal) @string + (number_literal) @number + (comment) @comment + + (preproc_function_def name: (identifier) @function) + + ; ... more definitions + ]] + + highlighter = vim.treesitter.TSHighlighter.new(query, bufnr, lang) + -- alternatively, to use the current buffer and its filetype: + -- highlighter = vim.treesitter.TSHighlighter.new(query) + + -- Don't recreate the highlighter for the same buffer, instead + -- modify the query like this: + local query2 = [[ ... ]] + highlighter:set_query(query2) + +As mentioned above the supported predicate is currently only `eq?`. `match?` +predicates behave like matching always fails. As an addition a capture which +begin with an upper-case letter like `@WarningMsg` will map directly to this +highlight group, if defined. Also if the predicate begins with upper-case and +contains a dot only the part before the first will be interpreted as the +highlight group. As an example, this warns of a binary expression with two +identical identifiers, highlighting both as |hl-WarningMsg|: > + + ((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) + (eq? @WarningMsg.left @WarningMsg.right)) + ------------------------------------------------------------------------------ VIM *lua-builtin* diff --git a/runtime/lua/vim/treesitter.lua b/runtime/lua/vim/treesitter.lua index e0202927bb..aa8b8fcdd1 100644 --- a/runtime/lua/vim/treesitter.lua +++ b/runtime/lua/vim/treesitter.lua @@ -12,9 +12,13 @@ function Parser:parse() if self.valid then return self.tree end - self.tree = self._parser:parse_buf(self.bufnr) + local changes + self.tree, changes = self._parser:parse_buf(self.bufnr) self.valid = true - return self.tree + for _, cb in ipairs(self.change_cbs) do + cb(changes) + end + return self.tree, changes end function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size) @@ -26,17 +30,28 @@ function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_ self.valid = false end -local module = { +local M = { add_language=vim._ts_add_language, inspect_language=vim._ts_inspect_language, + parse_query = vim._ts_parse_query, } -function module.create_parser(bufnr, ft, id) +setmetatable(M, { + __index = function (t, k) + if k == "TSHighlighter" then + t[k] = require'vim.tshighlighter' + return t[k] + end + end + }) + +function M.create_parser(bufnr, ft, id) if bufnr == 0 then bufnr = a.nvim_get_current_buf() end - local self = setmetatable({bufnr=bufnr, valid=false}, Parser) + local self = setmetatable({bufnr=bufnr, lang=ft, valid=false}, Parser) self._parser = vim._create_ts_parser(ft) + self.change_cbs = {} self:parse() -- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is -- using it. @@ -55,7 +70,7 @@ function module.create_parser(bufnr, ft, id) return self end -function module.get_parser(bufnr, ft) +function M.get_parser(bufnr, ft, cb) if bufnr == nil or bufnr == 0 then bufnr = a.nvim_get_current_buf() end @@ -65,9 +80,98 @@ function module.get_parser(bufnr, ft) local id = tostring(bufnr)..'_'..ft if parsers[id] == nil then - parsers[id] = module.create_parser(bufnr, ft, id) + parsers[id] = M.create_parser(bufnr, ft, id) + end + if cb ~= nil then + table.insert(parsers[id].change_cbs, cb) end return parsers[id] end -return module +-- query: pattern matching on trees +-- predicate matching is implemented in lua +local Query = {} +Query.__index = Query + +function M.parse_query(lang, query) + local self = setmetatable({}, Query) + self.query = vim._ts_parse_query(lang, query) + self.info = self.query:inspect() + self.captures = self.info.captures + return self +end + +local function get_node_text(node, bufnr) + local start_row, start_col, end_row, end_col = node:range() + if start_row ~= end_row then + return nil + end + local line = a.nvim_buf_get_lines(bufnr, start_row, start_row+1, true)[1] + return string.sub(line, start_col+1, end_col) +end + +local function match_preds(match, preds, bufnr) + for _, pred in pairs(preds) do + if pred[1] == "eq?" then + local node = match[pred[2]] + local node_text = get_node_text(node, bufnr) + + local str + if type(pred[3]) == "string" then + -- (eq? @aa "foo") + str = pred[3] + else + -- (eq? @aa @bb) + str = get_node_text(match[pred[3]], bufnr) + end + + if node_text ~= str or str == nil then + return false + end + else + return false + end + end + return true +end + +function Query:iter_captures(node, bufnr, start, stop) + if bufnr == 0 then + bufnr = vim.api.nvim_get_current_buf() + end + local raw_iter = node:_rawquery(self.query,true,start,stop) + local function iter() + local capture, captured_node, match = raw_iter() + if match ~= nil then + local preds = self.info.patterns[match.pattern] + local active = match_preds(match, preds, bufnr) + match.active = active + if not active then + return iter() -- tail call: try next match + end + end + return capture, captured_node + end + return iter +end + +function Query:iter_matches(node, bufnr, start, stop) + if bufnr == 0 then + bufnr = vim.api.nvim_get_current_buf() + end + local raw_iter = node:_rawquery(self.query,false,start,stop) + local function iter() + local pattern, match = raw_iter() + if match ~= nil then + local preds = self.info.patterns[pattern] + local active = (not preds) or match_preds(match, preds, bufnr) + if not active then + return iter() -- tail call: try next match + end + end + return pattern, match + end + return iter +end + +return M diff --git a/runtime/lua/vim/tshighlighter.lua b/runtime/lua/vim/tshighlighter.lua new file mode 100644 index 0000000000..1544ecbf49 --- /dev/null +++ b/runtime/lua/vim/tshighlighter.lua @@ -0,0 +1,142 @@ +local a = vim.api + +-- support reload for quick experimentation +local TSHighlighter = rawget(vim.treesitter, 'TSHighlighter') or {} +TSHighlighter.__index = TSHighlighter + +-- These are conventions defined by tree-sitter, though it +-- needs to be user extensible also. +-- TODO(bfredl): this is very much incomplete, we will need to +-- go through a few tree-sitter provided queries and decide +-- on translations that makes the most sense. +TSHighlighter.hl_map = { + keyword="Keyword", + string="String", + type="Type", + comment="Comment", + constant="Constant", + operator="Operator", + number="Number", + label="Label", + ["function"]="Function", + ["function.special"]="Function", +} + +function TSHighlighter.new(query, bufnr, ft) + local self = setmetatable({}, TSHighlighter) + self.parser = vim.treesitter.get_parser(bufnr, ft, function(...) self:on_change(...) end) + self.buf = self.parser.bufnr + -- TODO(bfredl): perhaps on_start should be called uncondionally, instead for only on mod? + local tree = self.parser:parse() + self.root = tree:root() + self:set_query(query) + self.edit_count = 0 + self.redraw_count = 0 + self.line_count = {} + a.nvim_buf_set_option(self.buf, "syntax", "") + a.nvim__buf_set_luahl(self.buf, { + on_start=function(...) return self:on_start(...) end, + on_window=function(...) return self:on_window(...) end, + on_line=function(...) return self:on_line(...) end, + }) + + -- Tricky: if syntax hasn't been enabled, we need to reload color scheme + -- but use synload.vim rather than syntax.vim to not enable + -- syntax FileType autocmds. Later on we should integrate with the + -- `:syntax` and `set syntax=...` machinery properly. + if vim.g.syntax_on ~= 1 then + vim.api.nvim_command("runtime! syntax/synload.vim") + end + return self +end + +function TSHighlighter:set_query(query) + if type(query) == "string" then + query = vim.treesitter.parse_query(self.parser.lang, query) + end + self.query = query + + self.id_map = {} + for i, capture in ipairs(self.query.captures) do + local hl = 0 + local firstc = string.sub(capture, 1, 1) + local hl_group = self.hl_map[capture] + if firstc ~= string.lower(firstc) then + hl_group = vim.split(capture, '.', true)[1] + end + if hl_group then + hl = a.nvim_get_hl_id_by_name(hl_group) + end + self.id_map[i] = hl + end +end + +function TSHighlighter:on_change(changes) + for _, ch in ipairs(changes or {}) do + a.nvim__buf_redraw_range(self.buf, ch[1], ch[3]+1) + end + self.edit_count = self.edit_count + 1 +end + +function TSHighlighter:on_start(_, _buf, _tick) + local tree = self.parser:parse() + self.root = tree:root() +end + +function TSHighlighter:on_window(_, _win, _buf, _topline, botline) + self.iter = nil + self.active_nodes = {} + self.nextrow = 0 + self.botline = botline + self.redraw_count = self.redraw_count + 1 +end + +function TSHighlighter:on_line(_, _win, buf, line) + if self.iter == nil then + self.iter = self.query:iter_captures(self.root,buf,line,self.botline) + end + while line >= self.nextrow do + local capture, node, match = self.iter() + local active = true + if capture == nil then + break + end + if match ~= nil then + active = self:run_pred(match) + match.active = active + end + local start_row, start_col, end_row, end_col = node:range() + local hl = self.id_map[capture] + if hl > 0 and active then + if start_row == line and end_row == line then + a.nvim__put_attr(hl, start_col, end_col) + elseif end_row >= line then + -- TODO(bfredl): this is quite messy. Togheter with multiline bufhl we should support + -- luahl generating multiline highlights (and other kinds of annotations) + self.active_nodes[{hl=hl, start_row=start_row, start_col=start_col, end_row=end_row, end_col=end_col}] = true + end + end + if start_row > line then + self.nextrow = start_row + end + end + for node,_ in pairs(self.active_nodes) do + if node.start_row <= line and node.end_row >= line then + local start_col, end_col = node.start_col, node.end_col + if node.start_row < line then + start_col = 0 + end + if node.end_row > line then + end_col = 9000 + end + a.nvim__put_attr(node.hl, start_col, end_col) + end + if node.end_row <= line then + self.active_nodes[node] = nil + end + end + self.line_count[line] = (self.line_count[line] or 0) + 1 + --return tostring(self.line_count[line]) +end + +return TSHighlighter diff --git a/src/nvim/api/buffer.c b/src/nvim/api/buffer.c index 8f5718d97e..e6f8f73b9d 100644 --- a/src/nvim/api/buffer.c +++ b/src/nvim/api/buffer.c @@ -169,21 +169,21 @@ Boolean nvim_buf_attach(uint64_t channel_id, goto error; } cb.on_lines = v->data.luaref; - v->data.integer = LUA_NOREF; + v->data.luaref = LUA_NOREF; } else if (is_lua && strequal("on_changedtick", k.data)) { if (v->type != kObjectTypeLuaRef) { api_set_error(err, kErrorTypeValidation, "callback is not a function"); goto error; } cb.on_changedtick = v->data.luaref; - v->data.integer = LUA_NOREF; + v->data.luaref = LUA_NOREF; } else if (is_lua && strequal("on_detach", k.data)) { if (v->type != kObjectTypeLuaRef) { api_set_error(err, kErrorTypeValidation, "callback is not a function"); goto error; } cb.on_detach = v->data.luaref; - v->data.integer = LUA_NOREF; + v->data.luaref = LUA_NOREF; } else if (is_lua && strequal("utf_sizes", k.data)) { if (v->type != kObjectTypeBoolean) { api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean"); @@ -231,6 +231,90 @@ Boolean nvim_buf_detach(uint64_t channel_id, return true; } +static void buf_clear_luahl(buf_T *buf, bool force) +{ + if (buf->b_luahl || force) { + executor_free_luaref(buf->b_luahl_start); + executor_free_luaref(buf->b_luahl_window); + executor_free_luaref(buf->b_luahl_line); + executor_free_luaref(buf->b_luahl_end); + } + buf->b_luahl_start = LUA_NOREF; + buf->b_luahl_window = LUA_NOREF; + buf->b_luahl_line = LUA_NOREF; + buf->b_luahl_end = LUA_NOREF; +} + +/// Unstabilized interface for defining syntax hl in lua. +/// +/// This is not yet safe for general use, lua callbacks will need to +/// be restricted, like textlock and probably other stuff. +/// +/// The API on_line/nvim__put_attr is quite raw and not intended to be the +/// final shape. Ideally this should operate on chunks larger than a single +/// line to reduce interpreter overhead, and generate annotation objects +/// (bufhl/virttext) on the fly but using the same representation. +void nvim__buf_set_luahl(uint64_t channel_id, Buffer buffer, + DictionaryOf(LuaRef) opts, Error *err) + FUNC_API_LUA_ONLY +{ + buf_T *buf = find_buffer_by_handle(buffer, err); + + if (!buf) { + return; + } + + redraw_buf_later(buf, NOT_VALID); + buf_clear_luahl(buf, false); + + for (size_t i = 0; i < opts.size; i++) { + String k = opts.items[i].key; + Object *v = &opts.items[i].value; + if (strequal("on_start", k.data)) { + if (v->type != kObjectTypeLuaRef) { + api_set_error(err, kErrorTypeValidation, "callback is not a function"); + goto error; + } + buf->b_luahl_start = v->data.luaref; + v->data.luaref = LUA_NOREF; + } else if (strequal("on_window", k.data)) { + if (v->type != kObjectTypeLuaRef) { + api_set_error(err, kErrorTypeValidation, "callback is not a function"); + goto error; + } + buf->b_luahl_window = v->data.luaref; + v->data.luaref = LUA_NOREF; + } else if (strequal("on_line", k.data)) { + if (v->type != kObjectTypeLuaRef) { + api_set_error(err, kErrorTypeValidation, "callback is not a function"); + goto error; + } + buf->b_luahl_line = v->data.luaref; + v->data.luaref = LUA_NOREF; + } else { + api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data); + goto error; + } + } + buf->b_luahl = true; + return; +error: + buf_clear_luahl(buf, true); + buf->b_luahl = false; +} + +void nvim__buf_redraw_range(Buffer buffer, Integer first, Integer last, + Error *err) + FUNC_API_LUA_ONLY +{ + buf_T *buf = find_buffer_by_handle(buffer, err); + if (!buf) { + return; + } + + redraw_buf_range_later(buf, (linenr_T)first+1, (linenr_T)last); +} + /// Sets a buffer line /// /// @deprecated use nvim_buf_set_lines instead. @@ -1112,7 +1196,6 @@ Array nvim_buf_get_extmarks(Buffer buffer, Integer ns_id, Object start, return rv; } limit = v->data.integer; - v->data.integer = LUA_NOREF; } else { api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data); return rv; diff --git a/src/nvim/api/private/helpers.h b/src/nvim/api/private/helpers.h index 8930f252f6..048b937136 100644 --- a/src/nvim/api/private/helpers.h +++ b/src/nvim/api/private/helpers.h @@ -60,6 +60,12 @@ #define ADD(array, item) \ kv_push(array, item) +#define FIXED_TEMP_ARRAY(name, fixsize) \ + Array name = ARRAY_DICT_INIT; \ + Object name##__items[fixsize]; \ + args.size = fixsize; \ + args.items = name##__items; \ + #define STATIC_CSTR_AS_STRING(s) ((String) {.data = s, .size = sizeof(s) - 1}) /// Create a new String instance, putting data in allocated memory diff --git a/src/nvim/api/vim.c b/src/nvim/api/vim.c index 19601b6539..2f59edee33 100644 --- a/src/nvim/api/vim.c +++ b/src/nvim/api/vim.c @@ -189,6 +189,15 @@ Dictionary nvim_get_hl_by_id(Integer hl_id, Boolean rgb, Error *err) return hl_get_attr_by_id(attrcode, rgb, err); } +/// Gets a highlight group by name +/// +/// similar to |hlID()|, but allocates a new ID if not present. +Integer nvim_get_hl_id_by_name(String name) + FUNC_API_SINCE(7) +{ + return syn_check_group((const char_u *)name.data, (int)name.size); +} + /// Sends input-keys to Nvim, subject to various quirks controlled by `mode` /// flags. This is a blocking call, unlike |nvim_input()|. /// @@ -2546,3 +2555,27 @@ Array nvim__inspect_cell(Integer grid, Integer row, Integer col, Error *err) } return ret; } + +/// Set attrs in nvim__buf_set_lua_hl callbacks +/// +/// TODO(bfredl): This is rather pedestrian. The final +/// interface should probably be derived from a reformed +/// bufhl/virttext interface with full support for multi-line +/// ranges etc +void nvim__put_attr(Integer id, Integer c0, Integer c1) + FUNC_API_LUA_ONLY +{ + if (!lua_attr_active) { + return; + } + if (id == 0 || syn_get_final_id((int)id) == 0) { + return; + } + int attr = syn_id2attr((int)id); + c0 = MAX(c0, 0); + c1 = MIN(c1, (Integer)lua_attr_bufsize); + for (Integer c = c0; c < c1; c++) { + lua_attr_buf[c] = (sattr_T)hl_combine_attr(lua_attr_buf[c], (int)attr); + } + return; +} diff --git a/src/nvim/buffer_defs.h b/src/nvim/buffer_defs.h index 700d8b82e6..bc4fb2997d 100644 --- a/src/nvim/buffer_defs.h +++ b/src/nvim/buffer_defs.h @@ -832,6 +832,12 @@ struct file_buffer { // The number for times the current line has been flushed in the memline. int flush_count; + bool b_luahl; + LuaRef b_luahl_start; + LuaRef b_luahl_window; + LuaRef b_luahl_line; + LuaRef b_luahl_end; + int b_diff_failed; // internal diff failed for this buffer }; diff --git a/src/nvim/buffer_updates.c b/src/nvim/buffer_updates.c index 3604578b50..d12527d6ac 100644 --- a/src/nvim/buffer_updates.c +++ b/src/nvim/buffer_updates.c @@ -157,7 +157,7 @@ void buf_updates_unregister_all(buf_T *buf) args.items[0] = BUFFER_OBJ(buf->handle); textlock++; - executor_exec_lua_cb(cb.on_detach, "detach", args, false); + executor_exec_lua_cb(cb.on_detach, "detach", args, false, NULL); textlock--; } free_update_callbacks(cb); @@ -265,7 +265,7 @@ void buf_updates_send_changes(buf_T *buf, args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits); } textlock++; - Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true); + Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true, NULL); textlock--; if (res.type == kObjectTypeBoolean && res.data.boolean == true) { @@ -293,10 +293,7 @@ void buf_updates_changedtick(buf_T *buf) BufUpdateCallbacks cb = kv_A(buf->update_callbacks, i); bool keep = true; if (cb.on_changedtick != LUA_NOREF) { - Array args = ARRAY_DICT_INIT; - Object items[2]; - args.size = 2; - args.items = items; + FIXED_TEMP_ARRAY(args, 2); // the first argument is always the buffer handle args.items[0] = BUFFER_OBJ(buf->handle); @@ -306,7 +303,7 @@ void buf_updates_changedtick(buf_T *buf) textlock++; Object res = executor_exec_lua_cb(cb.on_changedtick, "changedtick", - args, true); + args, true, NULL); textlock--; if (res.type == kObjectTypeBoolean && res.data.boolean == true) { diff --git a/src/nvim/func_attr.h b/src/nvim/func_attr.h index 14b8158c7f..38b51b5564 100644 --- a/src/nvim/func_attr.h +++ b/src/nvim/func_attr.h @@ -211,6 +211,8 @@ # define FUNC_API_NOEXPORT /// API function not exposed in VimL/eval. # define FUNC_API_REMOTE_ONLY +/// API function not exposed in VimL/remote. +# define FUNC_API_LUA_ONLY /// API function introduced at the given API level. # define FUNC_API_SINCE(X) /// API function deprecated since the given API level. diff --git a/src/nvim/generators/c_grammar.lua b/src/nvim/generators/c_grammar.lua index 1aa8223da0..de098b7a7b 100644 --- a/src/nvim/generators/c_grammar.lua +++ b/src/nvim/generators/c_grammar.lua @@ -42,6 +42,7 @@ local c_proto = Ct( (fill * Cg((P('FUNC_API_FAST') * Cc(true)), 'fast') ^ -1) * (fill * Cg((P('FUNC_API_NOEXPORT') * Cc(true)), 'noexport') ^ -1) * (fill * Cg((P('FUNC_API_REMOTE_ONLY') * Cc(true)), 'remote_only') ^ -1) * + (fill * Cg((P('FUNC_API_LUA_ONLY') * Cc(true)), 'lua_only') ^ -1) * (fill * Cg((P('FUNC_API_REMOTE_IMPL') * Cc(true)), 'remote_impl') ^ -1) * (fill * Cg((P('FUNC_API_BRIDGE_IMPL') * Cc(true)), 'bridge_impl') ^ -1) * (fill * Cg((P('FUNC_API_COMPOSITOR_IMPL') * Cc(true)), 'compositor_impl') ^ -1) * diff --git a/src/nvim/generators/gen_api_dispatch.lua b/src/nvim/generators/gen_api_dispatch.lua index 76dcf849d1..e861cfda35 100644 --- a/src/nvim/generators/gen_api_dispatch.lua +++ b/src/nvim/generators/gen_api_dispatch.lua @@ -192,7 +192,7 @@ end -- the real API. for i = 1, #functions do local fn = functions[i] - if fn.impl_name == nil then + if fn.impl_name == nil and not fn.lua_only then local args = {} output:write('Object handle_'..fn.name..'(uint64_t channel_id, Array args, Error *error)') @@ -310,12 +310,13 @@ void msgpack_rpc_init_method_table(void) for i = 1, #functions do local fn = functions[i] - output:write(' msgpack_rpc_add_method_handler('.. - '(String) {.data = "'..fn.name..'", '.. - '.size = sizeof("'..fn.name..'") - 1}, '.. - '(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name).. - ', .fast = '..tostring(fn.fast)..'});\n') - + if not fn.lua_only then + output:write(' msgpack_rpc_add_method_handler('.. + '(String) {.data = "'..fn.name..'", '.. + '.size = sizeof("'..fn.name..'") - 1}, '.. + '(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name).. + ', .fast = '..tostring(fn.fast)..'});\n') + end end output:write('\n}\n\n') diff --git a/src/nvim/generators/gen_eval.lua b/src/nvim/generators/gen_eval.lua index 2c6f8f2603..d16453530f 100644 --- a/src/nvim/generators/gen_eval.lua +++ b/src/nvim/generators/gen_eval.lua @@ -25,7 +25,7 @@ local gperfpipe = io.open(funcsfname .. '.gperf', 'wb') local funcs = require('eval').funcs local metadata = mpack.unpack(io.open(metadata_file, 'rb'):read("*all")) for _,fun in ipairs(metadata) do - if not fun.remote_only then + if not (fun.remote_only or fun.lua_only) then funcs[fun.name] = { args=#fun.parameters, func='api_wrapper', diff --git a/src/nvim/globals.h b/src/nvim/globals.h index 172c190df2..0a7a2d551e 100644 --- a/src/nvim/globals.h +++ b/src/nvim/globals.h @@ -126,6 +126,13 @@ typedef off_t off_T; */ EXTERN int mod_mask INIT(= 0x0); /* current key modifiers */ + +// TODO(bfredl): for the final interface this should find a more suitable +// location. +EXTERN sattr_T *lua_attr_buf INIT(= NULL); +EXTERN size_t lua_attr_bufsize INIT(= 0); +EXTERN bool lua_attr_active INIT(= false); + /* * Cmdline_row is the row where the command line starts, just below the * last window. diff --git a/src/nvim/lua/executor.c b/src/nvim/lua/executor.c index 25f4be1c4d..1d3d9929d3 100644 --- a/src/nvim/lua/executor.c +++ b/src/nvim/lua/executor.c @@ -835,7 +835,7 @@ Object executor_exec_lua_api(const String str, const Array args, Error *err) } Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args, - bool retval) + bool retval, Error *err) { lua_State *const lstate = nlua_enter(); nlua_pushref(lstate, ref); @@ -845,16 +845,24 @@ Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args, } if (lua_pcall(lstate, (int)args.size+1, retval ? 1 : 0, 0)) { - // TODO(bfredl): callbacks:s might not always be msg-safe, for instance - // lua callbacks for redraw events. Later on let the caller deal with the - // error instead. - nlua_error(lstate, _("Error executing lua callback: %.*s")); + // if err is passed, the caller will deal with the error. + if (err) { + size_t len; + const char *errstr = lua_tolstring(lstate, -1, &len); + api_set_error(err, kErrorTypeException, + "Error executing lua: %.*s", (int)len, errstr); + } else { + nlua_error(lstate, _("Error executing lua callback: %.*s")); + } return NIL; } - Error err = ERROR_INIT; if (retval) { - return nlua_pop_Object(lstate, false, &err); + Error dummy = ERROR_INIT; + if (err == NULL) { + err = &dummy; + } + return nlua_pop_Object(lstate, false, err); } else { return NIL; } @@ -1007,4 +1015,7 @@ static void nlua_add_treesitter(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL lua_pushcfunction(lstate, tslua_inspect_lang); lua_setfield(lstate, -2, "_ts_inspect_language"); + + lua_pushcfunction(lstate, ts_lua_parse_query); + lua_setfield(lstate, -2, "_ts_parse_query"); } diff --git a/src/nvim/lua/treesitter.c b/src/nvim/lua/treesitter.c index d2072402bb..874fabd89f 100644 --- a/src/nvim/lua/treesitter.c +++ b/src/nvim/lua/treesitter.c @@ -26,6 +26,11 @@ typedef struct { TSTree *tree; // internal tree, used for editing/reparsing } TSLua_parser; +typedef struct { + TSQueryCursor *cursor; + int predicated_match; +} TSLua_cursor; + #ifdef INCLUDE_GENERATED_DECLARATIONS # include "lua/treesitter.c.generated.h" #endif @@ -66,6 +71,20 @@ static struct luaL_Reg node_meta[] = { { "descendant_for_range", node_descendant_for_range }, { "named_descendant_for_range", node_named_descendant_for_range }, { "parent", node_parent }, + { "_rawquery", node_rawquery }, + { NULL, NULL } +}; + +static struct luaL_Reg query_meta[] = { + { "__gc", query_gc }, + { "__tostring", query_tostring }, + { "inspect", query_inspect }, + { NULL, NULL } +}; + +// cursor is not exposed, but still needs garbage collection +static struct luaL_Reg querycursor_meta[] = { + { "__gc", querycursor_gc }, { NULL, NULL } }; @@ -96,6 +115,8 @@ void tslua_init(lua_State *L) build_meta(L, "treesitter_parser", parser_meta); build_meta(L, "treesitter_tree", tree_meta); build_meta(L, "treesitter_node", node_meta); + build_meta(L, "treesitter_query", query_meta); + build_meta(L, "treesitter_querycursor", querycursor_meta); } int tslua_register_lang(lua_State *L) @@ -276,13 +297,33 @@ static int parser_parse_buf(lua_State *L) } TSInput input = { payload, input_cb, TSInputEncodingUTF8 }; TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input); + + uint32_t n_ranges = 0; + TSRange *changed = p->tree ? ts_tree_get_changed_ranges(p->tree, new_tree, + &n_ranges) : NULL; if (p->tree) { ts_tree_delete(p->tree); } p->tree = new_tree; tslua_push_tree(L, p->tree); - return 1; + + lua_createtable(L, n_ranges, 0); + for (size_t i = 0; i < n_ranges; i++) { + lua_createtable(L, 4, 0); + lua_pushinteger(L, changed[i].start_point.row); + lua_rawseti(L, -2, 1); + lua_pushinteger(L, changed[i].start_point.column); + lua_rawseti(L, -2, 2); + lua_pushinteger(L, changed[i].end_point.row); + lua_rawseti(L, -2, 3); + lua_pushinteger(L, changed[i].end_point.column); + lua_rawseti(L, -2, 4); + + lua_rawseti(L, -2, i+1); + } + xfree(changed); + return 2; } static int parser_tree(lua_State *L) @@ -383,7 +424,7 @@ static int tree_root(lua_State *L) return 0; } TSNode root = ts_tree_root_node(tree); - push_node(L, root); + push_node(L, root, 1); return 1; } @@ -394,18 +435,19 @@ static int tree_root(lua_State *L) /// top of stack must either be the tree this node belongs to or another node /// of the same tree! This value is not popped. Can only be called inside a /// cfunction with the tslua environment. -static void push_node(lua_State *L, TSNode node) +static void push_node(lua_State *L, TSNode node, int uindex) { + assert(uindex > 0 || uindex < -LUA_MINSTACK); if (ts_node_is_null(node)) { - lua_pushnil(L); // [src, nil] + lua_pushnil(L); // [nil] return; } - TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [src, udata] + TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [udata] *ud = node; - lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [src, udata, meta] - lua_setmetatable(L, -2); // [src, udata] - lua_getfenv(L, -2); // [src, udata, reftable] - lua_setfenv(L, -2); // [src, udata] + lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [udata, meta] + lua_setmetatable(L, -2); // [udata] + lua_getfenv(L, uindex); // [udata, reftable] + lua_setfenv(L, -2); // [udata] } static bool node_check(lua_State *L, TSNode *res) @@ -586,8 +628,7 @@ static int node_child(lua_State *L) long num = lua_tointeger(L, 2); TSNode child = ts_node_child(node, (uint32_t)num); - lua_pushvalue(L, 1); - push_node(L, child); + push_node(L, child, 1); return 1; } @@ -600,8 +641,7 @@ static int node_named_child(lua_State *L) long num = lua_tointeger(L, 2); TSNode child = ts_node_named_child(node, (uint32_t)num); - lua_pushvalue(L, 1); - push_node(L, child); + push_node(L, child, 1); return 1; } @@ -617,8 +657,7 @@ static int node_descendant_for_range(lua_State *L) (uint32_t)lua_tointeger(L, 5) }; TSNode child = ts_node_descendant_for_point_range(node, start, end); - lua_pushvalue(L, 1); - push_node(L, child); + push_node(L, child, 1); return 1; } @@ -634,8 +673,7 @@ static int node_named_descendant_for_range(lua_State *L) (uint32_t)lua_tointeger(L, 5) }; TSNode child = ts_node_named_descendant_for_point_range(node, start, end); - lua_pushvalue(L, 1); - push_node(L, child); + push_node(L, child, 1); return 1; } @@ -646,7 +684,254 @@ static int node_parent(lua_State *L) return 0; } TSNode parent = ts_node_parent(node); - push_node(L, parent); + push_node(L, parent, 1); return 1; } +/// assumes the match table being on top of the stack +static void set_match(lua_State *L, TSQueryMatch *match, int nodeidx) +{ + for (int i = 0; i < match->capture_count; i++) { + push_node(L, match->captures[i].node, nodeidx); + lua_rawseti(L, -2, match->captures[i].index+1); + } +} + +static int query_next_match(lua_State *L) +{ + TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1)); + TSQueryCursor *cursor = ud->cursor; + + TSQuery *query = query_check(L, lua_upvalueindex(3)); + TSQueryMatch match; + if (ts_query_cursor_next_match(cursor, &match)) { + lua_pushinteger(L, match.pattern_index+1); // [index] + lua_createtable(L, ts_query_capture_count(query), 2); // [index, match] + set_match(L, &match, lua_upvalueindex(2)); + return 2; + } + return 0; +} + + +static int query_next_capture(lua_State *L) +{ + TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1)); + TSQueryCursor *cursor = ud->cursor; + + TSQuery *query = query_check(L, lua_upvalueindex(3)); + + if (ud->predicated_match > -1) { + lua_getfield(L, lua_upvalueindex(4), "active"); + bool active = lua_toboolean(L, -1); + lua_pop(L, 1); + if (!active) { + ts_query_cursor_remove_match(cursor, ud->predicated_match); + } + ud->predicated_match = -1; + } + + TSQueryMatch match; + uint32_t capture_index; + if (ts_query_cursor_next_capture(cursor, &match, &capture_index)) { + TSQueryCapture capture = match.captures[capture_index]; + + lua_pushinteger(L, capture.index+1); // [index] + push_node(L, capture.node, lua_upvalueindex(2)); // [index, node] + + uint32_t n_pred; + ts_query_predicates_for_pattern(query, match.pattern_index, &n_pred); + if (n_pred > 0 && capture_index == 0) { + lua_pushvalue(L, lua_upvalueindex(4)); // [index, node, match] + set_match(L, &match, lua_upvalueindex(2)); + lua_pushinteger(L, match.pattern_index+1); + lua_setfield(L, -2, "pattern"); + + if (match.capture_count > 1) { + ud->predicated_match = match.id; + lua_pushboolean(L, false); + lua_setfield(L, -2, "active"); + } + return 3; + } + return 2; + } + return 0; +} + +static int node_rawquery(lua_State *L) +{ + TSNode node; + if (!node_check(L, &node)) { + return 0; + } + TSQuery *query = query_check(L, 2); + // TODO(bfredl): these are expensive allegedly, + // use a reuse list later on? + TSQueryCursor *cursor = ts_query_cursor_new(); + ts_query_cursor_exec(cursor, query, node); + + bool captures = lua_toboolean(L, 3); + + if (lua_gettop(L) >= 4) { + int start = luaL_checkinteger(L, 4); + int end = lua_gettop(L) >= 5 ? luaL_checkinteger(L, 5) : MAXLNUM; + ts_query_cursor_set_point_range(cursor, + (TSPoint){ start, 0 }, (TSPoint){ end, 0 }); + } + + TSLua_cursor *ud = lua_newuserdata(L, sizeof(*ud)); // [udata] + ud->cursor = cursor; + ud->predicated_match = -1; + + lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_querycursor"); + lua_setmetatable(L, -2); // [udata] + lua_pushvalue(L, 1); // [udata, node] + + // include query separately, as to keep a ref to it for gc + lua_pushvalue(L, 2); // [udata, node, query] + + if (captures) { + // placeholder for match state + lua_createtable(L, ts_query_capture_count(query), 2); // [u, n, q, match] + lua_pushcclosure(L, query_next_capture, 4); // [closure] + } else { + lua_pushcclosure(L, query_next_match, 3); // [closure] + } + + return 1; +} + +static int querycursor_gc(lua_State *L) +{ + TSLua_cursor *ud = luaL_checkudata(L, 1, "treesitter_querycursor"); + ts_query_cursor_delete(ud->cursor); + return 0; +} + +// Query methods + +int ts_lua_parse_query(lua_State *L) +{ + if (lua_gettop(L) < 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2)) { + return luaL_error(L, "string expected"); + } + + const char *lang_name = lua_tostring(L, 1); + TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name); + if (!lang) { + return luaL_error(L, "no such language: %s", lang_name); + } + + size_t len; + const char *src = lua_tolstring(L, 2, &len); + + uint32_t error_offset; + TSQueryError error_type; + TSQuery *query = ts_query_new(lang, src, len, &error_offset, &error_type); + + if (!query) { + return luaL_error(L, "query: %s at position %d", + query_err_string(error_type), (int)error_offset); + } + + TSQuery **ud = lua_newuserdata(L, sizeof(TSQuery *)); // [udata] + *ud = query; + lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_query"); // [udata, meta] + lua_setmetatable(L, -2); // [udata] + return 1; +} + + +static const char *query_err_string(TSQueryError err) { + switch (err) { + case TSQueryErrorSyntax: return "invalid syntax"; + case TSQueryErrorNodeType: return "invalid node type"; + case TSQueryErrorField: return "invalid field"; + case TSQueryErrorCapture: return "invalid capture"; + default: return "error"; + } +} + +static TSQuery *query_check(lua_State *L, int index) +{ + TSQuery **ud = luaL_checkudata(L, index, "treesitter_query"); + return *ud; +} + +static int query_gc(lua_State *L) +{ + TSQuery *query = query_check(L, 1); + if (!query) { + return 0; + } + + ts_query_delete(query); + return 0; +} + +static int query_tostring(lua_State *L) +{ + lua_pushstring(L, ""); + return 1; +} + +static int query_inspect(lua_State *L) +{ + TSQuery *query = query_check(L, 1); + if (!query) { + return 0; + } + + uint32_t n_pat = ts_query_pattern_count(query); + lua_createtable(L, 0, 2); // [retval] + lua_createtable(L, n_pat, 1); // [retval, patterns] + for (size_t i = 0; i < n_pat; i++) { + uint32_t len; + const TSQueryPredicateStep *step = ts_query_predicates_for_pattern(query, + i, &len); + if (len == 0) { + continue; + } + lua_createtable(L, len/4, 1); // [retval, patterns, pat] + lua_createtable(L, 3, 0); // [retval, patterns, pat, pred] + int nextpred = 1; + int nextitem = 1; + for (size_t k = 0; k < len; k++) { + if (step[k].type == TSQueryPredicateStepTypeDone) { + lua_rawseti(L, -2, nextpred++); // [retval, patterns, pat] + lua_createtable(L, 3, 0); // [retval, patterns, pat, pred] + nextitem = 1; + continue; + } + + if (step[k].type == TSQueryPredicateStepTypeString) { + uint32_t strlen; + const char *str = ts_query_string_value_for_id(query, step[k].value_id, + &strlen); + lua_pushlstring(L, str, strlen); // [retval, patterns, pat, pred, item] + } else if (step[k].type == TSQueryPredicateStepTypeCapture) { + lua_pushnumber(L, step[k].value_id+1); // [..., pat, pred, item] + } else { + abort(); + } + lua_rawseti(L, -2, nextitem++); // [retval, patterns, pat, pred] + } + // last predicate should have ended with TypeDone + lua_pop(L, 1); // [retval, patters, pat] + lua_rawseti(L, -2, i+1); // [retval, patterns] + } + lua_setfield(L, -2, "patterns"); // [retval] + + uint32_t n_captures = ts_query_capture_count(query); + lua_createtable(L, n_captures, 0); // [retval, captures] + for (size_t i = 0; i < n_captures; i++) { + uint32_t strlen; + const char *str = ts_query_capture_name_for_id(query, i, &strlen); + lua_pushlstring(L, str, strlen); // [retval, captures, capture] + lua_rawseti(L, -2, i+1); + } + lua_setfield(L, -2, "captures"); // [retval] + + return 1; +} diff --git a/src/nvim/screen.c b/src/nvim/screen.c index 1d29ae064e..4082208dd4 100644 --- a/src/nvim/screen.c +++ b/src/nvim/screen.c @@ -116,6 +116,8 @@ #include "nvim/window.h" #include "nvim/os/time.h" #include "nvim/api/private/helpers.h" +#include "nvim/api/vim.h" +#include "nvim/lua/executor.h" #define MB_FILLER_CHAR '<' /* character used when a double-width character * doesn't fit. */ @@ -232,6 +234,22 @@ void redraw_buf_line_later(buf_T *buf, linenr_T line) } } +void redraw_buf_range_later(buf_T *buf, linenr_T firstline, linenr_T lastline) +{ + FOR_ALL_WINDOWS_IN_TAB(wp, curtab) { + if (wp->w_buffer == buf + && lastline >= wp->w_topline && firstline < wp->w_botline) { + if (wp->w_redraw_top == 0 || wp->w_redraw_top > firstline) { + wp->w_redraw_top = firstline; + } + if (wp->w_redraw_bot == 0 || wp->w_redraw_bot < lastline) { + wp->w_redraw_bot = lastline; + } + redraw_win_later(wp, VALID); + } + } +} + /* * Changed something in the current window, at buffer line "lnum", that * requires that line and possibly other lines to be redrawn. @@ -477,6 +495,19 @@ int update_screen(int type) if (wwp == wp && syntax_present(wp)) { syn_stack_apply_changes(wp->w_buffer); } + + buf_T *buf = wp->w_buffer; + if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) { + Error err = ERROR_INIT; + FIXED_TEMP_ARRAY(args, 2); + args.items[0] = BUFFER_OBJ(buf->handle); + args.items[1] = INTEGER_OBJ(display_tick); + executor_exec_lua_cb(buf->b_luahl_start, "start", args, false, &err); + if (ERROR_SET(&err)) { + ELOG("error in luahl start: %s", err.msg); + api_clear_error(&err); + } + } } } @@ -1181,7 +1212,27 @@ static void win_update(win_T *wp) idx = 0; /* first entry in w_lines[].wl_size */ row = 0; srow = 0; - lnum = wp->w_topline; /* first line shown in window */ + lnum = wp->w_topline; // first line shown in window + + if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) { + Error err = ERROR_INIT; + FIXED_TEMP_ARRAY(args, 4); + linenr_T knownmax = ((wp->w_valid & VALID_BOTLINE) + ? wp->w_botline + : (wp->w_topline + wp->w_height_inner)); + args.items[0] = WINDOW_OBJ(wp->handle); + args.items[1] = BUFFER_OBJ(buf->handle); + args.items[2] = INTEGER_OBJ(wp->w_topline-1); + args.items[3] = INTEGER_OBJ(knownmax); + // TODO(bfredl): we could allow this callback to change mod_top, mod_bot. + // For now the "start" callback is expected to use nvim__buf_redraw_range. + executor_exec_lua_cb(buf->b_luahl_window, "window", args, false, &err); + if (ERROR_SET(&err)) { + ELOG("error in luahl window: %s", err.msg); + api_clear_error(&err); + } + } + for (;; ) { /* stop updating when reached the end of the window (check for _past_ * the end of the window is at the end of the loop) */ @@ -2229,6 +2280,8 @@ win_line ( row = startrow; + char *luatext = NULL; + if (!number_only) { // To speed up the loop below, set extra_check when there is linebreak, // trailing white space and/or syntax processing to be done. @@ -2454,6 +2507,41 @@ win_line ( line = ml_get_buf(wp->w_buffer, lnum, FALSE); ptr = line; + buf_T *buf = wp->w_buffer; + if (buf->b_luahl && buf->b_luahl_line != LUA_NOREF) { + size_t size = STRLEN(line); + if (lua_attr_bufsize < size) { + xfree(lua_attr_buf); + lua_attr_buf = xcalloc(size, sizeof(*lua_attr_buf)); + lua_attr_bufsize = size; + } else if (lua_attr_buf) { + memset(lua_attr_buf, 0, size * sizeof(*lua_attr_buf)); + } + Error err = ERROR_INIT; + // TODO(bfredl): build a macro for the "static array" pattern + // in buf_updates_send_changes? + FIXED_TEMP_ARRAY(args, 3); + args.items[0] = WINDOW_OBJ(wp->handle); + args.items[1] = BUFFER_OBJ(buf->handle); + args.items[2] = INTEGER_OBJ(lnum-1); + lua_attr_active = true; + extra_check = true; + Object o = executor_exec_lua_cb(buf->b_luahl_line, "line", + args, true, &err); + lua_attr_active = false; + if (o.type == kObjectTypeString) { + // TODO(bfredl): this is a bit of a hack. A final API should use an + // "unified" interface where luahl can add both bufhl and virttext + luatext = o.data.string.data; + do_virttext = true; + } else if (ERROR_SET(&err)) { + ELOG("error in luahl line: %s", err.msg); + luatext = err.msg; + do_virttext = true; + api_clear_error(&err); + } + } + if (has_spell && !number_only) { // For checking first word with a capital skip white space. if (cap_col == 0) { @@ -3429,6 +3517,10 @@ win_line ( } } + if (buf->b_luahl && v > 0 && v < (long)lua_attr_bufsize+1) { + char_attr = hl_combine_attr(char_attr, lua_attr_buf[v-1]); + } + if (wp->w_buffer->terminal) { char_attr = hl_combine_attr(term_attrs[vcol], char_attr); } @@ -3917,8 +4009,14 @@ win_line ( int rightmost_vcol = 0; int i; - VirtText virt_text = do_virttext ? bufhl_info.line->virt_text - : (VirtText)KV_INITIAL_VALUE; + VirtText virt_text; + if (luatext) { + virt_text = (VirtText)KV_INITIAL_VALUE; + kv_push(virt_text, ((VirtTextChunk){ .text = luatext, .hl_id = 0 })); + } else { + virt_text = do_virttext ? bufhl_info.line->virt_text + : (VirtText)KV_INITIAL_VALUE; + } size_t virt_pos = 0; LineState s = LINE_STATE((char_u *)""); int virt_attr = 0; @@ -4319,6 +4417,7 @@ win_line ( } xfree(p_extra_free); + xfree(luatext); return row; } diff --git a/test/functional/api/highlight_spec.lua b/test/functional/api/highlight_spec.lua index 5297c6454e..b6514a105c 100644 --- a/test/functional/api/highlight_spec.lua +++ b/test/functional/api/highlight_spec.lua @@ -4,6 +4,9 @@ local Screen = require('test.functional.ui.screen') local eq, eval = helpers.eq, helpers.eval local command = helpers.command local meths = helpers.meths +local funcs = helpers.funcs +local pcall_err = helpers.pcall_err +local ok = helpers.ok describe('API: highlight',function() local expected_rgb = { @@ -110,4 +113,20 @@ describe('API: highlight',function() meths.get_hl_by_name('cursorline', 0)); end) + + it('nvim_get_hl_id_by_name', function() + -- precondition: use a hl group that does not yet exist + eq('Invalid highlight name: Shrubbery', pcall_err(meths.get_hl_by_name, "Shrubbery", true)) + eq(0, funcs.hlID("Shrubbery")) + + local hl_id = meths.get_hl_id_by_name("Shrubbery") + ok(hl_id > 0) + eq(hl_id, funcs.hlID("Shrubbery")) + + command('hi Shrubbery guifg=#888888 guibg=#888888') + eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")}, + meths.get_hl_by_id(hl_id, true)) + eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")}, + meths.get_hl_by_name("Shrubbery", true)) + end) end) diff --git a/test/functional/lua/treesitter_spec.lua b/test/functional/lua/treesitter_spec.lua index 5a53ca1425..76e9899d34 100644 --- a/test/functional/lua/treesitter_spec.lua +++ b/test/functional/lua/treesitter_spec.lua @@ -1,5 +1,6 @@ -- Test suite for testing interactions with API bindings local helpers = require('test.functional.helpers')(after_each) +local Screen = require('test.functional.ui.screen') local clear = helpers.clear local eq = helpers.eq @@ -26,122 +27,371 @@ describe('treesitter API', function() pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')")) end) +end) + +describe('treesitter API with C parser', function() local ts_path = os.getenv("TREE_SITTER_DIR") - describe('with C parser', function() - if ts_path == nil then - it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end) - return - end + -- The tests after this requires an actual parser + if ts_path == nil then + it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end) + return + end - before_each(function() - local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so') - exec_lua([[ - local path = ... - vim.treesitter.add_language(path,'c') - ]], path) - end) + before_each(function() + local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so') + exec_lua([[ + local path = ... + vim.treesitter.add_language(path,'c') + ]], path) + end) - it('parses buffer', function() - insert([[ - int main() { - int x = 3; - }]]) + it('parses buffer', function() + insert([[ + int main() { + int x = 3; + }]]) - exec_lua([[ - parser = vim.treesitter.get_parser(0, "c") - tree = parser:parse() - root = tree:root() - lang = vim.treesitter.inspect_language('c') - ]]) + exec_lua([[ + parser = vim.treesitter.get_parser(0, "c") + tree = parser:parse() + root = tree:root() + lang = vim.treesitter.inspect_language('c') + ]]) - eq("", exec_lua("return tostring(tree)")) - eq("", exec_lua("return tostring(root)")) - eq({0,0,3,0}, exec_lua("return {root:range()}")) + eq("", exec_lua("return tostring(tree)")) + eq("", exec_lua("return tostring(root)")) + eq({0,0,3,0}, exec_lua("return {root:range()}")) - eq(1, exec_lua("return root:child_count()")) - exec_lua("child = root:child(0)") - eq("", exec_lua("return tostring(child)")) - eq({0,0,2,1}, exec_lua("return {child:range()}")) + eq(1, exec_lua("return root:child_count()")) + exec_lua("child = root:child(0)") + eq("", exec_lua("return tostring(child)")) + eq({0,0,2,1}, exec_lua("return {child:range()}")) - eq("function_definition", exec_lua("return child:type()")) - eq(true, exec_lua("return child:named()")) - eq("number", type(exec_lua("return child:symbol()"))) - eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]")) + eq("function_definition", exec_lua("return child:type()")) + eq(true, exec_lua("return child:named()")) + eq("number", type(exec_lua("return child:symbol()"))) + eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]")) - exec_lua("anon = root:descendant_for_range(0,8,0,9)") - eq("(", exec_lua("return anon:type()")) - eq(false, exec_lua("return anon:named()")) - eq("number", type(exec_lua("return anon:symbol()"))) - eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]")) + exec_lua("anon = root:descendant_for_range(0,8,0,9)") + eq("(", exec_lua("return anon:type()")) + eq(false, exec_lua("return anon:named()")) + eq("number", type(exec_lua("return anon:symbol()"))) + eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]")) - exec_lua("descendant = root:descendant_for_range(1,2,1,12)") - eq("", exec_lua("return tostring(descendant)")) - eq({1,2,1,12}, exec_lua("return {descendant:range()}")) - eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()")) + exec_lua("descendant = root:descendant_for_range(1,2,1,12)") + eq("", exec_lua("return tostring(descendant)")) + eq({1,2,1,12}, exec_lua("return {descendant:range()}")) + eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()")) - eq(true, exec_lua("return child == child")) - -- separate lua object, but represents same node - eq(true, exec_lua("return child == root:child(0)")) - eq(false, exec_lua("return child == descendant2")) - eq(false, exec_lua("return child == nil")) - eq(false, exec_lua("return child == tree")) + eq(true, exec_lua("return child == child")) + -- separate lua object, but represents same node + eq(true, exec_lua("return child == root:child(0)")) + eq(false, exec_lua("return child == descendant2")) + eq(false, exec_lua("return child == nil")) + eq(false, exec_lua("return child == tree")) - feed("2G7|ay") - exec_lua([[ - tree2 = parser:parse() - root2 = tree2:root() - descendant2 = root2:descendant_for_range(1,2,1,13) - ]]) - eq(false, exec_lua("return tree2 == tree1")) - eq(false, exec_lua("return root2 == root")) - eq("", exec_lua("return tostring(descendant2)")) - eq({1,2,1,13}, exec_lua("return {descendant2:range()}")) + feed("2G7|ay") + exec_lua([[ + tree2 = parser:parse() + root2 = tree2:root() + descendant2 = root2:descendant_for_range(1,2,1,13) + ]]) + eq(false, exec_lua("return tree2 == tree1")) + eq(false, exec_lua("return root2 == root")) + eq("", exec_lua("return tostring(descendant2)")) + eq({1,2,1,13}, exec_lua("return {descendant2:range()}")) - -- orginal tree did not change - eq({1,2,1,12}, exec_lua("return {descendant:range()}")) + -- orginal tree did not change + eq({1,2,1,12}, exec_lua("return {descendant:range()}")) - -- unchanged buffer: return the same tree - eq(true, exec_lua("return parser:parse() == tree2")) - end) + -- unchanged buffer: return the same tree + eq(true, exec_lua("return parser:parse() == tree2")) + end) - it('inspects language', function() - local keys, fields, symbols = unpack(exec_lua([[ - local lang = vim.treesitter.inspect_language('c') - local keys, symbols = {}, {} - for k,_ in pairs(lang) do - keys[k] = true - end + local test_text = [[ +void ui_refresh(void) +{ + int width = INT_MAX, height = INT_MAX; + bool ext_widgets[kUIExtCount]; + for (UIExtension i = 0; (int)i < kUIExtCount; i++) { + ext_widgets[i] = true; + } - -- symbols array can have "holes" and is thus not a valid msgpack array - -- but we don't care about the numbers here (checked in the parser test) - for _, v in pairs(lang.symbols) do - table.insert(symbols, v) - end - return {keys, lang.fields, symbols} - ]])) + bool inclusive = ui_override(); + for (size_t i = 0; i < ui_count; i++) { + UI *ui = uis[i]; + width = MIN(ui->width, width); + height = MIN(ui->height, height); + foo = BAR(ui->bazaar, bazaar); + for (UIExtension j = 0; (int)j < kUIExtCount; j++) { + ext_widgets[j] &= (ui->ui_ext[j] || inclusive); + } + } +}]] - eq({fields=true, symbols=true}, keys) + local query = [[ + ((call_expression function: (identifier) @minfunc (argument_list (identifier) @min_id)) (eq? @minfunc "MIN")) + "for" @keyword + (primitive_type) @type + (field_expression argument: (identifier) @fieldarg) + ]] - local fset = {} - for _,f in pairs(fields) do - eq("string", type(f)) - fset[f] = true + it('support query and iter by capture', function() + insert(test_text) + + local res = exec_lua([[ + cquery = vim.treesitter.parse_query("c", ...) + parser = vim.treesitter.get_parser(0, "c") + tree = parser:parse() + res = {} + for cid, node in cquery:iter_captures(tree:root(), 0, 7, 14) do + -- can't transmit node over RPC. just check the name and range + table.insert(res, {cquery.captures[cid], node:type(), node:range()}) + end + return res + ]], query) + + eq({ + { "type", "primitive_type", 8, 2, 8, 6 }, + { "keyword", "for", 9, 2, 9, 5 }, + { "type", "primitive_type", 9, 7, 9, 13 }, + { "minfunc", "identifier", 11, 12, 11, 15 }, + { "fieldarg", "identifier", 11, 16, 11, 18 }, + { "min_id", "identifier", 11, 27, 11, 32 }, + { "minfunc", "identifier", 12, 13, 12, 16 }, + { "fieldarg", "identifier", 12, 17, 12, 19 }, + { "min_id", "identifier", 12, 29, 12, 35 }, + { "fieldarg", "identifier", 13, 14, 13, 16 } + }, res) + end) + + it('support query and iter by match', function() + insert(test_text) + + local res = exec_lua([[ + cquery = vim.treesitter.parse_query("c", ...) + parser = vim.treesitter.get_parser(0, "c") + tree = parser:parse() + res = {} + for pattern, match in cquery:iter_matches(tree:root(), 0, 7, 14) do + -- can't transmit node over RPC. just check the name and range + local mrepr = {} + for cid,node in pairs(match) do + table.insert(mrepr, {cquery.captures[cid], node:type(), node:range()}) end - eq(true, fset["directive"]) - eq(true, fset["initializer"]) + table.insert(res, {pattern, mrepr}) + end + return res + ]], query) - local has_named, has_anonymous - for _,s in pairs(symbols) do - eq("string", type(s[1])) - eq("boolean", type(s[2])) - if s[1] == "for_statement" and s[2] == true then - has_named = true - elseif s[1] == "|=" and s[2] == false then - has_anonymous = true - end + eq({ + { 3, { { "type", "primitive_type", 8, 2, 8, 6 } } }, + { 2, { { "keyword", "for", 9, 2, 9, 5 } } }, + { 3, { { "type", "primitive_type", 9, 7, 9, 13 } } }, + { 4, { { "fieldarg", "identifier", 11, 16, 11, 18 } } }, + { 1, { { "minfunc", "identifier", 11, 12, 11, 15 }, { "min_id", "identifier", 11, 27, 11, 32 } } }, + { 4, { { "fieldarg", "identifier", 12, 17, 12, 19 } } }, + { 1, { { "minfunc", "identifier", 12, 13, 12, 16 }, { "min_id", "identifier", 12, 29, 12, 35 } } }, + { 4, { { "fieldarg", "identifier", 13, 14, 13, 16 } } } + }, res) + end) + + it('supports highlighting', function() + local hl_text = [[ +/// Schedule Lua callback on main loop's event queue +static int nlua_schedule(lua_State *const lstate) +{ + if (lua_type(lstate, 1) != LUA_TFUNCTION + || lstate != lstate) { + lua_pushliteral(lstate, "vim.schedule: expected function"); + return lua_error(lstate); + } + + LuaRef cb = nlua_ref(lstate, 1); + + multiqueue_put(main_loop.events, nlua_schedule_event, + 1, (void *)(ptrdiff_t)cb); + return 0; +}]] + + local hl_query = [[ +(ERROR) @ErrorMsg + +"if" @keyword +"else" @keyword +"for" @keyword +"return" @keyword + +"const" @type +"static" @type +"struct" @type +"enum" @type +"extern" @type + +(string_literal) @string + +(number_literal) @number +(char_literal) @string + +; TODO(bfredl): overlapping matches are unreliable, +; we need a proper priority mechanism +;(type_identifier) @type +((type_identifier) @Special (eq? @Special "LuaRef")) + +(primitive_type) @type +(sized_type_specifier) @type + +((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) (eq? @WarningMsg.left @WarningMsg.right)) + +(comment) @comment +]] + + local screen = Screen.new(65, 18) + screen:attach() + screen:set_default_attr_ids({ + [1] = {bold = true, foreground = Screen.colors.Blue1}, + [2] = {foreground = Screen.colors.Blue1}, + [3] = {bold = true, foreground = Screen.colors.SeaGreen4}, + [4] = {bold = true, foreground = Screen.colors.Brown}, + [5] = {foreground = Screen.colors.Magenta}, + [6] = {foreground = Screen.colors.Red}, + [7] = {foreground = Screen.colors.SlateBlue}, + [8] = {foreground = Screen.colors.Grey100, background = Screen.colors.Red}, + [9] = {foreground = Screen.colors.Magenta, background = Screen.colors.Red}, + [10] = {foreground = Screen.colors.Red, background = Screen.colors.Red}, + + }) + + insert(hl_text) + screen:expect{grid=[[ + /// Schedule Lua callback on main loop's event queue | + static int nlua_schedule(lua_State *const lstate) | + { | + if (lua_type(lstate, 1) != LUA_TFUNCTION | + || lstate != lstate) { | + lua_pushliteral(lstate, "vim.schedule: expected function"); | + return lua_error(lstate); | + } | + | + LuaRef cb = nlua_ref(lstate, 1); | + | + multiqueue_put(main_loop.events, nlua_schedule_event, | + 1, (void *)(ptrdiff_t)cb); | + return 0; | + ^} | + {1:~ }| + {1:~ }| + | + ]]} + + exec_lua([[ + local TSHighlighter = vim.treesitter.TSHighlighter + local query = ... + test_hl = TSHighlighter.new(query, 0, "c") + ]], hl_query) + screen:expect{grid=[[ + {2:/// Schedule Lua callback on main loop's event queue} | + {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) | + { | + {4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION | + || {6:lstate} != {6:lstate}) { | + lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); | + {4:return} lua_error(lstate); | + } | + | + {7:LuaRef} cb = nlua_ref(lstate, {5:1}); | + | + multiqueue_put(main_loop.events, nlua_schedule_event, | + {5:1}, ({3:void} *)(ptrdiff_t)cb); | + {4:return} {5:0}; | + ^} | + {1:~ }| + {1:~ }| + | + ]]} + + feed('7Go*/') + screen:expect{grid=[[ + {2:/// Schedule Lua callback on main loop's event queue} | + {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) | + { | + {4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION | + || {6:lstate} != {6:lstate}) { | + lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); | + {4:return} lua_error(lstate); | + {8:*^/} | + } | + | + {7:LuaRef} cb = nlua_ref(lstate, {5:1}); | + | + multiqueue_put(main_loop.events, nlua_schedule_event, | + {5:1}, ({3:void} *)(ptrdiff_t)cb); | + {4:return} {5:0}; | + } | + {1:~ }| + | + ]]} + + feed('3Go/*') + screen:expect{grid=[[ + {2:/// Schedule Lua callback on main loop's event queue} | + {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) | + { | + {2:/^*} | + {2: if (lua_type(lstate, 1) != LUA_TFUNCTION} | + {2: || lstate != lstate) {} | + {2: lua_pushliteral(lstate, "vim.schedule: expected function");} | + {2: return lua_error(lstate);} | + {2:*/} | + } | + | + {7:LuaRef} cb = nlua_ref(lstate, {5:1}); | + | + multiqueue_put(main_loop.events, nlua_schedule_event, | + {5:1}, ({3:void} *)(ptrdiff_t)cb); | + {4:return} {5:0}; | + {8:}} | + | + ]]} + end) + + it('inspects language', function() + local keys, fields, symbols = unpack(exec_lua([[ + local lang = vim.treesitter.inspect_language('c') + local keys, symbols = {}, {} + for k,_ in pairs(lang) do + keys[k] = true end - eq({true,true}, {has_named,has_anonymous}) - end) + + -- symbols array can have "holes" and is thus not a valid msgpack array + -- but we don't care about the numbers here (checked in the parser test) + for _, v in pairs(lang.symbols) do + table.insert(symbols, v) + end + return {keys, lang.fields, symbols} + ]])) + + eq({fields=true, symbols=true}, keys) + + local fset = {} + for _,f in pairs(fields) do + eq("string", type(f)) + fset[f] = true + end + eq(true, fset["directive"]) + eq(true, fset["initializer"]) + + local has_named, has_anonymous + for _,s in pairs(symbols) do + eq("string", type(s[1])) + eq("boolean", type(s[2])) + if s[1] == "for_statement" and s[2] == true then + has_named = true + elseif s[1] == "|=" and s[2] == false then + has_anonymous = true + end + end + eq({true,true}, {has_named,has_anonymous}) end) end)