Merge pull request #11113 from bfredl/tree-sitter-query

tree-sitter step 2: query API and  highlighting prototype
This commit is contained in:
Björn Linse 2019-12-22 13:47:45 +01:00 committed by GitHub
commit 9e9dcd4bd7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
44 changed files with 5950 additions and 384 deletions

View File

@ -594,6 +594,102 @@ tsnode:named_descendant_for_range(start_row, start_col, end_row, end_col)
Get the smallest named node within this node that spans the given
range of (row, column) positions
Query methods *lua-treesitter-query*
Tree-sitter queries are supported, with some limitations. Currently, the only
supported match predicate is `eq?` (both comparing a capture against a string
and two captures against each other).
vim.treesitter.parse_query(lang, query)
*vim.treesitter.parse_query(()*
Parse the query as a string. (If the query is in a file, the caller
should read the contents into a string before calling).
query:iter_captures(node, bufnr, start_row, end_row)
*query:iter_captures()*
Iterate over all captures from all matches inside a `node`.
`bufnr` is needed if the query contains predicates, then the caller
must ensure to use a freshly parsed tree consistent with the current
text of the buffer. `start_row` and `end_row` can be used to limit
matches inside a row range (this is typically used with root node
as the node, i e to get syntax highlight matches in the current
viewport)
The iterator returns two values, a numeric id identifying the capture
and the captured node. The following example shows how to get captures
by name:
>
for id, node in query:iter_captures(tree:root(), bufnr, first, last) do
local name = query.captures[id] -- name of the capture in the query
-- typically useful info about the node:
local type = node:type() -- type of the captured node
local row1, col1, row2, col2 = node:range() -- range of the capture
... use the info here ...
end
<
query:iter_matches(node, bufnr, start_row, end_row)
*query:iter_matches()*
Iterate over all matches within a node. The arguments are the same as
for |query:iter_captures()| but the iterated values are different:
an (1-based) index of the pattern in the query, and a table mapping
capture indices to nodes. If the query has more than one pattern
the capture table might be sparse, and e.g. `pairs` should be used and not
`ipairs`. Here an example iterating over all captures in
every match:
>
for pattern, match in cquery:iter_matches(tree:root(), bufnr, first, last) do
for id,node in pairs(match) do
local name = query.captures[id]
-- `node` was captured by the `name` capture in the match
... use the info here ...
end
end
>
Treesitter syntax highlighting (WIP) *lua-treesitter-highlight*
NOTE: This is a partially implemented feature, and not usable as a default
solution yet. What is documented here is a temporary interface indented
for those who want to experiment with this feature and contribute to
its development.
Highlights are defined in the same query format as in the tree-sitter highlight
crate, which some limitations and additions. Set a highlight query for a
buffer with this code: >
local query = [[
"for" @keyword
"if" @keyword
"return" @keyword
(string_literal) @string
(number_literal) @number
(comment) @comment
(preproc_function_def name: (identifier) @function)
; ... more definitions
]]
highlighter = vim.treesitter.TSHighlighter.new(query, bufnr, lang)
-- alternatively, to use the current buffer and its filetype:
-- highlighter = vim.treesitter.TSHighlighter.new(query)
-- Don't recreate the highlighter for the same buffer, instead
-- modify the query like this:
local query2 = [[ ... ]]
highlighter:set_query(query2)
As mentioned above the supported predicate is currently only `eq?`. `match?`
predicates behave like matching always fails. As an addition a capture which
begin with an upper-case letter like `@WarningMsg` will map directly to this
highlight group, if defined. Also if the predicate begins with upper-case and
contains a dot only the part before the first will be interpreted as the
highlight group. As an example, this warns of a binary expression with two
identical identifiers, highlighting both as |hl-WarningMsg|: >
((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right)
(eq? @WarningMsg.left @WarningMsg.right))
------------------------------------------------------------------------------
VIM *lua-builtin*

View File

@ -12,9 +12,13 @@ function Parser:parse()
if self.valid then
return self.tree
end
self.tree = self._parser:parse_buf(self.bufnr)
local changes
self.tree, changes = self._parser:parse_buf(self.bufnr)
self.valid = true
return self.tree
for _, cb in ipairs(self.change_cbs) do
cb(changes)
end
return self.tree, changes
end
function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size)
@ -26,17 +30,28 @@ function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_
self.valid = false
end
local module = {
local M = {
add_language=vim._ts_add_language,
inspect_language=vim._ts_inspect_language,
parse_query = vim._ts_parse_query,
}
function module.create_parser(bufnr, ft, id)
setmetatable(M, {
__index = function (t, k)
if k == "TSHighlighter" then
t[k] = require'vim.tshighlighter'
return t[k]
end
end
})
function M.create_parser(bufnr, ft, id)
if bufnr == 0 then
bufnr = a.nvim_get_current_buf()
end
local self = setmetatable({bufnr=bufnr, valid=false}, Parser)
local self = setmetatable({bufnr=bufnr, lang=ft, valid=false}, Parser)
self._parser = vim._create_ts_parser(ft)
self.change_cbs = {}
self:parse()
-- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is
-- using it.
@ -55,7 +70,7 @@ function module.create_parser(bufnr, ft, id)
return self
end
function module.get_parser(bufnr, ft)
function M.get_parser(bufnr, ft, cb)
if bufnr == nil or bufnr == 0 then
bufnr = a.nvim_get_current_buf()
end
@ -65,9 +80,98 @@ function module.get_parser(bufnr, ft)
local id = tostring(bufnr)..'_'..ft
if parsers[id] == nil then
parsers[id] = module.create_parser(bufnr, ft, id)
parsers[id] = M.create_parser(bufnr, ft, id)
end
if cb ~= nil then
table.insert(parsers[id].change_cbs, cb)
end
return parsers[id]
end
return module
-- query: pattern matching on trees
-- predicate matching is implemented in lua
local Query = {}
Query.__index = Query
function M.parse_query(lang, query)
local self = setmetatable({}, Query)
self.query = vim._ts_parse_query(lang, query)
self.info = self.query:inspect()
self.captures = self.info.captures
return self
end
local function get_node_text(node, bufnr)
local start_row, start_col, end_row, end_col = node:range()
if start_row ~= end_row then
return nil
end
local line = a.nvim_buf_get_lines(bufnr, start_row, start_row+1, true)[1]
return string.sub(line, start_col+1, end_col)
end
local function match_preds(match, preds, bufnr)
for _, pred in pairs(preds) do
if pred[1] == "eq?" then
local node = match[pred[2]]
local node_text = get_node_text(node, bufnr)
local str
if type(pred[3]) == "string" then
-- (eq? @aa "foo")
str = pred[3]
else
-- (eq? @aa @bb)
str = get_node_text(match[pred[3]], bufnr)
end
if node_text ~= str or str == nil then
return false
end
else
return false
end
end
return true
end
function Query:iter_captures(node, bufnr, start, stop)
if bufnr == 0 then
bufnr = vim.api.nvim_get_current_buf()
end
local raw_iter = node:_rawquery(self.query,true,start,stop)
local function iter()
local capture, captured_node, match = raw_iter()
if match ~= nil then
local preds = self.info.patterns[match.pattern]
local active = match_preds(match, preds, bufnr)
match.active = active
if not active then
return iter() -- tail call: try next match
end
end
return capture, captured_node
end
return iter
end
function Query:iter_matches(node, bufnr, start, stop)
if bufnr == 0 then
bufnr = vim.api.nvim_get_current_buf()
end
local raw_iter = node:_rawquery(self.query,false,start,stop)
local function iter()
local pattern, match = raw_iter()
if match ~= nil then
local preds = self.info.patterns[pattern]
local active = (not preds) or match_preds(match, preds, bufnr)
if not active then
return iter() -- tail call: try next match
end
end
return pattern, match
end
return iter
end
return M

View File

@ -0,0 +1,142 @@
local a = vim.api
-- support reload for quick experimentation
local TSHighlighter = rawget(vim.treesitter, 'TSHighlighter') or {}
TSHighlighter.__index = TSHighlighter
-- These are conventions defined by tree-sitter, though it
-- needs to be user extensible also.
-- TODO(bfredl): this is very much incomplete, we will need to
-- go through a few tree-sitter provided queries and decide
-- on translations that makes the most sense.
TSHighlighter.hl_map = {
keyword="Keyword",
string="String",
type="Type",
comment="Comment",
constant="Constant",
operator="Operator",
number="Number",
label="Label",
["function"]="Function",
["function.special"]="Function",
}
function TSHighlighter.new(query, bufnr, ft)
local self = setmetatable({}, TSHighlighter)
self.parser = vim.treesitter.get_parser(bufnr, ft, function(...) self:on_change(...) end)
self.buf = self.parser.bufnr
-- TODO(bfredl): perhaps on_start should be called uncondionally, instead for only on mod?
local tree = self.parser:parse()
self.root = tree:root()
self:set_query(query)
self.edit_count = 0
self.redraw_count = 0
self.line_count = {}
a.nvim_buf_set_option(self.buf, "syntax", "")
a.nvim__buf_set_luahl(self.buf, {
on_start=function(...) return self:on_start(...) end,
on_window=function(...) return self:on_window(...) end,
on_line=function(...) return self:on_line(...) end,
})
-- Tricky: if syntax hasn't been enabled, we need to reload color scheme
-- but use synload.vim rather than syntax.vim to not enable
-- syntax FileType autocmds. Later on we should integrate with the
-- `:syntax` and `set syntax=...` machinery properly.
if vim.g.syntax_on ~= 1 then
vim.api.nvim_command("runtime! syntax/synload.vim")
end
return self
end
function TSHighlighter:set_query(query)
if type(query) == "string" then
query = vim.treesitter.parse_query(self.parser.lang, query)
end
self.query = query
self.id_map = {}
for i, capture in ipairs(self.query.captures) do
local hl = 0
local firstc = string.sub(capture, 1, 1)
local hl_group = self.hl_map[capture]
if firstc ~= string.lower(firstc) then
hl_group = vim.split(capture, '.', true)[1]
end
if hl_group then
hl = a.nvim_get_hl_id_by_name(hl_group)
end
self.id_map[i] = hl
end
end
function TSHighlighter:on_change(changes)
for _, ch in ipairs(changes or {}) do
a.nvim__buf_redraw_range(self.buf, ch[1], ch[3]+1)
end
self.edit_count = self.edit_count + 1
end
function TSHighlighter:on_start(_, _buf, _tick)
local tree = self.parser:parse()
self.root = tree:root()
end
function TSHighlighter:on_window(_, _win, _buf, _topline, botline)
self.iter = nil
self.active_nodes = {}
self.nextrow = 0
self.botline = botline
self.redraw_count = self.redraw_count + 1
end
function TSHighlighter:on_line(_, _win, buf, line)
if self.iter == nil then
self.iter = self.query:iter_captures(self.root,buf,line,self.botline)
end
while line >= self.nextrow do
local capture, node, match = self.iter()
local active = true
if capture == nil then
break
end
if match ~= nil then
active = self:run_pred(match)
match.active = active
end
local start_row, start_col, end_row, end_col = node:range()
local hl = self.id_map[capture]
if hl > 0 and active then
if start_row == line and end_row == line then
a.nvim__put_attr(hl, start_col, end_col)
elseif end_row >= line then
-- TODO(bfredl): this is quite messy. Togheter with multiline bufhl we should support
-- luahl generating multiline highlights (and other kinds of annotations)
self.active_nodes[{hl=hl, start_row=start_row, start_col=start_col, end_row=end_row, end_col=end_col}] = true
end
end
if start_row > line then
self.nextrow = start_row
end
end
for node,_ in pairs(self.active_nodes) do
if node.start_row <= line and node.end_row >= line then
local start_col, end_col = node.start_col, node.end_col
if node.start_row < line then
start_col = 0
end
if node.end_row > line then
end_col = 9000
end
a.nvim__put_attr(node.hl, start_col, end_col)
end
if node.end_row <= line then
self.active_nodes[node] = nil
end
end
self.line_count[line] = (self.line_count[line] or 0) + 1
--return tostring(self.line_count[line])
end
return TSHighlighter

View File

@ -169,21 +169,21 @@ Boolean nvim_buf_attach(uint64_t channel_id,
goto error;
}
cb.on_lines = v->data.luaref;
v->data.integer = LUA_NOREF;
v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("on_changedtick", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
cb.on_changedtick = v->data.luaref;
v->data.integer = LUA_NOREF;
v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("on_detach", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
cb.on_detach = v->data.luaref;
v->data.integer = LUA_NOREF;
v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("utf_sizes", k.data)) {
if (v->type != kObjectTypeBoolean) {
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
@ -231,6 +231,90 @@ Boolean nvim_buf_detach(uint64_t channel_id,
return true;
}
static void buf_clear_luahl(buf_T *buf, bool force)
{
if (buf->b_luahl || force) {
executor_free_luaref(buf->b_luahl_start);
executor_free_luaref(buf->b_luahl_window);
executor_free_luaref(buf->b_luahl_line);
executor_free_luaref(buf->b_luahl_end);
}
buf->b_luahl_start = LUA_NOREF;
buf->b_luahl_window = LUA_NOREF;
buf->b_luahl_line = LUA_NOREF;
buf->b_luahl_end = LUA_NOREF;
}
/// Unstabilized interface for defining syntax hl in lua.
///
/// This is not yet safe for general use, lua callbacks will need to
/// be restricted, like textlock and probably other stuff.
///
/// The API on_line/nvim__put_attr is quite raw and not intended to be the
/// final shape. Ideally this should operate on chunks larger than a single
/// line to reduce interpreter overhead, and generate annotation objects
/// (bufhl/virttext) on the fly but using the same representation.
void nvim__buf_set_luahl(uint64_t channel_id, Buffer buffer,
DictionaryOf(LuaRef) opts, Error *err)
FUNC_API_LUA_ONLY
{
buf_T *buf = find_buffer_by_handle(buffer, err);
if (!buf) {
return;
}
redraw_buf_later(buf, NOT_VALID);
buf_clear_luahl(buf, false);
for (size_t i = 0; i < opts.size; i++) {
String k = opts.items[i].key;
Object *v = &opts.items[i].value;
if (strequal("on_start", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
buf->b_luahl_start = v->data.luaref;
v->data.luaref = LUA_NOREF;
} else if (strequal("on_window", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
buf->b_luahl_window = v->data.luaref;
v->data.luaref = LUA_NOREF;
} else if (strequal("on_line", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
buf->b_luahl_line = v->data.luaref;
v->data.luaref = LUA_NOREF;
} else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
goto error;
}
}
buf->b_luahl = true;
return;
error:
buf_clear_luahl(buf, true);
buf->b_luahl = false;
}
void nvim__buf_redraw_range(Buffer buffer, Integer first, Integer last,
Error *err)
FUNC_API_LUA_ONLY
{
buf_T *buf = find_buffer_by_handle(buffer, err);
if (!buf) {
return;
}
redraw_buf_range_later(buf, (linenr_T)first+1, (linenr_T)last);
}
/// Sets a buffer line
///
/// @deprecated use nvim_buf_set_lines instead.
@ -1112,7 +1196,6 @@ Array nvim_buf_get_extmarks(Buffer buffer, Integer ns_id, Object start,
return rv;
}
limit = v->data.integer;
v->data.integer = LUA_NOREF;
} else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
return rv;

View File

@ -60,6 +60,12 @@
#define ADD(array, item) \
kv_push(array, item)
#define FIXED_TEMP_ARRAY(name, fixsize) \
Array name = ARRAY_DICT_INIT; \
Object name##__items[fixsize]; \
args.size = fixsize; \
args.items = name##__items; \
#define STATIC_CSTR_AS_STRING(s) ((String) {.data = s, .size = sizeof(s) - 1})
/// Create a new String instance, putting data in allocated memory

View File

@ -189,6 +189,15 @@ Dictionary nvim_get_hl_by_id(Integer hl_id, Boolean rgb, Error *err)
return hl_get_attr_by_id(attrcode, rgb, err);
}
/// Gets a highlight group by name
///
/// similar to |hlID()|, but allocates a new ID if not present.
Integer nvim_get_hl_id_by_name(String name)
FUNC_API_SINCE(7)
{
return syn_check_group((const char_u *)name.data, (int)name.size);
}
/// Sends input-keys to Nvim, subject to various quirks controlled by `mode`
/// flags. This is a blocking call, unlike |nvim_input()|.
///
@ -2546,3 +2555,27 @@ Array nvim__inspect_cell(Integer grid, Integer row, Integer col, Error *err)
}
return ret;
}
/// Set attrs in nvim__buf_set_lua_hl callbacks
///
/// TODO(bfredl): This is rather pedestrian. The final
/// interface should probably be derived from a reformed
/// bufhl/virttext interface with full support for multi-line
/// ranges etc
void nvim__put_attr(Integer id, Integer c0, Integer c1)
FUNC_API_LUA_ONLY
{
if (!lua_attr_active) {
return;
}
if (id == 0 || syn_get_final_id((int)id) == 0) {
return;
}
int attr = syn_id2attr((int)id);
c0 = MAX(c0, 0);
c1 = MIN(c1, (Integer)lua_attr_bufsize);
for (Integer c = c0; c < c1; c++) {
lua_attr_buf[c] = (sattr_T)hl_combine_attr(lua_attr_buf[c], (int)attr);
}
return;
}

View File

@ -832,6 +832,12 @@ struct file_buffer {
// The number for times the current line has been flushed in the memline.
int flush_count;
bool b_luahl;
LuaRef b_luahl_start;
LuaRef b_luahl_window;
LuaRef b_luahl_line;
LuaRef b_luahl_end;
int b_diff_failed; // internal diff failed for this buffer
};

View File

@ -157,7 +157,7 @@ void buf_updates_unregister_all(buf_T *buf)
args.items[0] = BUFFER_OBJ(buf->handle);
textlock++;
executor_exec_lua_cb(cb.on_detach, "detach", args, false);
executor_exec_lua_cb(cb.on_detach, "detach", args, false, NULL);
textlock--;
}
free_update_callbacks(cb);
@ -265,7 +265,7 @@ void buf_updates_send_changes(buf_T *buf,
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
}
textlock++;
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true, NULL);
textlock--;
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
@ -293,10 +293,7 @@ void buf_updates_changedtick(buf_T *buf)
BufUpdateCallbacks cb = kv_A(buf->update_callbacks, i);
bool keep = true;
if (cb.on_changedtick != LUA_NOREF) {
Array args = ARRAY_DICT_INIT;
Object items[2];
args.size = 2;
args.items = items;
FIXED_TEMP_ARRAY(args, 2);
// the first argument is always the buffer handle
args.items[0] = BUFFER_OBJ(buf->handle);
@ -306,7 +303,7 @@ void buf_updates_changedtick(buf_T *buf)
textlock++;
Object res = executor_exec_lua_cb(cb.on_changedtick, "changedtick",
args, true);
args, true, NULL);
textlock--;
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {

View File

@ -211,6 +211,8 @@
# define FUNC_API_NOEXPORT
/// API function not exposed in VimL/eval.
# define FUNC_API_REMOTE_ONLY
/// API function not exposed in VimL/remote.
# define FUNC_API_LUA_ONLY
/// API function introduced at the given API level.
# define FUNC_API_SINCE(X)
/// API function deprecated since the given API level.

View File

@ -42,6 +42,7 @@ local c_proto = Ct(
(fill * Cg((P('FUNC_API_FAST') * Cc(true)), 'fast') ^ -1) *
(fill * Cg((P('FUNC_API_NOEXPORT') * Cc(true)), 'noexport') ^ -1) *
(fill * Cg((P('FUNC_API_REMOTE_ONLY') * Cc(true)), 'remote_only') ^ -1) *
(fill * Cg((P('FUNC_API_LUA_ONLY') * Cc(true)), 'lua_only') ^ -1) *
(fill * Cg((P('FUNC_API_REMOTE_IMPL') * Cc(true)), 'remote_impl') ^ -1) *
(fill * Cg((P('FUNC_API_BRIDGE_IMPL') * Cc(true)), 'bridge_impl') ^ -1) *
(fill * Cg((P('FUNC_API_COMPOSITOR_IMPL') * Cc(true)), 'compositor_impl') ^ -1) *

View File

@ -192,7 +192,7 @@ end
-- the real API.
for i = 1, #functions do
local fn = functions[i]
if fn.impl_name == nil then
if fn.impl_name == nil and not fn.lua_only then
local args = {}
output:write('Object handle_'..fn.name..'(uint64_t channel_id, Array args, Error *error)')
@ -310,12 +310,13 @@ void msgpack_rpc_init_method_table(void)
for i = 1, #functions do
local fn = functions[i]
output:write(' msgpack_rpc_add_method_handler('..
'(String) {.data = "'..fn.name..'", '..
'.size = sizeof("'..fn.name..'") - 1}, '..
'(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
', .fast = '..tostring(fn.fast)..'});\n')
if not fn.lua_only then
output:write(' msgpack_rpc_add_method_handler('..
'(String) {.data = "'..fn.name..'", '..
'.size = sizeof("'..fn.name..'") - 1}, '..
'(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
', .fast = '..tostring(fn.fast)..'});\n')
end
end
output:write('\n}\n\n')

View File

@ -25,7 +25,7 @@ local gperfpipe = io.open(funcsfname .. '.gperf', 'wb')
local funcs = require('eval').funcs
local metadata = mpack.unpack(io.open(metadata_file, 'rb'):read("*all"))
for _,fun in ipairs(metadata) do
if not fun.remote_only then
if not (fun.remote_only or fun.lua_only) then
funcs[fun.name] = {
args=#fun.parameters,
func='api_wrapper',

View File

@ -126,6 +126,13 @@ typedef off_t off_T;
*/
EXTERN int mod_mask INIT(= 0x0); /* current key modifiers */
// TODO(bfredl): for the final interface this should find a more suitable
// location.
EXTERN sattr_T *lua_attr_buf INIT(= NULL);
EXTERN size_t lua_attr_bufsize INIT(= 0);
EXTERN bool lua_attr_active INIT(= false);
/*
* Cmdline_row is the row where the command line starts, just below the
* last window.

View File

@ -835,7 +835,7 @@ Object executor_exec_lua_api(const String str, const Array args, Error *err)
}
Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
bool retval)
bool retval, Error *err)
{
lua_State *const lstate = nlua_enter();
nlua_pushref(lstate, ref);
@ -845,16 +845,24 @@ Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
}
if (lua_pcall(lstate, (int)args.size+1, retval ? 1 : 0, 0)) {
// TODO(bfredl): callbacks:s might not always be msg-safe, for instance
// lua callbacks for redraw events. Later on let the caller deal with the
// error instead.
nlua_error(lstate, _("Error executing lua callback: %.*s"));
// if err is passed, the caller will deal with the error.
if (err) {
size_t len;
const char *errstr = lua_tolstring(lstate, -1, &len);
api_set_error(err, kErrorTypeException,
"Error executing lua: %.*s", (int)len, errstr);
} else {
nlua_error(lstate, _("Error executing lua callback: %.*s"));
}
return NIL;
}
Error err = ERROR_INIT;
if (retval) {
return nlua_pop_Object(lstate, false, &err);
Error dummy = ERROR_INIT;
if (err == NULL) {
err = &dummy;
}
return nlua_pop_Object(lstate, false, err);
} else {
return NIL;
}
@ -1007,4 +1015,7 @@ static void nlua_add_treesitter(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
lua_pushcfunction(lstate, tslua_inspect_lang);
lua_setfield(lstate, -2, "_ts_inspect_language");
lua_pushcfunction(lstate, ts_lua_parse_query);
lua_setfield(lstate, -2, "_ts_parse_query");
}

View File

@ -26,6 +26,11 @@ typedef struct {
TSTree *tree; // internal tree, used for editing/reparsing
} TSLua_parser;
typedef struct {
TSQueryCursor *cursor;
int predicated_match;
} TSLua_cursor;
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "lua/treesitter.c.generated.h"
#endif
@ -66,6 +71,20 @@ static struct luaL_Reg node_meta[] = {
{ "descendant_for_range", node_descendant_for_range },
{ "named_descendant_for_range", node_named_descendant_for_range },
{ "parent", node_parent },
{ "_rawquery", node_rawquery },
{ NULL, NULL }
};
static struct luaL_Reg query_meta[] = {
{ "__gc", query_gc },
{ "__tostring", query_tostring },
{ "inspect", query_inspect },
{ NULL, NULL }
};
// cursor is not exposed, but still needs garbage collection
static struct luaL_Reg querycursor_meta[] = {
{ "__gc", querycursor_gc },
{ NULL, NULL }
};
@ -96,6 +115,8 @@ void tslua_init(lua_State *L)
build_meta(L, "treesitter_parser", parser_meta);
build_meta(L, "treesitter_tree", tree_meta);
build_meta(L, "treesitter_node", node_meta);
build_meta(L, "treesitter_query", query_meta);
build_meta(L, "treesitter_querycursor", querycursor_meta);
}
int tslua_register_lang(lua_State *L)
@ -276,13 +297,33 @@ static int parser_parse_buf(lua_State *L)
}
TSInput input = { payload, input_cb, TSInputEncodingUTF8 };
TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input);
uint32_t n_ranges = 0;
TSRange *changed = p->tree ? ts_tree_get_changed_ranges(p->tree, new_tree,
&n_ranges) : NULL;
if (p->tree) {
ts_tree_delete(p->tree);
}
p->tree = new_tree;
tslua_push_tree(L, p->tree);
return 1;
lua_createtable(L, n_ranges, 0);
for (size_t i = 0; i < n_ranges; i++) {
lua_createtable(L, 4, 0);
lua_pushinteger(L, changed[i].start_point.row);
lua_rawseti(L, -2, 1);
lua_pushinteger(L, changed[i].start_point.column);
lua_rawseti(L, -2, 2);
lua_pushinteger(L, changed[i].end_point.row);
lua_rawseti(L, -2, 3);
lua_pushinteger(L, changed[i].end_point.column);
lua_rawseti(L, -2, 4);
lua_rawseti(L, -2, i+1);
}
xfree(changed);
return 2;
}
static int parser_tree(lua_State *L)
@ -383,7 +424,7 @@ static int tree_root(lua_State *L)
return 0;
}
TSNode root = ts_tree_root_node(tree);
push_node(L, root);
push_node(L, root, 1);
return 1;
}
@ -394,18 +435,19 @@ static int tree_root(lua_State *L)
/// top of stack must either be the tree this node belongs to or another node
/// of the same tree! This value is not popped. Can only be called inside a
/// cfunction with the tslua environment.
static void push_node(lua_State *L, TSNode node)
static void push_node(lua_State *L, TSNode node, int uindex)
{
assert(uindex > 0 || uindex < -LUA_MINSTACK);
if (ts_node_is_null(node)) {
lua_pushnil(L); // [src, nil]
lua_pushnil(L); // [nil]
return;
}
TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [src, udata]
TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [udata]
*ud = node;
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [src, udata, meta]
lua_setmetatable(L, -2); // [src, udata]
lua_getfenv(L, -2); // [src, udata, reftable]
lua_setfenv(L, -2); // [src, udata]
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [udata, meta]
lua_setmetatable(L, -2); // [udata]
lua_getfenv(L, uindex); // [udata, reftable]
lua_setfenv(L, -2); // [udata]
}
static bool node_check(lua_State *L, TSNode *res)
@ -586,8 +628,7 @@ static int node_child(lua_State *L)
long num = lua_tointeger(L, 2);
TSNode child = ts_node_child(node, (uint32_t)num);
lua_pushvalue(L, 1);
push_node(L, child);
push_node(L, child, 1);
return 1;
}
@ -600,8 +641,7 @@ static int node_named_child(lua_State *L)
long num = lua_tointeger(L, 2);
TSNode child = ts_node_named_child(node, (uint32_t)num);
lua_pushvalue(L, 1);
push_node(L, child);
push_node(L, child, 1);
return 1;
}
@ -617,8 +657,7 @@ static int node_descendant_for_range(lua_State *L)
(uint32_t)lua_tointeger(L, 5) };
TSNode child = ts_node_descendant_for_point_range(node, start, end);
lua_pushvalue(L, 1);
push_node(L, child);
push_node(L, child, 1);
return 1;
}
@ -634,8 +673,7 @@ static int node_named_descendant_for_range(lua_State *L)
(uint32_t)lua_tointeger(L, 5) };
TSNode child = ts_node_named_descendant_for_point_range(node, start, end);
lua_pushvalue(L, 1);
push_node(L, child);
push_node(L, child, 1);
return 1;
}
@ -646,7 +684,254 @@ static int node_parent(lua_State *L)
return 0;
}
TSNode parent = ts_node_parent(node);
push_node(L, parent);
push_node(L, parent, 1);
return 1;
}
/// assumes the match table being on top of the stack
static void set_match(lua_State *L, TSQueryMatch *match, int nodeidx)
{
for (int i = 0; i < match->capture_count; i++) {
push_node(L, match->captures[i].node, nodeidx);
lua_rawseti(L, -2, match->captures[i].index+1);
}
}
static int query_next_match(lua_State *L)
{
TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
TSQueryCursor *cursor = ud->cursor;
TSQuery *query = query_check(L, lua_upvalueindex(3));
TSQueryMatch match;
if (ts_query_cursor_next_match(cursor, &match)) {
lua_pushinteger(L, match.pattern_index+1); // [index]
lua_createtable(L, ts_query_capture_count(query), 2); // [index, match]
set_match(L, &match, lua_upvalueindex(2));
return 2;
}
return 0;
}
static int query_next_capture(lua_State *L)
{
TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
TSQueryCursor *cursor = ud->cursor;
TSQuery *query = query_check(L, lua_upvalueindex(3));
if (ud->predicated_match > -1) {
lua_getfield(L, lua_upvalueindex(4), "active");
bool active = lua_toboolean(L, -1);
lua_pop(L, 1);
if (!active) {
ts_query_cursor_remove_match(cursor, ud->predicated_match);
}
ud->predicated_match = -1;
}
TSQueryMatch match;
uint32_t capture_index;
if (ts_query_cursor_next_capture(cursor, &match, &capture_index)) {
TSQueryCapture capture = match.captures[capture_index];
lua_pushinteger(L, capture.index+1); // [index]
push_node(L, capture.node, lua_upvalueindex(2)); // [index, node]
uint32_t n_pred;
ts_query_predicates_for_pattern(query, match.pattern_index, &n_pred);
if (n_pred > 0 && capture_index == 0) {
lua_pushvalue(L, lua_upvalueindex(4)); // [index, node, match]
set_match(L, &match, lua_upvalueindex(2));
lua_pushinteger(L, match.pattern_index+1);
lua_setfield(L, -2, "pattern");
if (match.capture_count > 1) {
ud->predicated_match = match.id;
lua_pushboolean(L, false);
lua_setfield(L, -2, "active");
}
return 3;
}
return 2;
}
return 0;
}
static int node_rawquery(lua_State *L)
{
TSNode node;
if (!node_check(L, &node)) {
return 0;
}
TSQuery *query = query_check(L, 2);
// TODO(bfredl): these are expensive allegedly,
// use a reuse list later on?
TSQueryCursor *cursor = ts_query_cursor_new();
ts_query_cursor_exec(cursor, query, node);
bool captures = lua_toboolean(L, 3);
if (lua_gettop(L) >= 4) {
int start = luaL_checkinteger(L, 4);
int end = lua_gettop(L) >= 5 ? luaL_checkinteger(L, 5) : MAXLNUM;
ts_query_cursor_set_point_range(cursor,
(TSPoint){ start, 0 }, (TSPoint){ end, 0 });
}
TSLua_cursor *ud = lua_newuserdata(L, sizeof(*ud)); // [udata]
ud->cursor = cursor;
ud->predicated_match = -1;
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_querycursor");
lua_setmetatable(L, -2); // [udata]
lua_pushvalue(L, 1); // [udata, node]
// include query separately, as to keep a ref to it for gc
lua_pushvalue(L, 2); // [udata, node, query]
if (captures) {
// placeholder for match state
lua_createtable(L, ts_query_capture_count(query), 2); // [u, n, q, match]
lua_pushcclosure(L, query_next_capture, 4); // [closure]
} else {
lua_pushcclosure(L, query_next_match, 3); // [closure]
}
return 1;
}
static int querycursor_gc(lua_State *L)
{
TSLua_cursor *ud = luaL_checkudata(L, 1, "treesitter_querycursor");
ts_query_cursor_delete(ud->cursor);
return 0;
}
// Query methods
int ts_lua_parse_query(lua_State *L)
{
if (lua_gettop(L) < 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2)) {
return luaL_error(L, "string expected");
}
const char *lang_name = lua_tostring(L, 1);
TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name);
if (!lang) {
return luaL_error(L, "no such language: %s", lang_name);
}
size_t len;
const char *src = lua_tolstring(L, 2, &len);
uint32_t error_offset;
TSQueryError error_type;
TSQuery *query = ts_query_new(lang, src, len, &error_offset, &error_type);
if (!query) {
return luaL_error(L, "query: %s at position %d",
query_err_string(error_type), (int)error_offset);
}
TSQuery **ud = lua_newuserdata(L, sizeof(TSQuery *)); // [udata]
*ud = query;
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_query"); // [udata, meta]
lua_setmetatable(L, -2); // [udata]
return 1;
}
static const char *query_err_string(TSQueryError err) {
switch (err) {
case TSQueryErrorSyntax: return "invalid syntax";
case TSQueryErrorNodeType: return "invalid node type";
case TSQueryErrorField: return "invalid field";
case TSQueryErrorCapture: return "invalid capture";
default: return "error";
}
}
static TSQuery *query_check(lua_State *L, int index)
{
TSQuery **ud = luaL_checkudata(L, index, "treesitter_query");
return *ud;
}
static int query_gc(lua_State *L)
{
TSQuery *query = query_check(L, 1);
if (!query) {
return 0;
}
ts_query_delete(query);
return 0;
}
static int query_tostring(lua_State *L)
{
lua_pushstring(L, "<query>");
return 1;
}
static int query_inspect(lua_State *L)
{
TSQuery *query = query_check(L, 1);
if (!query) {
return 0;
}
uint32_t n_pat = ts_query_pattern_count(query);
lua_createtable(L, 0, 2); // [retval]
lua_createtable(L, n_pat, 1); // [retval, patterns]
for (size_t i = 0; i < n_pat; i++) {
uint32_t len;
const TSQueryPredicateStep *step = ts_query_predicates_for_pattern(query,
i, &len);
if (len == 0) {
continue;
}
lua_createtable(L, len/4, 1); // [retval, patterns, pat]
lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
int nextpred = 1;
int nextitem = 1;
for (size_t k = 0; k < len; k++) {
if (step[k].type == TSQueryPredicateStepTypeDone) {
lua_rawseti(L, -2, nextpred++); // [retval, patterns, pat]
lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
nextitem = 1;
continue;
}
if (step[k].type == TSQueryPredicateStepTypeString) {
uint32_t strlen;
const char *str = ts_query_string_value_for_id(query, step[k].value_id,
&strlen);
lua_pushlstring(L, str, strlen); // [retval, patterns, pat, pred, item]
} else if (step[k].type == TSQueryPredicateStepTypeCapture) {
lua_pushnumber(L, step[k].value_id+1); // [..., pat, pred, item]
} else {
abort();
}
lua_rawseti(L, -2, nextitem++); // [retval, patterns, pat, pred]
}
// last predicate should have ended with TypeDone
lua_pop(L, 1); // [retval, patters, pat]
lua_rawseti(L, -2, i+1); // [retval, patterns]
}
lua_setfield(L, -2, "patterns"); // [retval]
uint32_t n_captures = ts_query_capture_count(query);
lua_createtable(L, n_captures, 0); // [retval, captures]
for (size_t i = 0; i < n_captures; i++) {
uint32_t strlen;
const char *str = ts_query_capture_name_for_id(query, i, &strlen);
lua_pushlstring(L, str, strlen); // [retval, captures, capture]
lua_rawseti(L, -2, i+1);
}
lua_setfield(L, -2, "captures"); // [retval]
return 1;
}

View File

@ -116,6 +116,8 @@
#include "nvim/window.h"
#include "nvim/os/time.h"
#include "nvim/api/private/helpers.h"
#include "nvim/api/vim.h"
#include "nvim/lua/executor.h"
#define MB_FILLER_CHAR '<' /* character used when a double-width character
* doesn't fit. */
@ -232,6 +234,22 @@ void redraw_buf_line_later(buf_T *buf, linenr_T line)
}
}
void redraw_buf_range_later(buf_T *buf, linenr_T firstline, linenr_T lastline)
{
FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
if (wp->w_buffer == buf
&& lastline >= wp->w_topline && firstline < wp->w_botline) {
if (wp->w_redraw_top == 0 || wp->w_redraw_top > firstline) {
wp->w_redraw_top = firstline;
}
if (wp->w_redraw_bot == 0 || wp->w_redraw_bot < lastline) {
wp->w_redraw_bot = lastline;
}
redraw_win_later(wp, VALID);
}
}
}
/*
* Changed something in the current window, at buffer line "lnum", that
* requires that line and possibly other lines to be redrawn.
@ -477,6 +495,19 @@ int update_screen(int type)
if (wwp == wp && syntax_present(wp)) {
syn_stack_apply_changes(wp->w_buffer);
}
buf_T *buf = wp->w_buffer;
if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
Error err = ERROR_INIT;
FIXED_TEMP_ARRAY(args, 2);
args.items[0] = BUFFER_OBJ(buf->handle);
args.items[1] = INTEGER_OBJ(display_tick);
executor_exec_lua_cb(buf->b_luahl_start, "start", args, false, &err);
if (ERROR_SET(&err)) {
ELOG("error in luahl start: %s", err.msg);
api_clear_error(&err);
}
}
}
}
@ -1181,7 +1212,27 @@ static void win_update(win_T *wp)
idx = 0; /* first entry in w_lines[].wl_size */
row = 0;
srow = 0;
lnum = wp->w_topline; /* first line shown in window */
lnum = wp->w_topline; // first line shown in window
if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
Error err = ERROR_INIT;
FIXED_TEMP_ARRAY(args, 4);
linenr_T knownmax = ((wp->w_valid & VALID_BOTLINE)
? wp->w_botline
: (wp->w_topline + wp->w_height_inner));
args.items[0] = WINDOW_OBJ(wp->handle);
args.items[1] = BUFFER_OBJ(buf->handle);
args.items[2] = INTEGER_OBJ(wp->w_topline-1);
args.items[3] = INTEGER_OBJ(knownmax);
// TODO(bfredl): we could allow this callback to change mod_top, mod_bot.
// For now the "start" callback is expected to use nvim__buf_redraw_range.
executor_exec_lua_cb(buf->b_luahl_window, "window", args, false, &err);
if (ERROR_SET(&err)) {
ELOG("error in luahl window: %s", err.msg);
api_clear_error(&err);
}
}
for (;; ) {
/* stop updating when reached the end of the window (check for _past_
* the end of the window is at the end of the loop) */
@ -2229,6 +2280,8 @@ win_line (
row = startrow;
char *luatext = NULL;
if (!number_only) {
// To speed up the loop below, set extra_check when there is linebreak,
// trailing white space and/or syntax processing to be done.
@ -2454,6 +2507,41 @@ win_line (
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
ptr = line;
buf_T *buf = wp->w_buffer;
if (buf->b_luahl && buf->b_luahl_line != LUA_NOREF) {
size_t size = STRLEN(line);
if (lua_attr_bufsize < size) {
xfree(lua_attr_buf);
lua_attr_buf = xcalloc(size, sizeof(*lua_attr_buf));
lua_attr_bufsize = size;
} else if (lua_attr_buf) {
memset(lua_attr_buf, 0, size * sizeof(*lua_attr_buf));
}
Error err = ERROR_INIT;
// TODO(bfredl): build a macro for the "static array" pattern
// in buf_updates_send_changes?
FIXED_TEMP_ARRAY(args, 3);
args.items[0] = WINDOW_OBJ(wp->handle);
args.items[1] = BUFFER_OBJ(buf->handle);
args.items[2] = INTEGER_OBJ(lnum-1);
lua_attr_active = true;
extra_check = true;
Object o = executor_exec_lua_cb(buf->b_luahl_line, "line",
args, true, &err);
lua_attr_active = false;
if (o.type == kObjectTypeString) {
// TODO(bfredl): this is a bit of a hack. A final API should use an
// "unified" interface where luahl can add both bufhl and virttext
luatext = o.data.string.data;
do_virttext = true;
} else if (ERROR_SET(&err)) {
ELOG("error in luahl line: %s", err.msg);
luatext = err.msg;
do_virttext = true;
api_clear_error(&err);
}
}
if (has_spell && !number_only) {
// For checking first word with a capital skip white space.
if (cap_col == 0) {
@ -3429,6 +3517,10 @@ win_line (
}
}
if (buf->b_luahl && v > 0 && v < (long)lua_attr_bufsize+1) {
char_attr = hl_combine_attr(char_attr, lua_attr_buf[v-1]);
}
if (wp->w_buffer->terminal) {
char_attr = hl_combine_attr(term_attrs[vcol], char_attr);
}
@ -3917,8 +4009,14 @@ win_line (
int rightmost_vcol = 0;
int i;
VirtText virt_text = do_virttext ? bufhl_info.line->virt_text
: (VirtText)KV_INITIAL_VALUE;
VirtText virt_text;
if (luatext) {
virt_text = (VirtText)KV_INITIAL_VALUE;
kv_push(virt_text, ((VirtTextChunk){ .text = luatext, .hl_id = 0 }));
} else {
virt_text = do_virttext ? bufhl_info.line->virt_text
: (VirtText)KV_INITIAL_VALUE;
}
size_t virt_pos = 0;
LineState s = LINE_STATE((char_u *)"");
int virt_attr = 0;
@ -4319,6 +4417,7 @@ win_line (
}
xfree(p_extra_free);
xfree(luatext);
return row;
}

View File

@ -14,7 +14,19 @@ extern "C" {
/* Section - ABI Versioning */
/****************************/
/**
* The latest ABI version that is supported by the current version of the
* library. When Languages are generated by the Tree-sitter CLI, they are
* assigned an ABI version number that corresponds to the current CLI version.
* The Tree-sitter library is generally backwards-compatible with languages
* generated using older CLI versions, but is not forwards-compatible.
*/
#define TREE_SITTER_LANGUAGE_VERSION 11
/**
* The earliest ABI version that is supported by the current version of the
* library.
*/
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
/*******************/
@ -26,6 +38,8 @@ typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
typedef struct TSParser TSParser;
typedef struct TSTree TSTree;
typedef struct TSQuery TSQuery;
typedef struct TSQueryCursor TSQueryCursor;
typedef enum {
TSInputEncodingUTF8,
@ -87,6 +101,37 @@ typedef struct {
uint32_t context[2];
} TSTreeCursor;
typedef struct {
TSNode node;
uint32_t index;
} TSQueryCapture;
typedef struct {
uint32_t id;
uint16_t pattern_index;
uint16_t capture_count;
const TSQueryCapture *captures;
} TSQueryMatch;
typedef enum {
TSQueryPredicateStepTypeDone,
TSQueryPredicateStepTypeCapture,
TSQueryPredicateStepTypeString,
} TSQueryPredicateStepType;
typedef struct {
TSQueryPredicateStepType type;
uint32_t value_id;
} TSQueryPredicateStep;
typedef enum {
TSQueryErrorNone = 0,
TSQueryErrorSyntax,
TSQueryErrorNodeType,
TSQueryErrorField,
TSQueryErrorCapture,
} TSQueryError;
/********************/
/* Section - Parser */
/********************/
@ -119,7 +164,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language);
const TSLanguage *ts_parser_language(const TSParser *self);
/**
* Set the spans of text that the parser should include when parsing.
* Set the ranges of text that the parser should include when parsing.
*
* By default, the parser will always include entire documents. This function
* allows you to parse only a *portion* of a document but still return a syntax
@ -226,14 +271,16 @@ TSTree *ts_parser_parse_string_encoding(
* by default, it will resume where it left off on the next call to
* `ts_parser_parse` or other parsing functions. If you don't want to resume,
* and instead intend to use this parser to parse some other document, you must
* call this `ts_parser_reset` first.
* call `ts_parser_reset` first.
*/
void ts_parser_reset(TSParser *self);
/**
* Set the maximum duration in microseconds that parsing should be allowed to
* take before halting. If parsing takes longer than this, it will halt early,
* returning NULL. See `ts_parser_parse` for more information.
* take before halting.
*
* If parsing takes longer than this, it will halt early, returning NULL.
* See `ts_parser_parse` for more information.
*/
void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
@ -243,10 +290,11 @@ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
uint64_t ts_parser_timeout_micros(const TSParser *self);
/**
* Set the parser's current cancellation flag pointer. If a non-null pointer is
* assigned, then the parser will periodically read from this pointer during
* parsing. If it reads a non-zero value, it will halt early, returning NULL.
* See `ts_parser_parse` for more information.
* Set the parser's current cancellation flag pointer.
*
* If a non-null pointer is assigned, then the parser will periodically read
* from this pointer during parsing. If it reads a non-zero value, it will
* halt early, returning NULL. See `ts_parser_parse` for more information.
*/
void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag);
@ -322,22 +370,22 @@ const TSLanguage *ts_tree_language(const TSTree *);
void ts_tree_edit(TSTree *self, const TSInputEdit *edit);
/**
* Compare a new syntax tree to a previous syntax tree representing the same
* Compare an old edited syntax tree to a new syntax tree representing the same
* document, returning an array of ranges whose syntactic structure has changed.
*
* For this to work correctly, the old syntax tree must have been edited such
* that its ranges match up to the new tree. Generally, you'll want to call
* this function right after calling one of the `ts_parser_parse` functions,
* passing in the new tree that was returned from `ts_parser_parse` and the old
* tree that was passed as a parameter.
* this function right after calling one of the `ts_parser_parse` functions.
* You need to pass the old tree that was passed to parse, as well as the new
* tree that was returned from that function.
*
* The returned array is allocated using `malloc` and the caller is responsible
* for freeing it using `free`. The length of the array will be written to the
* given `length` pointer.
*/
TSRange *ts_tree_get_changed_ranges(
const TSTree *self,
const TSTree *old_tree,
const TSTree *new_tree,
uint32_t *length
);
@ -409,8 +457,8 @@ bool ts_node_is_named(TSNode);
bool ts_node_is_missing(TSNode);
/**
* Check if the node is *missing*. Missing nodes are inserted by the parser in
* order to recover from certain kinds of syntax errors.
* Check if the node is *extra*. Extra nodes represent things like comments,
* which are not required the grammar, but can appear anywhere.
*/
bool ts_node_is_extra(TSNode);
@ -542,7 +590,7 @@ TSTreeCursor ts_tree_cursor_new(TSNode);
void ts_tree_cursor_delete(TSTreeCursor *);
/**
* Re-initialize a tree cursor to start at a different ndoe.
* Re-initialize a tree cursor to start at a different node.
*/
void ts_tree_cursor_reset(TSTreeCursor *, TSNode);
@ -584,7 +632,7 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *);
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
/**
* Move the cursor to the first schild of its current node.
* Move the cursor to the first child of its current node.
*
* This returns `true` if the cursor successfully moved, and returns `false`
* if there were no children.
@ -592,7 +640,7 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
bool ts_tree_cursor_goto_first_child(TSTreeCursor *);
/**
* Move the cursor to the first schild of its current node that extends beyond
* Move the cursor to the first child of its current node that extends beyond
* the given byte offset.
*
* This returns the index of the child node if one was found, and returns -1
@ -602,6 +650,156 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t);
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *);
/*******************/
/* Section - Query */
/*******************/
/**
* Create a new query from a string containing one or more S-expression
* patterns. The query is associated with a particular language, and can
* only be run on syntax nodes parsed with that language.
*
* If all of the given patterns are valid, this returns a `TSQuery`.
* If a pattern is invalid, this returns `NULL`, and provides two pieces
* of information about the problem:
* 1. The byte offset of the error is written to the `error_offset` parameter.
* 2. The type of error is written to the `error_type` parameter.
*/
TSQuery *ts_query_new(
const TSLanguage *language,
const char *source,
uint32_t source_len,
uint32_t *error_offset,
TSQueryError *error_type
);
/**
* Delete a query, freeing all of the memory that it used.
*/
void ts_query_delete(TSQuery *);
/**
* Get the number of patterns, captures, or string literals in the query.
*/
uint32_t ts_query_pattern_count(const TSQuery *);
uint32_t ts_query_capture_count(const TSQuery *);
uint32_t ts_query_string_count(const TSQuery *);
/**
* Get the byte offset where the given pattern starts in the query's source.
*
* This can be useful when combining queries by concatenating their source
* code strings.
*/
uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t);
/**
* Get all of the predicates for the given pattern in the query.
*
* The predicates are represented as a single array of steps. There are three
* types of steps in this array, which correspond to the three legal values for
* the `type` field:
* - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names
* of captures. Their `value_id` can be used with the
* `ts_query_capture_name_for_id` function to obtain the name of the capture.
* - `TSQueryPredicateStepTypeString` - Steps with this type represent literal
* strings. Their `value_id` can be used with the
* `ts_query_string_value_for_id` function to obtain their string value.
* - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels*
* that represent the end of an individual predicate. If a pattern has two
* predicates, then there will be two steps with this `type` in the array.
*/
const TSQueryPredicateStep *ts_query_predicates_for_pattern(
const TSQuery *self,
uint32_t pattern_index,
uint32_t *length
);
/**
* Get the name and length of one of the query's captures, or one of the
* query's string literals. Each capture and string is associated with a
* numeric id based on the order that it appeared in the query's source.
*/
const char *ts_query_capture_name_for_id(
const TSQuery *,
uint32_t id,
uint32_t *length
);
const char *ts_query_string_value_for_id(
const TSQuery *,
uint32_t id,
uint32_t *length
);
/**
* Disable a certain capture within a query. This prevents the capture
* from being returned in matches, and also avoids any resource usage
* associated with recording the capture.
*/
void ts_query_disable_capture(TSQuery *, const char *, uint32_t);
/**
* Create a new cursor for executing a given query.
*
* The cursor stores the state that is needed to iteratively search
* for matches. To use the query cursor, first call `ts_query_cursor_exec`
* to start running a given query on a given syntax node. Then, there are
* two options for consuming the results of the query:
* 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the
* the *matches* in the order that they were found. Each match contains the
* index of the pattern that matched, and an array of captures. Because
* multiple patterns can match the same set of nodes, one match may contain
* captures that appear *before* some of the captures from a previous match.
* 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the
* individual *captures* in the order that they appear. This is useful if
* don't care about which pattern matched, and just want a single ordered
* sequence of captures.
*
* If you don't care about consuming all of the results, you can stop calling
* `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point.
* You can then start executing another query on another node by calling
* `ts_query_cursor_exec` again.
*/
TSQueryCursor *ts_query_cursor_new(void);
/**
* Delete a query cursor, freeing all of the memory that it used.
*/
void ts_query_cursor_delete(TSQueryCursor *);
/**
* Start running a given query on a given node.
*/
void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);
/**
* Set the range of bytes or (row, column) positions in which the query
* will be executed.
*/
void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t);
void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint);
/**
* Advance to the next match of the currently running query.
*
* If there is a match, write it to `*match` and return `true`.
* Otherwise, return `false`.
*/
bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match);
void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id);
/**
* Advance to the next capture of the currently running query.
*
* If there is a capture, write its match to `*match` and its index within
* the matche's capture list to `*capture_index`. Otherwise, return `false`.
*/
bool ts_query_cursor_next_capture(
TSQueryCursor *,
TSQueryMatch *match,
uint32_t *capture_index
);
/**********************/
/* Section - Language */
/**********************/
@ -619,7 +817,12 @@ const char *ts_language_symbol_name(const TSLanguage *, TSSymbol);
/**
* Get the numerical id for the given node type string.
*/
TSSymbol ts_language_symbol_for_name(const TSLanguage *, const char *);
TSSymbol ts_language_symbol_for_name(
const TSLanguage *self,
const char *string,
uint32_t length,
bool is_named
);
/**
* Get the number of distinct field names in the language.

29
src/tree_sitter/bits.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef TREE_SITTER_BITS_H_
#define TREE_SITTER_BITS_H_
#include <stdint.h>
static inline uint32_t bitmask_for_index(uint16_t id) {
return (1u << (31 - id));
}
#ifdef _WIN32
#include <intrin.h>
static inline uint32_t count_leading_zeros(uint32_t x) {
if (x == 0) return 32;
uint32_t result;
_BitScanReverse(&result, x);
return 31 - result;
}
#else
static inline uint32_t count_leading_zeros(uint32_t x) {
if (x == 0) return 32;
return __builtin_clz(x);
}
#endif
#endif // TREE_SITTER_BITS_H_

View File

@ -3,8 +3,28 @@
#include "./error_costs.h"
#include <string.h>
void ts_language_table_entry(const TSLanguage *self, TSStateId state,
TSSymbol symbol, TableEntry *result) {
uint32_t ts_language_symbol_count(const TSLanguage *self) {
return self->symbol_count + self->alias_count;
}
uint32_t ts_language_version(const TSLanguage *self) {
return self->version;
}
uint32_t ts_language_field_count(const TSLanguage *self) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
return self->field_count;
} else {
return 0;
}
}
void ts_language_table_entry(
const TSLanguage *self,
TSStateId state,
TSSymbol symbol,
TableEntry *result
) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
result->action_count = 0;
result->is_reusable = false;
@ -19,48 +39,72 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state,
}
}
uint32_t ts_language_symbol_count(const TSLanguage *language) {
return language->symbol_count + language->alias_count;
}
uint32_t ts_language_version(const TSLanguage *language) {
return language->version;
}
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) {
TSSymbolMetadata ts_language_symbol_metadata(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return (TSSymbolMetadata){.visible = true, .named = true};
} else if (symbol == ts_builtin_sym_error_repeat) {
return (TSSymbolMetadata){.visible = false, .named = false};
} else {
return language->symbol_metadata[symbol];
return self->symbol_metadata[symbol];
}
}
const char *ts_language_symbol_name(const TSLanguage *language, TSSymbol symbol) {
TSSymbol ts_language_public_symbol(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) return symbol;
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[symbol];
} else {
return symbol;
}
}
const char *ts_language_symbol_name(
const TSLanguage *self,
TSSymbol symbol
) {
if (symbol == ts_builtin_sym_error) {
return "ERROR";
} else if (symbol == ts_builtin_sym_error_repeat) {
return "_ERROR";
} else {
return language->symbol_names[symbol];
return self->symbol_names[symbol];
}
}
TSSymbol ts_language_symbol_for_name(const TSLanguage *self, const char *name) {
if (!strcmp(name, "ERROR")) return ts_builtin_sym_error;
TSSymbol ts_language_symbol_for_name(
const TSLanguage *self,
const char *string,
uint32_t length,
bool is_named
) {
if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error;
uint32_t count = ts_language_symbol_count(self);
for (TSSymbol i = 0; i < count; i++) {
if (!strcmp(self->symbol_names[i], name)) {
return i;
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i);
if (!metadata.visible || metadata.named != is_named) continue;
const char *symbol_name = self->symbol_names[i];
if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
return self->public_symbol_map[i];
} else {
return i;
}
}
}
return 0;
}
TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
TSSymbolType ts_language_symbol_type(
const TSLanguage *self,
TSSymbol symbol
) {
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol);
if (metadata.named) {
return TSSymbolTypeRegular;
} else if (metadata.visible) {
@ -70,15 +114,10 @@ TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol
}
}
uint32_t ts_language_field_count(const TSLanguage *self) {
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
return self->field_count;
} else {
return 0;
}
}
const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id) {
const char *ts_language_field_name_for_id(
const TSLanguage *self,
TSFieldId id
) {
uint32_t count = ts_language_field_count(self);
if (count) {
return self->field_names[id];
@ -96,7 +135,8 @@ TSFieldId ts_language_field_id_for_name(
for (TSSymbol i = 1; i < count + 1; i++) {
switch (strncmp(name, self->field_names[i], name_length)) {
case 0:
return i;
if (self->field_names[i][name_length] == 0) return i;
break;
case -1:
return 0;
default:

View File

@ -10,6 +10,7 @@ extern "C" {
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
typedef struct {
@ -22,6 +23,8 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol);
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
return 0 < symbol && symbol < self->external_token_count + 1;
}

View File

@ -2,26 +2,58 @@
#include "./lexer.h"
#include "./subtree.h"
#include "./length.h"
#include "./utf16.h"
#include "utf8proc.h"
#include "./unicode.h"
#define LOG(...) \
if (self->logger.log) { \
snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
#define LOG(message, character) \
if (self->logger.log) { \
snprintf( \
self->debug_buffer, \
TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
32 <= character && character < 127 ? \
message " character:'%c'" : \
message " character:%d", \
character \
); \
self->logger.log( \
self->logger.payload, \
TSLogTypeLex, \
self->debug_buffer \
); \
}
#define LOG_CHARACTER(message, character) \
LOG( \
32 <= character && character < 127 ? \
message " character:'%c'" : \
message " character:%d", character \
)
static const char empty_chunk[3] = { 0, 0 };
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
static const TSRange DEFAULT_RANGE = {
.start_point = {
.row = 0,
.column = 0,
},
.end_point = {
.row = UINT32_MAX,
.column = UINT32_MAX,
},
.start_byte = 0,
.end_byte = UINT32_MAX
};
// Check if the lexer has reached EOF. This state is stored
// by setting the lexer's `current_included_range_index` such that
// it has consumed all of its available ranges.
static bool ts_lexer__eof(const TSLexer *_self) {
Lexer *self = (Lexer *)_self;
return self->current_included_range_index == self->included_range_count;
}
// Clear the currently stored chunk of source code, because the lexer's
// position has changed.
static void ts_lexer__clear_chunk(Lexer *self) {
self->chunk = NULL;
self->chunk_size = 0;
self->chunk_start = 0;
}
// Call the lexer's input callback to obtain a new chunk of source code
// for the current position.
static void ts_lexer__get_chunk(Lexer *self) {
self->chunk_start = self->current_position.bytes;
self->chunk = self->input.read(
@ -30,15 +62,15 @@ static void ts_lexer__get_chunk(Lexer *self) {
self->current_position.extent,
&self->chunk_size
);
if (!self->chunk_size) self->chunk = empty_chunk;
if (!self->chunk_size) {
self->current_included_range_index = self->included_range_count;
self->chunk = NULL;
}
}
typedef utf8proc_ssize_t (*DecodeFunction)(
const utf8proc_uint8_t *,
utf8proc_ssize_t,
utf8proc_int32_t *
);
// Decode the next unicode character in the current chunk of source code.
// This assumes that the lexer has already retrieved a chunk of source
// code that spans the current position.
static void ts_lexer__get_lookahead(Lexer *self) {
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
@ -50,29 +82,37 @@ static void ts_lexer__get_lookahead(Lexer *self) {
return;
}
DecodeFunction decode =
self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate;
UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
? ts_decode_utf8
: ts_decode_utf16;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
// If this chunk ended in the middle of a multi-byte character,
// try again with a fresh chunk.
if (self->data.lookahead == -1 && size < 4) {
if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
ts_lexer__get_chunk(self);
chunk = (const uint8_t *)self->chunk;
size = self->chunk_size;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
}
if (self->data.lookahead == -1) {
if (self->data.lookahead == TS_DECODE_ERROR) {
self->lookahead_size = 1;
}
}
static void ts_lexer__advance(TSLexer *payload, bool skip) {
Lexer *self = (Lexer *)payload;
if (self->chunk == empty_chunk)
return;
// Advance to the next character in the source code, retrieving a new
// chunk of source code if needed.
static void ts_lexer__advance(TSLexer *_self, bool skip) {
Lexer *self = (Lexer *)_self;
if (!self->chunk) return;
if (skip) {
LOG("skip", self->data.lookahead);
} else {
LOG("consume", self->data.lookahead);
}
if (self->lookahead_size) {
self->current_position.bytes += self->lookahead_size;
@ -84,53 +124,65 @@ static void ts_lexer__advance(TSLexer *payload, bool skip) {
}
}
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
if (self->current_position.bytes == current_range->end_byte) {
self->current_included_range_index++;
if (self->current_included_range_index == self->included_range_count) {
self->data.lookahead = '\0';
self->lookahead_size = 1;
return;
} else {
current_range++;
self->current_position = (Length) {
current_range->start_byte,
current_range->start_point,
};
const TSRange *current_range = NULL;
if (self->current_included_range_index < self->included_range_count) {
current_range = &self->included_ranges[self->current_included_range_index];
if (self->current_position.bytes == current_range->end_byte) {
self->current_included_range_index++;
if (self->current_included_range_index < self->included_range_count) {
current_range++;
self->current_position = (Length) {
current_range->start_byte,
current_range->start_point,
};
} else {
current_range = NULL;
}
}
}
if (skip) {
LOG_CHARACTER("skip", self->data.lookahead);
self->token_start_position = self->current_position;
if (skip) self->token_start_position = self->current_position;
if (current_range) {
if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
ts_lexer__get_chunk(self);
}
ts_lexer__get_lookahead(self);
} else {
LOG_CHARACTER("consume", self->data.lookahead);
}
if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
ts_lexer__get_chunk(self);
}
ts_lexer__get_lookahead(self);
}
static void ts_lexer__mark_end(TSLexer *payload) {
Lexer *self = (Lexer *)payload;
TSRange *current_included_range = &self->included_ranges[self->current_included_range_index];
if (self->current_included_range_index > 0 &&
self->current_position.bytes == current_included_range->start_byte) {
TSRange *previous_included_range = current_included_range - 1;
self->token_end_position = (Length) {
previous_included_range->end_byte,
previous_included_range->end_point,
};
} else {
self->token_end_position = self->current_position;
ts_lexer__clear_chunk(self);
self->data.lookahead = '\0';
self->lookahead_size = 1;
}
}
static uint32_t ts_lexer__get_column(TSLexer *payload) {
Lexer *self = (Lexer *)payload;
// Mark that a token match has completed. This can be called multiple
// times if a longer match is found later.
static void ts_lexer__mark_end(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
if (!ts_lexer__eof(&self->data)) {
// If the lexer is right at the beginning of included range,
// then the token should be considered to end at the *end* of the
// previous included range, rather than here.
TSRange *current_included_range = &self->included_ranges[
self->current_included_range_index
];
if (
self->current_included_range_index > 0 &&
self->current_position.bytes == current_included_range->start_byte
) {
TSRange *previous_included_range = current_included_range - 1;
self->token_end_position = (Length) {
previous_included_range->end_byte,
previous_included_range->end_point,
};
return;
}
}
self->token_end_position = self->current_position;
}
static uint32_t ts_lexer__get_column(TSLexer *_self) {
Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->current_position.bytes -= self->current_position.extent.column;
@ -142,67 +194,69 @@ static uint32_t ts_lexer__get_column(TSLexer *payload) {
uint32_t result = 0;
while (self->current_position.bytes < goal_byte) {
ts_lexer__advance(payload, false);
ts_lexer__advance(&self->data, false);
result++;
}
return result;
}
static bool ts_lexer__is_at_included_range_start(TSLexer *payload) {
const Lexer *self = (const Lexer *)payload;
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
return self->current_position.bytes == current_range->start_byte;
// Is the lexer at a boundary between two disjoint included ranges of
// source code? This is exposed as an API because some languages' external
// scanners need to perform custom actions at these bounaries.
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
const Lexer *self = (const Lexer *)_self;
if (self->current_included_range_index < self->included_range_count) {
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
return self->current_position.bytes == current_range->start_byte;
} else {
return false;
}
}
// The lexer's methods are stored as a struct field so that generated
// parsers can call them without needing to be linked against this library.
void ts_lexer_init(Lexer *self) {
*self = (Lexer) {
.data = {
// The lexer's methods are stored as struct fields so that generated
// parsers can call them without needing to be linked against this
// library.
.advance = ts_lexer__advance,
.mark_end = ts_lexer__mark_end,
.get_column = ts_lexer__get_column,
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
.eof = ts_lexer__eof,
.lookahead = 0,
.result_symbol = 0,
},
.chunk = NULL,
.chunk_size = 0,
.chunk_start = 0,
.current_position = {UINT32_MAX, {0, 0}},
.current_position = {0, {0, 0}},
.logger = {
.payload = NULL,
.log = NULL
},
.included_ranges = NULL,
.included_range_count = 0,
.current_included_range_index = 0,
};
self->included_ranges = NULL;
ts_lexer_set_included_ranges(self, NULL, 0);
ts_lexer_reset(self, length_zero());
}
void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
self->data.lookahead = 0;
self->lookahead_size = 0;
self->chunk = 0;
self->chunk_start = 0;
self->chunk_size = 0;
}
static void ts_lexer_goto(Lexer *self, Length position) {
self->current_position = position;
bool found_included_range = false;
// Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
position = (Length) {
self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
@ -214,46 +268,61 @@ static void ts_lexer_goto(Lexer *self, Length position) {
}
}
if (!found_included_range) {
if (found_included_range) {
// If the current position is outside of the current chunk of text,
// then clear out the current chunk of text.
if (self->chunk && (
position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size
)) {
ts_lexer__clear_chunk(self);
}
self->lookahead_size = 0;
self->data.lookahead = '\0';
}
// If the given position is beyond any of included ranges, move to the EOF
// state - past the end of the included ranges.
else {
self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
position = (Length) {
self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
self->chunk = empty_chunk;
self->chunk_start = position.bytes;
self->chunk_size = 2;
ts_lexer__clear_chunk(self);
self->lookahead_size = 1;
self->data.lookahead = '\0';
}
self->token_start_position = position;
self->token_end_position = LENGTH_UNDEFINED;
self->current_position = position;
if (self->chunk && (position.bytes < self->chunk_start ||
position.bytes >= self->chunk_start + self->chunk_size)) {
self->chunk = 0;
self->chunk_start = 0;
self->chunk_size = 0;
}
self->lookahead_size = 0;
self->data.lookahead = 0;
}
void ts_lexer_set_input(Lexer *self, TSInput input) {
self->input = input;
ts_lexer__clear_chunk(self);
ts_lexer_goto(self, self->current_position);
}
// Move the lexer to the given position. This doesn't do any work
// if the parser is already at the given position.
void ts_lexer_reset(Lexer *self, Length position) {
if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position);
if (position.bytes != self->current_position.bytes) {
ts_lexer_goto(self, position);
}
}
void ts_lexer_start(Lexer *self) {
self->token_start_position = self->current_position;
self->token_end_position = LENGTH_UNDEFINED;
self->data.result_symbol = 0;
if (!self->chunk) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if (
self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK
) ts_lexer__advance((TSLexer *)self, true);
if (!ts_lexer__eof(&self->data)) {
if (!self->chunk_size) ts_lexer__get_chunk(self);
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
if (
self->current_position.bytes == 0 &&
self->data.lookahead == BYTE_ORDER_MARK
) ts_lexer__advance(&self->data, true);
}
}
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
@ -267,7 +336,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
// the character decoding algorithm may have looked at the following byte.
// Therefore, the next byte *after* the current (invalid) character
// affects the interpretation of the current character.
if (self->data.lookahead == -1) {
if (self->data.lookahead == TS_DECODE_ERROR) {
current_lookahead_end_byte++;
}
@ -277,8 +346,8 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
}
void ts_lexer_advance_to_end(Lexer *self) {
while (self->data.lookahead != 0) {
ts_lexer__advance((TSLexer *)self, false);
while (self->chunk) {
ts_lexer__advance(&self->data, false);
}
}
@ -286,30 +355,19 @@ void ts_lexer_mark_end(Lexer *self) {
ts_lexer__mark_end(&self->data);
}
static const TSRange DEFAULT_RANGES[] = {
{
.start_point = {
.row = 0,
.column = 0,
},
.end_point = {
.row = UINT32_MAX,
.column = UINT32_MAX,
},
.start_byte = 0,
.end_byte = UINT32_MAX
}
};
void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count) {
if (!ranges) {
ranges = DEFAULT_RANGES;
void ts_lexer_set_included_ranges(
Lexer *self,
const TSRange *ranges,
uint32_t count
) {
if (count == 0 || !ranges) {
ranges = &DEFAULT_RANGE;
count = 1;
}
size_t sz = count * sizeof(TSRange);
self->included_ranges = ts_realloc(self->included_ranges, sz);
memcpy(self->included_ranges, ranges, sz);
size_t size = count * sizeof(TSRange);
self->included_ranges = ts_realloc(self->included_ranges, size);
memcpy(self->included_ranges, ranges, size);
self->included_range_count = count;
ts_lexer_goto(self, self->current_position);
}

View File

@ -16,7 +16,7 @@ typedef struct {
Length token_start_position;
Length token_end_position;
TSRange * included_ranges;
TSRange *included_ranges;
size_t included_range_count;
size_t current_included_range_index;

View File

@ -2,19 +2,16 @@
//
// The following directories must be added to the include path:
// - include
// - utf8proc
#define _POSIX_C_SOURCE 200112L
#define UTF8PROC_STATIC
#include "./get_changed_ranges.c"
#include "./language.c"
#include "./lexer.c"
#include "./node.c"
#include "./parser.c"
#include "./query.c"
#include "./stack.c"
#include "./subtree.c"
#include "./tree_cursor.c"
#include "./tree.c"
#include "./utf16.c"
#include "utf8proc.c"

View File

@ -415,13 +415,15 @@ TSPoint ts_node_end_point(TSNode self) {
}
TSSymbol ts_node_symbol(TSNode self) {
return ts_node__alias(&self)
? ts_node__alias(&self)
: ts_subtree_symbol(ts_node__subtree(self));
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_public_symbol(self.tree->language, symbol);
}
const char *ts_node_type(TSNode self) {
return ts_language_symbol_name(self.tree->language, ts_node_symbol(self));
TSSymbol symbol = ts_node__alias(&self);
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
return ts_language_symbol_name(self.tree->language, symbol);
}
char *ts_node_string(TSNode self) {

View File

@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
TSLexMode lex_mode = self->language->lex_modes[parse_state];
if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
@ -438,7 +439,7 @@ static Subtree ts_parser__lex(
}
if (self->lexer.current_position.bytes == error_end_position.bytes) {
if (self->lexer.data.lookahead == 0) {
if (self->lexer.data.eof(&self->lexer.data)) {
self->lexer.data.result_symbol = ts_builtin_sym_error;
break;
}
@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
uint32_t count,
int dynamic_precedence,
uint16_t production_id,
bool fragile
bool is_fragile,
bool is_extra
) {
uint32_t initial_version_count = ts_stack_version_count(self->stack);
uint32_t removed_version_count = 0;
@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(
TSStateId state = ts_stack_state(self->stack, slice_version);
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
if (fragile || pop.size > 1 || initial_version_count > 1) {
if (is_extra) parent.ptr->extra = true;
if (is_fragile || pop.size > 1 || initial_version_count > 1) {
parent.ptr->fragile_left = true;
parent.ptr->fragile_right = true;
parent.ptr->parse_state = TS_TREE_STATE_NONE;
@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
reduction_version = ts_parser__reduce(
self, version, action.symbol, action.count,
action.dynamic_precedence, action.production_id,
true
true, false
);
}
@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
// Otherwise, re-run the lexer.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
if (lookahead.ptr) {
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
}
// When parsing a non-terminal extra, a null lookahead indicates the
// end of the rule. The reduction is stored in the EOF table entry.
// After the reduction, the lexer needs to be run again.
else {
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
}
}
for (;;) {
@ -1422,11 +1434,12 @@ static bool ts_parser__advance(
case TSParseActionTypeReduce: {
bool is_fragile = table_entry.action_count > 1;
bool is_extra = lookahead.ptr == NULL;
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
StackVersion reduction_version = ts_parser__reduce(
self, version, action.params.symbol, action.params.child_count,
action.params.dynamic_precedence, action.params.production_id,
is_fragile
is_fragile, is_extra
);
if (reduction_version != STACK_VERSION_NONE) {
last_reduction_version = reduction_version;
@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
ts_stack_renumber_version(self->stack, last_reduction_version, version);
LOG_STACK();
state = ts_stack_state(self->stack, version);
// At the end of a non-terminal extra rule, the lexer will return a
// null subtree, because the parser needs to perform a fixed reduction
// regardless of the lookahead node. After performing that reduction,
// (and completing the non-terminal extra rule) run the lexer again based
// on the current parse state.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
}
ts_language_table_entry(
self->language,
state,
@ -1655,6 +1677,7 @@ TSParser *ts_parser_new(void) {
void ts_parser_delete(TSParser *self) {
if (!self) return;
ts_parser_set_language(self, NULL);
ts_stack_delete(self->stack);
if (self->reduce_actions.contents) {
array_delete(&self->reduce_actions);
@ -1670,7 +1693,6 @@ void ts_parser_delete(TSParser *self) {
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
ts_subtree_pool_delete(&self->tree_pool);
reusable_node_delete(&self->reusable_node);
ts_parser_set_language(self, NULL);
ts_free(self);
}
@ -1695,6 +1717,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language) {
}
self->language = language;
ts_parser_reset(self);
return true;
}
@ -1747,7 +1770,7 @@ const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count)
}
void ts_parser_reset(TSParser *self) {
if (self->language->external_scanner.deserialize) {
if (self->language && self->language->external_scanner.deserialize) {
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
}

View File

@ -45,7 +45,8 @@ struct TSLexer {
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
@ -117,6 +118,7 @@ struct TSLanguage {
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSSymbol *public_symbol_map;
};
/*
@ -126,6 +128,7 @@ struct TSLanguage {
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \

View File

@ -3,6 +3,7 @@
#include "tree_sitter/api.h"
#define POINT_ZERO ((TSPoint) {0, 0})
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
static inline TSPoint point__new(unsigned row, unsigned column) {

1450
src/tree_sitter/query.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,18 +18,9 @@ typedef struct {
Length new_end;
} Edit;
#ifdef TREE_SITTER_TEST
#define TS_MAX_INLINE_TREE_LENGTH 2
#define TS_MAX_TREE_POOL_SIZE 0
#else
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
#define TS_MAX_TREE_POOL_SIZE 32
#endif
static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}};
// ExternalScannerState
@ -775,10 +766,10 @@ Subtree ts_subtree_last_external_token(Subtree tree) {
}
static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
if (c == 0)
return snprintf(s, n, "EOF");
if (c == -1)
return snprintf(s, n, "INVALID");
else if (c == '\0')
return snprintf(s, n, "'\\0'");
else if (c == '\n')
return snprintf(s, n, "'\\n'");
else if (c == '\t')

View File

@ -84,12 +84,10 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
}
TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) {
TSRange *result;
TreeCursor cursor1 = {NULL, array_new()};
TreeCursor cursor2 = {NULL, array_new()};
TSNode root = ts_tree_root_node(self);
ts_tree_cursor_init(&cursor1, root);
ts_tree_cursor_init(&cursor2, root);
ts_tree_cursor_init(&cursor1, ts_tree_root_node(self));
ts_tree_cursor_init(&cursor2, ts_tree_root_node(other));
TSRangeArray included_range_differences = array_new();
ts_range_array_get_changed_ranges(
@ -98,6 +96,7 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin
&included_range_differences
);
TSRange *result;
*count = ts_subtree_get_changed_ranges(
&self->root, &other->root, &cursor1, &cursor2,
self->language, &included_range_differences, &result

View File

@ -244,6 +244,72 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
);
}
TSFieldId ts_tree_cursor_current_status(
const TSTreeCursor *_self,
bool *can_have_later_siblings,
bool *can_have_later_siblings_with_this_field
) {
const TreeCursor *self = (const TreeCursor *)_self;
TSFieldId result = 0;
*can_have_later_siblings = false;
*can_have_later_siblings_with_this_field = false;
// Walk up the tree, visiting the current node and its invisible ancestors,
// because fields can refer to nodes through invisible *wrapper* nodes,
for (unsigned i = self->stack.size - 1; i > 0; i--) {
TreeCursorEntry *entry = &self->stack.contents[i];
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
// Stop walking up when a visible ancestor is found.
if (i != self->stack.size - 1) {
if (ts_subtree_visible(*entry->subtree)) break;
const TSSymbol *alias_sequence = ts_language_alias_sequence(
self->tree->language,
parent_entry->subtree->ptr->production_id
);
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
break;
}
}
if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) {
*can_have_later_siblings = true;
}
if (ts_subtree_extra(*entry->subtree)) break;
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
// Look for a field name associated with the current node.
if (!result) {
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (!i->inherited && i->child_index == entry->structural_child_index) {
result = i->field_id;
*can_have_later_siblings_with_this_field = false;
break;
}
}
}
// Determine if there other later siblings with the same field name.
if (result) {
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (i->field_id == result && i->child_index > entry->structural_child_index) {
*can_have_later_siblings_with_this_field = true;
break;
}
}
}
}
return result;
}
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
@ -264,19 +330,18 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
}
}
if (ts_subtree_extra(*entry->subtree)) break;
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
while (field_map < field_map_end) {
if (
!field_map->inherited &&
field_map->child_index == entry->structural_child_index
) return field_map->field_id;
field_map++;
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
if (!i->inherited && i->child_index == entry->structural_child_index) {
return i->field_id;
}
}
}
return 0;

View File

@ -16,5 +16,6 @@ typedef struct {
} TreeCursor;
void ts_tree_cursor_init(TreeCursor *, TSNode);
TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *);
#endif // TREE_SITTER_TREE_CURSOR_H_

50
src/tree_sitter/unicode.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef TREE_SITTER_UNICODE_H_
#define TREE_SITTER_UNICODE_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <limits.h>
#include <stdint.h>
#define U_EXPORT
#define U_EXPORT2
#include "./unicode/utf8.h"
#include "./unicode/utf16.h"
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
// These functions read one unicode code point from the given string,
// returning the number of bytes consumed.
typedef uint32_t (*UnicodeDecodeFunction)(
const uint8_t *string,
uint32_t length,
int32_t *code_point
);
static inline uint32_t ts_decode_utf8(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U8_NEXT(string, i, length, *code_point);
return i;
}
static inline uint32_t ts_decode_utf16(
const uint8_t *string,
uint32_t length,
int32_t *code_point
) {
uint32_t i = 0;
U16_NEXT(((uint16_t *)string), i, length, *code_point);
return i * 2;
}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_UNICODE_H_

View File

@ -0,0 +1 @@
552b01f61127d30d6589aa4bf99468224979b661

View File

@ -0,0 +1,414 @@
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
(the "Data Files") or Unicode software and any associated documentation
(the "Software") to deal in the Data Files or Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of
the Data Files or Software, and to permit persons to whom the Data Files
or Software are furnished to do so, provided that either
(a) this copyright and permission notice appear with all copies
of the Data Files or Software, or
(b) this copyright and permission notice appear in associated
Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
---------------------
Third-Party Software Licenses
This section contains third-party software notices and/or additional
terms for licensed third-party software components included within ICU
libraries.
1. ICU License - ICU 1.8.1 to ICU 57.1
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2016 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies of
the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale, use
or other dealings in this Software without prior written authorization
of the copyright holder.
All trademarks and registered trademarks mentioned herein are the
property of their respective owners.
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
# The Google Chrome software developed by Google is licensed under
# the BSD license. Other software included in this distribution is
# provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution.
# Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists
# listed below with further processing for compound word breaking. The
# frequency is generated with an iterative training against Google web
# corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia
# * Sinica. All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
# University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END--------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN-------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END----------------------------------
3. Lao Word Break Dictionary Data (laodict.txt)
# Copyright (c) 2013 International Business Machines Corporation
# and others. All Rights Reserved.
#
# Project: http://code.google.com/p/lao-dictionary/
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
# (copied below)
#
# This file is derived from the above dictionary, with slight
# modifications.
# ----------------------------------------------------------------------
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification,
# are permitted provided that the following conditions are met:
#
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in
# binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------
4. Burmese Word Break Dictionary Data (burmesedict.txt)
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer. Redistributions in binary form must reproduce the
# above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
# --------------------------------------------------------------------------
5. Time Zone Database
ICU uses the public domain data and code derived from Time Zone
Database for its time zone support. The ownership of the TZ database
is explained in BCP 175: Procedure for Maintaining the Time Zone
Database section 7.
# 7. Database Ownership
#
# The TZ database itself is not an IETF Contribution or an IETF
# document. Rather it is a pre-existing and regularly updated work
# that is in the public domain, and is intended to remain in the
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
# not apply to the TZ Database or contributions that individuals make
# to it. Should any claims be made and substantiated against the TZ
# Database, the organization that is providing the IANA
# Considerations defined in this RFC, under the memorandum of
# understanding with the IETF, currently ICANN, may act in accordance
# with all competent court orders. No ownership claims will be made
# by ICANN or the IETF Trust on the database or the code. Any person
# making a contribution to the database or code waives all rights to
# future claims in that contribution or in the TZ Database.
6. Google double-conversion
Copyright 2006-2011, the V8 project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,29 @@
# ICU Parts
This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
### License
The license for these files is contained in the `LICENSE` file within this directory.
### Contents
* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
* `utf8.h`
* `utf16.h`
* `umachine.h`
* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
* `ptypes.h`
* `urename.h`
* `utf.h`
* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
* `README.md` - This text file.
### Updating ICU
To incorporate changes from the upstream `icu` repository:
* Update `ICU_SHA` with the new Git SHA.
* Update `LICENSE` with the license text from the directory mentioned above.
* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,448 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: umachine.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file defines basic types and constants for ICU to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*/
#ifndef __UMACHINE_H__
#define __UMACHINE_H__
/**
* \file
* \brief Basic types and constants for UTF
*
* <h2> Basic types and constants for UTF </h2>
* This file defines basic types and constants for utf.h to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*
*/
/*==========================================================================*/
/* Include platform-dependent definitions */
/* which are contained in the platform-specific file platform.h */
/*==========================================================================*/
#include "./ptypes.h" /* platform.h is included in ptypes.h */
/*
* ANSI C headers:
* stddef.h defines wchar_t
*/
#include <stddef.h>
/*==========================================================================*/
/* For C wrappers, we use the symbol U_STABLE. */
/* This works properly if the includer is C or C++. */
/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
/*==========================================================================*/
/**
* \def U_CFUNC
* This is used in a declaration of a library private ICU C function.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_BEGIN
* This is used to begin a declaration of a library private ICU C API.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_END
* This is used to end a declaration of a library private ICU C API
* @stable ICU 2.4
*/
#ifdef __cplusplus
# define U_CFUNC extern "C"
# define U_CDECL_BEGIN extern "C" {
# define U_CDECL_END }
#else
# define U_CFUNC extern
# define U_CDECL_BEGIN
# define U_CDECL_END
#endif
#ifndef U_ATTRIBUTE_DEPRECATED
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for GCC specific attributes
* @internal
*/
#if U_GCC_MAJOR_MINOR >= 302
# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for Visual C++ specific attributes
* @internal
*/
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
#else
# define U_ATTRIBUTE_DEPRECATED
#endif
#endif
/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
#define U_CAPI U_CFUNC U_EXPORT
/** This is used to declare a function as a stable public ICU C API*/
#define U_STABLE U_CAPI
/** This is used to declare a function as a draft public ICU C API */
#define U_DRAFT U_CAPI
/** This is used to declare a function as a deprecated public ICU C API */
#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
/** This is used to declare a function as an obsolete public ICU C API */
#define U_OBSOLETE U_CAPI
/** This is used to declare a function as an internal ICU C API */
#define U_INTERNAL U_CAPI
/**
* \def U_OVERRIDE
* Defined to the C++11 "override" keyword if available.
* Denotes a class or member which is an override of the base class.
* May result in an error if it applied to something not an override.
* @internal
*/
#ifndef U_OVERRIDE
#define U_OVERRIDE override
#endif
/**
* \def U_FINAL
* Defined to the C++11 "final" keyword if available.
* Denotes a class or member which may not be overridden in subclasses.
* May result in an error if subclasses attempt to override.
* @internal
*/
#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
#define U_FINAL final
#endif
// Before ICU 65, function-like, multi-statement ICU macros were just defined as
// series of statements wrapped in { } blocks and the caller could choose to
// either treat them as if they were actual functions and end the invocation
// with a trailing ; creating an empty statement after the block or else omit
// this trailing ; using the knowledge that the macro would expand to { }.
//
// But doing so doesn't work well with macros that look like functions and
// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
// switches to the standard solution of wrapping such macros in do { } while.
//
// This will however break existing code that depends on being able to invoke
// these macros without a trailing ; so to be able to remain compatible with
// such code the wrapper is itself defined as macros so that it's possible to
// build ICU 65 and later with the old macro behaviour, like this:
//
// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
// runConfigureICU ...
/**
* \def UPRV_BLOCK_MACRO_BEGIN
* Defined as the "do" keyword by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_BEGIN
#define UPRV_BLOCK_MACRO_BEGIN do
#endif
/**
* \def UPRV_BLOCK_MACRO_END
* Defined as "while (FALSE)" by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_END
#define UPRV_BLOCK_MACRO_END while (FALSE)
#endif
/*==========================================================================*/
/* limits for int32_t etc., like in POSIX inttypes.h */
/*==========================================================================*/
#ifndef INT8_MIN
/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MIN ((int8_t)(-128))
#endif
#ifndef INT16_MIN
/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MIN ((int16_t)(-32767-1))
#endif
#ifndef INT32_MIN
/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MIN ((int32_t)(-2147483647-1))
#endif
#ifndef INT8_MAX
/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MAX ((int8_t)(127))
#endif
#ifndef INT16_MAX
/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MAX ((int16_t)(32767))
#endif
#ifndef INT32_MAX
/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MAX ((int32_t)(2147483647))
#endif
#ifndef UINT8_MAX
/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT8_MAX ((uint8_t)(255U))
#endif
#ifndef UINT16_MAX
/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT16_MAX ((uint16_t)(65535U))
#endif
#ifndef UINT32_MAX
/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT32_MAX ((uint32_t)(4294967295U))
#endif
#if defined(U_INT64_T_UNAVAILABLE)
# error int64_t is required for decimal format and rule-based number format.
#else
# ifndef INT64_C
/**
* Provides a platform independent way to specify a signed 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
* @stable ICU 2.8
*/
# define INT64_C(c) c ## LL
# endif
# ifndef UINT64_C
/**
* Provides a platform independent way to specify an unsigned 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
* @stable ICU 2.8
*/
# define UINT64_C(c) c ## ULL
# endif
# ifndef U_INT64_MIN
/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
# endif
# ifndef U_INT64_MAX
/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
# endif
# ifndef U_UINT64_MAX
/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
# endif
#endif
/*==========================================================================*/
/* Boolean data type */
/*==========================================================================*/
/** The ICU boolean type @stable ICU 2.0 */
typedef int8_t UBool;
#ifndef TRUE
/** The TRUE value of a UBool @stable ICU 2.0 */
# define TRUE 1
#endif
#ifndef FALSE
/** The FALSE value of a UBool @stable ICU 2.0 */
# define FALSE 0
#endif
/*==========================================================================*/
/* Unicode data types */
/*==========================================================================*/
/* wchar_t-related definitions -------------------------------------------- */
/*
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
*
* @stable ICU 2.0
*/
/*
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
*
* @stable ICU 2.0
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
# if (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# elif (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif defined __UCS2__
# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# endif
# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
# if (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
# define U_WCHAR_IS_UTF32
# elif U_PLATFORM_HAS_WIN32_API
# define U_WCHAR_IS_UTF16
# endif
#endif
/* UChar and UChar32 definitions -------------------------------------------- */
/** Number of bytes in a UChar. @stable ICU 2.0 */
#define U_SIZEOF_UCHAR 2
/**
* \def U_CHAR16_IS_TYPEDEF
* If 1, then char16_t is a typedef and not a real type (yet)
* @internal
*/
#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
// for AIX, uchar.h needs to be included
# include <uchar.h>
# define U_CHAR16_IS_TYPEDEF 1
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
# define U_CHAR16_IS_TYPEDEF 1
#else
# define U_CHAR16_IS_TYPEDEF 0
#endif
/**
* \var UChar
*
* The base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
* Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
*
* UChar is configurable by defining the macro UCHAR_TYPE
* on the preprocessor or compiler command line:
* -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
* (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
* This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
*
* The default is UChar=char16_t.
*
* C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
*
* In C, char16_t is a simple typedef of uint_least16_t.
* ICU requires uint_least16_t=uint16_t for data memory mapping.
* On macOS, char16_t is not available because the uchar.h standard header is missing.
*
* @stable ICU 4.4
*/
#if 1
// #if 1 is normal. UChar defaults to char16_t in C++.
// For configuration testing of UChar=uint16_t temporarily change this to #if 0.
// The intltest Makefile #defines UCHAR_TYPE=char16_t,
// so we only #define it to uint16_t if it is undefined so far.
#elif !defined(UCHAR_TYPE)
# define UCHAR_TYPE uint16_t
#endif
#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
// Inside the ICU library code, never configurable.
typedef char16_t UChar;
#elif defined(UCHAR_TYPE)
typedef UCHAR_TYPE UChar;
#elif defined(__cplusplus)
typedef char16_t UChar;
#else
typedef uint16_t UChar;
#endif
/**
* \var OldUChar
* Default ICU 58 definition of UChar.
* A base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
*
* Define OldUChar to be wchar_t if that is 16 bits wide.
* If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
*
* This makes the definition of OldUChar platform-dependent
* but allows direct string type compatibility with platforms with
* 16-bit wchar_t types.
*
* This is how UChar was defined in ICU 58, for transition convenience.
* Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
* The current UChar responds to UCHAR_TYPE but OldUChar does not.
*
* @stable ICU 59
*/
#if U_SIZEOF_WCHAR_T==2
typedef wchar_t OldUChar;
#elif defined(__CHAR16_TYPE__)
typedef __CHAR16_TYPE__ OldUChar;
#else
typedef uint16_t OldUChar;
#endif
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer (same as int32_t).
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see U_SENTINEL
* @stable ICU 2.4
*/
typedef int32_t UChar32;
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @return -1
* @see UChar32
* @stable ICU 2.4
*/
#define U_SENTINEL (-1)
#include "./urename.h"
#endif

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.

View File

@ -0,0 +1,733 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf16.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep09
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 16-bit Unicode handling macros
*
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF16_H__
#define __UTF16_H__
#include "./umachine.h"
#ifndef __UTF_H__
# include "./utf.h"
#endif
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit alone encode a code point (BMP, not a surrogate)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code unit a surrogate (U+d800..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a trail surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 4.2
*/
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
/**
* Helper constant for U16_GET_SUPPLEMENTARY.
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
/**
* Get the lead surrogate (0xd800..0xdbff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return lead surrogate (U+d800..U+dbff) for supplementary
* @stable ICU 2.4
*/
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
/**
* Get the trail surrogate (0xdc00..0xdfff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return trail surrogate (U+dc00..U+dfff) for supplementary
* @stable ICU 2.4
*/
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/**
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
* @param c 32-bit code point
* @return 1 or 2
* @stable ICU 2.4
*/
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @stable ICU 2.4
*/
#define U16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* The result is undefined if the offset points to a single, unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_GET
* @stable ICU 2.4
*/
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 2.4
*/
#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U16_APPEND
* @stable ICU 2.4
*/
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a surrogate pair is written, checks for sufficient space in the string.
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to TRUE.
*
* @param s const UChar * string buffer
* @param i string offset, must be i<capacity
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U16_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* c>0x10ffff or not enough space */ { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_FWD_1
* @stable ICU 2.4
*/
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @see U16_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_FWD_N
* @stable ICU 2.4
*/
#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U16_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U16_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_START
* @stable ICU 2.4
*/
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i
* @see U16_SET_CP_START_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_PREV
* @stable ICU 2.4
*/
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_BACK_1
* @stable ICU 2.4
*/
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @see U16_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_BACK_N
* @stable ICU 2.4
*/
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start start of string
* @param i string offset, must be start<i
* @param n number of code points to skip
* @see U16_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U16_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, start<=i<=length
* @param length int32_t string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
#endif

View File

@ -0,0 +1,881 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf8.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 8-bit Unicode handling macros
*
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://userguide.icu-project.org/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF8_H__
#define __UTF8_H__
#include "./umachine.h"
#ifndef __UTF_H__
# include "./utf.h"
#endif
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
(U8_IS_LEAD(leadByte) ? \
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
* @internal
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* Lead byte E0..EF bits 3..0 are used as byte index,
* first trail byte bits 7..5 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD3_AND_T1
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* First trail byte bits 7..4 are used as byte index,
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD4_AND_T1
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
/**
* Function for handling "append code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
/**
* Function for handling "previous code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
/**
* Function for handling "skip backward one code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_STABLE int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
#define U8_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xd7ff ? 3 : \
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
) \
)
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
* @return 4
* @stable ICU 2.4
*/
#define U8_MAX_LENGTH 4
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_GET
* @stable ICU 2.4
*/
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_unsafe_index=(int32_t)(i); \
U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to a negative value.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_GET_UNSAFE
* @stable ICU 2.4
*/
#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_GET() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_GET
* @stable ICU 51
*/
#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_NEXT
* @stable ICU 2.4
*/
#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_NEXT() if that distinction is important.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_NEXT
* @stable ICU 51
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
/** @internal */
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t = 0; \
if((i)!=(length) && \
/* fetch/validate/assemble all but last trail byte */ \
((c)>=0xe0 ? \
((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
(__t&=0x3f, 1) \
: /* U+10000..U+10FFFF */ \
((c)-=0xf0)<=4 && \
U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
(__t=(s)[i]-0x80)<=0x3f) && \
/* valid second-to-last trail byte */ \
((c)=((c)<<6)|__t, ++(i)!=(length)) \
: /* U+0080..U+07FF */ \
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
/* last trail byte */ \
(__t=(s)[i]-0x80)<=0x3f && \
((c)=((c)<<6)|__t, ++(i), 1)) { \
} else { \
(c)=(sub); /* ill-formed*/ \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const uint8_t * string buffer
* @param i string offset
* @param c code point to append
* @see U8_APPEND
* @stable ICU 2.4
*/
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else { \
if(__uc<=0x7ff) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
} else { \
if(__uc<=0xffff) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
} else { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a non-ASCII code point is written, checks for sufficient space in the string.
* If the code point is not valid or trail bytes do not fit,
* then isError is set to TRUE.
*
* @param s const uint8_t * string buffer
* @param i int32_t string offset, must be i<capacity
* @param capacity int32_t size of the string buffer
* @param c UChar32 code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U8_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else { \
(isError)=TRUE; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_FWD_1
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @see U8_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_FWD_N
* @stable ICU 2.4
*/
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U8_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U8_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_START
* @stable ICU 2.4
*/
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
* @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* If the string ends with a UTF-8 byte sequence that is valid so far
* but incomplete, then reduce the length of the string to end before
* the lead byte of that incomplete sequence.
* For example, if the string ends with E1 80, the length is reduced by 2.
*
* In all other cases (the string ends with a complete sequence, or it is not
* possible for any further trail byte to extend the trailing sequence)
* the length remains unchanged.
*
* Useful for processing text split across multiple buffers
* (save the incomplete sequence for later)
* and for optimizing iteration
* (check for string length only once per character).
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_SET_CP_START(), this macro never reads s[length].
*
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param length int32_t string length (usually start<=length)
* @see U8_SET_CP_START
* @stable ICU 61
*/
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
if((length)>(start)) { \
uint8_t __b1=s[(length)-1]; \
if(U8_IS_SINGLE(__b1)) { \
/* common ASCII character */ \
} else if(U8_IS_LEAD(__b1)) { \
--(length); \
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
uint8_t __b2=s[(length)-2]; \
if(0xe0<=__b2 && __b2<=0xf4) { \
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
(length)-=2; \
} \
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
uint8_t __b3=s[(length)-3]; \
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
(length)-=3; \
} \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_PREV
* @stable ICU 2.4
*/
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
(c)|=(UChar32)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_PREV() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_PREV
* @stable ICU 51
*/
#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_BACK_1
* @stable ICU 2.4
*/
#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[--(i)])) {} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @see U8_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_BACK_N
* @stable ICU 2.4
*/
#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t index of the start of the string
* @param i int32_t string offset, must be start<i
* @param n number of code points to skip
* @see U8_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U8_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
U8_BACK_1_UNSAFE(s, i); \
U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i<=length
* @param length int32_t string length
* @see U8_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0)) { \
U8_BACK_1(s, start, i); \
U8_FWD_1(s, i, length); \
} \
} UPRV_BLOCK_MACRO_END
#endif

View File

@ -4,6 +4,9 @@ local Screen = require('test.functional.ui.screen')
local eq, eval = helpers.eq, helpers.eval
local command = helpers.command
local meths = helpers.meths
local funcs = helpers.funcs
local pcall_err = helpers.pcall_err
local ok = helpers.ok
describe('API: highlight',function()
local expected_rgb = {
@ -110,4 +113,20 @@ describe('API: highlight',function()
meths.get_hl_by_name('cursorline', 0));
end)
it('nvim_get_hl_id_by_name', function()
-- precondition: use a hl group that does not yet exist
eq('Invalid highlight name: Shrubbery', pcall_err(meths.get_hl_by_name, "Shrubbery", true))
eq(0, funcs.hlID("Shrubbery"))
local hl_id = meths.get_hl_id_by_name("Shrubbery")
ok(hl_id > 0)
eq(hl_id, funcs.hlID("Shrubbery"))
command('hi Shrubbery guifg=#888888 guibg=#888888')
eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
meths.get_hl_by_id(hl_id, true))
eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
meths.get_hl_by_name("Shrubbery", true))
end)
end)

View File

@ -1,5 +1,6 @@
-- Test suite for testing interactions with API bindings
local helpers = require('test.functional.helpers')(after_each)
local Screen = require('test.functional.ui.screen')
local clear = helpers.clear
local eq = helpers.eq
@ -26,122 +27,371 @@ describe('treesitter API', function()
pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')"))
end)
end)
describe('treesitter API with C parser', function()
local ts_path = os.getenv("TREE_SITTER_DIR")
describe('with C parser', function()
if ts_path == nil then
it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
return
end
-- The tests after this requires an actual parser
if ts_path == nil then
it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
return
end
before_each(function()
local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
exec_lua([[
local path = ...
vim.treesitter.add_language(path,'c')
]], path)
end)
before_each(function()
local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
exec_lua([[
local path = ...
vim.treesitter.add_language(path,'c')
]], path)
end)
it('parses buffer', function()
insert([[
int main() {
int x = 3;
}]])
it('parses buffer', function()
insert([[
int main() {
int x = 3;
}]])
exec_lua([[
parser = vim.treesitter.get_parser(0, "c")
tree = parser:parse()
root = tree:root()
lang = vim.treesitter.inspect_language('c')
]])
exec_lua([[
parser = vim.treesitter.get_parser(0, "c")
tree = parser:parse()
root = tree:root()
lang = vim.treesitter.inspect_language('c')
]])
eq("<tree>", exec_lua("return tostring(tree)"))
eq("<node translation_unit>", exec_lua("return tostring(root)"))
eq({0,0,3,0}, exec_lua("return {root:range()}"))
eq("<tree>", exec_lua("return tostring(tree)"))
eq("<node translation_unit>", exec_lua("return tostring(root)"))
eq({0,0,3,0}, exec_lua("return {root:range()}"))
eq(1, exec_lua("return root:child_count()"))
exec_lua("child = root:child(0)")
eq("<node function_definition>", exec_lua("return tostring(child)"))
eq({0,0,2,1}, exec_lua("return {child:range()}"))
eq(1, exec_lua("return root:child_count()"))
exec_lua("child = root:child(0)")
eq("<node function_definition>", exec_lua("return tostring(child)"))
eq({0,0,2,1}, exec_lua("return {child:range()}"))
eq("function_definition", exec_lua("return child:type()"))
eq(true, exec_lua("return child:named()"))
eq("number", type(exec_lua("return child:symbol()")))
eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
eq("function_definition", exec_lua("return child:type()"))
eq(true, exec_lua("return child:named()"))
eq("number", type(exec_lua("return child:symbol()")))
eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
exec_lua("anon = root:descendant_for_range(0,8,0,9)")
eq("(", exec_lua("return anon:type()"))
eq(false, exec_lua("return anon:named()"))
eq("number", type(exec_lua("return anon:symbol()")))
eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
exec_lua("anon = root:descendant_for_range(0,8,0,9)")
eq("(", exec_lua("return anon:type()"))
eq(false, exec_lua("return anon:named()"))
eq("number", type(exec_lua("return anon:symbol()")))
eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
eq("<node declaration>", exec_lua("return tostring(descendant)"))
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
eq("<node declaration>", exec_lua("return tostring(descendant)"))
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
eq(true, exec_lua("return child == child"))
-- separate lua object, but represents same node
eq(true, exec_lua("return child == root:child(0)"))
eq(false, exec_lua("return child == descendant2"))
eq(false, exec_lua("return child == nil"))
eq(false, exec_lua("return child == tree"))
eq(true, exec_lua("return child == child"))
-- separate lua object, but represents same node
eq(true, exec_lua("return child == root:child(0)"))
eq(false, exec_lua("return child == descendant2"))
eq(false, exec_lua("return child == nil"))
eq(false, exec_lua("return child == tree"))
feed("2G7|ay")
exec_lua([[
tree2 = parser:parse()
root2 = tree2:root()
descendant2 = root2:descendant_for_range(1,2,1,13)
]])
eq(false, exec_lua("return tree2 == tree1"))
eq(false, exec_lua("return root2 == root"))
eq("<node declaration>", exec_lua("return tostring(descendant2)"))
eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
feed("2G7|ay")
exec_lua([[
tree2 = parser:parse()
root2 = tree2:root()
descendant2 = root2:descendant_for_range(1,2,1,13)
]])
eq(false, exec_lua("return tree2 == tree1"))
eq(false, exec_lua("return root2 == root"))
eq("<node declaration>", exec_lua("return tostring(descendant2)"))
eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
-- orginal tree did not change
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
-- orginal tree did not change
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
-- unchanged buffer: return the same tree
eq(true, exec_lua("return parser:parse() == tree2"))
end)
-- unchanged buffer: return the same tree
eq(true, exec_lua("return parser:parse() == tree2"))
end)
it('inspects language', function()
local keys, fields, symbols = unpack(exec_lua([[
local lang = vim.treesitter.inspect_language('c')
local keys, symbols = {}, {}
for k,_ in pairs(lang) do
keys[k] = true
end
local test_text = [[
void ui_refresh(void)
{
int width = INT_MAX, height = INT_MAX;
bool ext_widgets[kUIExtCount];
for (UIExtension i = 0; (int)i < kUIExtCount; i++) {
ext_widgets[i] = true;
}
-- symbols array can have "holes" and is thus not a valid msgpack array
-- but we don't care about the numbers here (checked in the parser test)
for _, v in pairs(lang.symbols) do
table.insert(symbols, v)
end
return {keys, lang.fields, symbols}
]]))
bool inclusive = ui_override();
for (size_t i = 0; i < ui_count; i++) {
UI *ui = uis[i];
width = MIN(ui->width, width);
height = MIN(ui->height, height);
foo = BAR(ui->bazaar, bazaar);
for (UIExtension j = 0; (int)j < kUIExtCount; j++) {
ext_widgets[j] &= (ui->ui_ext[j] || inclusive);
}
}
}]]
eq({fields=true, symbols=true}, keys)
local query = [[
((call_expression function: (identifier) @minfunc (argument_list (identifier) @min_id)) (eq? @minfunc "MIN"))
"for" @keyword
(primitive_type) @type
(field_expression argument: (identifier) @fieldarg)
]]
local fset = {}
for _,f in pairs(fields) do
eq("string", type(f))
fset[f] = true
it('support query and iter by capture', function()
insert(test_text)
local res = exec_lua([[
cquery = vim.treesitter.parse_query("c", ...)
parser = vim.treesitter.get_parser(0, "c")
tree = parser:parse()
res = {}
for cid, node in cquery:iter_captures(tree:root(), 0, 7, 14) do
-- can't transmit node over RPC. just check the name and range
table.insert(res, {cquery.captures[cid], node:type(), node:range()})
end
return res
]], query)
eq({
{ "type", "primitive_type", 8, 2, 8, 6 },
{ "keyword", "for", 9, 2, 9, 5 },
{ "type", "primitive_type", 9, 7, 9, 13 },
{ "minfunc", "identifier", 11, 12, 11, 15 },
{ "fieldarg", "identifier", 11, 16, 11, 18 },
{ "min_id", "identifier", 11, 27, 11, 32 },
{ "minfunc", "identifier", 12, 13, 12, 16 },
{ "fieldarg", "identifier", 12, 17, 12, 19 },
{ "min_id", "identifier", 12, 29, 12, 35 },
{ "fieldarg", "identifier", 13, 14, 13, 16 }
}, res)
end)
it('support query and iter by match', function()
insert(test_text)
local res = exec_lua([[
cquery = vim.treesitter.parse_query("c", ...)
parser = vim.treesitter.get_parser(0, "c")
tree = parser:parse()
res = {}
for pattern, match in cquery:iter_matches(tree:root(), 0, 7, 14) do
-- can't transmit node over RPC. just check the name and range
local mrepr = {}
for cid,node in pairs(match) do
table.insert(mrepr, {cquery.captures[cid], node:type(), node:range()})
end
eq(true, fset["directive"])
eq(true, fset["initializer"])
table.insert(res, {pattern, mrepr})
end
return res
]], query)
local has_named, has_anonymous
for _,s in pairs(symbols) do
eq("string", type(s[1]))
eq("boolean", type(s[2]))
if s[1] == "for_statement" and s[2] == true then
has_named = true
elseif s[1] == "|=" and s[2] == false then
has_anonymous = true
end
eq({
{ 3, { { "type", "primitive_type", 8, 2, 8, 6 } } },
{ 2, { { "keyword", "for", 9, 2, 9, 5 } } },
{ 3, { { "type", "primitive_type", 9, 7, 9, 13 } } },
{ 4, { { "fieldarg", "identifier", 11, 16, 11, 18 } } },
{ 1, { { "minfunc", "identifier", 11, 12, 11, 15 }, { "min_id", "identifier", 11, 27, 11, 32 } } },
{ 4, { { "fieldarg", "identifier", 12, 17, 12, 19 } } },
{ 1, { { "minfunc", "identifier", 12, 13, 12, 16 }, { "min_id", "identifier", 12, 29, 12, 35 } } },
{ 4, { { "fieldarg", "identifier", 13, 14, 13, 16 } } }
}, res)
end)
it('supports highlighting', function()
local hl_text = [[
/// Schedule Lua callback on main loop's event queue
static int nlua_schedule(lua_State *const lstate)
{
if (lua_type(lstate, 1) != LUA_TFUNCTION
|| lstate != lstate) {
lua_pushliteral(lstate, "vim.schedule: expected function");
return lua_error(lstate);
}
LuaRef cb = nlua_ref(lstate, 1);
multiqueue_put(main_loop.events, nlua_schedule_event,
1, (void *)(ptrdiff_t)cb);
return 0;
}]]
local hl_query = [[
(ERROR) @ErrorMsg
"if" @keyword
"else" @keyword
"for" @keyword
"return" @keyword
"const" @type
"static" @type
"struct" @type
"enum" @type
"extern" @type
(string_literal) @string
(number_literal) @number
(char_literal) @string
; TODO(bfredl): overlapping matches are unreliable,
; we need a proper priority mechanism
;(type_identifier) @type
((type_identifier) @Special (eq? @Special "LuaRef"))
(primitive_type) @type
(sized_type_specifier) @type
((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) (eq? @WarningMsg.left @WarningMsg.right))
(comment) @comment
]]
local screen = Screen.new(65, 18)
screen:attach()
screen:set_default_attr_ids({
[1] = {bold = true, foreground = Screen.colors.Blue1},
[2] = {foreground = Screen.colors.Blue1},
[3] = {bold = true, foreground = Screen.colors.SeaGreen4},
[4] = {bold = true, foreground = Screen.colors.Brown},
[5] = {foreground = Screen.colors.Magenta},
[6] = {foreground = Screen.colors.Red},
[7] = {foreground = Screen.colors.SlateBlue},
[8] = {foreground = Screen.colors.Grey100, background = Screen.colors.Red},
[9] = {foreground = Screen.colors.Magenta, background = Screen.colors.Red},
[10] = {foreground = Screen.colors.Red, background = Screen.colors.Red},
})
insert(hl_text)
screen:expect{grid=[[
/// Schedule Lua callback on main loop's event queue |
static int nlua_schedule(lua_State *const lstate) |
{ |
if (lua_type(lstate, 1) != LUA_TFUNCTION |
|| lstate != lstate) { |
lua_pushliteral(lstate, "vim.schedule: expected function"); |
return lua_error(lstate); |
} |
|
LuaRef cb = nlua_ref(lstate, 1); |
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
1, (void *)(ptrdiff_t)cb); |
return 0; |
^} |
{1:~ }|
{1:~ }|
|
]]}
exec_lua([[
local TSHighlighter = vim.treesitter.TSHighlighter
local query = ...
test_hl = TSHighlighter.new(query, 0, "c")
]], hl_query)
screen:expect{grid=[[
{2:/// Schedule Lua callback on main loop's event queue} |
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
{ |
{4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
|| {6:lstate} != {6:lstate}) { |
lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
{4:return} lua_error(lstate); |
} |
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
{4:return} {5:0}; |
^} |
{1:~ }|
{1:~ }|
|
]]}
feed('7Go*/<esc>')
screen:expect{grid=[[
{2:/// Schedule Lua callback on main loop's event queue} |
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
{ |
{4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
|| {6:lstate} != {6:lstate}) { |
lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
{4:return} lua_error(lstate); |
{8:*^/} |
} |
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
{4:return} {5:0}; |
} |
{1:~ }|
|
]]}
feed('3Go/*<esc>')
screen:expect{grid=[[
{2:/// Schedule Lua callback on main loop's event queue} |
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
{ |
{2:/^*} |
{2: if (lua_type(lstate, 1) != LUA_TFUNCTION} |
{2: || lstate != lstate) {} |
{2: lua_pushliteral(lstate, "vim.schedule: expected function");} |
{2: return lua_error(lstate);} |
{2:*/} |
} |
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
{4:return} {5:0}; |
{8:}} |
|
]]}
end)
it('inspects language', function()
local keys, fields, symbols = unpack(exec_lua([[
local lang = vim.treesitter.inspect_language('c')
local keys, symbols = {}, {}
for k,_ in pairs(lang) do
keys[k] = true
end
eq({true,true}, {has_named,has_anonymous})
end)
-- symbols array can have "holes" and is thus not a valid msgpack array
-- but we don't care about the numbers here (checked in the parser test)
for _, v in pairs(lang.symbols) do
table.insert(symbols, v)
end
return {keys, lang.fields, symbols}
]]))
eq({fields=true, symbols=true}, keys)
local fset = {}
for _,f in pairs(fields) do
eq("string", type(f))
fset[f] = true
end
eq(true, fset["directive"])
eq(true, fset["initializer"])
local has_named, has_anonymous
for _,s in pairs(symbols) do
eq("string", type(s[1]))
eq("boolean", type(s[2]))
if s[1] == "for_statement" and s[2] == true then
has_named = true
elseif s[1] == "|=" and s[2] == false then
has_anonymous = true
end
end
eq({true,true}, {has_named,has_anonymous})
end)
end)