Merge pull request #11113 from bfredl/tree-sitter-query
tree-sitter step 2: query API and highlighting prototype
This commit is contained in:
commit
9e9dcd4bd7
|
@ -594,6 +594,102 @@ tsnode:named_descendant_for_range(start_row, start_col, end_row, end_col)
|
||||||
Get the smallest named node within this node that spans the given
|
Get the smallest named node within this node that spans the given
|
||||||
range of (row, column) positions
|
range of (row, column) positions
|
||||||
|
|
||||||
|
Query methods *lua-treesitter-query*
|
||||||
|
|
||||||
|
Tree-sitter queries are supported, with some limitations. Currently, the only
|
||||||
|
supported match predicate is `eq?` (both comparing a capture against a string
|
||||||
|
and two captures against each other).
|
||||||
|
|
||||||
|
vim.treesitter.parse_query(lang, query)
|
||||||
|
*vim.treesitter.parse_query(()*
|
||||||
|
Parse the query as a string. (If the query is in a file, the caller
|
||||||
|
should read the contents into a string before calling).
|
||||||
|
|
||||||
|
query:iter_captures(node, bufnr, start_row, end_row)
|
||||||
|
*query:iter_captures()*
|
||||||
|
Iterate over all captures from all matches inside a `node`.
|
||||||
|
`bufnr` is needed if the query contains predicates, then the caller
|
||||||
|
must ensure to use a freshly parsed tree consistent with the current
|
||||||
|
text of the buffer. `start_row` and `end_row` can be used to limit
|
||||||
|
matches inside a row range (this is typically used with root node
|
||||||
|
as the node, i e to get syntax highlight matches in the current
|
||||||
|
viewport)
|
||||||
|
|
||||||
|
The iterator returns two values, a numeric id identifying the capture
|
||||||
|
and the captured node. The following example shows how to get captures
|
||||||
|
by name:
|
||||||
|
>
|
||||||
|
for id, node in query:iter_captures(tree:root(), bufnr, first, last) do
|
||||||
|
local name = query.captures[id] -- name of the capture in the query
|
||||||
|
-- typically useful info about the node:
|
||||||
|
local type = node:type() -- type of the captured node
|
||||||
|
local row1, col1, row2, col2 = node:range() -- range of the capture
|
||||||
|
... use the info here ...
|
||||||
|
end
|
||||||
|
<
|
||||||
|
query:iter_matches(node, bufnr, start_row, end_row)
|
||||||
|
*query:iter_matches()*
|
||||||
|
Iterate over all matches within a node. The arguments are the same as
|
||||||
|
for |query:iter_captures()| but the iterated values are different:
|
||||||
|
an (1-based) index of the pattern in the query, and a table mapping
|
||||||
|
capture indices to nodes. If the query has more than one pattern
|
||||||
|
the capture table might be sparse, and e.g. `pairs` should be used and not
|
||||||
|
`ipairs`. Here an example iterating over all captures in
|
||||||
|
every match:
|
||||||
|
>
|
||||||
|
for pattern, match in cquery:iter_matches(tree:root(), bufnr, first, last) do
|
||||||
|
for id,node in pairs(match) do
|
||||||
|
local name = query.captures[id]
|
||||||
|
-- `node` was captured by the `name` capture in the match
|
||||||
|
... use the info here ...
|
||||||
|
end
|
||||||
|
end
|
||||||
|
>
|
||||||
|
Treesitter syntax highlighting (WIP) *lua-treesitter-highlight*
|
||||||
|
|
||||||
|
NOTE: This is a partially implemented feature, and not usable as a default
|
||||||
|
solution yet. What is documented here is a temporary interface indented
|
||||||
|
for those who want to experiment with this feature and contribute to
|
||||||
|
its development.
|
||||||
|
|
||||||
|
Highlights are defined in the same query format as in the tree-sitter highlight
|
||||||
|
crate, which some limitations and additions. Set a highlight query for a
|
||||||
|
buffer with this code: >
|
||||||
|
|
||||||
|
local query = [[
|
||||||
|
"for" @keyword
|
||||||
|
"if" @keyword
|
||||||
|
"return" @keyword
|
||||||
|
|
||||||
|
(string_literal) @string
|
||||||
|
(number_literal) @number
|
||||||
|
(comment) @comment
|
||||||
|
|
||||||
|
(preproc_function_def name: (identifier) @function)
|
||||||
|
|
||||||
|
; ... more definitions
|
||||||
|
]]
|
||||||
|
|
||||||
|
highlighter = vim.treesitter.TSHighlighter.new(query, bufnr, lang)
|
||||||
|
-- alternatively, to use the current buffer and its filetype:
|
||||||
|
-- highlighter = vim.treesitter.TSHighlighter.new(query)
|
||||||
|
|
||||||
|
-- Don't recreate the highlighter for the same buffer, instead
|
||||||
|
-- modify the query like this:
|
||||||
|
local query2 = [[ ... ]]
|
||||||
|
highlighter:set_query(query2)
|
||||||
|
|
||||||
|
As mentioned above the supported predicate is currently only `eq?`. `match?`
|
||||||
|
predicates behave like matching always fails. As an addition a capture which
|
||||||
|
begin with an upper-case letter like `@WarningMsg` will map directly to this
|
||||||
|
highlight group, if defined. Also if the predicate begins with upper-case and
|
||||||
|
contains a dot only the part before the first will be interpreted as the
|
||||||
|
highlight group. As an example, this warns of a binary expression with two
|
||||||
|
identical identifiers, highlighting both as |hl-WarningMsg|: >
|
||||||
|
|
||||||
|
((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right)
|
||||||
|
(eq? @WarningMsg.left @WarningMsg.right))
|
||||||
|
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
VIM *lua-builtin*
|
VIM *lua-builtin*
|
||||||
|
|
||||||
|
|
|
@ -12,9 +12,13 @@ function Parser:parse()
|
||||||
if self.valid then
|
if self.valid then
|
||||||
return self.tree
|
return self.tree
|
||||||
end
|
end
|
||||||
self.tree = self._parser:parse_buf(self.bufnr)
|
local changes
|
||||||
|
self.tree, changes = self._parser:parse_buf(self.bufnr)
|
||||||
self.valid = true
|
self.valid = true
|
||||||
return self.tree
|
for _, cb in ipairs(self.change_cbs) do
|
||||||
|
cb(changes)
|
||||||
|
end
|
||||||
|
return self.tree, changes
|
||||||
end
|
end
|
||||||
|
|
||||||
function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size)
|
function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size)
|
||||||
|
@ -26,17 +30,28 @@ function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_
|
||||||
self.valid = false
|
self.valid = false
|
||||||
end
|
end
|
||||||
|
|
||||||
local module = {
|
local M = {
|
||||||
add_language=vim._ts_add_language,
|
add_language=vim._ts_add_language,
|
||||||
inspect_language=vim._ts_inspect_language,
|
inspect_language=vim._ts_inspect_language,
|
||||||
|
parse_query = vim._ts_parse_query,
|
||||||
}
|
}
|
||||||
|
|
||||||
function module.create_parser(bufnr, ft, id)
|
setmetatable(M, {
|
||||||
|
__index = function (t, k)
|
||||||
|
if k == "TSHighlighter" then
|
||||||
|
t[k] = require'vim.tshighlighter'
|
||||||
|
return t[k]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
})
|
||||||
|
|
||||||
|
function M.create_parser(bufnr, ft, id)
|
||||||
if bufnr == 0 then
|
if bufnr == 0 then
|
||||||
bufnr = a.nvim_get_current_buf()
|
bufnr = a.nvim_get_current_buf()
|
||||||
end
|
end
|
||||||
local self = setmetatable({bufnr=bufnr, valid=false}, Parser)
|
local self = setmetatable({bufnr=bufnr, lang=ft, valid=false}, Parser)
|
||||||
self._parser = vim._create_ts_parser(ft)
|
self._parser = vim._create_ts_parser(ft)
|
||||||
|
self.change_cbs = {}
|
||||||
self:parse()
|
self:parse()
|
||||||
-- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is
|
-- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is
|
||||||
-- using it.
|
-- using it.
|
||||||
|
@ -55,7 +70,7 @@ function module.create_parser(bufnr, ft, id)
|
||||||
return self
|
return self
|
||||||
end
|
end
|
||||||
|
|
||||||
function module.get_parser(bufnr, ft)
|
function M.get_parser(bufnr, ft, cb)
|
||||||
if bufnr == nil or bufnr == 0 then
|
if bufnr == nil or bufnr == 0 then
|
||||||
bufnr = a.nvim_get_current_buf()
|
bufnr = a.nvim_get_current_buf()
|
||||||
end
|
end
|
||||||
|
@ -65,9 +80,98 @@ function module.get_parser(bufnr, ft)
|
||||||
local id = tostring(bufnr)..'_'..ft
|
local id = tostring(bufnr)..'_'..ft
|
||||||
|
|
||||||
if parsers[id] == nil then
|
if parsers[id] == nil then
|
||||||
parsers[id] = module.create_parser(bufnr, ft, id)
|
parsers[id] = M.create_parser(bufnr, ft, id)
|
||||||
|
end
|
||||||
|
if cb ~= nil then
|
||||||
|
table.insert(parsers[id].change_cbs, cb)
|
||||||
end
|
end
|
||||||
return parsers[id]
|
return parsers[id]
|
||||||
end
|
end
|
||||||
|
|
||||||
return module
|
-- query: pattern matching on trees
|
||||||
|
-- predicate matching is implemented in lua
|
||||||
|
local Query = {}
|
||||||
|
Query.__index = Query
|
||||||
|
|
||||||
|
function M.parse_query(lang, query)
|
||||||
|
local self = setmetatable({}, Query)
|
||||||
|
self.query = vim._ts_parse_query(lang, query)
|
||||||
|
self.info = self.query:inspect()
|
||||||
|
self.captures = self.info.captures
|
||||||
|
return self
|
||||||
|
end
|
||||||
|
|
||||||
|
local function get_node_text(node, bufnr)
|
||||||
|
local start_row, start_col, end_row, end_col = node:range()
|
||||||
|
if start_row ~= end_row then
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
local line = a.nvim_buf_get_lines(bufnr, start_row, start_row+1, true)[1]
|
||||||
|
return string.sub(line, start_col+1, end_col)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function match_preds(match, preds, bufnr)
|
||||||
|
for _, pred in pairs(preds) do
|
||||||
|
if pred[1] == "eq?" then
|
||||||
|
local node = match[pred[2]]
|
||||||
|
local node_text = get_node_text(node, bufnr)
|
||||||
|
|
||||||
|
local str
|
||||||
|
if type(pred[3]) == "string" then
|
||||||
|
-- (eq? @aa "foo")
|
||||||
|
str = pred[3]
|
||||||
|
else
|
||||||
|
-- (eq? @aa @bb)
|
||||||
|
str = get_node_text(match[pred[3]], bufnr)
|
||||||
|
end
|
||||||
|
|
||||||
|
if node_text ~= str or str == nil then
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
function Query:iter_captures(node, bufnr, start, stop)
|
||||||
|
if bufnr == 0 then
|
||||||
|
bufnr = vim.api.nvim_get_current_buf()
|
||||||
|
end
|
||||||
|
local raw_iter = node:_rawquery(self.query,true,start,stop)
|
||||||
|
local function iter()
|
||||||
|
local capture, captured_node, match = raw_iter()
|
||||||
|
if match ~= nil then
|
||||||
|
local preds = self.info.patterns[match.pattern]
|
||||||
|
local active = match_preds(match, preds, bufnr)
|
||||||
|
match.active = active
|
||||||
|
if not active then
|
||||||
|
return iter() -- tail call: try next match
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return capture, captured_node
|
||||||
|
end
|
||||||
|
return iter
|
||||||
|
end
|
||||||
|
|
||||||
|
function Query:iter_matches(node, bufnr, start, stop)
|
||||||
|
if bufnr == 0 then
|
||||||
|
bufnr = vim.api.nvim_get_current_buf()
|
||||||
|
end
|
||||||
|
local raw_iter = node:_rawquery(self.query,false,start,stop)
|
||||||
|
local function iter()
|
||||||
|
local pattern, match = raw_iter()
|
||||||
|
if match ~= nil then
|
||||||
|
local preds = self.info.patterns[pattern]
|
||||||
|
local active = (not preds) or match_preds(match, preds, bufnr)
|
||||||
|
if not active then
|
||||||
|
return iter() -- tail call: try next match
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return pattern, match
|
||||||
|
end
|
||||||
|
return iter
|
||||||
|
end
|
||||||
|
|
||||||
|
return M
|
||||||
|
|
|
@ -0,0 +1,142 @@
|
||||||
|
local a = vim.api
|
||||||
|
|
||||||
|
-- support reload for quick experimentation
|
||||||
|
local TSHighlighter = rawget(vim.treesitter, 'TSHighlighter') or {}
|
||||||
|
TSHighlighter.__index = TSHighlighter
|
||||||
|
|
||||||
|
-- These are conventions defined by tree-sitter, though it
|
||||||
|
-- needs to be user extensible also.
|
||||||
|
-- TODO(bfredl): this is very much incomplete, we will need to
|
||||||
|
-- go through a few tree-sitter provided queries and decide
|
||||||
|
-- on translations that makes the most sense.
|
||||||
|
TSHighlighter.hl_map = {
|
||||||
|
keyword="Keyword",
|
||||||
|
string="String",
|
||||||
|
type="Type",
|
||||||
|
comment="Comment",
|
||||||
|
constant="Constant",
|
||||||
|
operator="Operator",
|
||||||
|
number="Number",
|
||||||
|
label="Label",
|
||||||
|
["function"]="Function",
|
||||||
|
["function.special"]="Function",
|
||||||
|
}
|
||||||
|
|
||||||
|
function TSHighlighter.new(query, bufnr, ft)
|
||||||
|
local self = setmetatable({}, TSHighlighter)
|
||||||
|
self.parser = vim.treesitter.get_parser(bufnr, ft, function(...) self:on_change(...) end)
|
||||||
|
self.buf = self.parser.bufnr
|
||||||
|
-- TODO(bfredl): perhaps on_start should be called uncondionally, instead for only on mod?
|
||||||
|
local tree = self.parser:parse()
|
||||||
|
self.root = tree:root()
|
||||||
|
self:set_query(query)
|
||||||
|
self.edit_count = 0
|
||||||
|
self.redraw_count = 0
|
||||||
|
self.line_count = {}
|
||||||
|
a.nvim_buf_set_option(self.buf, "syntax", "")
|
||||||
|
a.nvim__buf_set_luahl(self.buf, {
|
||||||
|
on_start=function(...) return self:on_start(...) end,
|
||||||
|
on_window=function(...) return self:on_window(...) end,
|
||||||
|
on_line=function(...) return self:on_line(...) end,
|
||||||
|
})
|
||||||
|
|
||||||
|
-- Tricky: if syntax hasn't been enabled, we need to reload color scheme
|
||||||
|
-- but use synload.vim rather than syntax.vim to not enable
|
||||||
|
-- syntax FileType autocmds. Later on we should integrate with the
|
||||||
|
-- `:syntax` and `set syntax=...` machinery properly.
|
||||||
|
if vim.g.syntax_on ~= 1 then
|
||||||
|
vim.api.nvim_command("runtime! syntax/synload.vim")
|
||||||
|
end
|
||||||
|
return self
|
||||||
|
end
|
||||||
|
|
||||||
|
function TSHighlighter:set_query(query)
|
||||||
|
if type(query) == "string" then
|
||||||
|
query = vim.treesitter.parse_query(self.parser.lang, query)
|
||||||
|
end
|
||||||
|
self.query = query
|
||||||
|
|
||||||
|
self.id_map = {}
|
||||||
|
for i, capture in ipairs(self.query.captures) do
|
||||||
|
local hl = 0
|
||||||
|
local firstc = string.sub(capture, 1, 1)
|
||||||
|
local hl_group = self.hl_map[capture]
|
||||||
|
if firstc ~= string.lower(firstc) then
|
||||||
|
hl_group = vim.split(capture, '.', true)[1]
|
||||||
|
end
|
||||||
|
if hl_group then
|
||||||
|
hl = a.nvim_get_hl_id_by_name(hl_group)
|
||||||
|
end
|
||||||
|
self.id_map[i] = hl
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function TSHighlighter:on_change(changes)
|
||||||
|
for _, ch in ipairs(changes or {}) do
|
||||||
|
a.nvim__buf_redraw_range(self.buf, ch[1], ch[3]+1)
|
||||||
|
end
|
||||||
|
self.edit_count = self.edit_count + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function TSHighlighter:on_start(_, _buf, _tick)
|
||||||
|
local tree = self.parser:parse()
|
||||||
|
self.root = tree:root()
|
||||||
|
end
|
||||||
|
|
||||||
|
function TSHighlighter:on_window(_, _win, _buf, _topline, botline)
|
||||||
|
self.iter = nil
|
||||||
|
self.active_nodes = {}
|
||||||
|
self.nextrow = 0
|
||||||
|
self.botline = botline
|
||||||
|
self.redraw_count = self.redraw_count + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function TSHighlighter:on_line(_, _win, buf, line)
|
||||||
|
if self.iter == nil then
|
||||||
|
self.iter = self.query:iter_captures(self.root,buf,line,self.botline)
|
||||||
|
end
|
||||||
|
while line >= self.nextrow do
|
||||||
|
local capture, node, match = self.iter()
|
||||||
|
local active = true
|
||||||
|
if capture == nil then
|
||||||
|
break
|
||||||
|
end
|
||||||
|
if match ~= nil then
|
||||||
|
active = self:run_pred(match)
|
||||||
|
match.active = active
|
||||||
|
end
|
||||||
|
local start_row, start_col, end_row, end_col = node:range()
|
||||||
|
local hl = self.id_map[capture]
|
||||||
|
if hl > 0 and active then
|
||||||
|
if start_row == line and end_row == line then
|
||||||
|
a.nvim__put_attr(hl, start_col, end_col)
|
||||||
|
elseif end_row >= line then
|
||||||
|
-- TODO(bfredl): this is quite messy. Togheter with multiline bufhl we should support
|
||||||
|
-- luahl generating multiline highlights (and other kinds of annotations)
|
||||||
|
self.active_nodes[{hl=hl, start_row=start_row, start_col=start_col, end_row=end_row, end_col=end_col}] = true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if start_row > line then
|
||||||
|
self.nextrow = start_row
|
||||||
|
end
|
||||||
|
end
|
||||||
|
for node,_ in pairs(self.active_nodes) do
|
||||||
|
if node.start_row <= line and node.end_row >= line then
|
||||||
|
local start_col, end_col = node.start_col, node.end_col
|
||||||
|
if node.start_row < line then
|
||||||
|
start_col = 0
|
||||||
|
end
|
||||||
|
if node.end_row > line then
|
||||||
|
end_col = 9000
|
||||||
|
end
|
||||||
|
a.nvim__put_attr(node.hl, start_col, end_col)
|
||||||
|
end
|
||||||
|
if node.end_row <= line then
|
||||||
|
self.active_nodes[node] = nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
self.line_count[line] = (self.line_count[line] or 0) + 1
|
||||||
|
--return tostring(self.line_count[line])
|
||||||
|
end
|
||||||
|
|
||||||
|
return TSHighlighter
|
|
@ -169,21 +169,21 @@ Boolean nvim_buf_attach(uint64_t channel_id,
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
cb.on_lines = v->data.luaref;
|
cb.on_lines = v->data.luaref;
|
||||||
v->data.integer = LUA_NOREF;
|
v->data.luaref = LUA_NOREF;
|
||||||
} else if (is_lua && strequal("on_changedtick", k.data)) {
|
} else if (is_lua && strequal("on_changedtick", k.data)) {
|
||||||
if (v->type != kObjectTypeLuaRef) {
|
if (v->type != kObjectTypeLuaRef) {
|
||||||
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
cb.on_changedtick = v->data.luaref;
|
cb.on_changedtick = v->data.luaref;
|
||||||
v->data.integer = LUA_NOREF;
|
v->data.luaref = LUA_NOREF;
|
||||||
} else if (is_lua && strequal("on_detach", k.data)) {
|
} else if (is_lua && strequal("on_detach", k.data)) {
|
||||||
if (v->type != kObjectTypeLuaRef) {
|
if (v->type != kObjectTypeLuaRef) {
|
||||||
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
cb.on_detach = v->data.luaref;
|
cb.on_detach = v->data.luaref;
|
||||||
v->data.integer = LUA_NOREF;
|
v->data.luaref = LUA_NOREF;
|
||||||
} else if (is_lua && strequal("utf_sizes", k.data)) {
|
} else if (is_lua && strequal("utf_sizes", k.data)) {
|
||||||
if (v->type != kObjectTypeBoolean) {
|
if (v->type != kObjectTypeBoolean) {
|
||||||
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
|
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
|
||||||
|
@ -231,6 +231,90 @@ Boolean nvim_buf_detach(uint64_t channel_id,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void buf_clear_luahl(buf_T *buf, bool force)
|
||||||
|
{
|
||||||
|
if (buf->b_luahl || force) {
|
||||||
|
executor_free_luaref(buf->b_luahl_start);
|
||||||
|
executor_free_luaref(buf->b_luahl_window);
|
||||||
|
executor_free_luaref(buf->b_luahl_line);
|
||||||
|
executor_free_luaref(buf->b_luahl_end);
|
||||||
|
}
|
||||||
|
buf->b_luahl_start = LUA_NOREF;
|
||||||
|
buf->b_luahl_window = LUA_NOREF;
|
||||||
|
buf->b_luahl_line = LUA_NOREF;
|
||||||
|
buf->b_luahl_end = LUA_NOREF;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unstabilized interface for defining syntax hl in lua.
|
||||||
|
///
|
||||||
|
/// This is not yet safe for general use, lua callbacks will need to
|
||||||
|
/// be restricted, like textlock and probably other stuff.
|
||||||
|
///
|
||||||
|
/// The API on_line/nvim__put_attr is quite raw and not intended to be the
|
||||||
|
/// final shape. Ideally this should operate on chunks larger than a single
|
||||||
|
/// line to reduce interpreter overhead, and generate annotation objects
|
||||||
|
/// (bufhl/virttext) on the fly but using the same representation.
|
||||||
|
void nvim__buf_set_luahl(uint64_t channel_id, Buffer buffer,
|
||||||
|
DictionaryOf(LuaRef) opts, Error *err)
|
||||||
|
FUNC_API_LUA_ONLY
|
||||||
|
{
|
||||||
|
buf_T *buf = find_buffer_by_handle(buffer, err);
|
||||||
|
|
||||||
|
if (!buf) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
redraw_buf_later(buf, NOT_VALID);
|
||||||
|
buf_clear_luahl(buf, false);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < opts.size; i++) {
|
||||||
|
String k = opts.items[i].key;
|
||||||
|
Object *v = &opts.items[i].value;
|
||||||
|
if (strequal("on_start", k.data)) {
|
||||||
|
if (v->type != kObjectTypeLuaRef) {
|
||||||
|
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
buf->b_luahl_start = v->data.luaref;
|
||||||
|
v->data.luaref = LUA_NOREF;
|
||||||
|
} else if (strequal("on_window", k.data)) {
|
||||||
|
if (v->type != kObjectTypeLuaRef) {
|
||||||
|
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
buf->b_luahl_window = v->data.luaref;
|
||||||
|
v->data.luaref = LUA_NOREF;
|
||||||
|
} else if (strequal("on_line", k.data)) {
|
||||||
|
if (v->type != kObjectTypeLuaRef) {
|
||||||
|
api_set_error(err, kErrorTypeValidation, "callback is not a function");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
buf->b_luahl_line = v->data.luaref;
|
||||||
|
v->data.luaref = LUA_NOREF;
|
||||||
|
} else {
|
||||||
|
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buf->b_luahl = true;
|
||||||
|
return;
|
||||||
|
error:
|
||||||
|
buf_clear_luahl(buf, true);
|
||||||
|
buf->b_luahl = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void nvim__buf_redraw_range(Buffer buffer, Integer first, Integer last,
|
||||||
|
Error *err)
|
||||||
|
FUNC_API_LUA_ONLY
|
||||||
|
{
|
||||||
|
buf_T *buf = find_buffer_by_handle(buffer, err);
|
||||||
|
if (!buf) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
redraw_buf_range_later(buf, (linenr_T)first+1, (linenr_T)last);
|
||||||
|
}
|
||||||
|
|
||||||
/// Sets a buffer line
|
/// Sets a buffer line
|
||||||
///
|
///
|
||||||
/// @deprecated use nvim_buf_set_lines instead.
|
/// @deprecated use nvim_buf_set_lines instead.
|
||||||
|
@ -1112,7 +1196,6 @@ Array nvim_buf_get_extmarks(Buffer buffer, Integer ns_id, Object start,
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
limit = v->data.integer;
|
limit = v->data.integer;
|
||||||
v->data.integer = LUA_NOREF;
|
|
||||||
} else {
|
} else {
|
||||||
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
|
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
|
||||||
return rv;
|
return rv;
|
||||||
|
|
|
@ -60,6 +60,12 @@
|
||||||
#define ADD(array, item) \
|
#define ADD(array, item) \
|
||||||
kv_push(array, item)
|
kv_push(array, item)
|
||||||
|
|
||||||
|
#define FIXED_TEMP_ARRAY(name, fixsize) \
|
||||||
|
Array name = ARRAY_DICT_INIT; \
|
||||||
|
Object name##__items[fixsize]; \
|
||||||
|
args.size = fixsize; \
|
||||||
|
args.items = name##__items; \
|
||||||
|
|
||||||
#define STATIC_CSTR_AS_STRING(s) ((String) {.data = s, .size = sizeof(s) - 1})
|
#define STATIC_CSTR_AS_STRING(s) ((String) {.data = s, .size = sizeof(s) - 1})
|
||||||
|
|
||||||
/// Create a new String instance, putting data in allocated memory
|
/// Create a new String instance, putting data in allocated memory
|
||||||
|
|
|
@ -189,6 +189,15 @@ Dictionary nvim_get_hl_by_id(Integer hl_id, Boolean rgb, Error *err)
|
||||||
return hl_get_attr_by_id(attrcode, rgb, err);
|
return hl_get_attr_by_id(attrcode, rgb, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Gets a highlight group by name
|
||||||
|
///
|
||||||
|
/// similar to |hlID()|, but allocates a new ID if not present.
|
||||||
|
Integer nvim_get_hl_id_by_name(String name)
|
||||||
|
FUNC_API_SINCE(7)
|
||||||
|
{
|
||||||
|
return syn_check_group((const char_u *)name.data, (int)name.size);
|
||||||
|
}
|
||||||
|
|
||||||
/// Sends input-keys to Nvim, subject to various quirks controlled by `mode`
|
/// Sends input-keys to Nvim, subject to various quirks controlled by `mode`
|
||||||
/// flags. This is a blocking call, unlike |nvim_input()|.
|
/// flags. This is a blocking call, unlike |nvim_input()|.
|
||||||
///
|
///
|
||||||
|
@ -2546,3 +2555,27 @@ Array nvim__inspect_cell(Integer grid, Integer row, Integer col, Error *err)
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set attrs in nvim__buf_set_lua_hl callbacks
|
||||||
|
///
|
||||||
|
/// TODO(bfredl): This is rather pedestrian. The final
|
||||||
|
/// interface should probably be derived from a reformed
|
||||||
|
/// bufhl/virttext interface with full support for multi-line
|
||||||
|
/// ranges etc
|
||||||
|
void nvim__put_attr(Integer id, Integer c0, Integer c1)
|
||||||
|
FUNC_API_LUA_ONLY
|
||||||
|
{
|
||||||
|
if (!lua_attr_active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (id == 0 || syn_get_final_id((int)id) == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int attr = syn_id2attr((int)id);
|
||||||
|
c0 = MAX(c0, 0);
|
||||||
|
c1 = MIN(c1, (Integer)lua_attr_bufsize);
|
||||||
|
for (Integer c = c0; c < c1; c++) {
|
||||||
|
lua_attr_buf[c] = (sattr_T)hl_combine_attr(lua_attr_buf[c], (int)attr);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
|
@ -832,6 +832,12 @@ struct file_buffer {
|
||||||
// The number for times the current line has been flushed in the memline.
|
// The number for times the current line has been flushed in the memline.
|
||||||
int flush_count;
|
int flush_count;
|
||||||
|
|
||||||
|
bool b_luahl;
|
||||||
|
LuaRef b_luahl_start;
|
||||||
|
LuaRef b_luahl_window;
|
||||||
|
LuaRef b_luahl_line;
|
||||||
|
LuaRef b_luahl_end;
|
||||||
|
|
||||||
int b_diff_failed; // internal diff failed for this buffer
|
int b_diff_failed; // internal diff failed for this buffer
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -157,7 +157,7 @@ void buf_updates_unregister_all(buf_T *buf)
|
||||||
args.items[0] = BUFFER_OBJ(buf->handle);
|
args.items[0] = BUFFER_OBJ(buf->handle);
|
||||||
|
|
||||||
textlock++;
|
textlock++;
|
||||||
executor_exec_lua_cb(cb.on_detach, "detach", args, false);
|
executor_exec_lua_cb(cb.on_detach, "detach", args, false, NULL);
|
||||||
textlock--;
|
textlock--;
|
||||||
}
|
}
|
||||||
free_update_callbacks(cb);
|
free_update_callbacks(cb);
|
||||||
|
@ -265,7 +265,7 @@ void buf_updates_send_changes(buf_T *buf,
|
||||||
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
|
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
|
||||||
}
|
}
|
||||||
textlock++;
|
textlock++;
|
||||||
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
|
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true, NULL);
|
||||||
textlock--;
|
textlock--;
|
||||||
|
|
||||||
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
|
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
|
||||||
|
@ -293,10 +293,7 @@ void buf_updates_changedtick(buf_T *buf)
|
||||||
BufUpdateCallbacks cb = kv_A(buf->update_callbacks, i);
|
BufUpdateCallbacks cb = kv_A(buf->update_callbacks, i);
|
||||||
bool keep = true;
|
bool keep = true;
|
||||||
if (cb.on_changedtick != LUA_NOREF) {
|
if (cb.on_changedtick != LUA_NOREF) {
|
||||||
Array args = ARRAY_DICT_INIT;
|
FIXED_TEMP_ARRAY(args, 2);
|
||||||
Object items[2];
|
|
||||||
args.size = 2;
|
|
||||||
args.items = items;
|
|
||||||
|
|
||||||
// the first argument is always the buffer handle
|
// the first argument is always the buffer handle
|
||||||
args.items[0] = BUFFER_OBJ(buf->handle);
|
args.items[0] = BUFFER_OBJ(buf->handle);
|
||||||
|
@ -306,7 +303,7 @@ void buf_updates_changedtick(buf_T *buf)
|
||||||
|
|
||||||
textlock++;
|
textlock++;
|
||||||
Object res = executor_exec_lua_cb(cb.on_changedtick, "changedtick",
|
Object res = executor_exec_lua_cb(cb.on_changedtick, "changedtick",
|
||||||
args, true);
|
args, true, NULL);
|
||||||
textlock--;
|
textlock--;
|
||||||
|
|
||||||
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
|
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
|
||||||
|
|
|
@ -211,6 +211,8 @@
|
||||||
# define FUNC_API_NOEXPORT
|
# define FUNC_API_NOEXPORT
|
||||||
/// API function not exposed in VimL/eval.
|
/// API function not exposed in VimL/eval.
|
||||||
# define FUNC_API_REMOTE_ONLY
|
# define FUNC_API_REMOTE_ONLY
|
||||||
|
/// API function not exposed in VimL/remote.
|
||||||
|
# define FUNC_API_LUA_ONLY
|
||||||
/// API function introduced at the given API level.
|
/// API function introduced at the given API level.
|
||||||
# define FUNC_API_SINCE(X)
|
# define FUNC_API_SINCE(X)
|
||||||
/// API function deprecated since the given API level.
|
/// API function deprecated since the given API level.
|
||||||
|
|
|
@ -42,6 +42,7 @@ local c_proto = Ct(
|
||||||
(fill * Cg((P('FUNC_API_FAST') * Cc(true)), 'fast') ^ -1) *
|
(fill * Cg((P('FUNC_API_FAST') * Cc(true)), 'fast') ^ -1) *
|
||||||
(fill * Cg((P('FUNC_API_NOEXPORT') * Cc(true)), 'noexport') ^ -1) *
|
(fill * Cg((P('FUNC_API_NOEXPORT') * Cc(true)), 'noexport') ^ -1) *
|
||||||
(fill * Cg((P('FUNC_API_REMOTE_ONLY') * Cc(true)), 'remote_only') ^ -1) *
|
(fill * Cg((P('FUNC_API_REMOTE_ONLY') * Cc(true)), 'remote_only') ^ -1) *
|
||||||
|
(fill * Cg((P('FUNC_API_LUA_ONLY') * Cc(true)), 'lua_only') ^ -1) *
|
||||||
(fill * Cg((P('FUNC_API_REMOTE_IMPL') * Cc(true)), 'remote_impl') ^ -1) *
|
(fill * Cg((P('FUNC_API_REMOTE_IMPL') * Cc(true)), 'remote_impl') ^ -1) *
|
||||||
(fill * Cg((P('FUNC_API_BRIDGE_IMPL') * Cc(true)), 'bridge_impl') ^ -1) *
|
(fill * Cg((P('FUNC_API_BRIDGE_IMPL') * Cc(true)), 'bridge_impl') ^ -1) *
|
||||||
(fill * Cg((P('FUNC_API_COMPOSITOR_IMPL') * Cc(true)), 'compositor_impl') ^ -1) *
|
(fill * Cg((P('FUNC_API_COMPOSITOR_IMPL') * Cc(true)), 'compositor_impl') ^ -1) *
|
||||||
|
|
|
@ -192,7 +192,7 @@ end
|
||||||
-- the real API.
|
-- the real API.
|
||||||
for i = 1, #functions do
|
for i = 1, #functions do
|
||||||
local fn = functions[i]
|
local fn = functions[i]
|
||||||
if fn.impl_name == nil then
|
if fn.impl_name == nil and not fn.lua_only then
|
||||||
local args = {}
|
local args = {}
|
||||||
|
|
||||||
output:write('Object handle_'..fn.name..'(uint64_t channel_id, Array args, Error *error)')
|
output:write('Object handle_'..fn.name..'(uint64_t channel_id, Array args, Error *error)')
|
||||||
|
@ -310,12 +310,13 @@ void msgpack_rpc_init_method_table(void)
|
||||||
|
|
||||||
for i = 1, #functions do
|
for i = 1, #functions do
|
||||||
local fn = functions[i]
|
local fn = functions[i]
|
||||||
output:write(' msgpack_rpc_add_method_handler('..
|
if not fn.lua_only then
|
||||||
'(String) {.data = "'..fn.name..'", '..
|
output:write(' msgpack_rpc_add_method_handler('..
|
||||||
'.size = sizeof("'..fn.name..'") - 1}, '..
|
'(String) {.data = "'..fn.name..'", '..
|
||||||
'(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
|
'.size = sizeof("'..fn.name..'") - 1}, '..
|
||||||
', .fast = '..tostring(fn.fast)..'});\n')
|
'(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
|
||||||
|
', .fast = '..tostring(fn.fast)..'});\n')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
output:write('\n}\n\n')
|
output:write('\n}\n\n')
|
||||||
|
|
|
@ -25,7 +25,7 @@ local gperfpipe = io.open(funcsfname .. '.gperf', 'wb')
|
||||||
local funcs = require('eval').funcs
|
local funcs = require('eval').funcs
|
||||||
local metadata = mpack.unpack(io.open(metadata_file, 'rb'):read("*all"))
|
local metadata = mpack.unpack(io.open(metadata_file, 'rb'):read("*all"))
|
||||||
for _,fun in ipairs(metadata) do
|
for _,fun in ipairs(metadata) do
|
||||||
if not fun.remote_only then
|
if not (fun.remote_only or fun.lua_only) then
|
||||||
funcs[fun.name] = {
|
funcs[fun.name] = {
|
||||||
args=#fun.parameters,
|
args=#fun.parameters,
|
||||||
func='api_wrapper',
|
func='api_wrapper',
|
||||||
|
|
|
@ -126,6 +126,13 @@ typedef off_t off_T;
|
||||||
*/
|
*/
|
||||||
EXTERN int mod_mask INIT(= 0x0); /* current key modifiers */
|
EXTERN int mod_mask INIT(= 0x0); /* current key modifiers */
|
||||||
|
|
||||||
|
|
||||||
|
// TODO(bfredl): for the final interface this should find a more suitable
|
||||||
|
// location.
|
||||||
|
EXTERN sattr_T *lua_attr_buf INIT(= NULL);
|
||||||
|
EXTERN size_t lua_attr_bufsize INIT(= 0);
|
||||||
|
EXTERN bool lua_attr_active INIT(= false);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Cmdline_row is the row where the command line starts, just below the
|
* Cmdline_row is the row where the command line starts, just below the
|
||||||
* last window.
|
* last window.
|
||||||
|
|
|
@ -835,7 +835,7 @@ Object executor_exec_lua_api(const String str, const Array args, Error *err)
|
||||||
}
|
}
|
||||||
|
|
||||||
Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
|
Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
|
||||||
bool retval)
|
bool retval, Error *err)
|
||||||
{
|
{
|
||||||
lua_State *const lstate = nlua_enter();
|
lua_State *const lstate = nlua_enter();
|
||||||
nlua_pushref(lstate, ref);
|
nlua_pushref(lstate, ref);
|
||||||
|
@ -845,16 +845,24 @@ Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lua_pcall(lstate, (int)args.size+1, retval ? 1 : 0, 0)) {
|
if (lua_pcall(lstate, (int)args.size+1, retval ? 1 : 0, 0)) {
|
||||||
// TODO(bfredl): callbacks:s might not always be msg-safe, for instance
|
// if err is passed, the caller will deal with the error.
|
||||||
// lua callbacks for redraw events. Later on let the caller deal with the
|
if (err) {
|
||||||
// error instead.
|
size_t len;
|
||||||
nlua_error(lstate, _("Error executing lua callback: %.*s"));
|
const char *errstr = lua_tolstring(lstate, -1, &len);
|
||||||
|
api_set_error(err, kErrorTypeException,
|
||||||
|
"Error executing lua: %.*s", (int)len, errstr);
|
||||||
|
} else {
|
||||||
|
nlua_error(lstate, _("Error executing lua callback: %.*s"));
|
||||||
|
}
|
||||||
return NIL;
|
return NIL;
|
||||||
}
|
}
|
||||||
Error err = ERROR_INIT;
|
|
||||||
|
|
||||||
if (retval) {
|
if (retval) {
|
||||||
return nlua_pop_Object(lstate, false, &err);
|
Error dummy = ERROR_INIT;
|
||||||
|
if (err == NULL) {
|
||||||
|
err = &dummy;
|
||||||
|
}
|
||||||
|
return nlua_pop_Object(lstate, false, err);
|
||||||
} else {
|
} else {
|
||||||
return NIL;
|
return NIL;
|
||||||
}
|
}
|
||||||
|
@ -1007,4 +1015,7 @@ static void nlua_add_treesitter(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
|
||||||
|
|
||||||
lua_pushcfunction(lstate, tslua_inspect_lang);
|
lua_pushcfunction(lstate, tslua_inspect_lang);
|
||||||
lua_setfield(lstate, -2, "_ts_inspect_language");
|
lua_setfield(lstate, -2, "_ts_inspect_language");
|
||||||
|
|
||||||
|
lua_pushcfunction(lstate, ts_lua_parse_query);
|
||||||
|
lua_setfield(lstate, -2, "_ts_parse_query");
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,11 @@ typedef struct {
|
||||||
TSTree *tree; // internal tree, used for editing/reparsing
|
TSTree *tree; // internal tree, used for editing/reparsing
|
||||||
} TSLua_parser;
|
} TSLua_parser;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TSQueryCursor *cursor;
|
||||||
|
int predicated_match;
|
||||||
|
} TSLua_cursor;
|
||||||
|
|
||||||
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
||||||
# include "lua/treesitter.c.generated.h"
|
# include "lua/treesitter.c.generated.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -66,6 +71,20 @@ static struct luaL_Reg node_meta[] = {
|
||||||
{ "descendant_for_range", node_descendant_for_range },
|
{ "descendant_for_range", node_descendant_for_range },
|
||||||
{ "named_descendant_for_range", node_named_descendant_for_range },
|
{ "named_descendant_for_range", node_named_descendant_for_range },
|
||||||
{ "parent", node_parent },
|
{ "parent", node_parent },
|
||||||
|
{ "_rawquery", node_rawquery },
|
||||||
|
{ NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct luaL_Reg query_meta[] = {
|
||||||
|
{ "__gc", query_gc },
|
||||||
|
{ "__tostring", query_tostring },
|
||||||
|
{ "inspect", query_inspect },
|
||||||
|
{ NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
// cursor is not exposed, but still needs garbage collection
|
||||||
|
static struct luaL_Reg querycursor_meta[] = {
|
||||||
|
{ "__gc", querycursor_gc },
|
||||||
{ NULL, NULL }
|
{ NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -96,6 +115,8 @@ void tslua_init(lua_State *L)
|
||||||
build_meta(L, "treesitter_parser", parser_meta);
|
build_meta(L, "treesitter_parser", parser_meta);
|
||||||
build_meta(L, "treesitter_tree", tree_meta);
|
build_meta(L, "treesitter_tree", tree_meta);
|
||||||
build_meta(L, "treesitter_node", node_meta);
|
build_meta(L, "treesitter_node", node_meta);
|
||||||
|
build_meta(L, "treesitter_query", query_meta);
|
||||||
|
build_meta(L, "treesitter_querycursor", querycursor_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int tslua_register_lang(lua_State *L)
|
int tslua_register_lang(lua_State *L)
|
||||||
|
@ -276,13 +297,33 @@ static int parser_parse_buf(lua_State *L)
|
||||||
}
|
}
|
||||||
TSInput input = { payload, input_cb, TSInputEncodingUTF8 };
|
TSInput input = { payload, input_cb, TSInputEncodingUTF8 };
|
||||||
TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input);
|
TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input);
|
||||||
|
|
||||||
|
uint32_t n_ranges = 0;
|
||||||
|
TSRange *changed = p->tree ? ts_tree_get_changed_ranges(p->tree, new_tree,
|
||||||
|
&n_ranges) : NULL;
|
||||||
if (p->tree) {
|
if (p->tree) {
|
||||||
ts_tree_delete(p->tree);
|
ts_tree_delete(p->tree);
|
||||||
}
|
}
|
||||||
p->tree = new_tree;
|
p->tree = new_tree;
|
||||||
|
|
||||||
tslua_push_tree(L, p->tree);
|
tslua_push_tree(L, p->tree);
|
||||||
return 1;
|
|
||||||
|
lua_createtable(L, n_ranges, 0);
|
||||||
|
for (size_t i = 0; i < n_ranges; i++) {
|
||||||
|
lua_createtable(L, 4, 0);
|
||||||
|
lua_pushinteger(L, changed[i].start_point.row);
|
||||||
|
lua_rawseti(L, -2, 1);
|
||||||
|
lua_pushinteger(L, changed[i].start_point.column);
|
||||||
|
lua_rawseti(L, -2, 2);
|
||||||
|
lua_pushinteger(L, changed[i].end_point.row);
|
||||||
|
lua_rawseti(L, -2, 3);
|
||||||
|
lua_pushinteger(L, changed[i].end_point.column);
|
||||||
|
lua_rawseti(L, -2, 4);
|
||||||
|
|
||||||
|
lua_rawseti(L, -2, i+1);
|
||||||
|
}
|
||||||
|
xfree(changed);
|
||||||
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int parser_tree(lua_State *L)
|
static int parser_tree(lua_State *L)
|
||||||
|
@ -383,7 +424,7 @@ static int tree_root(lua_State *L)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
TSNode root = ts_tree_root_node(tree);
|
TSNode root = ts_tree_root_node(tree);
|
||||||
push_node(L, root);
|
push_node(L, root, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -394,18 +435,19 @@ static int tree_root(lua_State *L)
|
||||||
/// top of stack must either be the tree this node belongs to or another node
|
/// top of stack must either be the tree this node belongs to or another node
|
||||||
/// of the same tree! This value is not popped. Can only be called inside a
|
/// of the same tree! This value is not popped. Can only be called inside a
|
||||||
/// cfunction with the tslua environment.
|
/// cfunction with the tslua environment.
|
||||||
static void push_node(lua_State *L, TSNode node)
|
static void push_node(lua_State *L, TSNode node, int uindex)
|
||||||
{
|
{
|
||||||
|
assert(uindex > 0 || uindex < -LUA_MINSTACK);
|
||||||
if (ts_node_is_null(node)) {
|
if (ts_node_is_null(node)) {
|
||||||
lua_pushnil(L); // [src, nil]
|
lua_pushnil(L); // [nil]
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [src, udata]
|
TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [udata]
|
||||||
*ud = node;
|
*ud = node;
|
||||||
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [src, udata, meta]
|
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [udata, meta]
|
||||||
lua_setmetatable(L, -2); // [src, udata]
|
lua_setmetatable(L, -2); // [udata]
|
||||||
lua_getfenv(L, -2); // [src, udata, reftable]
|
lua_getfenv(L, uindex); // [udata, reftable]
|
||||||
lua_setfenv(L, -2); // [src, udata]
|
lua_setfenv(L, -2); // [udata]
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool node_check(lua_State *L, TSNode *res)
|
static bool node_check(lua_State *L, TSNode *res)
|
||||||
|
@ -586,8 +628,7 @@ static int node_child(lua_State *L)
|
||||||
long num = lua_tointeger(L, 2);
|
long num = lua_tointeger(L, 2);
|
||||||
TSNode child = ts_node_child(node, (uint32_t)num);
|
TSNode child = ts_node_child(node, (uint32_t)num);
|
||||||
|
|
||||||
lua_pushvalue(L, 1);
|
push_node(L, child, 1);
|
||||||
push_node(L, child);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -600,8 +641,7 @@ static int node_named_child(lua_State *L)
|
||||||
long num = lua_tointeger(L, 2);
|
long num = lua_tointeger(L, 2);
|
||||||
TSNode child = ts_node_named_child(node, (uint32_t)num);
|
TSNode child = ts_node_named_child(node, (uint32_t)num);
|
||||||
|
|
||||||
lua_pushvalue(L, 1);
|
push_node(L, child, 1);
|
||||||
push_node(L, child);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -617,8 +657,7 @@ static int node_descendant_for_range(lua_State *L)
|
||||||
(uint32_t)lua_tointeger(L, 5) };
|
(uint32_t)lua_tointeger(L, 5) };
|
||||||
TSNode child = ts_node_descendant_for_point_range(node, start, end);
|
TSNode child = ts_node_descendant_for_point_range(node, start, end);
|
||||||
|
|
||||||
lua_pushvalue(L, 1);
|
push_node(L, child, 1);
|
||||||
push_node(L, child);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -634,8 +673,7 @@ static int node_named_descendant_for_range(lua_State *L)
|
||||||
(uint32_t)lua_tointeger(L, 5) };
|
(uint32_t)lua_tointeger(L, 5) };
|
||||||
TSNode child = ts_node_named_descendant_for_point_range(node, start, end);
|
TSNode child = ts_node_named_descendant_for_point_range(node, start, end);
|
||||||
|
|
||||||
lua_pushvalue(L, 1);
|
push_node(L, child, 1);
|
||||||
push_node(L, child);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -646,7 +684,254 @@ static int node_parent(lua_State *L)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
TSNode parent = ts_node_parent(node);
|
TSNode parent = ts_node_parent(node);
|
||||||
push_node(L, parent);
|
push_node(L, parent, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// assumes the match table being on top of the stack
|
||||||
|
static void set_match(lua_State *L, TSQueryMatch *match, int nodeidx)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < match->capture_count; i++) {
|
||||||
|
push_node(L, match->captures[i].node, nodeidx);
|
||||||
|
lua_rawseti(L, -2, match->captures[i].index+1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int query_next_match(lua_State *L)
|
||||||
|
{
|
||||||
|
TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
|
||||||
|
TSQueryCursor *cursor = ud->cursor;
|
||||||
|
|
||||||
|
TSQuery *query = query_check(L, lua_upvalueindex(3));
|
||||||
|
TSQueryMatch match;
|
||||||
|
if (ts_query_cursor_next_match(cursor, &match)) {
|
||||||
|
lua_pushinteger(L, match.pattern_index+1); // [index]
|
||||||
|
lua_createtable(L, ts_query_capture_count(query), 2); // [index, match]
|
||||||
|
set_match(L, &match, lua_upvalueindex(2));
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int query_next_capture(lua_State *L)
|
||||||
|
{
|
||||||
|
TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
|
||||||
|
TSQueryCursor *cursor = ud->cursor;
|
||||||
|
|
||||||
|
TSQuery *query = query_check(L, lua_upvalueindex(3));
|
||||||
|
|
||||||
|
if (ud->predicated_match > -1) {
|
||||||
|
lua_getfield(L, lua_upvalueindex(4), "active");
|
||||||
|
bool active = lua_toboolean(L, -1);
|
||||||
|
lua_pop(L, 1);
|
||||||
|
if (!active) {
|
||||||
|
ts_query_cursor_remove_match(cursor, ud->predicated_match);
|
||||||
|
}
|
||||||
|
ud->predicated_match = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
TSQueryMatch match;
|
||||||
|
uint32_t capture_index;
|
||||||
|
if (ts_query_cursor_next_capture(cursor, &match, &capture_index)) {
|
||||||
|
TSQueryCapture capture = match.captures[capture_index];
|
||||||
|
|
||||||
|
lua_pushinteger(L, capture.index+1); // [index]
|
||||||
|
push_node(L, capture.node, lua_upvalueindex(2)); // [index, node]
|
||||||
|
|
||||||
|
uint32_t n_pred;
|
||||||
|
ts_query_predicates_for_pattern(query, match.pattern_index, &n_pred);
|
||||||
|
if (n_pred > 0 && capture_index == 0) {
|
||||||
|
lua_pushvalue(L, lua_upvalueindex(4)); // [index, node, match]
|
||||||
|
set_match(L, &match, lua_upvalueindex(2));
|
||||||
|
lua_pushinteger(L, match.pattern_index+1);
|
||||||
|
lua_setfield(L, -2, "pattern");
|
||||||
|
|
||||||
|
if (match.capture_count > 1) {
|
||||||
|
ud->predicated_match = match.id;
|
||||||
|
lua_pushboolean(L, false);
|
||||||
|
lua_setfield(L, -2, "active");
|
||||||
|
}
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int node_rawquery(lua_State *L)
|
||||||
|
{
|
||||||
|
TSNode node;
|
||||||
|
if (!node_check(L, &node)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
TSQuery *query = query_check(L, 2);
|
||||||
|
// TODO(bfredl): these are expensive allegedly,
|
||||||
|
// use a reuse list later on?
|
||||||
|
TSQueryCursor *cursor = ts_query_cursor_new();
|
||||||
|
ts_query_cursor_exec(cursor, query, node);
|
||||||
|
|
||||||
|
bool captures = lua_toboolean(L, 3);
|
||||||
|
|
||||||
|
if (lua_gettop(L) >= 4) {
|
||||||
|
int start = luaL_checkinteger(L, 4);
|
||||||
|
int end = lua_gettop(L) >= 5 ? luaL_checkinteger(L, 5) : MAXLNUM;
|
||||||
|
ts_query_cursor_set_point_range(cursor,
|
||||||
|
(TSPoint){ start, 0 }, (TSPoint){ end, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
TSLua_cursor *ud = lua_newuserdata(L, sizeof(*ud)); // [udata]
|
||||||
|
ud->cursor = cursor;
|
||||||
|
ud->predicated_match = -1;
|
||||||
|
|
||||||
|
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_querycursor");
|
||||||
|
lua_setmetatable(L, -2); // [udata]
|
||||||
|
lua_pushvalue(L, 1); // [udata, node]
|
||||||
|
|
||||||
|
// include query separately, as to keep a ref to it for gc
|
||||||
|
lua_pushvalue(L, 2); // [udata, node, query]
|
||||||
|
|
||||||
|
if (captures) {
|
||||||
|
// placeholder for match state
|
||||||
|
lua_createtable(L, ts_query_capture_count(query), 2); // [u, n, q, match]
|
||||||
|
lua_pushcclosure(L, query_next_capture, 4); // [closure]
|
||||||
|
} else {
|
||||||
|
lua_pushcclosure(L, query_next_match, 3); // [closure]
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int querycursor_gc(lua_State *L)
|
||||||
|
{
|
||||||
|
TSLua_cursor *ud = luaL_checkudata(L, 1, "treesitter_querycursor");
|
||||||
|
ts_query_cursor_delete(ud->cursor);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query methods
|
||||||
|
|
||||||
|
int ts_lua_parse_query(lua_State *L)
|
||||||
|
{
|
||||||
|
if (lua_gettop(L) < 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2)) {
|
||||||
|
return luaL_error(L, "string expected");
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *lang_name = lua_tostring(L, 1);
|
||||||
|
TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name);
|
||||||
|
if (!lang) {
|
||||||
|
return luaL_error(L, "no such language: %s", lang_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t len;
|
||||||
|
const char *src = lua_tolstring(L, 2, &len);
|
||||||
|
|
||||||
|
uint32_t error_offset;
|
||||||
|
TSQueryError error_type;
|
||||||
|
TSQuery *query = ts_query_new(lang, src, len, &error_offset, &error_type);
|
||||||
|
|
||||||
|
if (!query) {
|
||||||
|
return luaL_error(L, "query: %s at position %d",
|
||||||
|
query_err_string(error_type), (int)error_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
TSQuery **ud = lua_newuserdata(L, sizeof(TSQuery *)); // [udata]
|
||||||
|
*ud = query;
|
||||||
|
lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_query"); // [udata, meta]
|
||||||
|
lua_setmetatable(L, -2); // [udata]
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static const char *query_err_string(TSQueryError err) {
|
||||||
|
switch (err) {
|
||||||
|
case TSQueryErrorSyntax: return "invalid syntax";
|
||||||
|
case TSQueryErrorNodeType: return "invalid node type";
|
||||||
|
case TSQueryErrorField: return "invalid field";
|
||||||
|
case TSQueryErrorCapture: return "invalid capture";
|
||||||
|
default: return "error";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static TSQuery *query_check(lua_State *L, int index)
|
||||||
|
{
|
||||||
|
TSQuery **ud = luaL_checkudata(L, index, "treesitter_query");
|
||||||
|
return *ud;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int query_gc(lua_State *L)
|
||||||
|
{
|
||||||
|
TSQuery *query = query_check(L, 1);
|
||||||
|
if (!query) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ts_query_delete(query);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int query_tostring(lua_State *L)
|
||||||
|
{
|
||||||
|
lua_pushstring(L, "<query>");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int query_inspect(lua_State *L)
|
||||||
|
{
|
||||||
|
TSQuery *query = query_check(L, 1);
|
||||||
|
if (!query) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t n_pat = ts_query_pattern_count(query);
|
||||||
|
lua_createtable(L, 0, 2); // [retval]
|
||||||
|
lua_createtable(L, n_pat, 1); // [retval, patterns]
|
||||||
|
for (size_t i = 0; i < n_pat; i++) {
|
||||||
|
uint32_t len;
|
||||||
|
const TSQueryPredicateStep *step = ts_query_predicates_for_pattern(query,
|
||||||
|
i, &len);
|
||||||
|
if (len == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lua_createtable(L, len/4, 1); // [retval, patterns, pat]
|
||||||
|
lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
|
||||||
|
int nextpred = 1;
|
||||||
|
int nextitem = 1;
|
||||||
|
for (size_t k = 0; k < len; k++) {
|
||||||
|
if (step[k].type == TSQueryPredicateStepTypeDone) {
|
||||||
|
lua_rawseti(L, -2, nextpred++); // [retval, patterns, pat]
|
||||||
|
lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
|
||||||
|
nextitem = 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (step[k].type == TSQueryPredicateStepTypeString) {
|
||||||
|
uint32_t strlen;
|
||||||
|
const char *str = ts_query_string_value_for_id(query, step[k].value_id,
|
||||||
|
&strlen);
|
||||||
|
lua_pushlstring(L, str, strlen); // [retval, patterns, pat, pred, item]
|
||||||
|
} else if (step[k].type == TSQueryPredicateStepTypeCapture) {
|
||||||
|
lua_pushnumber(L, step[k].value_id+1); // [..., pat, pred, item]
|
||||||
|
} else {
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
lua_rawseti(L, -2, nextitem++); // [retval, patterns, pat, pred]
|
||||||
|
}
|
||||||
|
// last predicate should have ended with TypeDone
|
||||||
|
lua_pop(L, 1); // [retval, patters, pat]
|
||||||
|
lua_rawseti(L, -2, i+1); // [retval, patterns]
|
||||||
|
}
|
||||||
|
lua_setfield(L, -2, "patterns"); // [retval]
|
||||||
|
|
||||||
|
uint32_t n_captures = ts_query_capture_count(query);
|
||||||
|
lua_createtable(L, n_captures, 0); // [retval, captures]
|
||||||
|
for (size_t i = 0; i < n_captures; i++) {
|
||||||
|
uint32_t strlen;
|
||||||
|
const char *str = ts_query_capture_name_for_id(query, i, &strlen);
|
||||||
|
lua_pushlstring(L, str, strlen); // [retval, captures, capture]
|
||||||
|
lua_rawseti(L, -2, i+1);
|
||||||
|
}
|
||||||
|
lua_setfield(L, -2, "captures"); // [retval]
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
|
@ -116,6 +116,8 @@
|
||||||
#include "nvim/window.h"
|
#include "nvim/window.h"
|
||||||
#include "nvim/os/time.h"
|
#include "nvim/os/time.h"
|
||||||
#include "nvim/api/private/helpers.h"
|
#include "nvim/api/private/helpers.h"
|
||||||
|
#include "nvim/api/vim.h"
|
||||||
|
#include "nvim/lua/executor.h"
|
||||||
|
|
||||||
#define MB_FILLER_CHAR '<' /* character used when a double-width character
|
#define MB_FILLER_CHAR '<' /* character used when a double-width character
|
||||||
* doesn't fit. */
|
* doesn't fit. */
|
||||||
|
@ -232,6 +234,22 @@ void redraw_buf_line_later(buf_T *buf, linenr_T line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void redraw_buf_range_later(buf_T *buf, linenr_T firstline, linenr_T lastline)
|
||||||
|
{
|
||||||
|
FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
|
||||||
|
if (wp->w_buffer == buf
|
||||||
|
&& lastline >= wp->w_topline && firstline < wp->w_botline) {
|
||||||
|
if (wp->w_redraw_top == 0 || wp->w_redraw_top > firstline) {
|
||||||
|
wp->w_redraw_top = firstline;
|
||||||
|
}
|
||||||
|
if (wp->w_redraw_bot == 0 || wp->w_redraw_bot < lastline) {
|
||||||
|
wp->w_redraw_bot = lastline;
|
||||||
|
}
|
||||||
|
redraw_win_later(wp, VALID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Changed something in the current window, at buffer line "lnum", that
|
* Changed something in the current window, at buffer line "lnum", that
|
||||||
* requires that line and possibly other lines to be redrawn.
|
* requires that line and possibly other lines to be redrawn.
|
||||||
|
@ -477,6 +495,19 @@ int update_screen(int type)
|
||||||
if (wwp == wp && syntax_present(wp)) {
|
if (wwp == wp && syntax_present(wp)) {
|
||||||
syn_stack_apply_changes(wp->w_buffer);
|
syn_stack_apply_changes(wp->w_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
buf_T *buf = wp->w_buffer;
|
||||||
|
if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
|
||||||
|
Error err = ERROR_INIT;
|
||||||
|
FIXED_TEMP_ARRAY(args, 2);
|
||||||
|
args.items[0] = BUFFER_OBJ(buf->handle);
|
||||||
|
args.items[1] = INTEGER_OBJ(display_tick);
|
||||||
|
executor_exec_lua_cb(buf->b_luahl_start, "start", args, false, &err);
|
||||||
|
if (ERROR_SET(&err)) {
|
||||||
|
ELOG("error in luahl start: %s", err.msg);
|
||||||
|
api_clear_error(&err);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1181,7 +1212,27 @@ static void win_update(win_T *wp)
|
||||||
idx = 0; /* first entry in w_lines[].wl_size */
|
idx = 0; /* first entry in w_lines[].wl_size */
|
||||||
row = 0;
|
row = 0;
|
||||||
srow = 0;
|
srow = 0;
|
||||||
lnum = wp->w_topline; /* first line shown in window */
|
lnum = wp->w_topline; // first line shown in window
|
||||||
|
|
||||||
|
if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
|
||||||
|
Error err = ERROR_INIT;
|
||||||
|
FIXED_TEMP_ARRAY(args, 4);
|
||||||
|
linenr_T knownmax = ((wp->w_valid & VALID_BOTLINE)
|
||||||
|
? wp->w_botline
|
||||||
|
: (wp->w_topline + wp->w_height_inner));
|
||||||
|
args.items[0] = WINDOW_OBJ(wp->handle);
|
||||||
|
args.items[1] = BUFFER_OBJ(buf->handle);
|
||||||
|
args.items[2] = INTEGER_OBJ(wp->w_topline-1);
|
||||||
|
args.items[3] = INTEGER_OBJ(knownmax);
|
||||||
|
// TODO(bfredl): we could allow this callback to change mod_top, mod_bot.
|
||||||
|
// For now the "start" callback is expected to use nvim__buf_redraw_range.
|
||||||
|
executor_exec_lua_cb(buf->b_luahl_window, "window", args, false, &err);
|
||||||
|
if (ERROR_SET(&err)) {
|
||||||
|
ELOG("error in luahl window: %s", err.msg);
|
||||||
|
api_clear_error(&err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (;; ) {
|
for (;; ) {
|
||||||
/* stop updating when reached the end of the window (check for _past_
|
/* stop updating when reached the end of the window (check for _past_
|
||||||
* the end of the window is at the end of the loop) */
|
* the end of the window is at the end of the loop) */
|
||||||
|
@ -2229,6 +2280,8 @@ win_line (
|
||||||
|
|
||||||
row = startrow;
|
row = startrow;
|
||||||
|
|
||||||
|
char *luatext = NULL;
|
||||||
|
|
||||||
if (!number_only) {
|
if (!number_only) {
|
||||||
// To speed up the loop below, set extra_check when there is linebreak,
|
// To speed up the loop below, set extra_check when there is linebreak,
|
||||||
// trailing white space and/or syntax processing to be done.
|
// trailing white space and/or syntax processing to be done.
|
||||||
|
@ -2454,6 +2507,41 @@ win_line (
|
||||||
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
|
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
|
||||||
ptr = line;
|
ptr = line;
|
||||||
|
|
||||||
|
buf_T *buf = wp->w_buffer;
|
||||||
|
if (buf->b_luahl && buf->b_luahl_line != LUA_NOREF) {
|
||||||
|
size_t size = STRLEN(line);
|
||||||
|
if (lua_attr_bufsize < size) {
|
||||||
|
xfree(lua_attr_buf);
|
||||||
|
lua_attr_buf = xcalloc(size, sizeof(*lua_attr_buf));
|
||||||
|
lua_attr_bufsize = size;
|
||||||
|
} else if (lua_attr_buf) {
|
||||||
|
memset(lua_attr_buf, 0, size * sizeof(*lua_attr_buf));
|
||||||
|
}
|
||||||
|
Error err = ERROR_INIT;
|
||||||
|
// TODO(bfredl): build a macro for the "static array" pattern
|
||||||
|
// in buf_updates_send_changes?
|
||||||
|
FIXED_TEMP_ARRAY(args, 3);
|
||||||
|
args.items[0] = WINDOW_OBJ(wp->handle);
|
||||||
|
args.items[1] = BUFFER_OBJ(buf->handle);
|
||||||
|
args.items[2] = INTEGER_OBJ(lnum-1);
|
||||||
|
lua_attr_active = true;
|
||||||
|
extra_check = true;
|
||||||
|
Object o = executor_exec_lua_cb(buf->b_luahl_line, "line",
|
||||||
|
args, true, &err);
|
||||||
|
lua_attr_active = false;
|
||||||
|
if (o.type == kObjectTypeString) {
|
||||||
|
// TODO(bfredl): this is a bit of a hack. A final API should use an
|
||||||
|
// "unified" interface where luahl can add both bufhl and virttext
|
||||||
|
luatext = o.data.string.data;
|
||||||
|
do_virttext = true;
|
||||||
|
} else if (ERROR_SET(&err)) {
|
||||||
|
ELOG("error in luahl line: %s", err.msg);
|
||||||
|
luatext = err.msg;
|
||||||
|
do_virttext = true;
|
||||||
|
api_clear_error(&err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (has_spell && !number_only) {
|
if (has_spell && !number_only) {
|
||||||
// For checking first word with a capital skip white space.
|
// For checking first word with a capital skip white space.
|
||||||
if (cap_col == 0) {
|
if (cap_col == 0) {
|
||||||
|
@ -3429,6 +3517,10 @@ win_line (
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (buf->b_luahl && v > 0 && v < (long)lua_attr_bufsize+1) {
|
||||||
|
char_attr = hl_combine_attr(char_attr, lua_attr_buf[v-1]);
|
||||||
|
}
|
||||||
|
|
||||||
if (wp->w_buffer->terminal) {
|
if (wp->w_buffer->terminal) {
|
||||||
char_attr = hl_combine_attr(term_attrs[vcol], char_attr);
|
char_attr = hl_combine_attr(term_attrs[vcol], char_attr);
|
||||||
}
|
}
|
||||||
|
@ -3917,8 +4009,14 @@ win_line (
|
||||||
int rightmost_vcol = 0;
|
int rightmost_vcol = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
VirtText virt_text = do_virttext ? bufhl_info.line->virt_text
|
VirtText virt_text;
|
||||||
: (VirtText)KV_INITIAL_VALUE;
|
if (luatext) {
|
||||||
|
virt_text = (VirtText)KV_INITIAL_VALUE;
|
||||||
|
kv_push(virt_text, ((VirtTextChunk){ .text = luatext, .hl_id = 0 }));
|
||||||
|
} else {
|
||||||
|
virt_text = do_virttext ? bufhl_info.line->virt_text
|
||||||
|
: (VirtText)KV_INITIAL_VALUE;
|
||||||
|
}
|
||||||
size_t virt_pos = 0;
|
size_t virt_pos = 0;
|
||||||
LineState s = LINE_STATE((char_u *)"");
|
LineState s = LINE_STATE((char_u *)"");
|
||||||
int virt_attr = 0;
|
int virt_attr = 0;
|
||||||
|
@ -4319,6 +4417,7 @@ win_line (
|
||||||
}
|
}
|
||||||
|
|
||||||
xfree(p_extra_free);
|
xfree(p_extra_free);
|
||||||
|
xfree(luatext);
|
||||||
return row;
|
return row;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,19 @@ extern "C" {
|
||||||
/* Section - ABI Versioning */
|
/* Section - ABI Versioning */
|
||||||
/****************************/
|
/****************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The latest ABI version that is supported by the current version of the
|
||||||
|
* library. When Languages are generated by the Tree-sitter CLI, they are
|
||||||
|
* assigned an ABI version number that corresponds to the current CLI version.
|
||||||
|
* The Tree-sitter library is generally backwards-compatible with languages
|
||||||
|
* generated using older CLI versions, but is not forwards-compatible.
|
||||||
|
*/
|
||||||
#define TREE_SITTER_LANGUAGE_VERSION 11
|
#define TREE_SITTER_LANGUAGE_VERSION 11
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The earliest ABI version that is supported by the current version of the
|
||||||
|
* library.
|
||||||
|
*/
|
||||||
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
|
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
|
||||||
|
|
||||||
/*******************/
|
/*******************/
|
||||||
|
@ -26,6 +38,8 @@ typedef uint16_t TSFieldId;
|
||||||
typedef struct TSLanguage TSLanguage;
|
typedef struct TSLanguage TSLanguage;
|
||||||
typedef struct TSParser TSParser;
|
typedef struct TSParser TSParser;
|
||||||
typedef struct TSTree TSTree;
|
typedef struct TSTree TSTree;
|
||||||
|
typedef struct TSQuery TSQuery;
|
||||||
|
typedef struct TSQueryCursor TSQueryCursor;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
TSInputEncodingUTF8,
|
TSInputEncodingUTF8,
|
||||||
|
@ -87,6 +101,37 @@ typedef struct {
|
||||||
uint32_t context[2];
|
uint32_t context[2];
|
||||||
} TSTreeCursor;
|
} TSTreeCursor;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TSNode node;
|
||||||
|
uint32_t index;
|
||||||
|
} TSQueryCapture;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t id;
|
||||||
|
uint16_t pattern_index;
|
||||||
|
uint16_t capture_count;
|
||||||
|
const TSQueryCapture *captures;
|
||||||
|
} TSQueryMatch;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
TSQueryPredicateStepTypeDone,
|
||||||
|
TSQueryPredicateStepTypeCapture,
|
||||||
|
TSQueryPredicateStepTypeString,
|
||||||
|
} TSQueryPredicateStepType;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TSQueryPredicateStepType type;
|
||||||
|
uint32_t value_id;
|
||||||
|
} TSQueryPredicateStep;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
TSQueryErrorNone = 0,
|
||||||
|
TSQueryErrorSyntax,
|
||||||
|
TSQueryErrorNodeType,
|
||||||
|
TSQueryErrorField,
|
||||||
|
TSQueryErrorCapture,
|
||||||
|
} TSQueryError;
|
||||||
|
|
||||||
/********************/
|
/********************/
|
||||||
/* Section - Parser */
|
/* Section - Parser */
|
||||||
/********************/
|
/********************/
|
||||||
|
@ -119,7 +164,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language);
|
||||||
const TSLanguage *ts_parser_language(const TSParser *self);
|
const TSLanguage *ts_parser_language(const TSParser *self);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the spans of text that the parser should include when parsing.
|
* Set the ranges of text that the parser should include when parsing.
|
||||||
*
|
*
|
||||||
* By default, the parser will always include entire documents. This function
|
* By default, the parser will always include entire documents. This function
|
||||||
* allows you to parse only a *portion* of a document but still return a syntax
|
* allows you to parse only a *portion* of a document but still return a syntax
|
||||||
|
@ -226,14 +271,16 @@ TSTree *ts_parser_parse_string_encoding(
|
||||||
* by default, it will resume where it left off on the next call to
|
* by default, it will resume where it left off on the next call to
|
||||||
* `ts_parser_parse` or other parsing functions. If you don't want to resume,
|
* `ts_parser_parse` or other parsing functions. If you don't want to resume,
|
||||||
* and instead intend to use this parser to parse some other document, you must
|
* and instead intend to use this parser to parse some other document, you must
|
||||||
* call this `ts_parser_reset` first.
|
* call `ts_parser_reset` first.
|
||||||
*/
|
*/
|
||||||
void ts_parser_reset(TSParser *self);
|
void ts_parser_reset(TSParser *self);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the maximum duration in microseconds that parsing should be allowed to
|
* Set the maximum duration in microseconds that parsing should be allowed to
|
||||||
* take before halting. If parsing takes longer than this, it will halt early,
|
* take before halting.
|
||||||
* returning NULL. See `ts_parser_parse` for more information.
|
*
|
||||||
|
* If parsing takes longer than this, it will halt early, returning NULL.
|
||||||
|
* See `ts_parser_parse` for more information.
|
||||||
*/
|
*/
|
||||||
void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
|
void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
|
||||||
|
|
||||||
|
@ -243,10 +290,11 @@ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
|
||||||
uint64_t ts_parser_timeout_micros(const TSParser *self);
|
uint64_t ts_parser_timeout_micros(const TSParser *self);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the parser's current cancellation flag pointer. If a non-null pointer is
|
* Set the parser's current cancellation flag pointer.
|
||||||
* assigned, then the parser will periodically read from this pointer during
|
*
|
||||||
* parsing. If it reads a non-zero value, it will halt early, returning NULL.
|
* If a non-null pointer is assigned, then the parser will periodically read
|
||||||
* See `ts_parser_parse` for more information.
|
* from this pointer during parsing. If it reads a non-zero value, it will
|
||||||
|
* halt early, returning NULL. See `ts_parser_parse` for more information.
|
||||||
*/
|
*/
|
||||||
void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag);
|
void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag);
|
||||||
|
|
||||||
|
@ -322,22 +370,22 @@ const TSLanguage *ts_tree_language(const TSTree *);
|
||||||
void ts_tree_edit(TSTree *self, const TSInputEdit *edit);
|
void ts_tree_edit(TSTree *self, const TSInputEdit *edit);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compare a new syntax tree to a previous syntax tree representing the same
|
* Compare an old edited syntax tree to a new syntax tree representing the same
|
||||||
* document, returning an array of ranges whose syntactic structure has changed.
|
* document, returning an array of ranges whose syntactic structure has changed.
|
||||||
*
|
*
|
||||||
* For this to work correctly, the old syntax tree must have been edited such
|
* For this to work correctly, the old syntax tree must have been edited such
|
||||||
* that its ranges match up to the new tree. Generally, you'll want to call
|
* that its ranges match up to the new tree. Generally, you'll want to call
|
||||||
* this function right after calling one of the `ts_parser_parse` functions,
|
* this function right after calling one of the `ts_parser_parse` functions.
|
||||||
* passing in the new tree that was returned from `ts_parser_parse` and the old
|
* You need to pass the old tree that was passed to parse, as well as the new
|
||||||
* tree that was passed as a parameter.
|
* tree that was returned from that function.
|
||||||
*
|
*
|
||||||
* The returned array is allocated using `malloc` and the caller is responsible
|
* The returned array is allocated using `malloc` and the caller is responsible
|
||||||
* for freeing it using `free`. The length of the array will be written to the
|
* for freeing it using `free`. The length of the array will be written to the
|
||||||
* given `length` pointer.
|
* given `length` pointer.
|
||||||
*/
|
*/
|
||||||
TSRange *ts_tree_get_changed_ranges(
|
TSRange *ts_tree_get_changed_ranges(
|
||||||
const TSTree *self,
|
|
||||||
const TSTree *old_tree,
|
const TSTree *old_tree,
|
||||||
|
const TSTree *new_tree,
|
||||||
uint32_t *length
|
uint32_t *length
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -409,8 +457,8 @@ bool ts_node_is_named(TSNode);
|
||||||
bool ts_node_is_missing(TSNode);
|
bool ts_node_is_missing(TSNode);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if the node is *missing*. Missing nodes are inserted by the parser in
|
* Check if the node is *extra*. Extra nodes represent things like comments,
|
||||||
* order to recover from certain kinds of syntax errors.
|
* which are not required the grammar, but can appear anywhere.
|
||||||
*/
|
*/
|
||||||
bool ts_node_is_extra(TSNode);
|
bool ts_node_is_extra(TSNode);
|
||||||
|
|
||||||
|
@ -542,7 +590,7 @@ TSTreeCursor ts_tree_cursor_new(TSNode);
|
||||||
void ts_tree_cursor_delete(TSTreeCursor *);
|
void ts_tree_cursor_delete(TSTreeCursor *);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Re-initialize a tree cursor to start at a different ndoe.
|
* Re-initialize a tree cursor to start at a different node.
|
||||||
*/
|
*/
|
||||||
void ts_tree_cursor_reset(TSTreeCursor *, TSNode);
|
void ts_tree_cursor_reset(TSTreeCursor *, TSNode);
|
||||||
|
|
||||||
|
@ -584,7 +632,7 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *);
|
||||||
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
|
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Move the cursor to the first schild of its current node.
|
* Move the cursor to the first child of its current node.
|
||||||
*
|
*
|
||||||
* This returns `true` if the cursor successfully moved, and returns `false`
|
* This returns `true` if the cursor successfully moved, and returns `false`
|
||||||
* if there were no children.
|
* if there were no children.
|
||||||
|
@ -592,7 +640,7 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
|
||||||
bool ts_tree_cursor_goto_first_child(TSTreeCursor *);
|
bool ts_tree_cursor_goto_first_child(TSTreeCursor *);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Move the cursor to the first schild of its current node that extends beyond
|
* Move the cursor to the first child of its current node that extends beyond
|
||||||
* the given byte offset.
|
* the given byte offset.
|
||||||
*
|
*
|
||||||
* This returns the index of the child node if one was found, and returns -1
|
* This returns the index of the child node if one was found, and returns -1
|
||||||
|
@ -602,6 +650,156 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t);
|
||||||
|
|
||||||
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *);
|
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *);
|
||||||
|
|
||||||
|
/*******************/
|
||||||
|
/* Section - Query */
|
||||||
|
/*******************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new query from a string containing one or more S-expression
|
||||||
|
* patterns. The query is associated with a particular language, and can
|
||||||
|
* only be run on syntax nodes parsed with that language.
|
||||||
|
*
|
||||||
|
* If all of the given patterns are valid, this returns a `TSQuery`.
|
||||||
|
* If a pattern is invalid, this returns `NULL`, and provides two pieces
|
||||||
|
* of information about the problem:
|
||||||
|
* 1. The byte offset of the error is written to the `error_offset` parameter.
|
||||||
|
* 2. The type of error is written to the `error_type` parameter.
|
||||||
|
*/
|
||||||
|
TSQuery *ts_query_new(
|
||||||
|
const TSLanguage *language,
|
||||||
|
const char *source,
|
||||||
|
uint32_t source_len,
|
||||||
|
uint32_t *error_offset,
|
||||||
|
TSQueryError *error_type
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a query, freeing all of the memory that it used.
|
||||||
|
*/
|
||||||
|
void ts_query_delete(TSQuery *);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of patterns, captures, or string literals in the query.
|
||||||
|
*/
|
||||||
|
uint32_t ts_query_pattern_count(const TSQuery *);
|
||||||
|
uint32_t ts_query_capture_count(const TSQuery *);
|
||||||
|
uint32_t ts_query_string_count(const TSQuery *);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the byte offset where the given pattern starts in the query's source.
|
||||||
|
*
|
||||||
|
* This can be useful when combining queries by concatenating their source
|
||||||
|
* code strings.
|
||||||
|
*/
|
||||||
|
uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all of the predicates for the given pattern in the query.
|
||||||
|
*
|
||||||
|
* The predicates are represented as a single array of steps. There are three
|
||||||
|
* types of steps in this array, which correspond to the three legal values for
|
||||||
|
* the `type` field:
|
||||||
|
* - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names
|
||||||
|
* of captures. Their `value_id` can be used with the
|
||||||
|
* `ts_query_capture_name_for_id` function to obtain the name of the capture.
|
||||||
|
* - `TSQueryPredicateStepTypeString` - Steps with this type represent literal
|
||||||
|
* strings. Their `value_id` can be used with the
|
||||||
|
* `ts_query_string_value_for_id` function to obtain their string value.
|
||||||
|
* - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels*
|
||||||
|
* that represent the end of an individual predicate. If a pattern has two
|
||||||
|
* predicates, then there will be two steps with this `type` in the array.
|
||||||
|
*/
|
||||||
|
const TSQueryPredicateStep *ts_query_predicates_for_pattern(
|
||||||
|
const TSQuery *self,
|
||||||
|
uint32_t pattern_index,
|
||||||
|
uint32_t *length
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the name and length of one of the query's captures, or one of the
|
||||||
|
* query's string literals. Each capture and string is associated with a
|
||||||
|
* numeric id based on the order that it appeared in the query's source.
|
||||||
|
*/
|
||||||
|
const char *ts_query_capture_name_for_id(
|
||||||
|
const TSQuery *,
|
||||||
|
uint32_t id,
|
||||||
|
uint32_t *length
|
||||||
|
);
|
||||||
|
const char *ts_query_string_value_for_id(
|
||||||
|
const TSQuery *,
|
||||||
|
uint32_t id,
|
||||||
|
uint32_t *length
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disable a certain capture within a query. This prevents the capture
|
||||||
|
* from being returned in matches, and also avoids any resource usage
|
||||||
|
* associated with recording the capture.
|
||||||
|
*/
|
||||||
|
void ts_query_disable_capture(TSQuery *, const char *, uint32_t);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new cursor for executing a given query.
|
||||||
|
*
|
||||||
|
* The cursor stores the state that is needed to iteratively search
|
||||||
|
* for matches. To use the query cursor, first call `ts_query_cursor_exec`
|
||||||
|
* to start running a given query on a given syntax node. Then, there are
|
||||||
|
* two options for consuming the results of the query:
|
||||||
|
* 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the
|
||||||
|
* the *matches* in the order that they were found. Each match contains the
|
||||||
|
* index of the pattern that matched, and an array of captures. Because
|
||||||
|
* multiple patterns can match the same set of nodes, one match may contain
|
||||||
|
* captures that appear *before* some of the captures from a previous match.
|
||||||
|
* 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the
|
||||||
|
* individual *captures* in the order that they appear. This is useful if
|
||||||
|
* don't care about which pattern matched, and just want a single ordered
|
||||||
|
* sequence of captures.
|
||||||
|
*
|
||||||
|
* If you don't care about consuming all of the results, you can stop calling
|
||||||
|
* `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point.
|
||||||
|
* You can then start executing another query on another node by calling
|
||||||
|
* `ts_query_cursor_exec` again.
|
||||||
|
*/
|
||||||
|
TSQueryCursor *ts_query_cursor_new(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a query cursor, freeing all of the memory that it used.
|
||||||
|
*/
|
||||||
|
void ts_query_cursor_delete(TSQueryCursor *);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start running a given query on a given node.
|
||||||
|
*/
|
||||||
|
void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the range of bytes or (row, column) positions in which the query
|
||||||
|
* will be executed.
|
||||||
|
*/
|
||||||
|
void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t);
|
||||||
|
void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance to the next match of the currently running query.
|
||||||
|
*
|
||||||
|
* If there is a match, write it to `*match` and return `true`.
|
||||||
|
* Otherwise, return `false`.
|
||||||
|
*/
|
||||||
|
bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match);
|
||||||
|
void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance to the next capture of the currently running query.
|
||||||
|
*
|
||||||
|
* If there is a capture, write its match to `*match` and its index within
|
||||||
|
* the matche's capture list to `*capture_index`. Otherwise, return `false`.
|
||||||
|
*/
|
||||||
|
bool ts_query_cursor_next_capture(
|
||||||
|
TSQueryCursor *,
|
||||||
|
TSQueryMatch *match,
|
||||||
|
uint32_t *capture_index
|
||||||
|
);
|
||||||
|
|
||||||
/**********************/
|
/**********************/
|
||||||
/* Section - Language */
|
/* Section - Language */
|
||||||
/**********************/
|
/**********************/
|
||||||
|
@ -619,7 +817,12 @@ const char *ts_language_symbol_name(const TSLanguage *, TSSymbol);
|
||||||
/**
|
/**
|
||||||
* Get the numerical id for the given node type string.
|
* Get the numerical id for the given node type string.
|
||||||
*/
|
*/
|
||||||
TSSymbol ts_language_symbol_for_name(const TSLanguage *, const char *);
|
TSSymbol ts_language_symbol_for_name(
|
||||||
|
const TSLanguage *self,
|
||||||
|
const char *string,
|
||||||
|
uint32_t length,
|
||||||
|
bool is_named
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the number of distinct field names in the language.
|
* Get the number of distinct field names in the language.
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#ifndef TREE_SITTER_BITS_H_
|
||||||
|
#define TREE_SITTER_BITS_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
static inline uint32_t bitmask_for_index(uint16_t id) {
|
||||||
|
return (1u << (31 - id));
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
|
||||||
|
#include <intrin.h>
|
||||||
|
|
||||||
|
static inline uint32_t count_leading_zeros(uint32_t x) {
|
||||||
|
if (x == 0) return 32;
|
||||||
|
uint32_t result;
|
||||||
|
_BitScanReverse(&result, x);
|
||||||
|
return 31 - result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static inline uint32_t count_leading_zeros(uint32_t x) {
|
||||||
|
if (x == 0) return 32;
|
||||||
|
return __builtin_clz(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif // TREE_SITTER_BITS_H_
|
|
@ -3,8 +3,28 @@
|
||||||
#include "./error_costs.h"
|
#include "./error_costs.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
void ts_language_table_entry(const TSLanguage *self, TSStateId state,
|
uint32_t ts_language_symbol_count(const TSLanguage *self) {
|
||||||
TSSymbol symbol, TableEntry *result) {
|
return self->symbol_count + self->alias_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t ts_language_version(const TSLanguage *self) {
|
||||||
|
return self->version;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t ts_language_field_count(const TSLanguage *self) {
|
||||||
|
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
|
||||||
|
return self->field_count;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ts_language_table_entry(
|
||||||
|
const TSLanguage *self,
|
||||||
|
TSStateId state,
|
||||||
|
TSSymbol symbol,
|
||||||
|
TableEntry *result
|
||||||
|
) {
|
||||||
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
|
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
|
||||||
result->action_count = 0;
|
result->action_count = 0;
|
||||||
result->is_reusable = false;
|
result->is_reusable = false;
|
||||||
|
@ -19,48 +39,72 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t ts_language_symbol_count(const TSLanguage *language) {
|
TSSymbolMetadata ts_language_symbol_metadata(
|
||||||
return language->symbol_count + language->alias_count;
|
const TSLanguage *self,
|
||||||
}
|
TSSymbol symbol
|
||||||
|
) {
|
||||||
uint32_t ts_language_version(const TSLanguage *language) {
|
|
||||||
return language->version;
|
|
||||||
}
|
|
||||||
|
|
||||||
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) {
|
|
||||||
if (symbol == ts_builtin_sym_error) {
|
if (symbol == ts_builtin_sym_error) {
|
||||||
return (TSSymbolMetadata){.visible = true, .named = true};
|
return (TSSymbolMetadata){.visible = true, .named = true};
|
||||||
} else if (symbol == ts_builtin_sym_error_repeat) {
|
} else if (symbol == ts_builtin_sym_error_repeat) {
|
||||||
return (TSSymbolMetadata){.visible = false, .named = false};
|
return (TSSymbolMetadata){.visible = false, .named = false};
|
||||||
} else {
|
} else {
|
||||||
return language->symbol_metadata[symbol];
|
return self->symbol_metadata[symbol];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *ts_language_symbol_name(const TSLanguage *language, TSSymbol symbol) {
|
TSSymbol ts_language_public_symbol(
|
||||||
|
const TSLanguage *self,
|
||||||
|
TSSymbol symbol
|
||||||
|
) {
|
||||||
|
if (symbol == ts_builtin_sym_error) return symbol;
|
||||||
|
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
|
||||||
|
return self->public_symbol_map[symbol];
|
||||||
|
} else {
|
||||||
|
return symbol;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *ts_language_symbol_name(
|
||||||
|
const TSLanguage *self,
|
||||||
|
TSSymbol symbol
|
||||||
|
) {
|
||||||
if (symbol == ts_builtin_sym_error) {
|
if (symbol == ts_builtin_sym_error) {
|
||||||
return "ERROR";
|
return "ERROR";
|
||||||
} else if (symbol == ts_builtin_sym_error_repeat) {
|
} else if (symbol == ts_builtin_sym_error_repeat) {
|
||||||
return "_ERROR";
|
return "_ERROR";
|
||||||
} else {
|
} else {
|
||||||
return language->symbol_names[symbol];
|
return self->symbol_names[symbol];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TSSymbol ts_language_symbol_for_name(const TSLanguage *self, const char *name) {
|
TSSymbol ts_language_symbol_for_name(
|
||||||
if (!strcmp(name, "ERROR")) return ts_builtin_sym_error;
|
const TSLanguage *self,
|
||||||
|
const char *string,
|
||||||
|
uint32_t length,
|
||||||
|
bool is_named
|
||||||
|
) {
|
||||||
|
if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error;
|
||||||
uint32_t count = ts_language_symbol_count(self);
|
uint32_t count = ts_language_symbol_count(self);
|
||||||
for (TSSymbol i = 0; i < count; i++) {
|
for (TSSymbol i = 0; i < count; i++) {
|
||||||
if (!strcmp(self->symbol_names[i], name)) {
|
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i);
|
||||||
return i;
|
if (!metadata.visible || metadata.named != is_named) continue;
|
||||||
|
const char *symbol_name = self->symbol_names[i];
|
||||||
|
if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
|
||||||
|
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
|
||||||
|
return self->public_symbol_map[i];
|
||||||
|
} else {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol) {
|
TSSymbolType ts_language_symbol_type(
|
||||||
TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
|
const TSLanguage *self,
|
||||||
|
TSSymbol symbol
|
||||||
|
) {
|
||||||
|
TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol);
|
||||||
if (metadata.named) {
|
if (metadata.named) {
|
||||||
return TSSymbolTypeRegular;
|
return TSSymbolTypeRegular;
|
||||||
} else if (metadata.visible) {
|
} else if (metadata.visible) {
|
||||||
|
@ -70,15 +114,10 @@ TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t ts_language_field_count(const TSLanguage *self) {
|
const char *ts_language_field_name_for_id(
|
||||||
if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
|
const TSLanguage *self,
|
||||||
return self->field_count;
|
TSFieldId id
|
||||||
} else {
|
) {
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id) {
|
|
||||||
uint32_t count = ts_language_field_count(self);
|
uint32_t count = ts_language_field_count(self);
|
||||||
if (count) {
|
if (count) {
|
||||||
return self->field_names[id];
|
return self->field_names[id];
|
||||||
|
@ -96,7 +135,8 @@ TSFieldId ts_language_field_id_for_name(
|
||||||
for (TSSymbol i = 1; i < count + 1; i++) {
|
for (TSSymbol i = 1; i < count + 1; i++) {
|
||||||
switch (strncmp(name, self->field_names[i], name_length)) {
|
switch (strncmp(name, self->field_names[i], name_length)) {
|
||||||
case 0:
|
case 0:
|
||||||
return i;
|
if (self->field_names[i][name_length] == 0) return i;
|
||||||
|
break;
|
||||||
case -1:
|
case -1:
|
||||||
return 0;
|
return 0;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -10,6 +10,7 @@ extern "C" {
|
||||||
|
|
||||||
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
|
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
|
||||||
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
|
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
|
||||||
|
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
|
||||||
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
|
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -22,6 +23,8 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry
|
||||||
|
|
||||||
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
|
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
|
||||||
|
|
||||||
|
TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol);
|
||||||
|
|
||||||
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
|
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
|
||||||
return 0 < symbol && symbol < self->external_token_count + 1;
|
return 0 < symbol && symbol < self->external_token_count + 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,26 +2,58 @@
|
||||||
#include "./lexer.h"
|
#include "./lexer.h"
|
||||||
#include "./subtree.h"
|
#include "./subtree.h"
|
||||||
#include "./length.h"
|
#include "./length.h"
|
||||||
#include "./utf16.h"
|
#include "./unicode.h"
|
||||||
#include "utf8proc.h"
|
|
||||||
|
|
||||||
#define LOG(...) \
|
#define LOG(message, character) \
|
||||||
if (self->logger.log) { \
|
if (self->logger.log) { \
|
||||||
snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
|
snprintf( \
|
||||||
self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
|
self->debug_buffer, \
|
||||||
|
TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
|
||||||
|
32 <= character && character < 127 ? \
|
||||||
|
message " character:'%c'" : \
|
||||||
|
message " character:%d", \
|
||||||
|
character \
|
||||||
|
); \
|
||||||
|
self->logger.log( \
|
||||||
|
self->logger.payload, \
|
||||||
|
TSLogTypeLex, \
|
||||||
|
self->debug_buffer \
|
||||||
|
); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LOG_CHARACTER(message, character) \
|
|
||||||
LOG( \
|
|
||||||
32 <= character && character < 127 ? \
|
|
||||||
message " character:'%c'" : \
|
|
||||||
message " character:%d", character \
|
|
||||||
)
|
|
||||||
|
|
||||||
static const char empty_chunk[3] = { 0, 0 };
|
|
||||||
|
|
||||||
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
|
static const int32_t BYTE_ORDER_MARK = 0xFEFF;
|
||||||
|
|
||||||
|
static const TSRange DEFAULT_RANGE = {
|
||||||
|
.start_point = {
|
||||||
|
.row = 0,
|
||||||
|
.column = 0,
|
||||||
|
},
|
||||||
|
.end_point = {
|
||||||
|
.row = UINT32_MAX,
|
||||||
|
.column = UINT32_MAX,
|
||||||
|
},
|
||||||
|
.start_byte = 0,
|
||||||
|
.end_byte = UINT32_MAX
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if the lexer has reached EOF. This state is stored
|
||||||
|
// by setting the lexer's `current_included_range_index` such that
|
||||||
|
// it has consumed all of its available ranges.
|
||||||
|
static bool ts_lexer__eof(const TSLexer *_self) {
|
||||||
|
Lexer *self = (Lexer *)_self;
|
||||||
|
return self->current_included_range_index == self->included_range_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the currently stored chunk of source code, because the lexer's
|
||||||
|
// position has changed.
|
||||||
|
static void ts_lexer__clear_chunk(Lexer *self) {
|
||||||
|
self->chunk = NULL;
|
||||||
|
self->chunk_size = 0;
|
||||||
|
self->chunk_start = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call the lexer's input callback to obtain a new chunk of source code
|
||||||
|
// for the current position.
|
||||||
static void ts_lexer__get_chunk(Lexer *self) {
|
static void ts_lexer__get_chunk(Lexer *self) {
|
||||||
self->chunk_start = self->current_position.bytes;
|
self->chunk_start = self->current_position.bytes;
|
||||||
self->chunk = self->input.read(
|
self->chunk = self->input.read(
|
||||||
|
@ -30,15 +62,15 @@ static void ts_lexer__get_chunk(Lexer *self) {
|
||||||
self->current_position.extent,
|
self->current_position.extent,
|
||||||
&self->chunk_size
|
&self->chunk_size
|
||||||
);
|
);
|
||||||
if (!self->chunk_size) self->chunk = empty_chunk;
|
if (!self->chunk_size) {
|
||||||
|
self->current_included_range_index = self->included_range_count;
|
||||||
|
self->chunk = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef utf8proc_ssize_t (*DecodeFunction)(
|
// Decode the next unicode character in the current chunk of source code.
|
||||||
const utf8proc_uint8_t *,
|
// This assumes that the lexer has already retrieved a chunk of source
|
||||||
utf8proc_ssize_t,
|
// code that spans the current position.
|
||||||
utf8proc_int32_t *
|
|
||||||
);
|
|
||||||
|
|
||||||
static void ts_lexer__get_lookahead(Lexer *self) {
|
static void ts_lexer__get_lookahead(Lexer *self) {
|
||||||
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
|
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
|
||||||
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
|
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
|
||||||
|
@ -50,29 +82,37 @@ static void ts_lexer__get_lookahead(Lexer *self) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
DecodeFunction decode =
|
UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
|
||||||
self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate;
|
? ts_decode_utf8
|
||||||
|
: ts_decode_utf16;
|
||||||
|
|
||||||
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
|
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
|
||||||
|
|
||||||
// If this chunk ended in the middle of a multi-byte character,
|
// If this chunk ended in the middle of a multi-byte character,
|
||||||
// try again with a fresh chunk.
|
// try again with a fresh chunk.
|
||||||
if (self->data.lookahead == -1 && size < 4) {
|
if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
|
||||||
ts_lexer__get_chunk(self);
|
ts_lexer__get_chunk(self);
|
||||||
chunk = (const uint8_t *)self->chunk;
|
chunk = (const uint8_t *)self->chunk;
|
||||||
size = self->chunk_size;
|
size = self->chunk_size;
|
||||||
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
|
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (self->data.lookahead == -1) {
|
if (self->data.lookahead == TS_DECODE_ERROR) {
|
||||||
self->lookahead_size = 1;
|
self->lookahead_size = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ts_lexer__advance(TSLexer *payload, bool skip) {
|
// Advance to the next character in the source code, retrieving a new
|
||||||
Lexer *self = (Lexer *)payload;
|
// chunk of source code if needed.
|
||||||
if (self->chunk == empty_chunk)
|
static void ts_lexer__advance(TSLexer *_self, bool skip) {
|
||||||
return;
|
Lexer *self = (Lexer *)_self;
|
||||||
|
if (!self->chunk) return;
|
||||||
|
|
||||||
|
if (skip) {
|
||||||
|
LOG("skip", self->data.lookahead);
|
||||||
|
} else {
|
||||||
|
LOG("consume", self->data.lookahead);
|
||||||
|
}
|
||||||
|
|
||||||
if (self->lookahead_size) {
|
if (self->lookahead_size) {
|
||||||
self->current_position.bytes += self->lookahead_size;
|
self->current_position.bytes += self->lookahead_size;
|
||||||
|
@ -84,53 +124,65 @@ static void ts_lexer__advance(TSLexer *payload, bool skip) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
|
const TSRange *current_range = NULL;
|
||||||
if (self->current_position.bytes == current_range->end_byte) {
|
if (self->current_included_range_index < self->included_range_count) {
|
||||||
self->current_included_range_index++;
|
current_range = &self->included_ranges[self->current_included_range_index];
|
||||||
if (self->current_included_range_index == self->included_range_count) {
|
if (self->current_position.bytes == current_range->end_byte) {
|
||||||
self->data.lookahead = '\0';
|
self->current_included_range_index++;
|
||||||
self->lookahead_size = 1;
|
if (self->current_included_range_index < self->included_range_count) {
|
||||||
return;
|
current_range++;
|
||||||
} else {
|
self->current_position = (Length) {
|
||||||
current_range++;
|
current_range->start_byte,
|
||||||
self->current_position = (Length) {
|
current_range->start_point,
|
||||||
current_range->start_byte,
|
};
|
||||||
current_range->start_point,
|
} else {
|
||||||
};
|
current_range = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (skip) {
|
if (skip) self->token_start_position = self->current_position;
|
||||||
LOG_CHARACTER("skip", self->data.lookahead);
|
|
||||||
self->token_start_position = self->current_position;
|
if (current_range) {
|
||||||
|
if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
|
||||||
|
ts_lexer__get_chunk(self);
|
||||||
|
}
|
||||||
|
ts_lexer__get_lookahead(self);
|
||||||
} else {
|
} else {
|
||||||
LOG_CHARACTER("consume", self->data.lookahead);
|
ts_lexer__clear_chunk(self);
|
||||||
}
|
self->data.lookahead = '\0';
|
||||||
|
self->lookahead_size = 1;
|
||||||
if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
|
|
||||||
ts_lexer__get_chunk(self);
|
|
||||||
}
|
|
||||||
|
|
||||||
ts_lexer__get_lookahead(self);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ts_lexer__mark_end(TSLexer *payload) {
|
|
||||||
Lexer *self = (Lexer *)payload;
|
|
||||||
TSRange *current_included_range = &self->included_ranges[self->current_included_range_index];
|
|
||||||
if (self->current_included_range_index > 0 &&
|
|
||||||
self->current_position.bytes == current_included_range->start_byte) {
|
|
||||||
TSRange *previous_included_range = current_included_range - 1;
|
|
||||||
self->token_end_position = (Length) {
|
|
||||||
previous_included_range->end_byte,
|
|
||||||
previous_included_range->end_point,
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
self->token_end_position = self->current_position;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t ts_lexer__get_column(TSLexer *payload) {
|
// Mark that a token match has completed. This can be called multiple
|
||||||
Lexer *self = (Lexer *)payload;
|
// times if a longer match is found later.
|
||||||
|
static void ts_lexer__mark_end(TSLexer *_self) {
|
||||||
|
Lexer *self = (Lexer *)_self;
|
||||||
|
if (!ts_lexer__eof(&self->data)) {
|
||||||
|
// If the lexer is right at the beginning of included range,
|
||||||
|
// then the token should be considered to end at the *end* of the
|
||||||
|
// previous included range, rather than here.
|
||||||
|
TSRange *current_included_range = &self->included_ranges[
|
||||||
|
self->current_included_range_index
|
||||||
|
];
|
||||||
|
if (
|
||||||
|
self->current_included_range_index > 0 &&
|
||||||
|
self->current_position.bytes == current_included_range->start_byte
|
||||||
|
) {
|
||||||
|
TSRange *previous_included_range = current_included_range - 1;
|
||||||
|
self->token_end_position = (Length) {
|
||||||
|
previous_included_range->end_byte,
|
||||||
|
previous_included_range->end_point,
|
||||||
|
};
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self->token_end_position = self->current_position;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t ts_lexer__get_column(TSLexer *_self) {
|
||||||
|
Lexer *self = (Lexer *)_self;
|
||||||
uint32_t goal_byte = self->current_position.bytes;
|
uint32_t goal_byte = self->current_position.bytes;
|
||||||
|
|
||||||
self->current_position.bytes -= self->current_position.extent.column;
|
self->current_position.bytes -= self->current_position.extent.column;
|
||||||
|
@ -142,67 +194,69 @@ static uint32_t ts_lexer__get_column(TSLexer *payload) {
|
||||||
|
|
||||||
uint32_t result = 0;
|
uint32_t result = 0;
|
||||||
while (self->current_position.bytes < goal_byte) {
|
while (self->current_position.bytes < goal_byte) {
|
||||||
ts_lexer__advance(payload, false);
|
ts_lexer__advance(&self->data, false);
|
||||||
result++;
|
result++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ts_lexer__is_at_included_range_start(TSLexer *payload) {
|
// Is the lexer at a boundary between two disjoint included ranges of
|
||||||
const Lexer *self = (const Lexer *)payload;
|
// source code? This is exposed as an API because some languages' external
|
||||||
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
|
// scanners need to perform custom actions at these bounaries.
|
||||||
return self->current_position.bytes == current_range->start_byte;
|
static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
|
||||||
|
const Lexer *self = (const Lexer *)_self;
|
||||||
|
if (self->current_included_range_index < self->included_range_count) {
|
||||||
|
TSRange *current_range = &self->included_ranges[self->current_included_range_index];
|
||||||
|
return self->current_position.bytes == current_range->start_byte;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The lexer's methods are stored as a struct field so that generated
|
|
||||||
// parsers can call them without needing to be linked against this library.
|
|
||||||
|
|
||||||
void ts_lexer_init(Lexer *self) {
|
void ts_lexer_init(Lexer *self) {
|
||||||
*self = (Lexer) {
|
*self = (Lexer) {
|
||||||
.data = {
|
.data = {
|
||||||
|
// The lexer's methods are stored as struct fields so that generated
|
||||||
|
// parsers can call them without needing to be linked against this
|
||||||
|
// library.
|
||||||
.advance = ts_lexer__advance,
|
.advance = ts_lexer__advance,
|
||||||
.mark_end = ts_lexer__mark_end,
|
.mark_end = ts_lexer__mark_end,
|
||||||
.get_column = ts_lexer__get_column,
|
.get_column = ts_lexer__get_column,
|
||||||
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
|
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
|
||||||
|
.eof = ts_lexer__eof,
|
||||||
.lookahead = 0,
|
.lookahead = 0,
|
||||||
.result_symbol = 0,
|
.result_symbol = 0,
|
||||||
},
|
},
|
||||||
.chunk = NULL,
|
.chunk = NULL,
|
||||||
|
.chunk_size = 0,
|
||||||
.chunk_start = 0,
|
.chunk_start = 0,
|
||||||
.current_position = {UINT32_MAX, {0, 0}},
|
.current_position = {0, {0, 0}},
|
||||||
.logger = {
|
.logger = {
|
||||||
.payload = NULL,
|
.payload = NULL,
|
||||||
.log = NULL
|
.log = NULL
|
||||||
},
|
},
|
||||||
|
.included_ranges = NULL,
|
||||||
|
.included_range_count = 0,
|
||||||
.current_included_range_index = 0,
|
.current_included_range_index = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
self->included_ranges = NULL;
|
|
||||||
ts_lexer_set_included_ranges(self, NULL, 0);
|
ts_lexer_set_included_ranges(self, NULL, 0);
|
||||||
ts_lexer_reset(self, length_zero());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_lexer_delete(Lexer *self) {
|
void ts_lexer_delete(Lexer *self) {
|
||||||
ts_free(self->included_ranges);
|
ts_free(self->included_ranges);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_lexer_set_input(Lexer *self, TSInput input) {
|
|
||||||
self->input = input;
|
|
||||||
self->data.lookahead = 0;
|
|
||||||
self->lookahead_size = 0;
|
|
||||||
self->chunk = 0;
|
|
||||||
self->chunk_start = 0;
|
|
||||||
self->chunk_size = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ts_lexer_goto(Lexer *self, Length position) {
|
static void ts_lexer_goto(Lexer *self, Length position) {
|
||||||
|
self->current_position = position;
|
||||||
bool found_included_range = false;
|
bool found_included_range = false;
|
||||||
|
|
||||||
|
// Move to the first valid position at or after the given position.
|
||||||
for (unsigned i = 0; i < self->included_range_count; i++) {
|
for (unsigned i = 0; i < self->included_range_count; i++) {
|
||||||
TSRange *included_range = &self->included_ranges[i];
|
TSRange *included_range = &self->included_ranges[i];
|
||||||
if (included_range->end_byte > position.bytes) {
|
if (included_range->end_byte > position.bytes) {
|
||||||
if (included_range->start_byte > position.bytes) {
|
if (included_range->start_byte > position.bytes) {
|
||||||
position = (Length) {
|
self->current_position = (Length) {
|
||||||
.bytes = included_range->start_byte,
|
.bytes = included_range->start_byte,
|
||||||
.extent = included_range->start_point,
|
.extent = included_range->start_point,
|
||||||
};
|
};
|
||||||
|
@ -214,46 +268,61 @@ static void ts_lexer_goto(Lexer *self, Length position) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!found_included_range) {
|
if (found_included_range) {
|
||||||
|
// If the current position is outside of the current chunk of text,
|
||||||
|
// then clear out the current chunk of text.
|
||||||
|
if (self->chunk && (
|
||||||
|
position.bytes < self->chunk_start ||
|
||||||
|
position.bytes >= self->chunk_start + self->chunk_size
|
||||||
|
)) {
|
||||||
|
ts_lexer__clear_chunk(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
self->lookahead_size = 0;
|
||||||
|
self->data.lookahead = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the given position is beyond any of included ranges, move to the EOF
|
||||||
|
// state - past the end of the included ranges.
|
||||||
|
else {
|
||||||
|
self->current_included_range_index = self->included_range_count;
|
||||||
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
|
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
|
||||||
position = (Length) {
|
self->current_position = (Length) {
|
||||||
.bytes = last_included_range->end_byte,
|
.bytes = last_included_range->end_byte,
|
||||||
.extent = last_included_range->end_point,
|
.extent = last_included_range->end_point,
|
||||||
};
|
};
|
||||||
self->chunk = empty_chunk;
|
ts_lexer__clear_chunk(self);
|
||||||
self->chunk_start = position.bytes;
|
self->lookahead_size = 1;
|
||||||
self->chunk_size = 2;
|
self->data.lookahead = '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
self->token_start_position = position;
|
|
||||||
self->token_end_position = LENGTH_UNDEFINED;
|
|
||||||
self->current_position = position;
|
|
||||||
|
|
||||||
if (self->chunk && (position.bytes < self->chunk_start ||
|
|
||||||
position.bytes >= self->chunk_start + self->chunk_size)) {
|
|
||||||
self->chunk = 0;
|
|
||||||
self->chunk_start = 0;
|
|
||||||
self->chunk_size = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
self->lookahead_size = 0;
|
|
||||||
self->data.lookahead = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ts_lexer_set_input(Lexer *self, TSInput input) {
|
||||||
|
self->input = input;
|
||||||
|
ts_lexer__clear_chunk(self);
|
||||||
|
ts_lexer_goto(self, self->current_position);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move the lexer to the given position. This doesn't do any work
|
||||||
|
// if the parser is already at the given position.
|
||||||
void ts_lexer_reset(Lexer *self, Length position) {
|
void ts_lexer_reset(Lexer *self, Length position) {
|
||||||
if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position);
|
if (position.bytes != self->current_position.bytes) {
|
||||||
|
ts_lexer_goto(self, position);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_lexer_start(Lexer *self) {
|
void ts_lexer_start(Lexer *self) {
|
||||||
self->token_start_position = self->current_position;
|
self->token_start_position = self->current_position;
|
||||||
self->token_end_position = LENGTH_UNDEFINED;
|
self->token_end_position = LENGTH_UNDEFINED;
|
||||||
self->data.result_symbol = 0;
|
self->data.result_symbol = 0;
|
||||||
if (!self->chunk) ts_lexer__get_chunk(self);
|
if (!ts_lexer__eof(&self->data)) {
|
||||||
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
|
if (!self->chunk_size) ts_lexer__get_chunk(self);
|
||||||
if (
|
if (!self->lookahead_size) ts_lexer__get_lookahead(self);
|
||||||
self->current_position.bytes == 0 &&
|
if (
|
||||||
self->data.lookahead == BYTE_ORDER_MARK
|
self->current_position.bytes == 0 &&
|
||||||
) ts_lexer__advance((TSLexer *)self, true);
|
self->data.lookahead == BYTE_ORDER_MARK
|
||||||
|
) ts_lexer__advance(&self->data, true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
|
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
|
||||||
|
@ -267,7 +336,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
|
||||||
// the character decoding algorithm may have looked at the following byte.
|
// the character decoding algorithm may have looked at the following byte.
|
||||||
// Therefore, the next byte *after* the current (invalid) character
|
// Therefore, the next byte *after* the current (invalid) character
|
||||||
// affects the interpretation of the current character.
|
// affects the interpretation of the current character.
|
||||||
if (self->data.lookahead == -1) {
|
if (self->data.lookahead == TS_DECODE_ERROR) {
|
||||||
current_lookahead_end_byte++;
|
current_lookahead_end_byte++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -277,8 +346,8 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_lexer_advance_to_end(Lexer *self) {
|
void ts_lexer_advance_to_end(Lexer *self) {
|
||||||
while (self->data.lookahead != 0) {
|
while (self->chunk) {
|
||||||
ts_lexer__advance((TSLexer *)self, false);
|
ts_lexer__advance(&self->data, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -286,30 +355,19 @@ void ts_lexer_mark_end(Lexer *self) {
|
||||||
ts_lexer__mark_end(&self->data);
|
ts_lexer__mark_end(&self->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const TSRange DEFAULT_RANGES[] = {
|
void ts_lexer_set_included_ranges(
|
||||||
{
|
Lexer *self,
|
||||||
.start_point = {
|
const TSRange *ranges,
|
||||||
.row = 0,
|
uint32_t count
|
||||||
.column = 0,
|
) {
|
||||||
},
|
if (count == 0 || !ranges) {
|
||||||
.end_point = {
|
ranges = &DEFAULT_RANGE;
|
||||||
.row = UINT32_MAX,
|
|
||||||
.column = UINT32_MAX,
|
|
||||||
},
|
|
||||||
.start_byte = 0,
|
|
||||||
.end_byte = UINT32_MAX
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count) {
|
|
||||||
if (!ranges) {
|
|
||||||
ranges = DEFAULT_RANGES;
|
|
||||||
count = 1;
|
count = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t sz = count * sizeof(TSRange);
|
size_t size = count * sizeof(TSRange);
|
||||||
self->included_ranges = ts_realloc(self->included_ranges, sz);
|
self->included_ranges = ts_realloc(self->included_ranges, size);
|
||||||
memcpy(self->included_ranges, ranges, sz);
|
memcpy(self->included_ranges, ranges, size);
|
||||||
self->included_range_count = count;
|
self->included_range_count = count;
|
||||||
ts_lexer_goto(self, self->current_position);
|
ts_lexer_goto(self, self->current_position);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,7 @@ typedef struct {
|
||||||
Length token_start_position;
|
Length token_start_position;
|
||||||
Length token_end_position;
|
Length token_end_position;
|
||||||
|
|
||||||
TSRange * included_ranges;
|
TSRange *included_ranges;
|
||||||
size_t included_range_count;
|
size_t included_range_count;
|
||||||
size_t current_included_range_index;
|
size_t current_included_range_index;
|
||||||
|
|
||||||
|
|
|
@ -2,19 +2,16 @@
|
||||||
//
|
//
|
||||||
// The following directories must be added to the include path:
|
// The following directories must be added to the include path:
|
||||||
// - include
|
// - include
|
||||||
// - utf8proc
|
|
||||||
|
|
||||||
#define _POSIX_C_SOURCE 200112L
|
#define _POSIX_C_SOURCE 200112L
|
||||||
#define UTF8PROC_STATIC
|
|
||||||
|
|
||||||
#include "./get_changed_ranges.c"
|
#include "./get_changed_ranges.c"
|
||||||
#include "./language.c"
|
#include "./language.c"
|
||||||
#include "./lexer.c"
|
#include "./lexer.c"
|
||||||
#include "./node.c"
|
#include "./node.c"
|
||||||
#include "./parser.c"
|
#include "./parser.c"
|
||||||
|
#include "./query.c"
|
||||||
#include "./stack.c"
|
#include "./stack.c"
|
||||||
#include "./subtree.c"
|
#include "./subtree.c"
|
||||||
#include "./tree_cursor.c"
|
#include "./tree_cursor.c"
|
||||||
#include "./tree.c"
|
#include "./tree.c"
|
||||||
#include "./utf16.c"
|
|
||||||
#include "utf8proc.c"
|
|
||||||
|
|
|
@ -415,13 +415,15 @@ TSPoint ts_node_end_point(TSNode self) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TSSymbol ts_node_symbol(TSNode self) {
|
TSSymbol ts_node_symbol(TSNode self) {
|
||||||
return ts_node__alias(&self)
|
TSSymbol symbol = ts_node__alias(&self);
|
||||||
? ts_node__alias(&self)
|
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
|
||||||
: ts_subtree_symbol(ts_node__subtree(self));
|
return ts_language_public_symbol(self.tree->language, symbol);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *ts_node_type(TSNode self) {
|
const char *ts_node_type(TSNode self) {
|
||||||
return ts_language_symbol_name(self.tree->language, ts_node_symbol(self));
|
TSSymbol symbol = ts_node__alias(&self);
|
||||||
|
if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
|
||||||
|
return ts_language_symbol_name(self.tree->language, symbol);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *ts_node_string(TSNode self) {
|
char *ts_node_string(TSNode self) {
|
||||||
|
|
|
@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
|
||||||
Length start_position = ts_stack_position(self->stack, version);
|
Length start_position = ts_stack_position(self->stack, version);
|
||||||
Subtree external_token = ts_stack_last_external_token(self->stack, version);
|
Subtree external_token = ts_stack_last_external_token(self->stack, version);
|
||||||
TSLexMode lex_mode = self->language->lex_modes[parse_state];
|
TSLexMode lex_mode = self->language->lex_modes[parse_state];
|
||||||
|
if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
|
||||||
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
|
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
|
||||||
self->language,
|
self->language,
|
||||||
lex_mode.external_lex_state
|
lex_mode.external_lex_state
|
||||||
|
@ -438,7 +439,7 @@ static Subtree ts_parser__lex(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (self->lexer.current_position.bytes == error_end_position.bytes) {
|
if (self->lexer.current_position.bytes == error_end_position.bytes) {
|
||||||
if (self->lexer.data.lookahead == 0) {
|
if (self->lexer.data.eof(&self->lexer.data)) {
|
||||||
self->lexer.data.result_symbol = ts_builtin_sym_error;
|
self->lexer.data.result_symbol = ts_builtin_sym_error;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
|
||||||
uint32_t count,
|
uint32_t count,
|
||||||
int dynamic_precedence,
|
int dynamic_precedence,
|
||||||
uint16_t production_id,
|
uint16_t production_id,
|
||||||
bool fragile
|
bool is_fragile,
|
||||||
|
bool is_extra
|
||||||
) {
|
) {
|
||||||
uint32_t initial_version_count = ts_stack_version_count(self->stack);
|
uint32_t initial_version_count = ts_stack_version_count(self->stack);
|
||||||
uint32_t removed_version_count = 0;
|
uint32_t removed_version_count = 0;
|
||||||
|
@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(
|
||||||
|
|
||||||
TSStateId state = ts_stack_state(self->stack, slice_version);
|
TSStateId state = ts_stack_state(self->stack, slice_version);
|
||||||
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
|
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
|
||||||
if (fragile || pop.size > 1 || initial_version_count > 1) {
|
if (is_extra) parent.ptr->extra = true;
|
||||||
|
if (is_fragile || pop.size > 1 || initial_version_count > 1) {
|
||||||
parent.ptr->fragile_left = true;
|
parent.ptr->fragile_left = true;
|
||||||
parent.ptr->fragile_right = true;
|
parent.ptr->fragile_right = true;
|
||||||
parent.ptr->parse_state = TS_TREE_STATE_NONE;
|
parent.ptr->parse_state = TS_TREE_STATE_NONE;
|
||||||
|
@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
|
||||||
reduction_version = ts_parser__reduce(
|
reduction_version = ts_parser__reduce(
|
||||||
self, version, action.symbol, action.count,
|
self, version, action.symbol, action.count,
|
||||||
action.dynamic_precedence, action.production_id,
|
action.dynamic_precedence, action.production_id,
|
||||||
true
|
true, false
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
|
||||||
// Otherwise, re-run the lexer.
|
// Otherwise, re-run the lexer.
|
||||||
if (!lookahead.ptr) {
|
if (!lookahead.ptr) {
|
||||||
lookahead = ts_parser__lex(self, version, state);
|
lookahead = ts_parser__lex(self, version, state);
|
||||||
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
|
if (lookahead.ptr) {
|
||||||
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
|
ts_parser__set_cached_token(self, position, last_external_token, lookahead);
|
||||||
|
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
// When parsing a non-terminal extra, a null lookahead indicates the
|
||||||
|
// end of the rule. The reduction is stored in the EOF table entry.
|
||||||
|
// After the reduction, the lexer needs to be run again.
|
||||||
|
else {
|
||||||
|
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
@ -1422,11 +1434,12 @@ static bool ts_parser__advance(
|
||||||
|
|
||||||
case TSParseActionTypeReduce: {
|
case TSParseActionTypeReduce: {
|
||||||
bool is_fragile = table_entry.action_count > 1;
|
bool is_fragile = table_entry.action_count > 1;
|
||||||
|
bool is_extra = lookahead.ptr == NULL;
|
||||||
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
|
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
|
||||||
StackVersion reduction_version = ts_parser__reduce(
|
StackVersion reduction_version = ts_parser__reduce(
|
||||||
self, version, action.params.symbol, action.params.child_count,
|
self, version, action.params.symbol, action.params.child_count,
|
||||||
action.params.dynamic_precedence, action.params.production_id,
|
action.params.dynamic_precedence, action.params.production_id,
|
||||||
is_fragile
|
is_fragile, is_extra
|
||||||
);
|
);
|
||||||
if (reduction_version != STACK_VERSION_NONE) {
|
if (reduction_version != STACK_VERSION_NONE) {
|
||||||
last_reduction_version = reduction_version;
|
last_reduction_version = reduction_version;
|
||||||
|
@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
|
||||||
ts_stack_renumber_version(self->stack, last_reduction_version, version);
|
ts_stack_renumber_version(self->stack, last_reduction_version, version);
|
||||||
LOG_STACK();
|
LOG_STACK();
|
||||||
state = ts_stack_state(self->stack, version);
|
state = ts_stack_state(self->stack, version);
|
||||||
|
|
||||||
|
// At the end of a non-terminal extra rule, the lexer will return a
|
||||||
|
// null subtree, because the parser needs to perform a fixed reduction
|
||||||
|
// regardless of the lookahead node. After performing that reduction,
|
||||||
|
// (and completing the non-terminal extra rule) run the lexer again based
|
||||||
|
// on the current parse state.
|
||||||
|
if (!lookahead.ptr) {
|
||||||
|
lookahead = ts_parser__lex(self, version, state);
|
||||||
|
}
|
||||||
ts_language_table_entry(
|
ts_language_table_entry(
|
||||||
self->language,
|
self->language,
|
||||||
state,
|
state,
|
||||||
|
@ -1655,6 +1677,7 @@ TSParser *ts_parser_new(void) {
|
||||||
void ts_parser_delete(TSParser *self) {
|
void ts_parser_delete(TSParser *self) {
|
||||||
if (!self) return;
|
if (!self) return;
|
||||||
|
|
||||||
|
ts_parser_set_language(self, NULL);
|
||||||
ts_stack_delete(self->stack);
|
ts_stack_delete(self->stack);
|
||||||
if (self->reduce_actions.contents) {
|
if (self->reduce_actions.contents) {
|
||||||
array_delete(&self->reduce_actions);
|
array_delete(&self->reduce_actions);
|
||||||
|
@ -1670,7 +1693,6 @@ void ts_parser_delete(TSParser *self) {
|
||||||
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
|
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
|
||||||
ts_subtree_pool_delete(&self->tree_pool);
|
ts_subtree_pool_delete(&self->tree_pool);
|
||||||
reusable_node_delete(&self->reusable_node);
|
reusable_node_delete(&self->reusable_node);
|
||||||
ts_parser_set_language(self, NULL);
|
|
||||||
ts_free(self);
|
ts_free(self);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1695,6 +1717,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language) {
|
||||||
}
|
}
|
||||||
|
|
||||||
self->language = language;
|
self->language = language;
|
||||||
|
ts_parser_reset(self);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1747,7 +1770,7 @@ const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count)
|
||||||
}
|
}
|
||||||
|
|
||||||
void ts_parser_reset(TSParser *self) {
|
void ts_parser_reset(TSParser *self) {
|
||||||
if (self->language->external_scanner.deserialize) {
|
if (self->language && self->language->external_scanner.deserialize) {
|
||||||
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
|
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,8 @@ struct TSLexer {
|
||||||
void (*advance)(TSLexer *, bool);
|
void (*advance)(TSLexer *, bool);
|
||||||
void (*mark_end)(TSLexer *);
|
void (*mark_end)(TSLexer *);
|
||||||
uint32_t (*get_column)(TSLexer *);
|
uint32_t (*get_column)(TSLexer *);
|
||||||
bool (*is_at_included_range_start)(TSLexer *);
|
bool (*is_at_included_range_start)(const TSLexer *);
|
||||||
|
bool (*eof)(const TSLexer *);
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
@ -117,6 +118,7 @@ struct TSLanguage {
|
||||||
uint32_t large_state_count;
|
uint32_t large_state_count;
|
||||||
const uint16_t *small_parse_table;
|
const uint16_t *small_parse_table;
|
||||||
const uint32_t *small_parse_table_map;
|
const uint32_t *small_parse_table_map;
|
||||||
|
const TSSymbol *public_symbol_map;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -126,6 +128,7 @@ struct TSLanguage {
|
||||||
#define START_LEXER() \
|
#define START_LEXER() \
|
||||||
bool result = false; \
|
bool result = false; \
|
||||||
bool skip = false; \
|
bool skip = false; \
|
||||||
|
bool eof = false; \
|
||||||
int32_t lookahead; \
|
int32_t lookahead; \
|
||||||
goto start; \
|
goto start; \
|
||||||
next_state: \
|
next_state: \
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include "tree_sitter/api.h"
|
#include "tree_sitter/api.h"
|
||||||
|
|
||||||
|
#define POINT_ZERO ((TSPoint) {0, 0})
|
||||||
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
|
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
|
||||||
|
|
||||||
static inline TSPoint point__new(unsigned row, unsigned column) {
|
static inline TSPoint point__new(unsigned row, unsigned column) {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -18,18 +18,9 @@ typedef struct {
|
||||||
Length new_end;
|
Length new_end;
|
||||||
} Edit;
|
} Edit;
|
||||||
|
|
||||||
#ifdef TREE_SITTER_TEST
|
|
||||||
|
|
||||||
#define TS_MAX_INLINE_TREE_LENGTH 2
|
|
||||||
#define TS_MAX_TREE_POOL_SIZE 0
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
|
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
|
||||||
#define TS_MAX_TREE_POOL_SIZE 32
|
#define TS_MAX_TREE_POOL_SIZE 32
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}};
|
static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}};
|
||||||
|
|
||||||
// ExternalScannerState
|
// ExternalScannerState
|
||||||
|
@ -775,10 +766,10 @@ Subtree ts_subtree_last_external_token(Subtree tree) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
|
static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
|
||||||
if (c == 0)
|
|
||||||
return snprintf(s, n, "EOF");
|
|
||||||
if (c == -1)
|
if (c == -1)
|
||||||
return snprintf(s, n, "INVALID");
|
return snprintf(s, n, "INVALID");
|
||||||
|
else if (c == '\0')
|
||||||
|
return snprintf(s, n, "'\\0'");
|
||||||
else if (c == '\n')
|
else if (c == '\n')
|
||||||
return snprintf(s, n, "'\\n'");
|
return snprintf(s, n, "'\\n'");
|
||||||
else if (c == '\t')
|
else if (c == '\t')
|
||||||
|
|
|
@ -84,12 +84,10 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) {
|
TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) {
|
||||||
TSRange *result;
|
|
||||||
TreeCursor cursor1 = {NULL, array_new()};
|
TreeCursor cursor1 = {NULL, array_new()};
|
||||||
TreeCursor cursor2 = {NULL, array_new()};
|
TreeCursor cursor2 = {NULL, array_new()};
|
||||||
TSNode root = ts_tree_root_node(self);
|
ts_tree_cursor_init(&cursor1, ts_tree_root_node(self));
|
||||||
ts_tree_cursor_init(&cursor1, root);
|
ts_tree_cursor_init(&cursor2, ts_tree_root_node(other));
|
||||||
ts_tree_cursor_init(&cursor2, root);
|
|
||||||
|
|
||||||
TSRangeArray included_range_differences = array_new();
|
TSRangeArray included_range_differences = array_new();
|
||||||
ts_range_array_get_changed_ranges(
|
ts_range_array_get_changed_ranges(
|
||||||
|
@ -98,6 +96,7 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin
|
||||||
&included_range_differences
|
&included_range_differences
|
||||||
);
|
);
|
||||||
|
|
||||||
|
TSRange *result;
|
||||||
*count = ts_subtree_get_changed_ranges(
|
*count = ts_subtree_get_changed_ranges(
|
||||||
&self->root, &other->root, &cursor1, &cursor2,
|
&self->root, &other->root, &cursor1, &cursor2,
|
||||||
self->language, &included_range_differences, &result
|
self->language, &included_range_differences, &result
|
||||||
|
|
|
@ -244,6 +244,72 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TSFieldId ts_tree_cursor_current_status(
|
||||||
|
const TSTreeCursor *_self,
|
||||||
|
bool *can_have_later_siblings,
|
||||||
|
bool *can_have_later_siblings_with_this_field
|
||||||
|
) {
|
||||||
|
const TreeCursor *self = (const TreeCursor *)_self;
|
||||||
|
TSFieldId result = 0;
|
||||||
|
*can_have_later_siblings = false;
|
||||||
|
*can_have_later_siblings_with_this_field = false;
|
||||||
|
|
||||||
|
// Walk up the tree, visiting the current node and its invisible ancestors,
|
||||||
|
// because fields can refer to nodes through invisible *wrapper* nodes,
|
||||||
|
for (unsigned i = self->stack.size - 1; i > 0; i--) {
|
||||||
|
TreeCursorEntry *entry = &self->stack.contents[i];
|
||||||
|
TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
|
||||||
|
|
||||||
|
// Stop walking up when a visible ancestor is found.
|
||||||
|
if (i != self->stack.size - 1) {
|
||||||
|
if (ts_subtree_visible(*entry->subtree)) break;
|
||||||
|
const TSSymbol *alias_sequence = ts_language_alias_sequence(
|
||||||
|
self->tree->language,
|
||||||
|
parent_entry->subtree->ptr->production_id
|
||||||
|
);
|
||||||
|
if (alias_sequence && alias_sequence[entry->structural_child_index]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) {
|
||||||
|
*can_have_later_siblings = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ts_subtree_extra(*entry->subtree)) break;
|
||||||
|
|
||||||
|
const TSFieldMapEntry *field_map, *field_map_end;
|
||||||
|
ts_language_field_map(
|
||||||
|
self->tree->language,
|
||||||
|
parent_entry->subtree->ptr->production_id,
|
||||||
|
&field_map, &field_map_end
|
||||||
|
);
|
||||||
|
|
||||||
|
// Look for a field name associated with the current node.
|
||||||
|
if (!result) {
|
||||||
|
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
|
||||||
|
if (!i->inherited && i->child_index == entry->structural_child_index) {
|
||||||
|
result = i->field_id;
|
||||||
|
*can_have_later_siblings_with_this_field = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine if there other later siblings with the same field name.
|
||||||
|
if (result) {
|
||||||
|
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
|
||||||
|
if (i->field_id == result && i->child_index > entry->structural_child_index) {
|
||||||
|
*can_have_later_siblings_with_this_field = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
|
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
|
||||||
const TreeCursor *self = (const TreeCursor *)_self;
|
const TreeCursor *self = (const TreeCursor *)_self;
|
||||||
|
|
||||||
|
@ -264,19 +330,18 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ts_subtree_extra(*entry->subtree)) break;
|
||||||
|
|
||||||
const TSFieldMapEntry *field_map, *field_map_end;
|
const TSFieldMapEntry *field_map, *field_map_end;
|
||||||
ts_language_field_map(
|
ts_language_field_map(
|
||||||
self->tree->language,
|
self->tree->language,
|
||||||
parent_entry->subtree->ptr->production_id,
|
parent_entry->subtree->ptr->production_id,
|
||||||
&field_map, &field_map_end
|
&field_map, &field_map_end
|
||||||
);
|
);
|
||||||
|
for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
|
||||||
while (field_map < field_map_end) {
|
if (!i->inherited && i->child_index == entry->structural_child_index) {
|
||||||
if (
|
return i->field_id;
|
||||||
!field_map->inherited &&
|
}
|
||||||
field_map->child_index == entry->structural_child_index
|
|
||||||
) return field_map->field_id;
|
|
||||||
field_map++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -16,5 +16,6 @@ typedef struct {
|
||||||
} TreeCursor;
|
} TreeCursor;
|
||||||
|
|
||||||
void ts_tree_cursor_init(TreeCursor *, TSNode);
|
void ts_tree_cursor_init(TreeCursor *, TSNode);
|
||||||
|
TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *);
|
||||||
|
|
||||||
#endif // TREE_SITTER_TREE_CURSOR_H_
|
#endif // TREE_SITTER_TREE_CURSOR_H_
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
#ifndef TREE_SITTER_UNICODE_H_
|
||||||
|
#define TREE_SITTER_UNICODE_H_
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define U_EXPORT
|
||||||
|
#define U_EXPORT2
|
||||||
|
#include "./unicode/utf8.h"
|
||||||
|
#include "./unicode/utf16.h"
|
||||||
|
|
||||||
|
static const int32_t TS_DECODE_ERROR = U_SENTINEL;
|
||||||
|
|
||||||
|
// These functions read one unicode code point from the given string,
|
||||||
|
// returning the number of bytes consumed.
|
||||||
|
typedef uint32_t (*UnicodeDecodeFunction)(
|
||||||
|
const uint8_t *string,
|
||||||
|
uint32_t length,
|
||||||
|
int32_t *code_point
|
||||||
|
);
|
||||||
|
|
||||||
|
static inline uint32_t ts_decode_utf8(
|
||||||
|
const uint8_t *string,
|
||||||
|
uint32_t length,
|
||||||
|
int32_t *code_point
|
||||||
|
) {
|
||||||
|
uint32_t i = 0;
|
||||||
|
U8_NEXT(string, i, length, *code_point);
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t ts_decode_utf16(
|
||||||
|
const uint8_t *string,
|
||||||
|
uint32_t length,
|
||||||
|
int32_t *code_point
|
||||||
|
) {
|
||||||
|
uint32_t i = 0;
|
||||||
|
U16_NEXT(((uint16_t *)string), i, length, *code_point);
|
||||||
|
return i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // TREE_SITTER_UNICODE_H_
|
|
@ -0,0 +1 @@
|
||||||
|
552b01f61127d30d6589aa4bf99468224979b661
|
|
@ -0,0 +1,414 @@
|
||||||
|
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
|
||||||
|
|
||||||
|
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
|
||||||
|
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
a copy of the Unicode data files and any associated documentation
|
||||||
|
(the "Data Files") or Unicode software and any associated documentation
|
||||||
|
(the "Software") to deal in the Data Files or Software
|
||||||
|
without restriction, including without limitation the rights to use,
|
||||||
|
copy, modify, merge, publish, distribute, and/or sell copies of
|
||||||
|
the Data Files or Software, and to permit persons to whom the Data Files
|
||||||
|
or Software are furnished to do so, provided that either
|
||||||
|
(a) this copyright and permission notice appear with all copies
|
||||||
|
of the Data Files or Software, or
|
||||||
|
(b) this copyright and permission notice appear in associated
|
||||||
|
Documentation.
|
||||||
|
|
||||||
|
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||||
|
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||||
|
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||||
|
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||||
|
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||||
|
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||||
|
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||||
|
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||||
|
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||||
|
|
||||||
|
Except as contained in this notice, the name of a copyright holder
|
||||||
|
shall not be used in advertising or otherwise to promote the sale,
|
||||||
|
use or other dealings in these Data Files or Software without prior
|
||||||
|
written authorization of the copyright holder.
|
||||||
|
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Third-Party Software Licenses
|
||||||
|
|
||||||
|
This section contains third-party software notices and/or additional
|
||||||
|
terms for licensed third-party software components included within ICU
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
1. ICU License - ICU 1.8.1 to ICU 57.1
|
||||||
|
|
||||||
|
COPYRIGHT AND PERMISSION NOTICE
|
||||||
|
|
||||||
|
Copyright (c) 1995-2016 International Business Machines Corporation and others
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
a copy of this software and associated documentation files (the
|
||||||
|
"Software"), to deal in the Software without restriction, including
|
||||||
|
without limitation the rights to use, copy, modify, merge, publish,
|
||||||
|
distribute, and/or sell copies of the Software, and to permit persons
|
||||||
|
to whom the Software is furnished to do so, provided that the above
|
||||||
|
copyright notice(s) and this permission notice appear in all copies of
|
||||||
|
the Software and that both the above copyright notice(s) and this
|
||||||
|
permission notice appear in supporting documentation.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
||||||
|
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
||||||
|
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
|
||||||
|
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
|
||||||
|
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
|
||||||
|
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||||
|
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
|
||||||
|
Except as contained in this notice, the name of a copyright holder
|
||||||
|
shall not be used in advertising or otherwise to promote the sale, use
|
||||||
|
or other dealings in this Software without prior written authorization
|
||||||
|
of the copyright holder.
|
||||||
|
|
||||||
|
All trademarks and registered trademarks mentioned herein are the
|
||||||
|
property of their respective owners.
|
||||||
|
|
||||||
|
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
|
||||||
|
|
||||||
|
# The Google Chrome software developed by Google is licensed under
|
||||||
|
# the BSD license. Other software included in this distribution is
|
||||||
|
# provided under other licenses, as set forth below.
|
||||||
|
#
|
||||||
|
# The BSD License
|
||||||
|
# http://opensource.org/licenses/bsd-license.php
|
||||||
|
# Copyright (C) 2006-2008, Google Inc.
|
||||||
|
#
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# Redistributions in binary form must reproduce the above
|
||||||
|
# copyright notice, this list of conditions and the following
|
||||||
|
# disclaimer in the documentation and/or other materials provided with
|
||||||
|
# the distribution.
|
||||||
|
# Neither the name of Google Inc. nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived from
|
||||||
|
# this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||||
|
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||||
|
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# The word list in cjdict.txt are generated by combining three word lists
|
||||||
|
# listed below with further processing for compound word breaking. The
|
||||||
|
# frequency is generated with an iterative training against Google web
|
||||||
|
# corpora.
|
||||||
|
#
|
||||||
|
# * Libtabe (Chinese)
|
||||||
|
# - https://sourceforge.net/project/?group_id=1519
|
||||||
|
# - Its license terms and conditions are shown below.
|
||||||
|
#
|
||||||
|
# * IPADIC (Japanese)
|
||||||
|
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
|
||||||
|
# - Its license terms and conditions are shown below.
|
||||||
|
#
|
||||||
|
# ---------COPYING.libtabe ---- BEGIN--------------------
|
||||||
|
#
|
||||||
|
# /*
|
||||||
|
# * Copyright (c) 1999 TaBE Project.
|
||||||
|
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
|
||||||
|
# * All rights reserved.
|
||||||
|
# *
|
||||||
|
# * Redistribution and use in source and binary forms, with or without
|
||||||
|
# * modification, are permitted provided that the following conditions
|
||||||
|
# * are met:
|
||||||
|
# *
|
||||||
|
# * . Redistributions of source code must retain the above copyright
|
||||||
|
# * notice, this list of conditions and the following disclaimer.
|
||||||
|
# * . Redistributions in binary form must reproduce the above copyright
|
||||||
|
# * notice, this list of conditions and the following disclaimer in
|
||||||
|
# * the documentation and/or other materials provided with the
|
||||||
|
# * distribution.
|
||||||
|
# * . Neither the name of the TaBE Project nor the names of its
|
||||||
|
# * contributors may be used to endorse or promote products derived
|
||||||
|
# * from this software without specific prior written permission.
|
||||||
|
# *
|
||||||
|
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||||
|
# * OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
# */
|
||||||
|
#
|
||||||
|
# /*
|
||||||
|
# * Copyright (c) 1999 Computer Systems and Communication Lab,
|
||||||
|
# * Institute of Information Science, Academia
|
||||||
|
# * Sinica. All rights reserved.
|
||||||
|
# *
|
||||||
|
# * Redistribution and use in source and binary forms, with or without
|
||||||
|
# * modification, are permitted provided that the following conditions
|
||||||
|
# * are met:
|
||||||
|
# *
|
||||||
|
# * . Redistributions of source code must retain the above copyright
|
||||||
|
# * notice, this list of conditions and the following disclaimer.
|
||||||
|
# * . Redistributions in binary form must reproduce the above copyright
|
||||||
|
# * notice, this list of conditions and the following disclaimer in
|
||||||
|
# * the documentation and/or other materials provided with the
|
||||||
|
# * distribution.
|
||||||
|
# * . Neither the name of the Computer Systems and Communication Lab
|
||||||
|
# * nor the names of its contributors may be used to endorse or
|
||||||
|
# * promote products derived from this software without specific
|
||||||
|
# * prior written permission.
|
||||||
|
# *
|
||||||
|
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||||
|
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||||
|
# * OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
# */
|
||||||
|
#
|
||||||
|
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
|
||||||
|
# University of Illinois
|
||||||
|
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
|
||||||
|
#
|
||||||
|
# ---------------COPYING.libtabe-----END--------------------------------
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# ---------------COPYING.ipadic-----BEGIN-------------------------------
|
||||||
|
#
|
||||||
|
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
|
||||||
|
# and Technology. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Use, reproduction, and distribution of this software is permitted.
|
||||||
|
# Any copy of this software, whether in its original form or modified,
|
||||||
|
# must include both the above copyright notice and the following
|
||||||
|
# paragraphs.
|
||||||
|
#
|
||||||
|
# Nara Institute of Science and Technology (NAIST),
|
||||||
|
# the copyright holders, disclaims all warranties with regard to this
|
||||||
|
# software, including all implied warranties of merchantability and
|
||||||
|
# fitness, in no event shall NAIST be liable for
|
||||||
|
# any special, indirect or consequential damages or any damages
|
||||||
|
# whatsoever resulting from loss of use, data or profits, whether in an
|
||||||
|
# action of contract, negligence or other tortuous action, arising out
|
||||||
|
# of or in connection with the use or performance of this software.
|
||||||
|
#
|
||||||
|
# A large portion of the dictionary entries
|
||||||
|
# originate from ICOT Free Software. The following conditions for ICOT
|
||||||
|
# Free Software applies to the current dictionary as well.
|
||||||
|
#
|
||||||
|
# Each User may also freely distribute the Program, whether in its
|
||||||
|
# original form or modified, to any third party or parties, PROVIDED
|
||||||
|
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
||||||
|
# on, or be attached to, the Program, which is distributed substantially
|
||||||
|
# in the same form as set out herein and that such intended
|
||||||
|
# distribution, if actually made, will neither violate or otherwise
|
||||||
|
# contravene any of the laws and regulations of the countries having
|
||||||
|
# jurisdiction over the User or the intended distribution itself.
|
||||||
|
#
|
||||||
|
# NO WARRANTY
|
||||||
|
#
|
||||||
|
# The program was produced on an experimental basis in the course of the
|
||||||
|
# research and development conducted during the project and is provided
|
||||||
|
# to users as so produced on an experimental basis. Accordingly, the
|
||||||
|
# program is provided without any warranty whatsoever, whether express,
|
||||||
|
# implied, statutory or otherwise. The term "warranty" used herein
|
||||||
|
# includes, but is not limited to, any warranty of the quality,
|
||||||
|
# performance, merchantability and fitness for a particular purpose of
|
||||||
|
# the program and the nonexistence of any infringement or violation of
|
||||||
|
# any right of any third party.
|
||||||
|
#
|
||||||
|
# Each user of the program will agree and understand, and be deemed to
|
||||||
|
# have agreed and understood, that there is no warranty whatsoever for
|
||||||
|
# the program and, accordingly, the entire risk arising from or
|
||||||
|
# otherwise connected with the program is assumed by the user.
|
||||||
|
#
|
||||||
|
# Therefore, neither ICOT, the copyright holder, or any other
|
||||||
|
# organization that participated in or was otherwise related to the
|
||||||
|
# development of the program and their respective officials, directors,
|
||||||
|
# officers and other employees shall be held liable for any and all
|
||||||
|
# damages, including, without limitation, general, special, incidental
|
||||||
|
# and consequential damages, arising out of or otherwise in connection
|
||||||
|
# with the use or inability to use the program or any product, material
|
||||||
|
# or result produced or otherwise obtained by using the program,
|
||||||
|
# regardless of whether they have been advised of, or otherwise had
|
||||||
|
# knowledge of, the possibility of such damages at any time during the
|
||||||
|
# project or thereafter. Each user will be deemed to have agreed to the
|
||||||
|
# foregoing by his or her commencement of use of the program. The term
|
||||||
|
# "use" as used herein includes, but is not limited to, the use,
|
||||||
|
# modification, copying and distribution of the program and the
|
||||||
|
# production of secondary products from the program.
|
||||||
|
#
|
||||||
|
# In the case where the program, whether in its original form or
|
||||||
|
# modified, was distributed or delivered to or received by a user from
|
||||||
|
# any person, organization or entity other than ICOT, unless it makes or
|
||||||
|
# grants independently of ICOT any specific warranty to the user in
|
||||||
|
# writing, such person, organization or entity, will also be exempted
|
||||||
|
# from and not be held liable to the user for any such damages as noted
|
||||||
|
# above as far as the program is concerned.
|
||||||
|
#
|
||||||
|
# ---------------COPYING.ipadic-----END----------------------------------
|
||||||
|
|
||||||
|
3. Lao Word Break Dictionary Data (laodict.txt)
|
||||||
|
|
||||||
|
# Copyright (c) 2013 International Business Machines Corporation
|
||||||
|
# and others. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Project: http://code.google.com/p/lao-dictionary/
|
||||||
|
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
|
||||||
|
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
|
||||||
|
# (copied below)
|
||||||
|
#
|
||||||
|
# This file is derived from the above dictionary, with slight
|
||||||
|
# modifications.
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification,
|
||||||
|
# are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Redistributions of source code must retain the above copyright notice, this
|
||||||
|
# list of conditions and the following disclaimer. Redistributions in
|
||||||
|
# binary form must reproduce the above copyright notice, this list of
|
||||||
|
# conditions and the following disclaimer in the documentation and/or
|
||||||
|
# other materials provided with the distribution.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||||
|
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||||
|
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||||
|
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||||
|
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||||
|
# OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
4. Burmese Word Break Dictionary Data (burmesedict.txt)
|
||||||
|
|
||||||
|
# Copyright (c) 2014 International Business Machines Corporation
|
||||||
|
# and others. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# This list is part of a project hosted at:
|
||||||
|
# github.com/kanyawtech/myanmar-karen-word-lists
|
||||||
|
#
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
# Copyright (c) 2013, LeRoy Benjamin Sharon
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions
|
||||||
|
# are met: Redistributions of source code must retain the above
|
||||||
|
# copyright notice, this list of conditions and the following
|
||||||
|
# disclaimer. Redistributions in binary form must reproduce the
|
||||||
|
# above copyright notice, this list of conditions and the following
|
||||||
|
# disclaimer in the documentation and/or other materials provided
|
||||||
|
# with the distribution.
|
||||||
|
#
|
||||||
|
# Neither the name Myanmar Karen Word Lists, nor the names of its
|
||||||
|
# contributors may be used to endorse or promote products derived
|
||||||
|
# from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||||
|
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||||
|
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
|
||||||
|
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
|
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||||
|
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
|
||||||
|
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
# SUCH DAMAGE.
|
||||||
|
# --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
5. Time Zone Database
|
||||||
|
|
||||||
|
ICU uses the public domain data and code derived from Time Zone
|
||||||
|
Database for its time zone support. The ownership of the TZ database
|
||||||
|
is explained in BCP 175: Procedure for Maintaining the Time Zone
|
||||||
|
Database section 7.
|
||||||
|
|
||||||
|
# 7. Database Ownership
|
||||||
|
#
|
||||||
|
# The TZ database itself is not an IETF Contribution or an IETF
|
||||||
|
# document. Rather it is a pre-existing and regularly updated work
|
||||||
|
# that is in the public domain, and is intended to remain in the
|
||||||
|
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
|
||||||
|
# not apply to the TZ Database or contributions that individuals make
|
||||||
|
# to it. Should any claims be made and substantiated against the TZ
|
||||||
|
# Database, the organization that is providing the IANA
|
||||||
|
# Considerations defined in this RFC, under the memorandum of
|
||||||
|
# understanding with the IETF, currently ICANN, may act in accordance
|
||||||
|
# with all competent court orders. No ownership claims will be made
|
||||||
|
# by ICANN or the IETF Trust on the database or the code. Any person
|
||||||
|
# making a contribution to the database or code waives all rights to
|
||||||
|
# future claims in that contribution or in the TZ Database.
|
||||||
|
|
||||||
|
6. Google double-conversion
|
||||||
|
|
||||||
|
Copyright 2006-2011, the V8 project authors. All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
* Neither the name of Google Inc. nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,29 @@
|
||||||
|
# ICU Parts
|
||||||
|
|
||||||
|
This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
|
||||||
|
|
||||||
|
### License
|
||||||
|
|
||||||
|
The license for these files is contained in the `LICENSE` file within this directory.
|
||||||
|
|
||||||
|
### Contents
|
||||||
|
|
||||||
|
* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
|
||||||
|
* `utf8.h`
|
||||||
|
* `utf16.h`
|
||||||
|
* `umachine.h`
|
||||||
|
* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
|
||||||
|
* `ptypes.h`
|
||||||
|
* `urename.h`
|
||||||
|
* `utf.h`
|
||||||
|
* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
|
||||||
|
* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
|
||||||
|
* `README.md` - This text file.
|
||||||
|
|
||||||
|
### Updating ICU
|
||||||
|
|
||||||
|
To incorporate changes from the upstream `icu` repository:
|
||||||
|
|
||||||
|
* Update `ICU_SHA` with the new Git SHA.
|
||||||
|
* Update `LICENSE` with the license text from the directory mentioned above.
|
||||||
|
* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.
|
|
@ -0,0 +1 @@
|
||||||
|
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
|
|
@ -0,0 +1,448 @@
|
||||||
|
// © 2016 and later: Unicode, Inc. and others.
|
||||||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||||||
|
/*
|
||||||
|
******************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (C) 1999-2015, International Business Machines
|
||||||
|
* Corporation and others. All Rights Reserved.
|
||||||
|
*
|
||||||
|
******************************************************************************
|
||||||
|
* file name: umachine.h
|
||||||
|
* encoding: UTF-8
|
||||||
|
* tab size: 8 (not used)
|
||||||
|
* indentation:4
|
||||||
|
*
|
||||||
|
* created on: 1999sep13
|
||||||
|
* created by: Markus W. Scherer
|
||||||
|
*
|
||||||
|
* This file defines basic types and constants for ICU to be
|
||||||
|
* platform-independent. umachine.h and utf.h are included into
|
||||||
|
* utypes.h to provide all the general definitions for ICU.
|
||||||
|
* All of these definitions used to be in utypes.h before
|
||||||
|
* the UTF-handling macros made this unmaintainable.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __UMACHINE_H__
|
||||||
|
#define __UMACHINE_H__
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \file
|
||||||
|
* \brief Basic types and constants for UTF
|
||||||
|
*
|
||||||
|
* <h2> Basic types and constants for UTF </h2>
|
||||||
|
* This file defines basic types and constants for utf.h to be
|
||||||
|
* platform-independent. umachine.h and utf.h are included into
|
||||||
|
* utypes.h to provide all the general definitions for ICU.
|
||||||
|
* All of these definitions used to be in utypes.h before
|
||||||
|
* the UTF-handling macros made this unmaintainable.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
/*==========================================================================*/
|
||||||
|
/* Include platform-dependent definitions */
|
||||||
|
/* which are contained in the platform-specific file platform.h */
|
||||||
|
/*==========================================================================*/
|
||||||
|
|
||||||
|
#include "./ptypes.h" /* platform.h is included in ptypes.h */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ANSI C headers:
|
||||||
|
* stddef.h defines wchar_t
|
||||||
|
*/
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
/*==========================================================================*/
|
||||||
|
/* For C wrappers, we use the symbol U_STABLE. */
|
||||||
|
/* This works properly if the includer is C or C++. */
|
||||||
|
/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
|
||||||
|
/*==========================================================================*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_CFUNC
|
||||||
|
* This is used in a declaration of a library private ICU C function.
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_CDECL_BEGIN
|
||||||
|
* This is used to begin a declaration of a library private ICU C API.
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_CDECL_END
|
||||||
|
* This is used to end a declaration of a library private ICU C API
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
# define U_CFUNC extern "C"
|
||||||
|
# define U_CDECL_BEGIN extern "C" {
|
||||||
|
# define U_CDECL_END }
|
||||||
|
#else
|
||||||
|
# define U_CFUNC extern
|
||||||
|
# define U_CDECL_BEGIN
|
||||||
|
# define U_CDECL_END
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef U_ATTRIBUTE_DEPRECATED
|
||||||
|
/**
|
||||||
|
* \def U_ATTRIBUTE_DEPRECATED
|
||||||
|
* This is used for GCC specific attributes
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#if U_GCC_MAJOR_MINOR >= 302
|
||||||
|
# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
|
||||||
|
/**
|
||||||
|
* \def U_ATTRIBUTE_DEPRECATED
|
||||||
|
* This is used for Visual C++ specific attributes
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
|
||||||
|
# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
|
||||||
|
#else
|
||||||
|
# define U_ATTRIBUTE_DEPRECATED
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
|
||||||
|
#define U_CAPI U_CFUNC U_EXPORT
|
||||||
|
/** This is used to declare a function as a stable public ICU C API*/
|
||||||
|
#define U_STABLE U_CAPI
|
||||||
|
/** This is used to declare a function as a draft public ICU C API */
|
||||||
|
#define U_DRAFT U_CAPI
|
||||||
|
/** This is used to declare a function as a deprecated public ICU C API */
|
||||||
|
#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
|
||||||
|
/** This is used to declare a function as an obsolete public ICU C API */
|
||||||
|
#define U_OBSOLETE U_CAPI
|
||||||
|
/** This is used to declare a function as an internal ICU C API */
|
||||||
|
#define U_INTERNAL U_CAPI
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_OVERRIDE
|
||||||
|
* Defined to the C++11 "override" keyword if available.
|
||||||
|
* Denotes a class or member which is an override of the base class.
|
||||||
|
* May result in an error if it applied to something not an override.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#ifndef U_OVERRIDE
|
||||||
|
#define U_OVERRIDE override
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_FINAL
|
||||||
|
* Defined to the C++11 "final" keyword if available.
|
||||||
|
* Denotes a class or member which may not be overridden in subclasses.
|
||||||
|
* May result in an error if subclasses attempt to override.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
|
||||||
|
#define U_FINAL final
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Before ICU 65, function-like, multi-statement ICU macros were just defined as
|
||||||
|
// series of statements wrapped in { } blocks and the caller could choose to
|
||||||
|
// either treat them as if they were actual functions and end the invocation
|
||||||
|
// with a trailing ; creating an empty statement after the block or else omit
|
||||||
|
// this trailing ; using the knowledge that the macro would expand to { }.
|
||||||
|
//
|
||||||
|
// But doing so doesn't work well with macros that look like functions and
|
||||||
|
// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
|
||||||
|
// switches to the standard solution of wrapping such macros in do { } while.
|
||||||
|
//
|
||||||
|
// This will however break existing code that depends on being able to invoke
|
||||||
|
// these macros without a trailing ; so to be able to remain compatible with
|
||||||
|
// such code the wrapper is itself defined as macros so that it's possible to
|
||||||
|
// build ICU 65 and later with the old macro behaviour, like this:
|
||||||
|
//
|
||||||
|
// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
|
||||||
|
// runConfigureICU ...
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def UPRV_BLOCK_MACRO_BEGIN
|
||||||
|
* Defined as the "do" keyword by default.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#ifndef UPRV_BLOCK_MACRO_BEGIN
|
||||||
|
#define UPRV_BLOCK_MACRO_BEGIN do
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def UPRV_BLOCK_MACRO_END
|
||||||
|
* Defined as "while (FALSE)" by default.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#ifndef UPRV_BLOCK_MACRO_END
|
||||||
|
#define UPRV_BLOCK_MACRO_END while (FALSE)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*==========================================================================*/
|
||||||
|
/* limits for int32_t etc., like in POSIX inttypes.h */
|
||||||
|
/*==========================================================================*/
|
||||||
|
|
||||||
|
#ifndef INT8_MIN
|
||||||
|
/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT8_MIN ((int8_t)(-128))
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MIN
|
||||||
|
/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT16_MIN ((int16_t)(-32767-1))
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MIN
|
||||||
|
/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT32_MIN ((int32_t)(-2147483647-1))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef INT8_MAX
|
||||||
|
/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT8_MAX ((int8_t)(127))
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MAX
|
||||||
|
/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT16_MAX ((int16_t)(32767))
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MAX
|
||||||
|
/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
|
||||||
|
# define INT32_MAX ((int32_t)(2147483647))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef UINT8_MAX
|
||||||
|
/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
|
||||||
|
# define UINT8_MAX ((uint8_t)(255U))
|
||||||
|
#endif
|
||||||
|
#ifndef UINT16_MAX
|
||||||
|
/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
|
||||||
|
# define UINT16_MAX ((uint16_t)(65535U))
|
||||||
|
#endif
|
||||||
|
#ifndef UINT32_MAX
|
||||||
|
/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
|
||||||
|
# define UINT32_MAX ((uint32_t)(4294967295U))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(U_INT64_T_UNAVAILABLE)
|
||||||
|
# error int64_t is required for decimal format and rule-based number format.
|
||||||
|
#else
|
||||||
|
# ifndef INT64_C
|
||||||
|
/**
|
||||||
|
* Provides a platform independent way to specify a signed 64-bit integer constant.
|
||||||
|
* note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
|
||||||
|
* @stable ICU 2.8
|
||||||
|
*/
|
||||||
|
# define INT64_C(c) c ## LL
|
||||||
|
# endif
|
||||||
|
# ifndef UINT64_C
|
||||||
|
/**
|
||||||
|
* Provides a platform independent way to specify an unsigned 64-bit integer constant.
|
||||||
|
* note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
|
||||||
|
* @stable ICU 2.8
|
||||||
|
*/
|
||||||
|
# define UINT64_C(c) c ## ULL
|
||||||
|
# endif
|
||||||
|
# ifndef U_INT64_MIN
|
||||||
|
/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
|
||||||
|
# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
|
||||||
|
# endif
|
||||||
|
# ifndef U_INT64_MAX
|
||||||
|
/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
|
||||||
|
# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
|
||||||
|
# endif
|
||||||
|
# ifndef U_UINT64_MAX
|
||||||
|
/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
|
||||||
|
# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*==========================================================================*/
|
||||||
|
/* Boolean data type */
|
||||||
|
/*==========================================================================*/
|
||||||
|
|
||||||
|
/** The ICU boolean type @stable ICU 2.0 */
|
||||||
|
typedef int8_t UBool;
|
||||||
|
|
||||||
|
#ifndef TRUE
|
||||||
|
/** The TRUE value of a UBool @stable ICU 2.0 */
|
||||||
|
# define TRUE 1
|
||||||
|
#endif
|
||||||
|
#ifndef FALSE
|
||||||
|
/** The FALSE value of a UBool @stable ICU 2.0 */
|
||||||
|
# define FALSE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*==========================================================================*/
|
||||||
|
/* Unicode data types */
|
||||||
|
/*==========================================================================*/
|
||||||
|
|
||||||
|
/* wchar_t-related definitions -------------------------------------------- */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* \def U_WCHAR_IS_UTF16
|
||||||
|
* Defined if wchar_t uses UTF-16.
|
||||||
|
*
|
||||||
|
* @stable ICU 2.0
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* \def U_WCHAR_IS_UTF32
|
||||||
|
* Defined if wchar_t uses UTF-32.
|
||||||
|
*
|
||||||
|
* @stable ICU 2.0
|
||||||
|
*/
|
||||||
|
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
|
||||||
|
# ifdef __STDC_ISO_10646__
|
||||||
|
# if (U_SIZEOF_WCHAR_T==2)
|
||||||
|
# define U_WCHAR_IS_UTF16
|
||||||
|
# elif (U_SIZEOF_WCHAR_T==4)
|
||||||
|
# define U_WCHAR_IS_UTF32
|
||||||
|
# endif
|
||||||
|
# elif defined __UCS2__
|
||||||
|
# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
|
||||||
|
# define U_WCHAR_IS_UTF16
|
||||||
|
# endif
|
||||||
|
# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
|
||||||
|
# if (U_SIZEOF_WCHAR_T==4)
|
||||||
|
# define U_WCHAR_IS_UTF32
|
||||||
|
# endif
|
||||||
|
# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
|
||||||
|
# define U_WCHAR_IS_UTF32
|
||||||
|
# elif U_PLATFORM_HAS_WIN32_API
|
||||||
|
# define U_WCHAR_IS_UTF16
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* UChar and UChar32 definitions -------------------------------------------- */
|
||||||
|
|
||||||
|
/** Number of bytes in a UChar. @stable ICU 2.0 */
|
||||||
|
#define U_SIZEOF_UCHAR 2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \def U_CHAR16_IS_TYPEDEF
|
||||||
|
* If 1, then char16_t is a typedef and not a real type (yet)
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
|
||||||
|
// for AIX, uchar.h needs to be included
|
||||||
|
# include <uchar.h>
|
||||||
|
# define U_CHAR16_IS_TYPEDEF 1
|
||||||
|
#elif defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||||
|
// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
|
||||||
|
// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
|
||||||
|
# define U_CHAR16_IS_TYPEDEF 1
|
||||||
|
#else
|
||||||
|
# define U_CHAR16_IS_TYPEDEF 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \var UChar
|
||||||
|
*
|
||||||
|
* The base type for UTF-16 code units and pointers.
|
||||||
|
* Unsigned 16-bit integer.
|
||||||
|
* Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
|
||||||
|
*
|
||||||
|
* UChar is configurable by defining the macro UCHAR_TYPE
|
||||||
|
* on the preprocessor or compiler command line:
|
||||||
|
* -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
|
||||||
|
* (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
|
||||||
|
* This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
|
||||||
|
*
|
||||||
|
* The default is UChar=char16_t.
|
||||||
|
*
|
||||||
|
* C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
|
||||||
|
*
|
||||||
|
* In C, char16_t is a simple typedef of uint_least16_t.
|
||||||
|
* ICU requires uint_least16_t=uint16_t for data memory mapping.
|
||||||
|
* On macOS, char16_t is not available because the uchar.h standard header is missing.
|
||||||
|
*
|
||||||
|
* @stable ICU 4.4
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// #if 1 is normal. UChar defaults to char16_t in C++.
|
||||||
|
// For configuration testing of UChar=uint16_t temporarily change this to #if 0.
|
||||||
|
// The intltest Makefile #defines UCHAR_TYPE=char16_t,
|
||||||
|
// so we only #define it to uint16_t if it is undefined so far.
|
||||||
|
#elif !defined(UCHAR_TYPE)
|
||||||
|
# define UCHAR_TYPE uint16_t
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
|
||||||
|
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
|
||||||
|
// Inside the ICU library code, never configurable.
|
||||||
|
typedef char16_t UChar;
|
||||||
|
#elif defined(UCHAR_TYPE)
|
||||||
|
typedef UCHAR_TYPE UChar;
|
||||||
|
#elif defined(__cplusplus)
|
||||||
|
typedef char16_t UChar;
|
||||||
|
#else
|
||||||
|
typedef uint16_t UChar;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \var OldUChar
|
||||||
|
* Default ICU 58 definition of UChar.
|
||||||
|
* A base type for UTF-16 code units and pointers.
|
||||||
|
* Unsigned 16-bit integer.
|
||||||
|
*
|
||||||
|
* Define OldUChar to be wchar_t if that is 16 bits wide.
|
||||||
|
* If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
|
||||||
|
*
|
||||||
|
* This makes the definition of OldUChar platform-dependent
|
||||||
|
* but allows direct string type compatibility with platforms with
|
||||||
|
* 16-bit wchar_t types.
|
||||||
|
*
|
||||||
|
* This is how UChar was defined in ICU 58, for transition convenience.
|
||||||
|
* Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
|
||||||
|
* The current UChar responds to UCHAR_TYPE but OldUChar does not.
|
||||||
|
*
|
||||||
|
* @stable ICU 59
|
||||||
|
*/
|
||||||
|
#if U_SIZEOF_WCHAR_T==2
|
||||||
|
typedef wchar_t OldUChar;
|
||||||
|
#elif defined(__CHAR16_TYPE__)
|
||||||
|
typedef __CHAR16_TYPE__ OldUChar;
|
||||||
|
#else
|
||||||
|
typedef uint16_t OldUChar;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Define UChar32 as a type for single Unicode code points.
|
||||||
|
* UChar32 is a signed 32-bit integer (same as int32_t).
|
||||||
|
*
|
||||||
|
* The Unicode code point range is 0..0x10ffff.
|
||||||
|
* All other values (negative or >=0x110000) are illegal as Unicode code points.
|
||||||
|
* They may be used as sentinel values to indicate "done", "error"
|
||||||
|
* or similar non-code point conditions.
|
||||||
|
*
|
||||||
|
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
|
||||||
|
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
|
||||||
|
* or else to be uint32_t.
|
||||||
|
* That is, the definition of UChar32 was platform-dependent.
|
||||||
|
*
|
||||||
|
* @see U_SENTINEL
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
typedef int32_t UChar32;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This value is intended for sentinel values for APIs that
|
||||||
|
* (take or) return single code points (UChar32).
|
||||||
|
* It is outside of the Unicode code point range 0..0x10ffff.
|
||||||
|
*
|
||||||
|
* For example, a "done" or "error" value in a new API
|
||||||
|
* could be indicated with U_SENTINEL.
|
||||||
|
*
|
||||||
|
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
|
||||||
|
* values, mostly 0xffff.
|
||||||
|
* Those may need to be distinguished from
|
||||||
|
* actual U+ffff text contents by calling functions like
|
||||||
|
* CharacterIterator::hasNext() or UnicodeString::length().
|
||||||
|
*
|
||||||
|
* @return -1
|
||||||
|
* @see UChar32
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U_SENTINEL (-1)
|
||||||
|
|
||||||
|
#include "./urename.h"
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1 @@
|
||||||
|
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
|
|
@ -0,0 +1 @@
|
||||||
|
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
|
|
@ -0,0 +1,733 @@
|
||||||
|
// © 2016 and later: Unicode, Inc. and others.
|
||||||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||||||
|
/*
|
||||||
|
*******************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (C) 1999-2012, International Business Machines
|
||||||
|
* Corporation and others. All Rights Reserved.
|
||||||
|
*
|
||||||
|
*******************************************************************************
|
||||||
|
* file name: utf16.h
|
||||||
|
* encoding: UTF-8
|
||||||
|
* tab size: 8 (not used)
|
||||||
|
* indentation:4
|
||||||
|
*
|
||||||
|
* created on: 1999sep09
|
||||||
|
* created by: Markus W. Scherer
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \file
|
||||||
|
* \brief C API: 16-bit Unicode handling macros
|
||||||
|
*
|
||||||
|
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
|
||||||
|
*
|
||||||
|
* For more information see utf.h and the ICU User Guide Strings chapter
|
||||||
|
* (http://userguide.icu-project.org/strings).
|
||||||
|
*
|
||||||
|
* <em>Usage:</em>
|
||||||
|
* ICU coding guidelines for if() statements should be followed when using these macros.
|
||||||
|
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||||
|
* bodies and all macro statements should be terminated with semicolon.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __UTF16_H__
|
||||||
|
#define __UTF16_H__
|
||||||
|
|
||||||
|
#include "./umachine.h"
|
||||||
|
#ifndef __UTF_H__
|
||||||
|
# include "./utf.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* single-code point definitions -------------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does this code unit alone encode a code point (BMP, not a surrogate)?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this code unit a lead surrogate (U+d800..U+dbff)?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this code unit a surrogate (U+d800..U+dfff)?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
|
||||||
|
* is it a lead surrogate?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
|
||||||
|
* is it a trail surrogate?
|
||||||
|
* @param c 16-bit code unit
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 4.2
|
||||||
|
*/
|
||||||
|
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper constant for U16_GET_SUPPLEMENTARY.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a supplementary code point value (U+10000..U+10ffff)
|
||||||
|
* from its lead and trail surrogates.
|
||||||
|
* The result is undefined if the input values are not
|
||||||
|
* lead and trail surrogates.
|
||||||
|
*
|
||||||
|
* @param lead lead surrogate (U+d800..U+dbff)
|
||||||
|
* @param trail trail surrogate (U+dc00..U+dfff)
|
||||||
|
* @return supplementary code point (U+10000..U+10ffff)
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_GET_SUPPLEMENTARY(lead, trail) \
|
||||||
|
(((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the lead surrogate (0xd800..0xdbff) for a
|
||||||
|
* supplementary code point (0x10000..0x10ffff).
|
||||||
|
* @param supplementary 32-bit code point (U+10000..U+10ffff)
|
||||||
|
* @return lead surrogate (U+d800..U+dbff) for supplementary
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the trail surrogate (0xdc00..0xdfff) for a
|
||||||
|
* supplementary code point (0x10000..0x10ffff).
|
||||||
|
* @param supplementary 32-bit code point (U+10000..U+10ffff)
|
||||||
|
* @return trail surrogate (U+dc00..U+dfff) for supplementary
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
|
||||||
|
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
|
||||||
|
* @param c 32-bit code point
|
||||||
|
* @return 1 or 2
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
|
||||||
|
* @return 2
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_MAX_LENGTH 2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* The offset may point to either the lead or trail surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the adjacent matching surrogate as well.
|
||||||
|
* The result is undefined if the offset points to a single, unpaired surrogate.
|
||||||
|
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_GET
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[i]; \
|
||||||
|
if(U16_IS_SURROGATE(c)) { \
|
||||||
|
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
|
||||||
|
} else { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The offset may point to either the lead or trail surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the adjacent matching surrogate as well.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* If the offset points to a single, unpaired surrogate, then
|
||||||
|
* c is set to that unpaired surrogate.
|
||||||
|
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<=i<length
|
||||||
|
* @param length string length
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_GET_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[i]; \
|
||||||
|
if(U16_IS_SURROGATE(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||||
|
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The offset may point to either the lead or trail surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the adjacent matching surrogate as well.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* If the offset points to a single, unpaired surrogate, then
|
||||||
|
* c is set to U+FFFD.
|
||||||
|
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<=i<length
|
||||||
|
* @param length string length
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_GET_UNSAFE
|
||||||
|
* @stable ICU 60
|
||||||
|
*/
|
||||||
|
#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[i]; \
|
||||||
|
if(U16_IS_SURROGATE(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||||
|
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||||
|
} else { \
|
||||||
|
(c)=0xfffd; \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||||
|
} else { \
|
||||||
|
(c)=0xfffd; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/* definitions with forward iteration --------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the following trail surrogate as well.
|
||||||
|
* If the offset points to a trail surrogate, then that itself
|
||||||
|
* will be returned as the code point.
|
||||||
|
* The result is undefined if the offset points to a single, unpaired lead surrogate.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_NEXT
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[(i)++]; \
|
||||||
|
if(U16_IS_LEAD(c)) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the following trail surrogate as well.
|
||||||
|
* If the offset points to a trail surrogate or
|
||||||
|
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset, must be i<length
|
||||||
|
* @param length string length
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_NEXT_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[(i)++]; \
|
||||||
|
if(U16_IS_LEAD(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||||
|
++(i); \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead surrogate unit
|
||||||
|
* for a supplementary code point, in which case the macro will read
|
||||||
|
* the following trail surrogate as well.
|
||||||
|
* If the offset points to a trail surrogate or
|
||||||
|
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset, must be i<length
|
||||||
|
* @param length string length
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_NEXT_UNSAFE
|
||||||
|
* @stable ICU 60
|
||||||
|
*/
|
||||||
|
#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[(i)++]; \
|
||||||
|
if(U16_IS_SURROGATE(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||||
|
++(i); \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||||
|
} else { \
|
||||||
|
(c)=0xfffd; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||||
|
* The offset points to the current end of the string contents
|
||||||
|
* and is advanced (post-increment).
|
||||||
|
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
|
||||||
|
* Otherwise, the result is undefined.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string buffer
|
||||||
|
* @param i string offset
|
||||||
|
* @param c code point to append
|
||||||
|
* @see U16_APPEND
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if((uint32_t)(c)<=0xffff) { \
|
||||||
|
(s)[(i)++]=(uint16_t)(c); \
|
||||||
|
} else { \
|
||||||
|
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
|
||||||
|
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||||
|
* The offset points to the current end of the string contents
|
||||||
|
* and is advanced (post-increment).
|
||||||
|
* "Safe" macro, checks for a valid code point.
|
||||||
|
* If a surrogate pair is written, checks for sufficient space in the string.
|
||||||
|
* If the code point is not valid or a trail surrogate does not fit,
|
||||||
|
* then isError is set to TRUE.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string buffer
|
||||||
|
* @param i string offset, must be i<capacity
|
||||||
|
* @param capacity size of the string buffer
|
||||||
|
* @param c code point to append
|
||||||
|
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
|
||||||
|
* @see U16_APPEND_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if((uint32_t)(c)<=0xffff) { \
|
||||||
|
(s)[(i)++]=(uint16_t)(c); \
|
||||||
|
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
|
||||||
|
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
|
||||||
|
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
|
||||||
|
} else /* c>0x10ffff or not enough space */ { \
|
||||||
|
(isError)=TRUE; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the next.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U16_FWD_1
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_LEAD((s)[(i)++])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the next.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset, must be i<length
|
||||||
|
* @param length string length
|
||||||
|
* @see U16_FWD_1_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the n-th next one,
|
||||||
|
* i.e., move forward by n code points.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U16_FWD_N
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0) { \
|
||||||
|
U16_FWD_1_UNSAFE(s, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the n-th next one,
|
||||||
|
* i.e., move forward by n code points.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i int32_t string offset, must be i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U16_FWD_N_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
|
||||||
|
U16_FWD_1(s, i, length); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary
|
||||||
|
* at the start of a code point.
|
||||||
|
* If the offset points to the trail surrogate of a surrogate pair,
|
||||||
|
* then the offset is decremented.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U16_SET_CP_START
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_TRAIL((s)[i])) { \
|
||||||
|
--(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary
|
||||||
|
* at the start of a code point.
|
||||||
|
* If the offset points to the trail surrogate of a surrogate pair,
|
||||||
|
* then the offset is decremented.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<=i
|
||||||
|
* @see U16_SET_CP_START_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
|
||||||
|
--(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/* definitions with backward iteration -------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a trail surrogate unit
|
||||||
|
* for a supplementary code point, then the macro will read
|
||||||
|
* the preceding lead surrogate as well.
|
||||||
|
* If the offset is behind a lead surrogate, then that itself
|
||||||
|
* will be returned as the code point.
|
||||||
|
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_PREV
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[--(i)]; \
|
||||||
|
if(U16_IS_TRAIL(c)) { \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a trail surrogate unit
|
||||||
|
* for a supplementary code point, then the macro will read
|
||||||
|
* the preceding lead surrogate as well.
|
||||||
|
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||||
|
* trail surrogate, then c is set to that unpaired surrogate.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<i
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_PREV_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[--(i)]; \
|
||||||
|
if(U16_IS_TRAIL(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||||
|
--(i); \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a trail surrogate unit
|
||||||
|
* for a supplementary code point, then the macro will read
|
||||||
|
* the preceding lead surrogate as well.
|
||||||
|
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||||
|
* trail surrogate, then c is set to U+FFFD.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<i
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U16_PREV_UNSAFE
|
||||||
|
* @stable ICU 60
|
||||||
|
*/
|
||||||
|
#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(s)[--(i)]; \
|
||||||
|
if(U16_IS_SURROGATE(c)) { \
|
||||||
|
uint16_t __c2; \
|
||||||
|
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||||
|
--(i); \
|
||||||
|
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||||
|
} else { \
|
||||||
|
(c)=0xfffd; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U16_BACK_1
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_TRAIL((s)[--(i)])) { \
|
||||||
|
--(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start starting string offset (usually 0)
|
||||||
|
* @param i string offset, must be start<i
|
||||||
|
* @see U16_BACK_1_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
|
||||||
|
--(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the n-th one before it,
|
||||||
|
* i.e., move backward by n code points.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U16_BACK_N
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0) { \
|
||||||
|
U16_BACK_1_UNSAFE(s, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the n-th one before it,
|
||||||
|
* i.e., move backward by n code points.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start start of string
|
||||||
|
* @param i string offset, must be start<i
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U16_BACK_N_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0 && (i)>(start)) { \
|
||||||
|
U16_BACK_1(s, start, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary after a code point.
|
||||||
|
* If the offset is behind the lead surrogate of a surrogate pair,
|
||||||
|
* then the offset is incremented.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U16_SET_CP_LIMIT
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U16_IS_LEAD((s)[(i)-1])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary after a code point.
|
||||||
|
* If the offset is behind the lead surrogate of a surrogate pair,
|
||||||
|
* then the offset is incremented.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const UChar * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, start<=i<=length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @see U16_SET_CP_LIMIT_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,881 @@
|
||||||
|
// © 2016 and later: Unicode, Inc. and others.
|
||||||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||||||
|
/*
|
||||||
|
*******************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (C) 1999-2015, International Business Machines
|
||||||
|
* Corporation and others. All Rights Reserved.
|
||||||
|
*
|
||||||
|
*******************************************************************************
|
||||||
|
* file name: utf8.h
|
||||||
|
* encoding: UTF-8
|
||||||
|
* tab size: 8 (not used)
|
||||||
|
* indentation:4
|
||||||
|
*
|
||||||
|
* created on: 1999sep13
|
||||||
|
* created by: Markus W. Scherer
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \file
|
||||||
|
* \brief C API: 8-bit Unicode handling macros
|
||||||
|
*
|
||||||
|
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
|
||||||
|
*
|
||||||
|
* For more information see utf.h and the ICU User Guide Strings chapter
|
||||||
|
* (http://userguide.icu-project.org/strings).
|
||||||
|
*
|
||||||
|
* <em>Usage:</em>
|
||||||
|
* ICU coding guidelines for if() statements should be followed when using these macros.
|
||||||
|
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||||
|
* bodies and all macro statements should be terminated with semicolon.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __UTF8_H__
|
||||||
|
#define __UTF8_H__
|
||||||
|
|
||||||
|
#include "./umachine.h"
|
||||||
|
#ifndef __UTF_H__
|
||||||
|
# include "./utf.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* internal definitions ----------------------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the trail bytes for a UTF-8 lead byte.
|
||||||
|
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
|
||||||
|
* leadByte might be evaluated multiple times.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is called by public macros in this file and thus must remain stable.
|
||||||
|
*
|
||||||
|
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_COUNT_TRAIL_BYTES(leadByte) \
|
||||||
|
(U8_IS_LEAD(leadByte) ? \
|
||||||
|
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
|
||||||
|
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
|
||||||
|
* leadByte might be evaluated multiple times.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is called by public macros in this file and thus must remain stable.
|
||||||
|
*
|
||||||
|
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
|
||||||
|
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is called by public macros in this file and thus must remain stable.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
|
||||||
|
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||||
|
* Lead byte E0..EF bits 3..0 are used as byte index,
|
||||||
|
* first trail byte bits 7..5 are used as bit index into that byte.
|
||||||
|
* @see U8_IS_VALID_LEAD3_AND_T1
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal 3-byte UTF-8 validity check.
|
||||||
|
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
|
||||||
|
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||||
|
* First trail byte bits 7..4 are used as byte index,
|
||||||
|
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
|
||||||
|
* @see U8_IS_VALID_LEAD4_AND_T1
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal 4-byte UTF-8 validity check.
|
||||||
|
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function for handling "next code point" with error-checking.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
|
||||||
|
* file and thus must remain stable, and should not be hidden when other internal
|
||||||
|
* functions are hidden (otherwise public macros would fail to compile).
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
U_STABLE UChar32 U_EXPORT2
|
||||||
|
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function for handling "append code point" with error-checking.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
|
||||||
|
* file and thus must remain stable, and should not be hidden when other internal
|
||||||
|
* functions are hidden (otherwise public macros would fail to compile).
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
U_STABLE int32_t U_EXPORT2
|
||||||
|
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function for handling "previous code point" with error-checking.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
|
||||||
|
* file and thus must remain stable, and should not be hidden when other internal
|
||||||
|
* functions are hidden (otherwise public macros would fail to compile).
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
U_STABLE UChar32 U_EXPORT2
|
||||||
|
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function for handling "skip backward one code point" with error-checking.
|
||||||
|
*
|
||||||
|
* This is internal since it is not meant to be called directly by external clients;
|
||||||
|
* however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
|
||||||
|
* file and thus must remain stable, and should not be hidden when other internal
|
||||||
|
* functions are hidden (otherwise public macros would fail to compile).
|
||||||
|
* @internal
|
||||||
|
*/
|
||||||
|
U_STABLE int32_t U_EXPORT2
|
||||||
|
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||||
|
|
||||||
|
/* single-code point definitions -------------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
|
||||||
|
* @param c 8-bit code unit (byte)
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
|
||||||
|
* @param c 8-bit code unit (byte)
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
|
||||||
|
// 0x32=0xf4-0xc2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
|
||||||
|
* @param c 8-bit code unit (byte)
|
||||||
|
* @return TRUE or FALSE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* How many code units (bytes) are used for the UTF-8 encoding
|
||||||
|
* of this Unicode code point?
|
||||||
|
* @param c 32-bit code point
|
||||||
|
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_LENGTH(c) \
|
||||||
|
((uint32_t)(c)<=0x7f ? 1 : \
|
||||||
|
((uint32_t)(c)<=0x7ff ? 2 : \
|
||||||
|
((uint32_t)(c)<=0xd7ff ? 3 : \
|
||||||
|
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
|
||||||
|
((uint32_t)(c)<=0xffff ? 3 : 4)\
|
||||||
|
) \
|
||||||
|
) \
|
||||||
|
) \
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
|
||||||
|
* @return 4
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_MAX_LENGTH 4
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* The offset may point to either the lead byte or one of the trail bytes
|
||||||
|
* for a code point, in which case the macro will read all of the bytes
|
||||||
|
* for the code point.
|
||||||
|
* The result is undefined if the offset points to an illegal UTF-8
|
||||||
|
* byte sequence.
|
||||||
|
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U8_GET
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t _u8_get_unsafe_index=(int32_t)(i); \
|
||||||
|
U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
|
||||||
|
U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* The offset may point to either the lead byte or one of the trail bytes
|
||||||
|
* for a code point, in which case the macro will read all of the bytes
|
||||||
|
* for the code point.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* If the offset points to an illegal UTF-8 byte sequence, then
|
||||||
|
* c is set to a negative value.
|
||||||
|
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset
|
||||||
|
* @param i int32_t string offset, must be start<=i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param c output UChar32 variable, set to <0 in case of an error
|
||||||
|
* @see U8_GET_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t _u8_get_index=(i); \
|
||||||
|
U8_SET_CP_START(s, start, _u8_get_index); \
|
||||||
|
U8_NEXT(s, _u8_get_index, length, c); \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a random-access offset,
|
||||||
|
* without changing the offset.
|
||||||
|
* The offset may point to either the lead byte or one of the trail bytes
|
||||||
|
* for a code point, in which case the macro will read all of the bytes
|
||||||
|
* for the code point.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* If the offset points to an illegal UTF-8 byte sequence, then
|
||||||
|
* c is set to U+FFFD.
|
||||||
|
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
|
||||||
|
*
|
||||||
|
* This macro does not distinguish between a real U+FFFD in the text
|
||||||
|
* and U+FFFD returned for an ill-formed sequence.
|
||||||
|
* Use U8_GET() if that distinction is important.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset
|
||||||
|
* @param i int32_t string offset, must be start<=i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||||
|
* @see U8_GET
|
||||||
|
* @stable ICU 51
|
||||||
|
*/
|
||||||
|
#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t _u8_get_index=(i); \
|
||||||
|
U8_SET_CP_START(s, start, _u8_get_index); \
|
||||||
|
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/* definitions with forward iteration --------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead byte of a multi-byte sequence,
|
||||||
|
* in which case the macro will read the whole sequence.
|
||||||
|
* The result is undefined if the offset points to a trail byte
|
||||||
|
* or an illegal UTF-8 sequence.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U8_NEXT
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(uint8_t)(s)[(i)++]; \
|
||||||
|
if(!U8_IS_SINGLE(c)) { \
|
||||||
|
if((c)<0xe0) { \
|
||||||
|
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
|
||||||
|
} else if((c)<0xf0) { \
|
||||||
|
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||||
|
(c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
|
||||||
|
(i)+=2; \
|
||||||
|
} else { \
|
||||||
|
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
|
||||||
|
(i)+=3; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead byte of a multi-byte sequence,
|
||||||
|
* in which case the macro will read the whole sequence.
|
||||||
|
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
|
||||||
|
* c is set to a negative value.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i int32_t string offset, must be i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param c output UChar32 variable, set to <0 in case of an error
|
||||||
|
* @see U8_NEXT_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a code point from a string at a code point boundary offset,
|
||||||
|
* and advance the offset to the next code point boundary.
|
||||||
|
* (Post-incrementing forward iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* The offset may point to the lead byte of a multi-byte sequence,
|
||||||
|
* in which case the macro will read the whole sequence.
|
||||||
|
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
|
||||||
|
* c is set to U+FFFD.
|
||||||
|
*
|
||||||
|
* This macro does not distinguish between a real U+FFFD in the text
|
||||||
|
* and U+FFFD returned for an ill-formed sequence.
|
||||||
|
* Use U8_NEXT() if that distinction is important.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i int32_t string offset, must be i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||||
|
* @see U8_NEXT
|
||||||
|
* @stable ICU 51
|
||||||
|
*/
|
||||||
|
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
|
||||||
|
|
||||||
|
/** @internal */
|
||||||
|
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(uint8_t)(s)[(i)++]; \
|
||||||
|
if(!U8_IS_SINGLE(c)) { \
|
||||||
|
uint8_t __t = 0; \
|
||||||
|
if((i)!=(length) && \
|
||||||
|
/* fetch/validate/assemble all but last trail byte */ \
|
||||||
|
((c)>=0xe0 ? \
|
||||||
|
((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
|
||||||
|
U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
|
||||||
|
(__t&=0x3f, 1) \
|
||||||
|
: /* U+10000..U+10FFFF */ \
|
||||||
|
((c)-=0xf0)<=4 && \
|
||||||
|
U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
|
||||||
|
((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
|
||||||
|
(__t=(s)[i]-0x80)<=0x3f) && \
|
||||||
|
/* valid second-to-last trail byte */ \
|
||||||
|
((c)=((c)<<6)|__t, ++(i)!=(length)) \
|
||||||
|
: /* U+0080..U+07FF */ \
|
||||||
|
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
|
||||||
|
/* last trail byte */ \
|
||||||
|
(__t=(s)[i]-0x80)<=0x3f && \
|
||||||
|
((c)=((c)<<6)|__t, ++(i), 1)) { \
|
||||||
|
} else { \
|
||||||
|
(c)=(sub); /* ill-formed*/ \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a code point to a string, overwriting 1 to 4 bytes.
|
||||||
|
* The offset points to the current end of the string contents
|
||||||
|
* and is advanced (post-increment).
|
||||||
|
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
|
||||||
|
* Otherwise, the result is undefined.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string buffer
|
||||||
|
* @param i string offset
|
||||||
|
* @param c code point to append
|
||||||
|
* @see U8_APPEND
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
uint32_t __uc=(c); \
|
||||||
|
if(__uc<=0x7f) { \
|
||||||
|
(s)[(i)++]=(uint8_t)__uc; \
|
||||||
|
} else { \
|
||||||
|
if(__uc<=0x7ff) { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
|
||||||
|
} else { \
|
||||||
|
if(__uc<=0xffff) { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
|
||||||
|
} else { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
|
||||||
|
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
|
||||||
|
} \
|
||||||
|
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
|
||||||
|
} \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a code point to a string, overwriting 1 to 4 bytes.
|
||||||
|
* The offset points to the current end of the string contents
|
||||||
|
* and is advanced (post-increment).
|
||||||
|
* "Safe" macro, checks for a valid code point.
|
||||||
|
* If a non-ASCII code point is written, checks for sufficient space in the string.
|
||||||
|
* If the code point is not valid or trail bytes do not fit,
|
||||||
|
* then isError is set to TRUE.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string buffer
|
||||||
|
* @param i int32_t string offset, must be i<capacity
|
||||||
|
* @param capacity int32_t size of the string buffer
|
||||||
|
* @param c UChar32 code point to append
|
||||||
|
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
|
||||||
|
* @see U8_APPEND_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
uint32_t __uc=(c); \
|
||||||
|
if(__uc<=0x7f) { \
|
||||||
|
(s)[(i)++]=(uint8_t)__uc; \
|
||||||
|
} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
|
||||||
|
} else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
|
||||||
|
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
|
||||||
|
} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
|
||||||
|
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
|
||||||
|
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
|
||||||
|
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
|
||||||
|
} else { \
|
||||||
|
(isError)=TRUE; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the next.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U8_FWD_1
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the next.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i int32_t string offset, must be i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @see U8_FWD_1_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
uint8_t __b=(s)[(i)++]; \
|
||||||
|
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
|
||||||
|
uint8_t __t1=(s)[i]; \
|
||||||
|
if((0xe0<=__b && __b<0xf0)) { \
|
||||||
|
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
|
||||||
|
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} else if(__b<0xe0) { \
|
||||||
|
if(U8_IS_TRAIL(__t1)) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} else /* c>=0xf0 */ { \
|
||||||
|
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
|
||||||
|
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
|
||||||
|
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||||
|
++(i); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the n-th next one,
|
||||||
|
* i.e., move forward by n code points.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U8_FWD_N
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0) { \
|
||||||
|
U8_FWD_1_UNSAFE(s, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the string offset from one code point boundary to the n-th next one,
|
||||||
|
* i.e., move forward by n code points.
|
||||||
|
* (Post-incrementing iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i int32_t string offset, must be i<length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U8_FWD_N_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
|
||||||
|
U8_FWD_1(s, i, length); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary
|
||||||
|
* at the start of a code point.
|
||||||
|
* If the offset points to a UTF-8 trail byte,
|
||||||
|
* then the offset is moved backward to the corresponding lead byte.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U8_SET_CP_START
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
while(U8_IS_TRAIL((s)[i])) { --(i); } \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary
|
||||||
|
* at the start of a code point.
|
||||||
|
* If the offset points to a UTF-8 trail byte,
|
||||||
|
* then the offset is moved backward to the corresponding lead byte.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
*
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, must be start<=i
|
||||||
|
* @see U8_SET_CP_START_UNSAFE
|
||||||
|
* @see U8_TRUNCATE_IF_INCOMPLETE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U8_IS_TRAIL((s)[(i)])) { \
|
||||||
|
(i)=utf8_back1SafeBody(s, start, (i)); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the string ends with a UTF-8 byte sequence that is valid so far
|
||||||
|
* but incomplete, then reduce the length of the string to end before
|
||||||
|
* the lead byte of that incomplete sequence.
|
||||||
|
* For example, if the string ends with E1 80, the length is reduced by 2.
|
||||||
|
*
|
||||||
|
* In all other cases (the string ends with a complete sequence, or it is not
|
||||||
|
* possible for any further trail byte to extend the trailing sequence)
|
||||||
|
* the length remains unchanged.
|
||||||
|
*
|
||||||
|
* Useful for processing text split across multiple buffers
|
||||||
|
* (save the incomplete sequence for later)
|
||||||
|
* and for optimizing iteration
|
||||||
|
* (check for string length only once per character).
|
||||||
|
*
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
* Unlike U8_SET_CP_START(), this macro never reads s[length].
|
||||||
|
*
|
||||||
|
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param length int32_t string length (usually start<=length)
|
||||||
|
* @see U8_SET_CP_START
|
||||||
|
* @stable ICU 61
|
||||||
|
*/
|
||||||
|
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if((length)>(start)) { \
|
||||||
|
uint8_t __b1=s[(length)-1]; \
|
||||||
|
if(U8_IS_SINGLE(__b1)) { \
|
||||||
|
/* common ASCII character */ \
|
||||||
|
} else if(U8_IS_LEAD(__b1)) { \
|
||||||
|
--(length); \
|
||||||
|
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
|
||||||
|
uint8_t __b2=s[(length)-2]; \
|
||||||
|
if(0xe0<=__b2 && __b2<=0xf4) { \
|
||||||
|
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
|
||||||
|
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
|
||||||
|
(length)-=2; \
|
||||||
|
} \
|
||||||
|
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
|
||||||
|
uint8_t __b3=s[(length)-3]; \
|
||||||
|
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
|
||||||
|
(length)-=3; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/* definitions with backward iteration -------------------------------------- */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||||
|
* the whole sequence.
|
||||||
|
* If the offset is behind a lead byte, then that itself
|
||||||
|
* will be returned as the code point.
|
||||||
|
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param c output UChar32 variable
|
||||||
|
* @see U8_PREV
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(uint8_t)(s)[--(i)]; \
|
||||||
|
if(U8_IS_TRAIL(c)) { \
|
||||||
|
uint8_t __b, __count=1, __shift=6; \
|
||||||
|
\
|
||||||
|
/* c is a trail byte */ \
|
||||||
|
(c)&=0x3f; \
|
||||||
|
for(;;) { \
|
||||||
|
__b=(s)[--(i)]; \
|
||||||
|
if(__b>=0xc0) { \
|
||||||
|
U8_MASK_LEAD_BYTE(__b, __count); \
|
||||||
|
(c)|=(UChar32)__b<<__shift; \
|
||||||
|
break; \
|
||||||
|
} else { \
|
||||||
|
(c)|=(UChar32)(__b&0x3f)<<__shift; \
|
||||||
|
++__count; \
|
||||||
|
__shift+=6; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||||
|
* the whole sequence.
|
||||||
|
* If the offset is behind a lead byte, then that itself
|
||||||
|
* will be returned as the code point.
|
||||||
|
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, must be start<i
|
||||||
|
* @param c output UChar32 variable, set to <0 in case of an error
|
||||||
|
* @see U8_PREV_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(uint8_t)(s)[--(i)]; \
|
||||||
|
if(!U8_IS_SINGLE(c)) { \
|
||||||
|
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one
|
||||||
|
* and get the code point between them.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||||
|
* the whole sequence.
|
||||||
|
* If the offset is behind a lead byte, then that itself
|
||||||
|
* will be returned as the code point.
|
||||||
|
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
|
||||||
|
*
|
||||||
|
* This macro does not distinguish between a real U+FFFD in the text
|
||||||
|
* and U+FFFD returned for an ill-formed sequence.
|
||||||
|
* Use U8_PREV() if that distinction is important.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, must be start<i
|
||||||
|
* @param c output UChar32 variable, set to U+FFFD in case of an error
|
||||||
|
* @see U8_PREV
|
||||||
|
* @stable ICU 51
|
||||||
|
*/
|
||||||
|
#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
(c)=(uint8_t)(s)[--(i)]; \
|
||||||
|
if(!U8_IS_SINGLE(c)) { \
|
||||||
|
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U8_BACK_1
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
while(U8_IS_TRAIL((s)[--(i)])) {} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the previous one.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, must be start<i
|
||||||
|
* @see U8_BACK_1_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if(U8_IS_TRAIL((s)[--(i)])) { \
|
||||||
|
(i)=utf8_back1SafeBody(s, start, (i)); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the n-th one before it,
|
||||||
|
* i.e., move backward by n code points.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U8_BACK_N
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0) { \
|
||||||
|
U8_BACK_1_UNSAFE(s, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the string offset from one code point boundary to the n-th one before it,
|
||||||
|
* i.e., move backward by n code points.
|
||||||
|
* (Pre-decrementing backward iteration.)
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t index of the start of the string
|
||||||
|
* @param i int32_t string offset, must be start<i
|
||||||
|
* @param n number of code points to skip
|
||||||
|
* @see U8_BACK_N_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
int32_t __N=(n); \
|
||||||
|
while(__N>0 && (i)>(start)) { \
|
||||||
|
U8_BACK_1(s, start, i); \
|
||||||
|
--__N; \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary after a code point.
|
||||||
|
* If the offset is behind a partial multi-byte sequence,
|
||||||
|
* then the offset is incremented to behind the whole sequence.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param i string offset
|
||||||
|
* @see U8_SET_CP_LIMIT
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
U8_BACK_1_UNSAFE(s, i); \
|
||||||
|
U8_FWD_1_UNSAFE(s, i); \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjust a random-access offset to a code point boundary after a code point.
|
||||||
|
* If the offset is behind a partial multi-byte sequence,
|
||||||
|
* then the offset is incremented to behind the whole sequence.
|
||||||
|
* Otherwise, it is not modified.
|
||||||
|
* The input offset may be the same as the string length.
|
||||||
|
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||||
|
*
|
||||||
|
* The length can be negative for a NUL-terminated string.
|
||||||
|
*
|
||||||
|
* @param s const uint8_t * string
|
||||||
|
* @param start int32_t starting string offset (usually 0)
|
||||||
|
* @param i int32_t string offset, must be start<=i<=length
|
||||||
|
* @param length int32_t string length
|
||||||
|
* @see U8_SET_CP_LIMIT_UNSAFE
|
||||||
|
* @stable ICU 2.4
|
||||||
|
*/
|
||||||
|
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
|
||||||
|
if((start)<(i) && ((i)<(length) || (length)<0)) { \
|
||||||
|
U8_BACK_1(s, start, i); \
|
||||||
|
U8_FWD_1(s, i, length); \
|
||||||
|
} \
|
||||||
|
} UPRV_BLOCK_MACRO_END
|
||||||
|
|
||||||
|
#endif
|
|
@ -4,6 +4,9 @@ local Screen = require('test.functional.ui.screen')
|
||||||
local eq, eval = helpers.eq, helpers.eval
|
local eq, eval = helpers.eq, helpers.eval
|
||||||
local command = helpers.command
|
local command = helpers.command
|
||||||
local meths = helpers.meths
|
local meths = helpers.meths
|
||||||
|
local funcs = helpers.funcs
|
||||||
|
local pcall_err = helpers.pcall_err
|
||||||
|
local ok = helpers.ok
|
||||||
|
|
||||||
describe('API: highlight',function()
|
describe('API: highlight',function()
|
||||||
local expected_rgb = {
|
local expected_rgb = {
|
||||||
|
@ -110,4 +113,20 @@ describe('API: highlight',function()
|
||||||
meths.get_hl_by_name('cursorline', 0));
|
meths.get_hl_by_name('cursorline', 0));
|
||||||
|
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it('nvim_get_hl_id_by_name', function()
|
||||||
|
-- precondition: use a hl group that does not yet exist
|
||||||
|
eq('Invalid highlight name: Shrubbery', pcall_err(meths.get_hl_by_name, "Shrubbery", true))
|
||||||
|
eq(0, funcs.hlID("Shrubbery"))
|
||||||
|
|
||||||
|
local hl_id = meths.get_hl_id_by_name("Shrubbery")
|
||||||
|
ok(hl_id > 0)
|
||||||
|
eq(hl_id, funcs.hlID("Shrubbery"))
|
||||||
|
|
||||||
|
command('hi Shrubbery guifg=#888888 guibg=#888888')
|
||||||
|
eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
|
||||||
|
meths.get_hl_by_id(hl_id, true))
|
||||||
|
eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
|
||||||
|
meths.get_hl_by_name("Shrubbery", true))
|
||||||
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
-- Test suite for testing interactions with API bindings
|
-- Test suite for testing interactions with API bindings
|
||||||
local helpers = require('test.functional.helpers')(after_each)
|
local helpers = require('test.functional.helpers')(after_each)
|
||||||
|
local Screen = require('test.functional.ui.screen')
|
||||||
|
|
||||||
local clear = helpers.clear
|
local clear = helpers.clear
|
||||||
local eq = helpers.eq
|
local eq = helpers.eq
|
||||||
|
@ -26,122 +27,371 @@ describe('treesitter API', function()
|
||||||
pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')"))
|
pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')"))
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
end)
|
||||||
|
|
||||||
|
describe('treesitter API with C parser', function()
|
||||||
local ts_path = os.getenv("TREE_SITTER_DIR")
|
local ts_path = os.getenv("TREE_SITTER_DIR")
|
||||||
|
|
||||||
describe('with C parser', function()
|
-- The tests after this requires an actual parser
|
||||||
if ts_path == nil then
|
if ts_path == nil then
|
||||||
it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
|
it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
before_each(function()
|
before_each(function()
|
||||||
local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
|
local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
|
||||||
exec_lua([[
|
exec_lua([[
|
||||||
local path = ...
|
local path = ...
|
||||||
vim.treesitter.add_language(path,'c')
|
vim.treesitter.add_language(path,'c')
|
||||||
]], path)
|
]], path)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
it('parses buffer', function()
|
it('parses buffer', function()
|
||||||
insert([[
|
insert([[
|
||||||
int main() {
|
int main() {
|
||||||
int x = 3;
|
int x = 3;
|
||||||
}]])
|
}]])
|
||||||
|
|
||||||
exec_lua([[
|
exec_lua([[
|
||||||
parser = vim.treesitter.get_parser(0, "c")
|
parser = vim.treesitter.get_parser(0, "c")
|
||||||
tree = parser:parse()
|
tree = parser:parse()
|
||||||
root = tree:root()
|
root = tree:root()
|
||||||
lang = vim.treesitter.inspect_language('c')
|
lang = vim.treesitter.inspect_language('c')
|
||||||
]])
|
]])
|
||||||
|
|
||||||
eq("<tree>", exec_lua("return tostring(tree)"))
|
eq("<tree>", exec_lua("return tostring(tree)"))
|
||||||
eq("<node translation_unit>", exec_lua("return tostring(root)"))
|
eq("<node translation_unit>", exec_lua("return tostring(root)"))
|
||||||
eq({0,0,3,0}, exec_lua("return {root:range()}"))
|
eq({0,0,3,0}, exec_lua("return {root:range()}"))
|
||||||
|
|
||||||
eq(1, exec_lua("return root:child_count()"))
|
eq(1, exec_lua("return root:child_count()"))
|
||||||
exec_lua("child = root:child(0)")
|
exec_lua("child = root:child(0)")
|
||||||
eq("<node function_definition>", exec_lua("return tostring(child)"))
|
eq("<node function_definition>", exec_lua("return tostring(child)"))
|
||||||
eq({0,0,2,1}, exec_lua("return {child:range()}"))
|
eq({0,0,2,1}, exec_lua("return {child:range()}"))
|
||||||
|
|
||||||
eq("function_definition", exec_lua("return child:type()"))
|
eq("function_definition", exec_lua("return child:type()"))
|
||||||
eq(true, exec_lua("return child:named()"))
|
eq(true, exec_lua("return child:named()"))
|
||||||
eq("number", type(exec_lua("return child:symbol()")))
|
eq("number", type(exec_lua("return child:symbol()")))
|
||||||
eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
|
eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
|
||||||
|
|
||||||
exec_lua("anon = root:descendant_for_range(0,8,0,9)")
|
exec_lua("anon = root:descendant_for_range(0,8,0,9)")
|
||||||
eq("(", exec_lua("return anon:type()"))
|
eq("(", exec_lua("return anon:type()"))
|
||||||
eq(false, exec_lua("return anon:named()"))
|
eq(false, exec_lua("return anon:named()"))
|
||||||
eq("number", type(exec_lua("return anon:symbol()")))
|
eq("number", type(exec_lua("return anon:symbol()")))
|
||||||
eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
|
eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
|
||||||
|
|
||||||
exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
|
exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
|
||||||
eq("<node declaration>", exec_lua("return tostring(descendant)"))
|
eq("<node declaration>", exec_lua("return tostring(descendant)"))
|
||||||
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
|
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
|
||||||
eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
|
eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
|
||||||
|
|
||||||
eq(true, exec_lua("return child == child"))
|
eq(true, exec_lua("return child == child"))
|
||||||
-- separate lua object, but represents same node
|
-- separate lua object, but represents same node
|
||||||
eq(true, exec_lua("return child == root:child(0)"))
|
eq(true, exec_lua("return child == root:child(0)"))
|
||||||
eq(false, exec_lua("return child == descendant2"))
|
eq(false, exec_lua("return child == descendant2"))
|
||||||
eq(false, exec_lua("return child == nil"))
|
eq(false, exec_lua("return child == nil"))
|
||||||
eq(false, exec_lua("return child == tree"))
|
eq(false, exec_lua("return child == tree"))
|
||||||
|
|
||||||
feed("2G7|ay")
|
feed("2G7|ay")
|
||||||
exec_lua([[
|
exec_lua([[
|
||||||
tree2 = parser:parse()
|
tree2 = parser:parse()
|
||||||
root2 = tree2:root()
|
root2 = tree2:root()
|
||||||
descendant2 = root2:descendant_for_range(1,2,1,13)
|
descendant2 = root2:descendant_for_range(1,2,1,13)
|
||||||
]])
|
]])
|
||||||
eq(false, exec_lua("return tree2 == tree1"))
|
eq(false, exec_lua("return tree2 == tree1"))
|
||||||
eq(false, exec_lua("return root2 == root"))
|
eq(false, exec_lua("return root2 == root"))
|
||||||
eq("<node declaration>", exec_lua("return tostring(descendant2)"))
|
eq("<node declaration>", exec_lua("return tostring(descendant2)"))
|
||||||
eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
|
eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
|
||||||
|
|
||||||
-- orginal tree did not change
|
-- orginal tree did not change
|
||||||
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
|
eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
|
||||||
|
|
||||||
-- unchanged buffer: return the same tree
|
-- unchanged buffer: return the same tree
|
||||||
eq(true, exec_lua("return parser:parse() == tree2"))
|
eq(true, exec_lua("return parser:parse() == tree2"))
|
||||||
end)
|
end)
|
||||||
|
|
||||||
it('inspects language', function()
|
local test_text = [[
|
||||||
local keys, fields, symbols = unpack(exec_lua([[
|
void ui_refresh(void)
|
||||||
local lang = vim.treesitter.inspect_language('c')
|
{
|
||||||
local keys, symbols = {}, {}
|
int width = INT_MAX, height = INT_MAX;
|
||||||
for k,_ in pairs(lang) do
|
bool ext_widgets[kUIExtCount];
|
||||||
keys[k] = true
|
for (UIExtension i = 0; (int)i < kUIExtCount; i++) {
|
||||||
end
|
ext_widgets[i] = true;
|
||||||
|
}
|
||||||
|
|
||||||
-- symbols array can have "holes" and is thus not a valid msgpack array
|
bool inclusive = ui_override();
|
||||||
-- but we don't care about the numbers here (checked in the parser test)
|
for (size_t i = 0; i < ui_count; i++) {
|
||||||
for _, v in pairs(lang.symbols) do
|
UI *ui = uis[i];
|
||||||
table.insert(symbols, v)
|
width = MIN(ui->width, width);
|
||||||
end
|
height = MIN(ui->height, height);
|
||||||
return {keys, lang.fields, symbols}
|
foo = BAR(ui->bazaar, bazaar);
|
||||||
]]))
|
for (UIExtension j = 0; (int)j < kUIExtCount; j++) {
|
||||||
|
ext_widgets[j] &= (ui->ui_ext[j] || inclusive);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}]]
|
||||||
|
|
||||||
eq({fields=true, symbols=true}, keys)
|
local query = [[
|
||||||
|
((call_expression function: (identifier) @minfunc (argument_list (identifier) @min_id)) (eq? @minfunc "MIN"))
|
||||||
|
"for" @keyword
|
||||||
|
(primitive_type) @type
|
||||||
|
(field_expression argument: (identifier) @fieldarg)
|
||||||
|
]]
|
||||||
|
|
||||||
local fset = {}
|
it('support query and iter by capture', function()
|
||||||
for _,f in pairs(fields) do
|
insert(test_text)
|
||||||
eq("string", type(f))
|
|
||||||
fset[f] = true
|
local res = exec_lua([[
|
||||||
|
cquery = vim.treesitter.parse_query("c", ...)
|
||||||
|
parser = vim.treesitter.get_parser(0, "c")
|
||||||
|
tree = parser:parse()
|
||||||
|
res = {}
|
||||||
|
for cid, node in cquery:iter_captures(tree:root(), 0, 7, 14) do
|
||||||
|
-- can't transmit node over RPC. just check the name and range
|
||||||
|
table.insert(res, {cquery.captures[cid], node:type(), node:range()})
|
||||||
|
end
|
||||||
|
return res
|
||||||
|
]], query)
|
||||||
|
|
||||||
|
eq({
|
||||||
|
{ "type", "primitive_type", 8, 2, 8, 6 },
|
||||||
|
{ "keyword", "for", 9, 2, 9, 5 },
|
||||||
|
{ "type", "primitive_type", 9, 7, 9, 13 },
|
||||||
|
{ "minfunc", "identifier", 11, 12, 11, 15 },
|
||||||
|
{ "fieldarg", "identifier", 11, 16, 11, 18 },
|
||||||
|
{ "min_id", "identifier", 11, 27, 11, 32 },
|
||||||
|
{ "minfunc", "identifier", 12, 13, 12, 16 },
|
||||||
|
{ "fieldarg", "identifier", 12, 17, 12, 19 },
|
||||||
|
{ "min_id", "identifier", 12, 29, 12, 35 },
|
||||||
|
{ "fieldarg", "identifier", 13, 14, 13, 16 }
|
||||||
|
}, res)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it('support query and iter by match', function()
|
||||||
|
insert(test_text)
|
||||||
|
|
||||||
|
local res = exec_lua([[
|
||||||
|
cquery = vim.treesitter.parse_query("c", ...)
|
||||||
|
parser = vim.treesitter.get_parser(0, "c")
|
||||||
|
tree = parser:parse()
|
||||||
|
res = {}
|
||||||
|
for pattern, match in cquery:iter_matches(tree:root(), 0, 7, 14) do
|
||||||
|
-- can't transmit node over RPC. just check the name and range
|
||||||
|
local mrepr = {}
|
||||||
|
for cid,node in pairs(match) do
|
||||||
|
table.insert(mrepr, {cquery.captures[cid], node:type(), node:range()})
|
||||||
end
|
end
|
||||||
eq(true, fset["directive"])
|
table.insert(res, {pattern, mrepr})
|
||||||
eq(true, fset["initializer"])
|
end
|
||||||
|
return res
|
||||||
|
]], query)
|
||||||
|
|
||||||
local has_named, has_anonymous
|
eq({
|
||||||
for _,s in pairs(symbols) do
|
{ 3, { { "type", "primitive_type", 8, 2, 8, 6 } } },
|
||||||
eq("string", type(s[1]))
|
{ 2, { { "keyword", "for", 9, 2, 9, 5 } } },
|
||||||
eq("boolean", type(s[2]))
|
{ 3, { { "type", "primitive_type", 9, 7, 9, 13 } } },
|
||||||
if s[1] == "for_statement" and s[2] == true then
|
{ 4, { { "fieldarg", "identifier", 11, 16, 11, 18 } } },
|
||||||
has_named = true
|
{ 1, { { "minfunc", "identifier", 11, 12, 11, 15 }, { "min_id", "identifier", 11, 27, 11, 32 } } },
|
||||||
elseif s[1] == "|=" and s[2] == false then
|
{ 4, { { "fieldarg", "identifier", 12, 17, 12, 19 } } },
|
||||||
has_anonymous = true
|
{ 1, { { "minfunc", "identifier", 12, 13, 12, 16 }, { "min_id", "identifier", 12, 29, 12, 35 } } },
|
||||||
end
|
{ 4, { { "fieldarg", "identifier", 13, 14, 13, 16 } } }
|
||||||
|
}, res)
|
||||||
|
end)
|
||||||
|
|
||||||
|
it('supports highlighting', function()
|
||||||
|
local hl_text = [[
|
||||||
|
/// Schedule Lua callback on main loop's event queue
|
||||||
|
static int nlua_schedule(lua_State *const lstate)
|
||||||
|
{
|
||||||
|
if (lua_type(lstate, 1) != LUA_TFUNCTION
|
||||||
|
|| lstate != lstate) {
|
||||||
|
lua_pushliteral(lstate, "vim.schedule: expected function");
|
||||||
|
return lua_error(lstate);
|
||||||
|
}
|
||||||
|
|
||||||
|
LuaRef cb = nlua_ref(lstate, 1);
|
||||||
|
|
||||||
|
multiqueue_put(main_loop.events, nlua_schedule_event,
|
||||||
|
1, (void *)(ptrdiff_t)cb);
|
||||||
|
return 0;
|
||||||
|
}]]
|
||||||
|
|
||||||
|
local hl_query = [[
|
||||||
|
(ERROR) @ErrorMsg
|
||||||
|
|
||||||
|
"if" @keyword
|
||||||
|
"else" @keyword
|
||||||
|
"for" @keyword
|
||||||
|
"return" @keyword
|
||||||
|
|
||||||
|
"const" @type
|
||||||
|
"static" @type
|
||||||
|
"struct" @type
|
||||||
|
"enum" @type
|
||||||
|
"extern" @type
|
||||||
|
|
||||||
|
(string_literal) @string
|
||||||
|
|
||||||
|
(number_literal) @number
|
||||||
|
(char_literal) @string
|
||||||
|
|
||||||
|
; TODO(bfredl): overlapping matches are unreliable,
|
||||||
|
; we need a proper priority mechanism
|
||||||
|
;(type_identifier) @type
|
||||||
|
((type_identifier) @Special (eq? @Special "LuaRef"))
|
||||||
|
|
||||||
|
(primitive_type) @type
|
||||||
|
(sized_type_specifier) @type
|
||||||
|
|
||||||
|
((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) (eq? @WarningMsg.left @WarningMsg.right))
|
||||||
|
|
||||||
|
(comment) @comment
|
||||||
|
]]
|
||||||
|
|
||||||
|
local screen = Screen.new(65, 18)
|
||||||
|
screen:attach()
|
||||||
|
screen:set_default_attr_ids({
|
||||||
|
[1] = {bold = true, foreground = Screen.colors.Blue1},
|
||||||
|
[2] = {foreground = Screen.colors.Blue1},
|
||||||
|
[3] = {bold = true, foreground = Screen.colors.SeaGreen4},
|
||||||
|
[4] = {bold = true, foreground = Screen.colors.Brown},
|
||||||
|
[5] = {foreground = Screen.colors.Magenta},
|
||||||
|
[6] = {foreground = Screen.colors.Red},
|
||||||
|
[7] = {foreground = Screen.colors.SlateBlue},
|
||||||
|
[8] = {foreground = Screen.colors.Grey100, background = Screen.colors.Red},
|
||||||
|
[9] = {foreground = Screen.colors.Magenta, background = Screen.colors.Red},
|
||||||
|
[10] = {foreground = Screen.colors.Red, background = Screen.colors.Red},
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
insert(hl_text)
|
||||||
|
screen:expect{grid=[[
|
||||||
|
/// Schedule Lua callback on main loop's event queue |
|
||||||
|
static int nlua_schedule(lua_State *const lstate) |
|
||||||
|
{ |
|
||||||
|
if (lua_type(lstate, 1) != LUA_TFUNCTION |
|
||||||
|
|| lstate != lstate) { |
|
||||||
|
lua_pushliteral(lstate, "vim.schedule: expected function"); |
|
||||||
|
return lua_error(lstate); |
|
||||||
|
} |
|
||||||
|
|
|
||||||
|
LuaRef cb = nlua_ref(lstate, 1); |
|
||||||
|
|
|
||||||
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
|
||||||
|
1, (void *)(ptrdiff_t)cb); |
|
||||||
|
return 0; |
|
||||||
|
^} |
|
||||||
|
{1:~ }|
|
||||||
|
{1:~ }|
|
||||||
|
|
|
||||||
|
]]}
|
||||||
|
|
||||||
|
exec_lua([[
|
||||||
|
local TSHighlighter = vim.treesitter.TSHighlighter
|
||||||
|
local query = ...
|
||||||
|
test_hl = TSHighlighter.new(query, 0, "c")
|
||||||
|
]], hl_query)
|
||||||
|
screen:expect{grid=[[
|
||||||
|
{2:/// Schedule Lua callback on main loop's event queue} |
|
||||||
|
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
|
||||||
|
{ |
|
||||||
|
{4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
|
||||||
|
|| {6:lstate} != {6:lstate}) { |
|
||||||
|
lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
|
||||||
|
{4:return} lua_error(lstate); |
|
||||||
|
} |
|
||||||
|
|
|
||||||
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
||||||
|
|
|
||||||
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
|
||||||
|
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
|
||||||
|
{4:return} {5:0}; |
|
||||||
|
^} |
|
||||||
|
{1:~ }|
|
||||||
|
{1:~ }|
|
||||||
|
|
|
||||||
|
]]}
|
||||||
|
|
||||||
|
feed('7Go*/<esc>')
|
||||||
|
screen:expect{grid=[[
|
||||||
|
{2:/// Schedule Lua callback on main loop's event queue} |
|
||||||
|
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
|
||||||
|
{ |
|
||||||
|
{4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
|
||||||
|
|| {6:lstate} != {6:lstate}) { |
|
||||||
|
lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
|
||||||
|
{4:return} lua_error(lstate); |
|
||||||
|
{8:*^/} |
|
||||||
|
} |
|
||||||
|
|
|
||||||
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
||||||
|
|
|
||||||
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
|
||||||
|
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
|
||||||
|
{4:return} {5:0}; |
|
||||||
|
} |
|
||||||
|
{1:~ }|
|
||||||
|
|
|
||||||
|
]]}
|
||||||
|
|
||||||
|
feed('3Go/*<esc>')
|
||||||
|
screen:expect{grid=[[
|
||||||
|
{2:/// Schedule Lua callback on main loop's event queue} |
|
||||||
|
{3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
|
||||||
|
{ |
|
||||||
|
{2:/^*} |
|
||||||
|
{2: if (lua_type(lstate, 1) != LUA_TFUNCTION} |
|
||||||
|
{2: || lstate != lstate) {} |
|
||||||
|
{2: lua_pushliteral(lstate, "vim.schedule: expected function");} |
|
||||||
|
{2: return lua_error(lstate);} |
|
||||||
|
{2:*/} |
|
||||||
|
} |
|
||||||
|
|
|
||||||
|
{7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
|
||||||
|
|
|
||||||
|
multiqueue_put(main_loop.events, nlua_schedule_event, |
|
||||||
|
{5:1}, ({3:void} *)(ptrdiff_t)cb); |
|
||||||
|
{4:return} {5:0}; |
|
||||||
|
{8:}} |
|
||||||
|
|
|
||||||
|
]]}
|
||||||
|
end)
|
||||||
|
|
||||||
|
it('inspects language', function()
|
||||||
|
local keys, fields, symbols = unpack(exec_lua([[
|
||||||
|
local lang = vim.treesitter.inspect_language('c')
|
||||||
|
local keys, symbols = {}, {}
|
||||||
|
for k,_ in pairs(lang) do
|
||||||
|
keys[k] = true
|
||||||
end
|
end
|
||||||
eq({true,true}, {has_named,has_anonymous})
|
|
||||||
end)
|
-- symbols array can have "holes" and is thus not a valid msgpack array
|
||||||
|
-- but we don't care about the numbers here (checked in the parser test)
|
||||||
|
for _, v in pairs(lang.symbols) do
|
||||||
|
table.insert(symbols, v)
|
||||||
|
end
|
||||||
|
return {keys, lang.fields, symbols}
|
||||||
|
]]))
|
||||||
|
|
||||||
|
eq({fields=true, symbols=true}, keys)
|
||||||
|
|
||||||
|
local fset = {}
|
||||||
|
for _,f in pairs(fields) do
|
||||||
|
eq("string", type(f))
|
||||||
|
fset[f] = true
|
||||||
|
end
|
||||||
|
eq(true, fset["directive"])
|
||||||
|
eq(true, fset["initializer"])
|
||||||
|
|
||||||
|
local has_named, has_anonymous
|
||||||
|
for _,s in pairs(symbols) do
|
||||||
|
eq("string", type(s[1]))
|
||||||
|
eq("boolean", type(s[2]))
|
||||||
|
if s[1] == "for_statement" and s[2] == true then
|
||||||
|
has_named = true
|
||||||
|
elseif s[1] == "|=" and s[2] == false then
|
||||||
|
has_anonymous = true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
eq({true,true}, {has_named,has_anonymous})
|
||||||
end)
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
Loading…
Reference in New Issue