Compare commits

...

10 commits

Author SHA1 Message Date
Amaan Qureshi
64cfbc213f
feat: rewrite the scanner in C (#40)
* feat: rewrite the scanner in C
* chore: update manifests & docs
2023-06-19 18:05:11 -04:00
Emilia Simmons
081179c52b fix: parse continued latex env as paragraph (close #38) 2022-10-21 23:23:29 -04:00
Emilia Simmons
eb1e080361 fix: tabs not included in regexes (close #36) 2022-09-12 12:12:18 -04:00
Emilia Simmons
698bb1a343 fix: freeze on error recovery (#34) 2022-08-16 11:52:06 -04:00
Emilia Simmons
76a70a8b35 chore: version bump 2022-08-12 12:08:38 -04:00
Emilia Simmons
a2c1946d78 chore: add more fields to tests 2022-06-28 19:36:39 -04:00
Emilia Simmons
bc8a040492 chore: add fields to tests 2022-06-27 11:07:56 -04:00
Emilia Simmons
031031fe6c fix: add alias to (expr) for block end name 2022-06-27 11:06:00 -04:00
Emilia Simmons
428fd9aeb9 fix: add (expr) to dynamic block end (#30) 2022-06-27 11:03:17 -04:00
Emilia Simmons
53a61b6b62 feat!: add (checkbox) in list 2022-06-27 10:47:50 -04:00
14 changed files with 78094 additions and 66515 deletions

View file

@ -1,7 +1,7 @@
[package]
name = "tree-sitter-org"
description = "org grammar for the tree-sitter parsing library"
version = "1.0.1"
version = "1.3.3"
keywords = ["incremental", "parsing", "org"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/milisims/tree-sitter-org"

View file

@ -23,7 +23,7 @@ Like in many regex systems, `*/+` is read as "0/1 or more", and `?` is 0 or 1.
## Example
``` org
```org
#+TITLE: Example
Some *marked up* words
@ -32,9 +32,9 @@ Some *marked up* words
<2020-06-07 Sun>
- list a
- [ ] list a
- [ ] list b
- [-] list a
- [ ] list b
- [x] list b
- list a
** Subsection :tag:
@ -43,6 +43,7 @@ Text
```
Parses as:
```
(document [0, 0] - [16, 0]
body: (body [0, 0] - [4, 0]
@ -75,24 +76,23 @@ Parses as:
(expr [7, 9] - [7, 10])))
(listitem [8, 2] - [11, 0]
bullet: (bullet [8, 2] - [8, 3])
contents: (paragraph [8, 4] - [9, 0]
(expr [8, 4] - [8, 5])
(expr [8, 6] - [8, 7])
checkbox: (checkbox [8, 4] - [8, 7]
status: (expr [8, 5] - [8, 6]))
contents: (paragraph [8, 8] - [9, 0]
(expr [8, 8] - [8, 12])
(expr [8, 13] - [8, 14]))
contents: (list [9, 0] - [11, 0]
(listitem [9, 4] - [10, 0]
bullet: (bullet [9, 4] - [9, 5])
contents: (paragraph [9, 6] - [10, 0]
(expr [9, 6] - [9, 7])
(expr [9, 8] - [9, 9])
checkbox: (checkbox [9, 6] - [9, 9])
contents: (paragraph [9, 10] - [10, 0]
(expr [9, 10] - [9, 14])
(expr [9, 15] - [9, 16])))
(listitem [10, 4] - [11, 0]
bullet: (bullet [10, 4] - [10, 5])
contents: (paragraph [10, 6] - [11, 0]
(expr [10, 6] - [10, 7])
(expr [10, 8] - [10, 9])
checkbox: (checkbox [10, 6] - [10, 9]
status: (expr [10, 7] - [10, 8]))
contents: (paragraph [10, 10] - [11, 0]
(expr [10, 10] - [10, 14])
(expr [10, 15] - [10, 16])))))
(listitem [11, 2] - [12, 0]
@ -118,13 +118,13 @@ For manual install, use `make`.
For neovim, using `nvim-treesitter/nvim-treesitter`, add to your configuration:
``` lua
```lua
local parser_config = require "nvim-treesitter.parsers".get_parser_configs()
parser_config.org = {
install_info = {
url = 'https://github.com/milisims/tree-sitter-org',
revision = 'main',
files = { 'src/parser.c', 'src/scanner.cc' },
files = { 'src/parser.c', 'src/scanner.c' },
},
filetype = 'org',
}

View file

@ -9,7 +9,7 @@
"sources": [
"src/parser.c",
"bindings/node/binding.cc",
"src/scanner.cc"
"src/scanner.c"
],
"cflags_c": [
"-std=c99",

View file

@ -2,7 +2,7 @@ fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config.include(src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
@ -10,29 +10,10 @@ fn main() {
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
/*
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
}

File diff suppressed because it is too large Load diff

View file

@ -172,12 +172,12 @@ org_grammar = {
date: $ => /\p{N}{1,4}-\p{N}{1,4}-\p{N}{1,4}/,
_ts_element: $ => choice(
field('day', alias(/\p{L}[^\]>\p{Z}\n\r]*/, $.day)),
field('day', alias(/\p{L}[^\]>\p{Z}\t\n\r]*/, $.day)),
field('time', alias(/\p{N}?\p{N}[:.]\p{N}\p{N}( ?\p{L}{1,2})?/, $.time)),
field('duration', alias(/\p{N}?\p{N}[:.]\p{N}\p{N}( ?\p{L}{1,2})?-\p{N}?\p{N}[:.]\p{N}\p{N}( ?\p{L}{1,2})?/, $.duration)),
field('repeat', alias(/[.+]?\+\p{N}+\p{L}/, $.repeat)),
field('delay', alias(/--?\p{N}+\p{L}/, $.delay)),
alias(prec(-1, /[^\[<\]>\p{Z}\n\r]+/), $.expr),
alias(prec(-1, /[^\[<\]>\p{Z}\t\n\r]+/), $.expr),
),
paragraph: $ => seq(optional($._directive_list), $._multiline_text),
@ -186,7 +186,7 @@ org_grammar = {
optional($._directive_list),
seq(
caseInsensitive('[fn:'),
field('label', alias(/[^\p{Z}\n\r\]]+/, $.expr)),
field('label', alias(/[^\p{Z}\t\n\r\]]+/, $.expr)),
']',
),
field('description', alias($._multiline_text, $.description))
@ -222,7 +222,7 @@ org_grammar = {
$._nl,
optional(field('contents', $.contents)),
caseInsensitive('#+end_'),
$._immediate_expr,
field('end_name',alias($._immediate_expr, $.expr)),
$._eol,
),
@ -234,6 +234,7 @@ org_grammar = {
$._nl,
optional(field('contents', $.contents)),
caseInsensitive('#+end:'),
optional(field('end_name', $.expr)),
$._eol,
),
@ -246,12 +247,22 @@ org_grammar = {
listitem: $ => seq(
field('bullet', $.bullet),
optional(field('checkbox', $.checkbox)),
choice(
$._eof,
field('contents', $._body_contents),
),
),
checkbox: $ => choice(
'[ ]',
seq(
token(prec('non-immediate', '[')),
field('status', alias($._checkbox_status_expr, $.expr)),
token.immediate(prec('special', ']')),
),
),
table: $ => prec.right(seq(
optional($._directive_list),
repeat1(choice($.row, $.hr)),
@ -294,14 +305,12 @@ org_grammar = {
token.immediate('}'),
),
seq(
caseInsensitive('\\['),
$._nl,
token(seq(caseInsensitive('\\['), choice('\n', '\r'))),
optional(field('contents', $.contents)),
caseInsensitive('\\]'),
),
seq(
caseInsensitive('\\('),
$._nl,
token(seq(caseInsensitive('\\('), choice('\n', '\r'))),
optional(field('contents', $.contents)),
caseInsensitive('\\)'),
),
@ -324,6 +333,8 @@ org_grammar = {
_immediate_expr: $ => repeat1(expr('immediate', token.immediate)),
_noc_expr: $ => repeat1(expr('immediate', token.immediate, ':')),
_checkbox_status_expr: $ => expr('immediate', token.immediate, ']'),
_ts_expr: $ => seq(
expr('non-immediate', token, '>]'),
repeat(expr('immediate', token.immediate, '>]'))
@ -343,7 +354,10 @@ function expr(pr, tfunc, skip = '') {
...asciiSymbols.filter(c => !skip.includes(c)).map(c => tfunc(prec(pr, c))),
alias(tfunc(prec(pr, /\p{L}+/)), 'str'),
alias(tfunc(prec(pr, /\p{N}+/)), 'num'),
alias(tfunc(prec(pr, /[^\p{Z}\p{L}\p{N}\n\r]/)), 'sym'),
alias(tfunc(prec(pr, /[^\p{Z}\p{L}\p{N}\t\n\r]/)), 'sym'),
// for checkboxes: ugly, but makes them work..
// alias(tfunc(prec(pr, 'x')), 'str'),
// alias(tfunc(prec(pr, 'X')), 'str'),
)
}

View file

@ -1,6 +1,6 @@
{
"name": "tree-sitter-org",
"version": "1.0.1",
"version": "1.3.3",
"description": "Org grammar for tree-sitter",
"main": "bindings/node",
"keywords": [

View file

@ -61,10 +61,10 @@
(bullet) @OrgListBullet
; Get different colors for different statuses as follows
(listitem . (bullet) . (paragraph . (expr "[" "str" @OrgCheckDone "]") @OrgCheckbox (#match? @OrgCheckbox "^\[[xX]\]$")))
(listitem . (bullet) . (paragraph . (expr "[" "-" @OrgCheckInProgress "]") @OrgCheckbox (#eq? @OrgCheckbox "[-]")))
(listitem . (bullet) . (paragraph . (expr "[") @OrgCheckbox.left (#eq? @OrgCheckbox.left "[") . (expr "]") @OrgCheckbox.right (#eq? @OrgCheckbox.right "]")))
; (listitem . (bullet) . (paragraph (expr ":" ":") @OrgListDescriptionSeparator (#eq? @OrgListDescriptionSeparator "::"))) -- matches multiple, requires a special search.
(checkbox) @OrgCheckbox
(checkbox status: (expr "-") @OrgCheckInProgress)
(checkbox status: (expr "str") @OrgCheckDone (#any-of? @OrgCheckDone "x" "X"))
(checkbox status: (expr) @Error (#not-any-of? @Error "x" "X" "-"))
; If you want the ruler one color and the separators a different color,
; something like this would do it:

View file

@ -833,7 +833,7 @@
"type": "ALIAS",
"content": {
"type": "PATTERN",
"value": "\\p{L}[^\\]>\\p{Z}\\n\\r]*"
"value": "\\p{L}[^\\]>\\p{Z}\\t\\n\\r]*"
},
"named": true,
"value": "day"
@ -898,7 +898,7 @@
"value": -1,
"content": {
"type": "PATTERN",
"value": "[^\\[<\\]>\\p{Z}\\n\\r]+"
"value": "[^\\[<\\]>\\p{Z}\\t\\n\\r]+"
}
},
"named": true,
@ -961,7 +961,7 @@
"type": "ALIAS",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\n\\r\\]]+"
"value": "[^\\p{Z}\\t\\n\\r\\]]+"
},
"named": true,
"value": "expr"
@ -1250,8 +1250,17 @@
"value": "#+end_"
},
{
"type": "SYMBOL",
"name": "_immediate_expr"
"type": "FIELD",
"name": "end_name",
"content": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_immediate_expr"
},
"named": true,
"value": "expr"
}
},
{
"type": "SYMBOL",
@ -1331,6 +1340,22 @@
"named": false,
"value": "#+end:"
},
{
"type": "CHOICE",
"members": [
{
"type": "FIELD",
"name": "end_name",
"content": {
"type": "SYMBOL",
"name": "expr"
}
},
{
"type": "BLANK"
}
]
},
{
"type": "SYMBOL",
"name": "_eol"
@ -1405,6 +1430,22 @@
"name": "bullet"
}
},
{
"type": "CHOICE",
"members": [
{
"type": "FIELD",
"name": "checkbox",
"content": {
"type": "SYMBOL",
"name": "checkbox"
}
},
{
"type": "BLANK"
}
]
},
{
"type": "CHOICE",
"members": [
@ -1424,6 +1465,55 @@
}
]
},
"checkbox": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "[ ]"
},
{
"type": "SEQ",
"members": [
{
"type": "TOKEN",
"content": {
"type": "PREC",
"value": "non-immediate",
"content": {
"type": "STRING",
"value": "["
}
}
},
{
"type": "FIELD",
"name": "status",
"content": {
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_checkbox_status_expr"
},
"named": true,
"value": "expr"
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "special",
"content": {
"type": "STRING",
"value": "]"
}
}
}
]
}
]
},
"table": {
"type": "PREC_RIGHT",
"value": 0,
@ -1730,17 +1820,34 @@
"type": "SEQ",
"members": [
{
"type": "ALIAS",
"type": "TOKEN",
"content": {
"type": "PATTERN",
"value": "\\\\\\["
},
"named": false,
"value": "\\["
},
{
"type": "SYMBOL",
"name": "_nl"
"type": "SEQ",
"members": [
{
"type": "ALIAS",
"content": {
"type": "PATTERN",
"value": "\\\\\\["
},
"named": false,
"value": "\\["
},
{
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "\n"
},
{
"type": "STRING",
"value": "\r"
}
]
}
]
}
},
{
"type": "CHOICE",
@ -1773,17 +1880,34 @@
"type": "SEQ",
"members": [
{
"type": "ALIAS",
"type": "TOKEN",
"content": {
"type": "PATTERN",
"value": "\\\\\\("
},
"named": false,
"value": "\\("
},
{
"type": "SYMBOL",
"name": "_nl"
"type": "SEQ",
"members": [
{
"type": "ALIAS",
"content": {
"type": "PATTERN",
"value": "\\\\\\("
},
"named": false,
"value": "\\("
},
{
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "\n"
},
{
"type": "STRING",
"value": "\r"
}
]
}
]
}
},
{
"type": "CHOICE",
@ -2317,7 +2441,7 @@
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
@ -2714,7 +2838,7 @@
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
@ -2724,6 +2848,400 @@
]
}
},
"_checkbox_status_expr": {
"type": "CHOICE",
"members": [
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "!"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "\""
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "#"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "$"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "%"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "&"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "'"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "("
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": ")"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "*"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "+"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": ","
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "-"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "."
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "/"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": ":"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": ";"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "<"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "="
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": ">"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "?"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "@"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "["
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "\\"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "^"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "_"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "`"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "{"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "|"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "}"
}
}
},
{
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "STRING",
"value": "~"
}
}
},
{
"type": "ALIAS",
"content": {
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "\\p{L}+"
}
}
},
"named": false,
"value": "str"
},
{
"type": "ALIAS",
"content": {
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "\\p{N}+"
}
}
},
"named": false,
"value": "num"
},
{
"type": "ALIAS",
"content": {
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PREC",
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
"named": false,
"value": "sym"
}
]
},
"_ts_expr": {
"type": "SEQ",
"members": [
@ -3101,7 +3619,7 @@
"value": "non-immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
@ -3486,7 +4004,7 @@
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
@ -3897,7 +4415,7 @@
"value": "non-immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},
@ -4304,7 +4822,7 @@
"value": "immediate",
"content": {
"type": "PATTERN",
"value": "[^\\p{Z}\\p{L}\\p{N}\\n\\r]"
"value": "[^\\p{Z}\\p{L}\\p{N}\\t\\n\\r]"
}
}
},

View file

@ -23,6 +23,16 @@
}
]
},
"end_name": {
"multiple": false,
"required": true,
"types": [
{
"type": "expr",
"named": true
}
]
},
"name": {
"multiple": false,
"required": true,
@ -119,6 +129,22 @@
}
}
},
{
"type": "checkbox",
"named": true,
"fields": {
"status": {
"multiple": false,
"required": false,
"types": [
{
"type": "expr",
"named": true
}
]
}
}
},
{
"type": "comment",
"named": true,
@ -276,6 +302,16 @@
}
]
},
"end_name": {
"multiple": false,
"required": false,
"types": [
{
"type": "expr",
"named": true
}
]
},
"name": {
"multiple": false,
"required": true,
@ -523,6 +559,16 @@
}
]
},
"checkbox": {
"multiple": false,
"required": false,
"types": [
{
"type": "checkbox",
"named": true
}
]
},
"contents": {
"multiple": true,
"required": false,
@ -1052,6 +1098,10 @@
"type": "[",
"named": false
},
{
"type": "[ ]",
"named": false
},
{
"type": "[%%",
"named": false
@ -1064,18 +1114,10 @@
"type": "\\",
"named": false
},
{
"type": "\\(",
"named": false
},
{
"type": "\\)",
"named": false
},
{
"type": "\\[",
"named": false
},
{
"type": "\\]",
"named": false

141901
src/parser.c

File diff suppressed because it is too large Load diff

342
src/scanner.c Normal file
View file

@ -0,0 +1,342 @@
#include <assert.h>
#include <stdio.h>
#include <tree_sitter/parser.h>
#include <wctype.h>
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
{ \
(vec)->data = realloc((vec)->data, (_cap) * sizeof((vec)->data[0])); \
assert((vec)->data != NULL); \
(vec)->cap = (_cap); \
}
#define VEC_PUSH(vec, el) \
{ \
if ((vec)->cap == (vec)->len) { \
VEC_RESIZE((vec), MAX(16, (vec)->len * 2)); \
} \
(vec)->data[(vec)->len++] = (el); \
}
#define VEC_POP(vec) (vec)->len--;
#define VEC_BACK(vec) ((vec)->data[(vec)->len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec)->data != NULL) \
free((vec)->data); \
}
#define VEC_CLEAR(vec) \
{ (vec)->len = 0; }
enum TokenType {
LISTSTART,
LISTEND,
LISTITEMEND,
BULLET,
HLSTARS,
SECTIONEND,
ENDOFFILE,
};
typedef enum {
NOTABULLET,
DASH,
PLUS,
STAR,
LOWERDOT,
UPPERDOT,
LOWERPAREN,
UPPERPAREN,
NUMDOT,
NUMPAREN,
} Bullet;
typedef struct {
uint32_t len;
uint32_t cap;
int16_t *data;
} stack;
typedef struct {
stack *indent_length_stack;
stack *bullet_stack;
stack *section_stack;
} Scanner;
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
unsigned serialize(Scanner *scanner, char *buffer) {
size_t i = 0;
size_t indent_count = scanner->indent_length_stack->len - 1;
if (indent_count > UINT8_MAX)
indent_count = UINT8_MAX;
buffer[i++] = indent_count;
int iter = 1;
for (; iter < scanner->indent_length_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->indent_length_stack->data[iter];
}
iter = 1;
for (; iter < scanner->bullet_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->bullet_stack->data[iter];
}
iter = 1;
for (; iter < scanner->section_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->section_stack->data[iter];
}
return i;
}
void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
VEC_CLEAR(scanner->section_stack);
VEC_PUSH(scanner->section_stack, 0);
VEC_CLEAR(scanner->indent_length_stack);
VEC_PUSH(scanner->indent_length_stack, -1);
VEC_CLEAR(scanner->bullet_stack);
VEC_PUSH(scanner->bullet_stack, NOTABULLET);
if (length == 0)
return;
size_t i = 0;
size_t indent_count = (uint8_t)buffer[i++];
for (; i <= indent_count; i++)
VEC_PUSH(scanner->indent_length_stack, buffer[i]);
for (; i <= 2 * indent_count; i++)
VEC_PUSH(scanner->bullet_stack, buffer[i]);
for (; i < length; i++)
VEC_PUSH(scanner->section_stack, buffer[i]);
}
static bool dedent(Scanner *scanner, TSLexer *lexer) {
VEC_POP(scanner->indent_length_stack);
VEC_POP(scanner->bullet_stack);
lexer->result_symbol = LISTEND;
return true;
}
static bool in_error_recovery(const bool *valid_symbols) {
return (valid_symbols[LISTSTART] && valid_symbols[LISTEND] &&
valid_symbols[LISTITEMEND] && valid_symbols[BULLET] &&
valid_symbols[HLSTARS] && valid_symbols[SECTIONEND] &&
valid_symbols[ENDOFFILE]);
}
Bullet getbullet(TSLexer *lexer) {
if (lexer->lookahead == '-') {
advance(lexer);
if (iswspace(lexer->lookahead))
return DASH;
} else if (lexer->lookahead == '+') {
advance(lexer);
if (iswspace(lexer->lookahead))
return PLUS;
} else if (lexer->lookahead == '*') {
advance(lexer);
if (iswspace(lexer->lookahead))
return STAR;
} else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return LOWERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return LOWERPAREN;
}
} else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return UPPERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return UPPERPAREN;
}
} else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
do {
advance(lexer);
} while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return NUMDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return NUMPAREN;
}
}
return NOTABULLET;
}
bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols))
return false;
// - Section ends
int16_t indent_length = 0;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
if (valid_symbols[LISTEND]) {
lexer->result_symbol = LISTEND;
} else if (valid_symbols[SECTIONEND]) {
lexer->result_symbol = SECTIONEND;
} else if (valid_symbols[ENDOFFILE]) {
lexer->result_symbol = ENDOFFILE;
} else
return false;
return true;
} else {
break;
}
skip(lexer);
}
// - Listiem ends
// Listend -> end of a line, looking for:
// 1. dedent
// 2. same indent, not a bullet
// 3. two eols
int16_t newlines = 0;
if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
return dedent(scanner, lexer);
} else if (lexer->lookahead == '\n') {
if (++newlines > 1)
return dedent(scanner, lexer);
indent_length = 0;
} else {
break;
}
skip(lexer);
}
if (indent_length < VEC_BACK(scanner->indent_length_stack)) {
return dedent(scanner, lexer);
} else if (indent_length == VEC_BACK(scanner->indent_length_stack)) {
if (getbullet(lexer) == VEC_BACK(scanner->bullet_stack)) {
lexer->result_symbol = LISTITEMEND;
return true;
}
return dedent(scanner, lexer);
}
}
// - Col=0 star
if (indent_length == 0 && lexer->lookahead == '*') {
lexer->mark_end(lexer);
int16_t stars = 1;
skip(lexer);
while (lexer->lookahead == '*') {
stars++;
skip(lexer);
}
if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) &&
stars > 0 && stars <= VEC_BACK(scanner->section_stack)) {
VEC_POP(scanner->section_stack);
lexer->result_symbol = SECTIONEND;
return true;
} else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) {
VEC_PUSH(scanner->section_stack, stars);
lexer->result_symbol = HLSTARS;
return true;
}
return false;
}
// - Liststart and bullets
if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) {
Bullet bullet = getbullet(lexer);
if (valid_symbols[BULLET] &&
bullet == VEC_BACK(scanner->bullet_stack) &&
indent_length == VEC_BACK(scanner->indent_length_stack)) {
lexer->mark_end(lexer);
lexer->result_symbol = BULLET;
return true;
} else if (valid_symbols[LISTSTART] && bullet != NOTABULLET &&
indent_length > VEC_BACK(scanner->indent_length_stack)) {
VEC_PUSH(scanner->indent_length_stack, indent_length);
VEC_PUSH(scanner->bullet_stack, bullet);
lexer->result_symbol = LISTSTART;
return true;
}
}
return false; // default
}
void *tree_sitter_org_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
scanner->indent_length_stack = (stack *)calloc(1, sizeof(stack));
scanner->bullet_stack = (stack *)calloc(1, sizeof(stack));
scanner->section_stack = (stack *)calloc(1, sizeof(stack));
deserialize(scanner, NULL, 0);
return scanner;
}
bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
unsigned tree_sitter_org_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
void tree_sitter_org_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
void tree_sitter_org_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
VEC_FREE(scanner->indent_length_stack);
VEC_FREE(scanner->bullet_stack);
VEC_FREE(scanner->section_stack);
free(scanner->indent_length_stack);
free(scanner->bullet_stack);
free(scanner->section_stack);
free(scanner);
}

View file

@ -1,279 +0,0 @@
#include <tree_sitter/parser.h>
#include <vector>
#include <cwctype>
namespace {
using std::vector;
using std::iswspace;
enum TokenType {
LISTSTART,
LISTEND,
LISTITEMEND,
BULLET,
HLSTARS,
SECTIONEND,
ENDOFFILE,
};
enum Bullet {
NOTABULLET,
DASH,
PLUS,
STAR,
LOWERDOT,
UPPERDOT,
LOWERPAREN,
UPPERPAREN,
NUMDOT,
NUMPAREN,
};
struct Scanner {
vector<int16_t> indent_length_stack;
vector<int16_t> bullet_stack;
vector<int16_t> section_stack;
Scanner() {
deserialize(NULL, 0);
}
unsigned serialize(char *buffer) {
size_t i = 0;
size_t indent_count = indent_length_stack.size() - 1;
if (indent_count > UINT8_MAX) indent_count = UINT8_MAX;
buffer[i++] = indent_count;
vector<int16_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = bullet_stack.begin() + 1;
end = bullet_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = section_stack.begin() + 1;
end = section_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
section_stack.clear();
section_stack.push_back(0);
indent_length_stack.clear();
indent_length_stack.push_back(-1);
bullet_stack.clear();
bullet_stack.push_back(NOTABULLET);
if (length == 0) return;
size_t i = 0;
size_t indent_count = (uint8_t)buffer[i++];
for (; i <= indent_count ; i++) indent_length_stack.push_back(buffer[i]);
for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]);
for (; i < length ; i++) section_stack.push_back(buffer[i]);
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool dedent(TSLexer *lexer) {
indent_length_stack.pop_back();
bullet_stack.pop_back();
lexer->result_symbol = LISTEND;
return true;
}
Bullet getbullet(TSLexer *lexer) {
if (lexer->lookahead == '-') {
advance(lexer);
if (iswspace(lexer->lookahead)) return DASH;
} else if (lexer->lookahead == '+') {
advance(lexer);
if (iswspace(lexer->lookahead)) return PLUS;
} else if (lexer->lookahead == '*') {
advance(lexer);
if (iswspace(lexer->lookahead)) return STAR;
} else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return LOWERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return LOWERPAREN;
}
} else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return UPPERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return UPPERPAREN;
}
} else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
do {
advance(lexer);
} while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return NUMDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return NUMPAREN;
}
}
return NOTABULLET;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
// - Section ends
int16_t indent_length = 0;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
if (valid_symbols[LISTEND]) { lexer->result_symbol = LISTEND; }
else if (valid_symbols[SECTIONEND]) { lexer->result_symbol = SECTIONEND; }
else if (valid_symbols[ENDOFFILE]) { lexer->result_symbol = ENDOFFILE; }
else return false;
return true;
} else {
break;
}
skip(lexer);
}
// - Listiem ends
// Listend -> end of a line, looking for:
// 1. dedent
// 2. same indent, not a bullet
// 3. two eols
int16_t newlines = 0;
if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
return dedent(lexer);
} else if (lexer->lookahead == '\n') {
if (++newlines > 1) return dedent(lexer);
indent_length = 0;
} else {
break;
}
skip(lexer);
}
if (indent_length < indent_length_stack.back()) {
return dedent(lexer);
} else if (indent_length == indent_length_stack.back()) {
if (getbullet(lexer) == bullet_stack.back()) {
lexer->result_symbol = LISTITEMEND;
return true;
}
return dedent(lexer);
}
}
// - Col=0 star
if (indent_length == 0 && lexer->lookahead == '*') {
lexer->mark_end(lexer);
int16_t stars = 1;
skip(lexer);
while (lexer->lookahead == '*') {
stars++;
skip(lexer);
}
if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) && stars > 0 && stars <= section_stack.back()) {
section_stack.pop_back();
lexer->result_symbol = SECTIONEND;
return true;
} else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) {
section_stack.push_back(stars);
lexer->result_symbol = HLSTARS;
return true;
}
return false;
}
// - Liststart and bullets
if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) {
Bullet bullet = getbullet(lexer);
if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) {
lexer->mark_end(lexer);
lexer->result_symbol = BULLET;
return true;
} else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) {
indent_length_stack.push_back(indent_length);
bullet_stack.push_back(bullet);
lexer->result_symbol = LISTSTART;
return true;
}
}
return false; // default
}
};
}
extern "C" {
void *tree_sitter_org_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_org_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}

View file

@ -123,6 +123,7 @@ struct TSLanguage {
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
const TSStateId *primary_state_ids;
};
/*