Initial commit

This commit is contained in:
Emilia Simmons 2021-04-11 01:07:45 -04:00
commit 8d6ef83961
4 changed files with 1771 additions and 0 deletions

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
Cargo.lock
package-lock.json
node_modules
build
*.log
/examples/*/
/target/

1005
corpus/basic.tst Normal file

File diff suppressed because it is too large Load diff

434
grammar.js Normal file
View file

@ -0,0 +1,434 @@
org_grammar = {
// EXTERNALS, INLINE =================================== {{{1
name: 'org',
extras: _ => [' '], // Treat newlines explicitly
externals: $ => [
$._liststart,
$._listend,
$._listitemend,
$._bullet,
$.stars,
$._sectionend,
$._markup,
],
// inline: $ => [$._word, $._numbers, $._junk],
// inline: $ => [ $._activeStart, $._activeEnd, $._inactiveStart, $._inactiveEnd,
// $._tsSeparator, $._ymd, $._dayname,],
// PRECEDENCES, CONFLICT =============================== {{{1
precedences: _ => [
['section', 'element', 'paragraph', 'textelement'],
['plan', 'textelement'],
['fn_definition', 'footnote'],
],
conflicts: $ => [
[$._text, $.bold],
[$._text, $.italic],
[$._text, $.underline],
[$._text, $.strikethrough],
[$._text, $.code],
[$._text, $.verbatim],
[$.item],
[$._lastitem],
],
rules: {
// DOCUMENT, SECTIONS, BODY, & PARAGRAPH =============== {{{1
document: $ => seq(
optional($.body),
repeat($.section),
),
// SECTIONS, BODY, PARAGRAPH =========================== {{{1
section: $ => prec.dynamic(1, prec('section',
seq(
$.headline, $._eol,
optional(seq(
optional(seq($.plan, $._eol)),
optional(seq($.property_drawer, $._eol)),
optional($.body),
repeat($.section),
)),
$._sectionend,
))),
_eol: _ => choice('\0', '\n', '\r'),
_nl: _ => choice('\n', '\r'),
body: $ => choice(
repeat1($._eol),
seq(
repeat($._eol),
repeat1(seq(
choice(
$._element,
$.paragraph
),
repeat($._eol),
)),
)),
paragraph: $ => prec.right('paragraph',
repeat1(seq(
repeat1($._textelement),
$._eol)
)),
// ELEMENT AND TEXTELEMENT ============================= {{{1
_element: $ => choice(
$.drawer,
$.comment,
$.fndef,
$.directive,
$.list,
$.block,
$.dynamic_block,
// $.table,
),
_textelement: $ => prec('textelement',
choice(
$._text,
$.timestamp,
$.footnote,
$.link,
$.bold,
$.code,
$.italic,
$.verbatim,
$.underline,
$.strikethrough,
// $.subscript
// $.superscript
// $.latexfragment
)),
// HEADLINES =========================================== {{{1
headline: $ => seq(
$.stars,
$.item,
optional($._taglist),
),
item: $ => repeat1(choice($._text, ':')),
_taglist: $ => prec.dynamic(1, // otherwise just item
seq(':',
repeat1(seq(
$.tag,
token.immediate(':')
)))),
tag: _ => token.immediate(/[\p{L}\p{N}_@#%]+/),
_propertyName: _ => /:\p{Z}*:/,
property_drawer: $ => seq(
':PROPERTIES:', $._eol,
repeat(prec.right(seq(optional($.property), repeat1($._eol)))),
':END:',
),
property: $ => seq(
$._propertyName,
repeat($._text),
),
// PLANNING ============================================ {{{1
_scheduled: _ => 'SCHEDULED:',
_deadline: _ => 'DEADLINE:',
_closed: _ => 'CLOSED:',
plan: $ => repeat1(prec('plan',
choice(
$.timestamp,
$.scheduled,
$.deadline,
$.closed,
))),
scheduled: $ => seq($._scheduled, $.timestamp),
deadline: $ => seq($._deadline, $.timestamp),
closed: $ => seq(
$._closed,
alias(choice(
$._inactiveTimestamp,
$._inactiveTimestampRange,
), $.timestamp),
),
// TIMESTAMP =========================================== {{{1
_activeStart: _ => '<',
_activeEnd: _ => '>',
_inactiveStart: _ => '[',
_inactiveEnd: _ => ']',
_tsSeparator: _ => '--',
_ymd: _ => /\p{N}{1,4}-\p{N}{1,2}-\p{N}{1,4}/,
time: _ => /\p{N}?\p{N}:\p{N}\p{N}/,
repeater: _ => /[.+]?\+\p{N}+\p{L}/,
delay: _ => /--?\p{N}+\p{L}/,
date: $ => seq($._ymd, optional(/\p{L}+/)),
timestamp: $ => choice(
$._activeTimestamp,
$._activeTimestampRange,
$._inactiveTimestamp,
$._inactiveTimestampRange,
),
_activeTimestamp: $ => seq(
$._activeStart,
$.date,
optional($.time),
optional($.repeater),
optional($.delay),
$._activeEnd,
),
_inactiveTimestamp: $ => seq(
$._inactiveStart,
$.date,
optional($.time),
optional($.repeater),
optional($.delay),
$._inactiveEnd,
),
_activeTimestampRange: $ => choice(
seq(
alias($._activeTimestamp, $.timestamp),
$._tsSeparator,
alias($._activeTimestamp, $.timestamp)),
seq(
$._activeStart,
$.date,
$.time, '-', $.time,
optional($.repeater),
optional($.delay),
$._activeEnd,
)
),
_inactiveTimestampRange: $ => choice(
seq($._inactiveTimestamp, $._tsSeparator, $._inactiveTimestamp),
seq(
$._inactiveStart,
$.date,
$.time, '-', $.time,
optional($.repeater),
optional($.delay),
$._inactiveEnd,
)
),
// MARKUP ============================================== {{{1
bold: make_markup('*'),
italic: make_markup('/'),
underline: make_markup('_'),
strikethrough: make_markup('+'),
code: make_markup('~', true),
verbatim: make_markup('=', true),
// LINK ================================================ {{{1
_linkstart: _ => '[[',
_linksep: _ => '][',
_linkend: _ => ']]',
link: $ => seq(
$._linkstart,
optional(seq(field('uri', $.linktext), $._linksep)),
field('description', $.linktext),
$._linkend,
),
linktext: _ => /[^\]]*/,
// FOOTNOTE ============================================ {{{1
_fn_label: _ => /[^\p{Z}\[\]]+/,
_fn: _ => '[fn:',
fndef: $ => prec('fn_definition',
seq(
$._fn,
$._fn_label,
']',
$.paragraph,
)),
footnote: $ => prec('footnote',
seq(
$._fn,
choice(
$._fn_label,
seq(optional($._fn_label), ':', repeat1($._fn_label)),
),
']',
)),
// DIRECTIVE =========================================== {{{1
directive: $ => seq(
'#+',
token.immediate(/[^\p{Z}:]+/), // name
token.immediate(':'),
repeat($._text),
$._eol,
),
// COMMENTS ============================================ {{{1
comment: $ => prec.right(repeat1(seq(
'# ', repeat($._text), $._eol
))),
// DRAWER ============================================== {{{1
drawer: $ => seq(
':',
token.immediate(/[\p{L}\p{N}\p{Pd}\p{Pc}]+/),
token.immediate(':'),
$._eol,
optional($.body),
':END:',
$._eol,
),
// BLOCK =============================================== {{{1
block: $ => seq(
'#+BEGIN_',
alias($._name, $.name),
optional($.parameters),
$._nl,
alias(
repeat(seq(
repeat($._textonly),
$._nl,
)),
$.contents),
'#+END_', $._name, // \P{Z} does not match newlines
repeat($._junk), // FIXME
$._eol,
),
_name: _ => token.immediate(/[^\p{Z}\n\r]+/),
// DYNAMIC BLOCK ======================================= {{{1
dynamic_block: $ => prec(1, seq( // FIXME why is this precedence required?
'#+BEGIN:',
optional(alias($._text, $.name)),
optional($.parameters),
// optional(alias(repeat1(/\S+/), $.parameters)),
$._eol,
alias(repeat(seq(
repeat($._textonly),
$._nl,
)), $.contents),
'#+END:',
repeat($._junk), // FIXME
$._eol,
)),
parameters: $ => repeat1($._text),
// LISTS =============================================== {{{1
list: $ => seq(
$._liststart,
repeat(seq($.listitem, optional($._eol))),
alias($._lastitem, $.listitem),
),
listitem: $ => seq(
$._bullet,
optional($._checkbox),
optional($._itemtag),
optional($._itemtext),
$._listitemend,
$._eol,
),
_lastitem: $ => seq(
$._bullet,
optional($._checkbox),
optional($._itemtag),
optional($._itemtext),
$._listend,
optional($._eol),
),
_checkbox: _ => /\[[ xX-]\]/,
_itemtag: $ => seq(repeat($._textelement), '::'),
_itemtext: $ => seq(
repeat1($._textelement),
repeat(seq(
$._eol,
optional($._eol),
choice(repeat1($._textelement), $.list)
)),
),
// TEXT ================================================ {{{1
// TODO: inline word/numbers/junk. Causes precedence issues
// A repeat would also be nice.
_textonly: $ => choice($._word,
$._numbers,
$._junk,
),
_text: $ => choice(
$._word,
$._numbers,
$._junk,
$._activeStart, // Causes conflicts, so they get marked as text.
$._inactiveStart,
seq($._markup, '*'),
seq($._markup, '/'),
seq($._markup, '_'),
seq($._markup, '+'),
seq($._markup, '~'),
seq($._markup, '='),
'#', // comment collision
),
_word: _ => /\p{L}+/,
_numbers: _ => /\p{N}+/,
_junk: _ => /[^\p{Z}\p{L}\p{N}]/,
}
};
function make_markup(delim, textonly = false) { // {{{1
return $ => prec.left(seq(
$._markup,
delim,
repeat1(textonly ? $._text : $._textelement),
repeat(seq($._eol, repeat1(textonly ? $._text : $._textelement))),
token.immediate(delim),
))
}
// }}}
module.exports = grammar(org_grammar);

325
src/scanner.cc Normal file
View file

@ -0,0 +1,325 @@
#include <tree_sitter/parser.h>
#include <vector>
#include <cwctype>
#include <cstring>
#include <cassert>
#include <stdio.h>
#include <iostream>
namespace {
using std::vector;
using std::iswspace;
enum TokenType { // {{{1
LISTSTART,
LISTEND,
LISTITEMEND,
BULLET,
HLSTARS,
SECTIONEND,
MARKUP,
};
enum Bullet { // {{{1
NOTABULLET,
DASH,
PLUS,
STAR,
LOWERDOT,
UPPERDOT,
LOWERPAREN,
UPPERPAREN,
NUMDOT,
NUMPAREN,
};
struct Scanner { // {{{1
vector<int16_t> indent_length_stack;
vector<int16_t> bullet_stack;
vector<int16_t> section_stack;
Scanner() {
deserialize(NULL, 0);
}
unsigned serialize(char *buffer) { // {{{1
size_t i = 0;
size_t indent_count = indent_length_stack.size() - 1;
if (indent_count > UINT8_MAX) indent_count = UINT8_MAX;
buffer[i++] = indent_count;
vector<int16_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = bullet_stack.begin() + 1;
end = bullet_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = section_stack.begin() + 1;
end = section_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) { // {{{1
section_stack.clear();
section_stack.push_back(0);
indent_length_stack.clear();
indent_length_stack.push_back(-1);
bullet_stack.clear();
bullet_stack.push_back(NOTABULLET);
if (length == 0) return;
size_t i = 0;
size_t indent_count = (uint8_t)buffer[i++];
for (; i <= indent_count ; i++) indent_length_stack.push_back(buffer[i]);
for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]);
for (; i < length ; i++) section_stack.push_back(buffer[i]);
}
void advance(TSLexer *lexer) { // {{{1
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) { // {{{1
lexer->advance(lexer, true);
}
bool dedent(TSLexer *lexer) { // {{{1
indent_length_stack.pop_back();
bullet_stack.pop_back();
lexer->result_symbol = LISTEND;
// std::cout << " == Dedent~" << std::endl;
return true;
}
Bullet getbullet(TSLexer *lexer) { // {{{1
if (lexer->lookahead == '-') {
skip(lexer);
if (iswspace(lexer->lookahead)) return DASH;
} else if (lexer->lookahead == '+') {
skip(lexer);
if (iswspace(lexer->lookahead)) return PLUS;
} else if (lexer->lookahead == '*') {
skip(lexer);
if (iswspace(lexer->lookahead)) return STAR;
} else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
skip(lexer);
if (lexer->lookahead == '.') {
skip(lexer);
if (iswspace(lexer->lookahead)) return LOWERDOT;
} else if (lexer->lookahead == ')') {
skip(lexer);
if (iswspace(lexer->lookahead)) return LOWERPAREN;
}
} else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
skip(lexer);
if (lexer->lookahead == '.') {
skip(lexer);
if (iswspace(lexer->lookahead)) return UPPERDOT;
} else if (lexer->lookahead == ')') {
skip(lexer);
if (iswspace(lexer->lookahead)) return UPPERPAREN;
}
} else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
do {
skip(lexer);
} while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
if (lexer->lookahead == '.') {
skip(lexer);
if (iswspace(lexer->lookahead)) return NUMDOT;
} else if (lexer->lookahead == ')') {
skip(lexer);
if (iswspace(lexer->lookahead)) return NUMPAREN;
}
}
return NOTABULLET;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) { // {{{1
// std::cout << " == " << valid_symbols[LISTSTART] << ", " << valid_symbols[LISTEND] << ", " << valid_symbols[LISTITEMEND] << ", " << valid_symbols[BULLET] << ", " << valid_symbols[HLSTARS] << ", " << valid_symbols[SECTIONEND] << ", " << valid_symbols[MARKUP] << std::endl;
if (valid_symbols[SECTIONEND] && lexer->lookahead == '\0' && section_stack.back() > 0) {
lexer->result_symbol = SECTIONEND;
section_stack.pop_back();
return true;
}
int16_t indent_length = 0;
// - Listiem ends {{{1
// Listend -> end of a line, looking for:
// 1. dedent
// 2. same indent, not a bullet
// 3. three eols
if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
int16_t newlines = 0;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ' && newlines > 0) {
indent_length++;
} else if (lexer->lookahead == '\t' && newlines > 0) {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
return dedent(lexer);
} else if (lexer->lookahead == '\n') {
if (++newlines > 2) return dedent(lexer);
indent_length = 0;
} else {
break;
}
skip(lexer);
}
if (newlines == 0) return false;
if (indent_length < indent_length_stack.back()) {
return dedent(lexer);
} else if (indent_length == indent_length_stack.back()) {
if (getbullet(lexer) == bullet_stack.back()) {
// std::cout << " == Item end~" << std::endl;
lexer->result_symbol = LISTITEMEND;
return true;
}
return dedent(lexer);
}
return false;
}
// - Count whitespace {{{1
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\n') {
return false;
} else {
break;
}
skip(lexer);
}
// std::cout << " == indent: " << indent_length << " next: '" << char(lexer->lookahead) << "'" << std::endl;
// - Col=0 star {{{1
if (indent_length == 0 && lexer->lookahead == '*') {
lexer->mark_end(lexer);
int16_t stars = 1;
skip(lexer);
while (lexer->lookahead == '*') {
stars++;
skip(lexer);
}
if (valid_symbols[SECTIONEND] && stars <= section_stack.back()) {
section_stack.pop_back();
lexer->result_symbol = SECTIONEND;
// std::cout << " == Section End~" << std::endl;
return true;
} else if (valid_symbols[HLSTARS] && lexer->lookahead == ' ' || lexer->lookahead == '\t') {
section_stack.push_back(stars);
lexer->mark_end(lexer);
lexer->result_symbol = HLSTARS;
// std::cout << " == Stars~" << std::endl;
return true;
} else if (valid_symbols[MARKUP] && stars == 1 && (!iswspace(lexer->lookahead) && lexer->lookahead != '\0')) {
lexer->result_symbol = MARKUP;
// std::cout << " == Bold~" << std::endl;
return true;
}
return false;
}
// - Liststart and bullets {{{1
if (valid_symbols[LISTSTART] || valid_symbols[BULLET]) {
bool plus = lexer->lookahead == '+'; // requires special treatment, like *
Bullet bullet = getbullet(lexer);
// std::cout << " == bullet: " << bullet << " back indent: " << indent_length_stack.back() << std::endl;
// std::cout << " == il gt back: " << (indent_length > indent_length_stack.back()) << std::endl;
if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) {
lexer->mark_end(lexer);
lexer->result_symbol = BULLET;
// std::cout << " == Bullet~" << std::endl;
return true;
} else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) {
indent_length_stack.push_back(indent_length);
bullet_stack.push_back(bullet);
lexer->result_symbol = LISTSTART;
// std::cout << " == Liststart~" << std::endl;
return true;
} else if (valid_symbols[MARKUP] && bullet == NOTABULLET && plus) {
lexer->result_symbol = MARKUP;
// std::cout << " == Markup~" << std::endl;
return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
}
}
// - Markup {{{1
if (valid_symbols[MARKUP] && (indent_length > 0 || lexer->get_column(lexer) == 0)
&& (lexer->lookahead == '*'
|| lexer->lookahead == '/'
|| lexer->lookahead == '_'
|| lexer->lookahead == '+'
|| lexer->lookahead == '~'
|| lexer->lookahead == '=')) {
lexer->mark_end(lexer);
skip(lexer);
lexer->result_symbol = MARKUP;
// std::cout << " == Markup~" << std::endl;
return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
}
// - Default {{{1
// std::cout << " == False~" << std::endl;
return false;
}
};
}
extern "C" { // {{{1
void *tree_sitter_org_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_org_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}