From 8d6ef8396174236e09edae7070a3883d0e8e0bca Mon Sep 17 00:00:00 2001 From: Emilia Simmons Date: Sun, 11 Apr 2021 01:07:45 -0400 Subject: [PATCH] Initial commit --- .gitignore | 7 + corpus/basic.tst | 1005 ++++++++++++++++++++++++++++++++++++++++++++++ grammar.js | 434 ++++++++++++++++++++ src/scanner.cc | 325 +++++++++++++++ 4 files changed, 1771 insertions(+) create mode 100644 .gitignore create mode 100644 corpus/basic.tst create mode 100644 grammar.js create mode 100644 src/scanner.cc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c997138 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +Cargo.lock +package-lock.json +node_modules +build +*.log +/examples/*/ +/target/ diff --git a/corpus/basic.tst b/corpus/basic.tst new file mode 100644 index 0000000..a3c8f6d --- /dev/null +++ b/corpus/basic.tst @@ -0,0 +1,1005 @@ +============== +Headlines.1a - No eols +============== +* l1 +---------- + +(document (section (headline (stars) (item)))) + +============== +Headlines.1b - pre eol +============== + +* l1 +---------- + +(document (body) (section (headline (stars) (item)))) + +============== +Headlines.1c - Post eols (body) +============== +* l1 + + +---------- + +(document (section (headline (stars) (item)) (body))) + +============== +Headlines.1d - More eols +============== + +* l1 + + +---------- + +(document (body) (section (headline (stars) (item)) (body))) + +============== +Headlines.2 - level 2 +============== +** l2 + +---------- + +(document (section (headline (stars) (item)))) + +============== +Headlines.3 - Two sections +============== +* l1 +* l1 + +---------- + +(document + (section (headline (stars) (item))) + (section (headline (stars) (item))) + ) + +============== +Headlines.3a - Two sections, eol +============== +* l1 + +* l1 +---------- + +(document + (section (headline (stars) (item)) (body)) + (section (headline (stars) (item))) + ) + +============== +Headlines.4 - Subsection +============== +* l1 +** l2 + +---------- + +(document + (section + (headline (stars) (item)) + (section (headline (stars) (item)))) + ) + +============== +Headlines.4a - Subsection eols +============== +* l1 + +** l2 + +---------- + +(document + (section + (headline (stars) (item)) + (body) + (section (headline (stars) (item))) + )) + +============== +Headlines.5 - Subsection & continued section +============== +* l1 +** l2 +* l1 +---------- + +(document + (section + (headline (stars) (item)) + (section (headline (stars) (item)))) + (section (headline (stars) (item))) + ) + +============== +Headlines.6 - Top high level section +============== +*** l3 +* l1 +---------- + +(document + (section (headline (stars) (item))) + (section (headline (stars) (item))) + ) + +============== +Headlines.7a - Item/tag conflict (:) +============== +* a: b +---------- + +(document (section (headline (stars) (item)))) + +============== +Headlines.7b - Item/tag conflict (:) +============== +* a: b: +---------- + +(document (section (headline (stars) (item)))) + +============== +Headlines.8 - Tag +============== +* a :b: +---------- + +(document (section (headline (stars) (item) (tag)))) + +========== +Body.1 +========== +a +---------- + +(document (body (paragraph))) + +========== +Body.2 +========== + +# a +---------- + +(document (body (comment))) + +========== +Body.3 +========== +a + +---------- + +(document (body (paragraph))) + +========== +Body.4 +========== + +a + +---------- + +(document (body (paragraph))) + +========== +Body.5 +========== +a +---------- + +(document (body (paragraph))) + +========== +Body.6 +========== +a + +a +---------- + +(document (body (paragraph) (paragraph))) + +========== +Body.7 +========== + +a + +# a +---------- + +(document (body (paragraph) (comment))) + +========== +Body.8 +========== +# a + +a + +---------- + +(document (body (comment) (paragraph))) + +========== +Body.9 +========== + +a + +a + +---------- + +(document (body (paragraph) (paragraph))) + +========== +Paragraph.1 +========== + +a + +---------- + +(document (body (paragraph))) + +========== +Paragraph.2 +========== +a + +a +---------- + +(document (body (paragraph) (paragraph))) + + +========== +Paragraph.3a +========== +a +a + +---------- + +(document (body (paragraph))) + +========== +Paragraph.3b +========== +a +a + +a + +---------- + +(document (body (paragraph) (paragraph))) + +========== +Paragraph.4 +========== +* headline +words +words +words + +words +words + +---------- + +(document + (section + (headline (stars) (item)) + (body (paragraph) (paragraph)) + )) + +========== +Timestamp.1 +========== +<1111-11-11 day> + +---------- + +(document + (body + (paragraph + (timestamp (date))) + )) + +========== +Timestamp.2 +========== +<1111-11-11 day +1h> + +---------- + +(document + (body + (paragraph + (timestamp (date) (repeater))) + )) + +========== +Timestamp.3 +========== +<1111-11-11 day -1d> + +---------- + +(document + (body + (paragraph + (timestamp (date) (delay))) + )) + +========== +Timestamp.4 +========== +<1111-11-11 day +1w -1m> + +---------- + +(document + (body + (paragraph + (timestamp (date) (repeater) (delay))) + )) + +========== +Timestamp.5 +========== +<1111-11-11 day 11:11> + +---------- + +(document + (body + (paragraph + (timestamp (date) (time))) + )) + +========== +Timestamp.6 +========== +<1111-11-11 day 11:11-11:11> + +---------- + +(document + (body + (paragraph + (timestamp (date) (time) (time))) + )) + +========== +Timestamp.7 +========== +<1111-11-11 day 11:11>--<1111-11-11 day 11:11 +1d> + +---------- + +(document + (body + (paragraph + (timestamp + (timestamp (date) (time)) + (timestamp (date) (time) (repeater))) + ))) + +=============== +Timestamp.8 - Junk +=============== +[b] +--------------- + +(document + (body + (paragraph) + )) + +=============== +Timestamp.9 - Junk +=============== + +--------------- + +(document + (body + (paragraph) + )) + +========== +Plan +========== +* headline +[1111-11-11 Day] + +---------- + +(document + (section + (headline (stars) (item)) + (plan (timestamp (date))) + )) + +========== +Scheduled +========== +* headline +SCHEDULED: <1111-11-11 Day> + +---------- + +(document + (section + (headline (stars) (item)) + (plan (scheduled (timestamp (date)))) + )) + +================= +Multiple plan +================= +* headline +DEADLINE: <1111-11-11 Day> <1111-11-11 Day> CLOSED: [1111-11-11 Day] + +----------------- + +(document + (section + (headline (stars) (item)) + (plan + (deadline (timestamp (date))) + (timestamp (date)) + (closed (timestamp (date)))) + )) + +========== +Drawer.1 +========== +:name: +:END: + +---------- + +(document (body (drawer))) + +========== +Drawer.2 +========== +:name: +a +:END: + +---------- + +(document (body (drawer (body (paragraph))))) + +========== +Drawer.3 +========== +:name: +a + +a +:END: + +---------- + +(document (body (drawer (body (paragraph) (paragraph))))) + +========== +Drawer.4 +========== +:name: +:name: +a +:END: +a +:END: + +---------- + +(document + (body + (drawer + (body + (drawer (body (paragraph))) + (paragraph)) + ))) + +========== +Block.1 - +========== +#+BEGIN_A +#+END_B +---------- + +(document (body (block (name)))) + +========== +Block.2 - +========== +#+BEGIN_SRC ABC +a +#+END_ABC +---------- + +(document (body (block (name) (parameters) (contents)))) + +================= +DynamicBlock.1 - +================= +#+BEGIN: a b +#+END: +---------- + +(document (body (dynamic_block (name) (parameters)))) + +================= +DynamicBlock.2 - +================= +#+BEGIN: a +c +#+END: +---------- + +(document (body (dynamic_block (name) (contents)))) + +========== +Link.1 - Description only +========== +[[link]] +---------- + +(document + (body + (paragraph (link (linktext))) + )) + +========== +Link.2 - Complete +========== +[[uri][link]] +---------- + +(document + (body + (paragraph (link (linktext) (linktext))) + )) + +========== +Link.3 - Junk +========== +[not [a link]] +---------- + +(document (body (paragraph))) + +========== +Footnote.1 +========== +a [fn:b] +---------- + +(document (body (paragraph (footnote)))) + +========== +Footnote.2 +========== +inline def [fn:name:definition] +---------- + +(document (body (paragraph (footnote)))) + +========== +Footnote.3 +========== +[fn:name] definition +words +---------- + +(document (body (fndef (paragraph)))) + +========== +Comment.1 +========== +# Comment +---------- + +(document + (body + (comment) + )) + +========== +Comment.2 +========== +text +# Comment +---------- + +(document + (body + (paragraph) + (comment) + )) + +========== +Comment.3 +========== +# Comment +text +---------- + +(document + (body + (comment) + (paragraph) + )) + +========== +Comment.4 +========== +text +# Comment +text + +# Comment +# Comment +---------- + +(document + (body + (paragraph) + (comment) + (paragraph) + (comment) + )) + + +=========== +Markup.1 - Ya basic +=========== +a *b* +a /b/ +a ~b~ +a _b_ +a =b= +a +b+ + +---------- + +(document + (body + (paragraph + (bold) + (italic) + (code) + (underline) + (verbatim) + (strikethrough)) + )) + +=========== +Markup.2a - start of line +=========== +*b* +---------- + +(document (body (paragraph (bold)))) + +=========== +Markup.2b - start of line +=========== +/b/ +---------- + +(document (body (paragraph (italic)))) + +=========== +Markup.2c - start of line +=========== ++b+ +---------- + +(document (body (paragraph (strikethrough)))) + +========== +Markup.3a - Within +========== +a *b /c d/ e* +---------- + +(document (body (paragraph (bold (italic))))) + +========== +Markup.3b - Within +========== +a _b ~c d~ e_ +---------- + +(document (body (paragraph (underline (code))))) + +========== +Markup.3c - Within +========== +a =b +c d+ e= +---------- + +(document (body (paragraph (verbatim)))) + +========== +Markup.4 - Multi +========== ++a /b/ b+ +---------- + +(document (body (paragraph (strikethrough (italic))))) + +=========== +Markup.5 - Junk +=========== + +*b * a + +* b* a + +---------- + +(document (body (paragraph)) (section (headline (stars) (item)))) + + +========== +List.1a - Basic: dash [-] +========== + - a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1b - Basic: plus [+] +========== + + a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1c - Basic: star [*] +========== + * a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1d - Basic: count dot [1.] +========== + 1. a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1e - Basic: count paren [1)] +========== + 1) a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1f - Basic: letter dot [a.] +========== + 1. a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.1g - Basic: letter paren [a)] +========== + 1) a +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.2a - two items +========== + + - a + - a + +---------- + +(document + (body + (list (listitem) (listitem)) + )) + +========== +List.2d - two items +========== + + 1. a + 2. a + +---------- + +(document + (body + (list (listitem) (listitem)) + )) + +========== +List.2b - two items +========== + + - a + + - a + +---------- + +(document + (body + (list (listitem) (listitem)) + )) + +========== +List.2c - two lists +========== + + - a + + + - a + +---------- + +(document + (body + (list (listitem)) + (list (listitem)) + )) + +========== +List.3 - sublist +========== + + - a + a + - b + a + - a + +---------- + +(document + (body + (list + (listitem + (list (listitem))) + (listitem) + ))) + +========== +List.4a - multiline item +========== + + - a + b + +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.4b - multiline item +========== + + - a + + b + +---------- + +(document + (body + (list (listitem)) + )) + +========== +List.5 - dedent +========== + + - a + b + - a + +---------- + +(document + (body + (list (listitem)) + (paragraph) + (list (listitem)) + )) + +========== +List.6 - multi dedent +========== + +b + - a + - a +b + +---------- + +(document + (body + (paragraph) + (list + (listitem + (list (listitem)))) + (paragraph) + )) + +========== +List.7a - changing +========== + + - a + + a + +---------- + +(document + (body + (list (listitem)) + (list (listitem)) + )) + +============== +Directive.1 - +============== +#+a: b +---------- + +(document (body (directive))) diff --git a/grammar.js b/grammar.js new file mode 100644 index 0000000..da44eec --- /dev/null +++ b/grammar.js @@ -0,0 +1,434 @@ +org_grammar = { + // EXTERNALS, INLINE =================================== {{{1 + name: 'org', + extras: _ => [' '], // Treat newlines explicitly + + externals: $ => [ + $._liststart, + $._listend, + $._listitemend, + $._bullet, + $.stars, + $._sectionend, + $._markup, + ], + + // inline: $ => [$._word, $._numbers, $._junk], + // inline: $ => [ $._activeStart, $._activeEnd, $._inactiveStart, $._inactiveEnd, + // $._tsSeparator, $._ymd, $._dayname,], + + + // PRECEDENCES, CONFLICT =============================== {{{1 + precedences: _ => [ + ['section', 'element', 'paragraph', 'textelement'], + ['plan', 'textelement'], + ['fn_definition', 'footnote'], + ], + + conflicts: $ => [ + [$._text, $.bold], + [$._text, $.italic], + [$._text, $.underline], + [$._text, $.strikethrough], + [$._text, $.code], + [$._text, $.verbatim], + [$.item], + [$._lastitem], + ], + + rules: { + // DOCUMENT, SECTIONS, BODY, & PARAGRAPH =============== {{{1 + + document: $ => seq( + optional($.body), + repeat($.section), + ), + + // SECTIONS, BODY, PARAGRAPH =========================== {{{1 + + section: $ => prec.dynamic(1, prec('section', + seq( + $.headline, $._eol, + optional(seq( + optional(seq($.plan, $._eol)), + optional(seq($.property_drawer, $._eol)), + optional($.body), + repeat($.section), + )), + $._sectionend, + ))), + + _eol: _ => choice('\0', '\n', '\r'), + _nl: _ => choice('\n', '\r'), + + body: $ => choice( + repeat1($._eol), + seq( + repeat($._eol), + repeat1(seq( + choice( + $._element, + $.paragraph + ), + repeat($._eol), + )), + )), + + paragraph: $ => prec.right('paragraph', + repeat1(seq( + repeat1($._textelement), + $._eol) + )), + + // ELEMENT AND TEXTELEMENT ============================= {{{1 + + _element: $ => choice( + $.drawer, + $.comment, + $.fndef, + $.directive, + $.list, + $.block, + $.dynamic_block, + // $.table, + ), + + _textelement: $ => prec('textelement', + choice( + $._text, + $.timestamp, + $.footnote, + $.link, + $.bold, + $.code, + $.italic, + $.verbatim, + $.underline, + $.strikethrough, + // $.subscript + // $.superscript + // $.latexfragment + )), + + // HEADLINES =========================================== {{{1 + + headline: $ => seq( + $.stars, + $.item, + optional($._taglist), + ), + + item: $ => repeat1(choice($._text, ':')), + + _taglist: $ => prec.dynamic(1, // otherwise just item + seq(':', + repeat1(seq( + $.tag, + token.immediate(':') + )))), + + tag: _ => token.immediate(/[\p{L}\p{N}_@#%]+/), + + _propertyName: _ => /:\p{Z}*:/, + + property_drawer: $ => seq( + ':PROPERTIES:', $._eol, + repeat(prec.right(seq(optional($.property), repeat1($._eol)))), + ':END:', + ), + + property: $ => seq( + $._propertyName, + repeat($._text), + ), + + // PLANNING ============================================ {{{1 + + _scheduled: _ => 'SCHEDULED:', + _deadline: _ => 'DEADLINE:', + _closed: _ => 'CLOSED:', + + plan: $ => repeat1(prec('plan', + choice( + $.timestamp, + $.scheduled, + $.deadline, + $.closed, + ))), + + scheduled: $ => seq($._scheduled, $.timestamp), + deadline: $ => seq($._deadline, $.timestamp), + closed: $ => seq( + $._closed, + alias(choice( + $._inactiveTimestamp, + $._inactiveTimestampRange, + ), $.timestamp), + ), + + // TIMESTAMP =========================================== {{{1 + + _activeStart: _ => '<', + _activeEnd: _ => '>', + _inactiveStart: _ => '[', + _inactiveEnd: _ => ']', + _tsSeparator: _ => '--', + _ymd: _ => /\p{N}{1,4}-\p{N}{1,2}-\p{N}{1,4}/, + time: _ => /\p{N}?\p{N}:\p{N}\p{N}/, + repeater: _ => /[.+]?\+\p{N}+\p{L}/, + delay: _ => /--?\p{N}+\p{L}/, + + date: $ => seq($._ymd, optional(/\p{L}+/)), + + timestamp: $ => choice( + $._activeTimestamp, + $._activeTimestampRange, + $._inactiveTimestamp, + $._inactiveTimestampRange, + ), + + _activeTimestamp: $ => seq( + $._activeStart, + $.date, + optional($.time), + optional($.repeater), + optional($.delay), + $._activeEnd, + ), + + _inactiveTimestamp: $ => seq( + $._inactiveStart, + $.date, + optional($.time), + optional($.repeater), + optional($.delay), + $._inactiveEnd, + ), + + _activeTimestampRange: $ => choice( + seq( + alias($._activeTimestamp, $.timestamp), + $._tsSeparator, + alias($._activeTimestamp, $.timestamp)), + seq( + $._activeStart, + $.date, + $.time, '-', $.time, + optional($.repeater), + optional($.delay), + $._activeEnd, + ) + ), + + _inactiveTimestampRange: $ => choice( + seq($._inactiveTimestamp, $._tsSeparator, $._inactiveTimestamp), + seq( + $._inactiveStart, + $.date, + $.time, '-', $.time, + optional($.repeater), + optional($.delay), + $._inactiveEnd, + ) + ), + + // MARKUP ============================================== {{{1 + + bold: make_markup('*'), + italic: make_markup('/'), + underline: make_markup('_'), + strikethrough: make_markup('+'), + code: make_markup('~', true), + verbatim: make_markup('=', true), + + // LINK ================================================ {{{1 + + _linkstart: _ => '[[', + _linksep: _ => '][', + _linkend: _ => ']]', + + link: $ => seq( + $._linkstart, + optional(seq(field('uri', $.linktext), $._linksep)), + field('description', $.linktext), + $._linkend, + ), + linktext: _ => /[^\]]*/, + + // FOOTNOTE ============================================ {{{1 + + _fn_label: _ => /[^\p{Z}\[\]]+/, + _fn: _ => '[fn:', + + fndef: $ => prec('fn_definition', + seq( + $._fn, + $._fn_label, + ']', + $.paragraph, + )), + + footnote: $ => prec('footnote', + seq( + $._fn, + choice( + $._fn_label, + seq(optional($._fn_label), ':', repeat1($._fn_label)), + ), + ']', + )), + + // DIRECTIVE =========================================== {{{1 + + directive: $ => seq( + '#+', + token.immediate(/[^\p{Z}:]+/), // name + token.immediate(':'), + repeat($._text), + $._eol, + ), + + // COMMENTS ============================================ {{{1 + + comment: $ => prec.right(repeat1(seq( + '# ', repeat($._text), $._eol + ))), + + // DRAWER ============================================== {{{1 + + drawer: $ => seq( + ':', + token.immediate(/[\p{L}\p{N}\p{Pd}\p{Pc}]+/), + token.immediate(':'), + $._eol, + optional($.body), + ':END:', + $._eol, + ), + + // BLOCK =============================================== {{{1 + + block: $ => seq( + '#+BEGIN_', + alias($._name, $.name), + optional($.parameters), + $._nl, + alias( + repeat(seq( + repeat($._textonly), + $._nl, + )), + $.contents), + '#+END_', $._name, // \P{Z} does not match newlines + repeat($._junk), // FIXME + $._eol, + ), + + _name: _ => token.immediate(/[^\p{Z}\n\r]+/), + + // DYNAMIC BLOCK ======================================= {{{1 + + dynamic_block: $ => prec(1, seq( // FIXME why is this precedence required? + '#+BEGIN:', + optional(alias($._text, $.name)), + optional($.parameters), + // optional(alias(repeat1(/\S+/), $.parameters)), + $._eol, + alias(repeat(seq( + repeat($._textonly), + $._nl, + )), $.contents), + '#+END:', + repeat($._junk), // FIXME + $._eol, + )), + + parameters: $ => repeat1($._text), + + // LISTS =============================================== {{{1 + + list: $ => seq( + $._liststart, + repeat(seq($.listitem, optional($._eol))), + alias($._lastitem, $.listitem), + ), + + listitem: $ => seq( + $._bullet, + optional($._checkbox), + optional($._itemtag), + optional($._itemtext), + $._listitemend, + $._eol, + ), + + _lastitem: $ => seq( + $._bullet, + optional($._checkbox), + optional($._itemtag), + optional($._itemtext), + $._listend, + optional($._eol), + ), + + _checkbox: _ => /\[[ xX-]\]/, + _itemtag: $ => seq(repeat($._textelement), '::'), + + _itemtext: $ => seq( + repeat1($._textelement), + repeat(seq( + $._eol, + optional($._eol), + choice(repeat1($._textelement), $.list) + )), + ), + + + // TEXT ================================================ {{{1 + + // TODO: inline word/numbers/junk. Causes precedence issues + // A repeat would also be nice. + _textonly: $ => choice($._word, + $._numbers, + $._junk, + ), + + _text: $ => choice( + $._word, + $._numbers, + $._junk, + + $._activeStart, // Causes conflicts, so they get marked as text. + $._inactiveStart, + + seq($._markup, '*'), + seq($._markup, '/'), + seq($._markup, '_'), + seq($._markup, '+'), + seq($._markup, '~'), + seq($._markup, '='), + + '#', // comment collision + ), + + + _word: _ => /\p{L}+/, + _numbers: _ => /\p{N}+/, + _junk: _ => /[^\p{Z}\p{L}\p{N}]/, + + } +}; + +function make_markup(delim, textonly = false) { // {{{1 + return $ => prec.left(seq( + $._markup, + delim, + repeat1(textonly ? $._text : $._textelement), + repeat(seq($._eol, repeat1(textonly ? $._text : $._textelement))), + token.immediate(delim), + )) +} + +// }}} + +module.exports = grammar(org_grammar); diff --git a/src/scanner.cc b/src/scanner.cc new file mode 100644 index 0000000..00f6cde --- /dev/null +++ b/src/scanner.cc @@ -0,0 +1,325 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using std::vector; +using std::iswspace; + +enum TokenType { // {{{1 + LISTSTART, + LISTEND, + LISTITEMEND, + BULLET, + HLSTARS, + SECTIONEND, + MARKUP, +}; + +enum Bullet { // {{{1 + NOTABULLET, + DASH, + PLUS, + STAR, + LOWERDOT, + UPPERDOT, + LOWERPAREN, + UPPERPAREN, + NUMDOT, + NUMPAREN, +}; + +struct Scanner { // {{{1 + vector indent_length_stack; + vector bullet_stack; + vector section_stack; + + Scanner() { + deserialize(NULL, 0); + } + + unsigned serialize(char *buffer) { // {{{1 + size_t i = 0; + + size_t indent_count = indent_length_stack.size() - 1; + if (indent_count > UINT8_MAX) indent_count = UINT8_MAX; + buffer[i++] = indent_count; + + vector::iterator + iter = indent_length_stack.begin() + 1, + end = indent_length_stack.end(); + + for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { + buffer[i++] = *iter; + } + + iter = bullet_stack.begin() + 1; + end = bullet_stack.end(); + for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { + buffer[i++] = *iter; + } + + iter = section_stack.begin() + 1; + end = section_stack.end(); + + for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { + buffer[i++] = *iter; + } + + return i; + } + + void deserialize(const char *buffer, unsigned length) { // {{{1 + section_stack.clear(); + section_stack.push_back(0); + indent_length_stack.clear(); + indent_length_stack.push_back(-1); + bullet_stack.clear(); + bullet_stack.push_back(NOTABULLET); + + if (length == 0) return; + + size_t i = 0; + + size_t indent_count = (uint8_t)buffer[i++]; + + for (; i <= indent_count ; i++) indent_length_stack.push_back(buffer[i]); + for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]); + for (; i < length ; i++) section_stack.push_back(buffer[i]); + + } + + void advance(TSLexer *lexer) { // {{{1 + lexer->advance(lexer, false); + } + + void skip(TSLexer *lexer) { // {{{1 + lexer->advance(lexer, true); + } + + bool dedent(TSLexer *lexer) { // {{{1 + indent_length_stack.pop_back(); + bullet_stack.pop_back(); + lexer->result_symbol = LISTEND; + // std::cout << " == Dedent~" << std::endl; + return true; + } + + Bullet getbullet(TSLexer *lexer) { // {{{1 + if (lexer->lookahead == '-') { + skip(lexer); + if (iswspace(lexer->lookahead)) return DASH; + } else if (lexer->lookahead == '+') { + skip(lexer); + if (iswspace(lexer->lookahead)) return PLUS; + } else if (lexer->lookahead == '*') { + skip(lexer); + if (iswspace(lexer->lookahead)) return STAR; + } else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') { + skip(lexer); + if (lexer->lookahead == '.') { + skip(lexer); + if (iswspace(lexer->lookahead)) return LOWERDOT; + } else if (lexer->lookahead == ')') { + skip(lexer); + if (iswspace(lexer->lookahead)) return LOWERPAREN; + } + } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') { + skip(lexer); + if (lexer->lookahead == '.') { + skip(lexer); + if (iswspace(lexer->lookahead)) return UPPERDOT; + } else if (lexer->lookahead == ')') { + skip(lexer); + if (iswspace(lexer->lookahead)) return UPPERPAREN; + } + } else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') { + do { + skip(lexer); + } while ('0' <= lexer->lookahead && lexer->lookahead <= '9'); + if (lexer->lookahead == '.') { + skip(lexer); + if (iswspace(lexer->lookahead)) return NUMDOT; + } else if (lexer->lookahead == ')') { + skip(lexer); + if (iswspace(lexer->lookahead)) return NUMPAREN; + } + } + return NOTABULLET; + } + + bool scan(TSLexer *lexer, const bool *valid_symbols) { // {{{1 + + // std::cout << " == " << valid_symbols[LISTSTART] << ", " << valid_symbols[LISTEND] << ", " << valid_symbols[LISTITEMEND] << ", " << valid_symbols[BULLET] << ", " << valid_symbols[HLSTARS] << ", " << valid_symbols[SECTIONEND] << ", " << valid_symbols[MARKUP] << std::endl; + if (valid_symbols[SECTIONEND] && lexer->lookahead == '\0' && section_stack.back() > 0) { + lexer->result_symbol = SECTIONEND; + section_stack.pop_back(); + return true; + } + + int16_t indent_length = 0; + // - Listiem ends {{{1 + // Listend -> end of a line, looking for: + // 1. dedent + // 2. same indent, not a bullet + // 3. three eols + if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) { + int16_t newlines = 0; + lexer->mark_end(lexer); + for (;;) { + if (lexer->lookahead == ' ' && newlines > 0) { + indent_length++; + } else if (lexer->lookahead == '\t' && newlines > 0) { + indent_length += 8; + } else if (lexer->lookahead == '\0') { + return dedent(lexer); + } else if (lexer->lookahead == '\n') { + if (++newlines > 2) return dedent(lexer); + indent_length = 0; + } else { + break; + } + skip(lexer); + } + if (newlines == 0) return false; + + if (indent_length < indent_length_stack.back()) { + return dedent(lexer); + } else if (indent_length == indent_length_stack.back()) { + if (getbullet(lexer) == bullet_stack.back()) { + // std::cout << " == Item end~" << std::endl; + lexer->result_symbol = LISTITEMEND; + return true; + } + return dedent(lexer); + } + return false; + } + + // - Count whitespace {{{1 + lexer->mark_end(lexer); + for (;;) { + if (lexer->lookahead == ' ') { + indent_length++; + } else if (lexer->lookahead == '\t') { + indent_length += 8; + } else if (lexer->lookahead == '\n') { + return false; + } else { + break; + } + skip(lexer); + } + + // std::cout << " == indent: " << indent_length << " next: '" << char(lexer->lookahead) << "'" << std::endl; + + // - Col=0 star {{{1 + if (indent_length == 0 && lexer->lookahead == '*') { + lexer->mark_end(lexer); + int16_t stars = 1; + skip(lexer); + while (lexer->lookahead == '*') { + stars++; + skip(lexer); + } + + if (valid_symbols[SECTIONEND] && stars <= section_stack.back()) { + section_stack.pop_back(); + lexer->result_symbol = SECTIONEND; + // std::cout << " == Section End~" << std::endl; + return true; + } else if (valid_symbols[HLSTARS] && lexer->lookahead == ' ' || lexer->lookahead == '\t') { + section_stack.push_back(stars); + lexer->mark_end(lexer); + lexer->result_symbol = HLSTARS; + // std::cout << " == Stars~" << std::endl; + return true; + } else if (valid_symbols[MARKUP] && stars == 1 && (!iswspace(lexer->lookahead) && lexer->lookahead != '\0')) { + lexer->result_symbol = MARKUP; + // std::cout << " == Bold~" << std::endl; + return true; + } + return false; + } + + // - Liststart and bullets {{{1 + + if (valid_symbols[LISTSTART] || valid_symbols[BULLET]) { + + bool plus = lexer->lookahead == '+'; // requires special treatment, like * + Bullet bullet = getbullet(lexer); + + // std::cout << " == bullet: " << bullet << " back indent: " << indent_length_stack.back() << std::endl; + // std::cout << " == il gt back: " << (indent_length > indent_length_stack.back()) << std::endl; + if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) { + lexer->mark_end(lexer); + lexer->result_symbol = BULLET; + // std::cout << " == Bullet~" << std::endl; + return true; + } else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) { + indent_length_stack.push_back(indent_length); + bullet_stack.push_back(bullet); + lexer->result_symbol = LISTSTART; + // std::cout << " == Liststart~" << std::endl; + return true; + } else if (valid_symbols[MARKUP] && bullet == NOTABULLET && plus) { + lexer->result_symbol = MARKUP; + // std::cout << " == Markup~" << std::endl; + return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0'); + } + } + + // - Markup {{{1 + if (valid_symbols[MARKUP] && (indent_length > 0 || lexer->get_column(lexer) == 0) + && (lexer->lookahead == '*' + || lexer->lookahead == '/' + || lexer->lookahead == '_' + || lexer->lookahead == '+' + || lexer->lookahead == '~' + || lexer->lookahead == '=')) { + lexer->mark_end(lexer); + skip(lexer); + lexer->result_symbol = MARKUP; + // std::cout << " == Markup~" << std::endl; + return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0'); + } + // - Default {{{1 + // std::cout << " == False~" << std::endl; + return false; + } +}; + +} + +extern "C" { // {{{1 + +void *tree_sitter_org_external_scanner_create() { + return new Scanner(); +} + +bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *scanner = static_cast(payload); + return scanner->scan(lexer, valid_symbols); +} + +unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = static_cast(payload); + return scanner->serialize(buffer); +} + +void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = static_cast(payload); + scanner->deserialize(buffer, length); +} + +void tree_sitter_org_external_scanner_destroy(void *payload) { + Scanner *scanner = static_cast(payload); + delete scanner; +} + +}