Initial commit

2021-04-11 01:07:45 -04:00 · 2021-04-11 01:07:45 -04:00 · 8d6ef83961
commit 8d6ef83961
4 changed files with 1771 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+Cargo.lock
+package-lock.json
+node_modules
+build
+*.log
+/examples/*/
+/target/
--- a/corpus/basic.tst
+++ b/corpus/basic.tst
--- a/grammar.js
+++ b/grammar.js
@ -0,0 +1,434 @@
+org_grammar = {
+  // EXTERNALS, INLINE =================================== {{{1
+  name: 'org',
+  extras: _ => [' '],  // Treat newlines explicitly
+
+  externals: $ => [
+    $._liststart,
+    $._listend,
+    $._listitemend,
+    $._bullet,
+    $.stars,
+    $._sectionend,
+    $._markup,
+  ],
+
+  // inline: $ => [$._word, $._numbers, $._junk],
+  // inline: $ => [ $._activeStart, $._activeEnd, $._inactiveStart, $._inactiveEnd,
+  //   $._tsSeparator, $._ymd, $._dayname,],
+
+
+  // PRECEDENCES, CONFLICT =============================== {{{1
+  precedences: _ => [
+    ['section', 'element', 'paragraph', 'textelement'],
+    ['plan', 'textelement'],
+    ['fn_definition', 'footnote'],
+  ],
+
+  conflicts: $ => [
+    [$._text, $.bold],
+    [$._text, $.italic],
+    [$._text, $.underline],
+    [$._text, $.strikethrough],
+    [$._text, $.code],
+    [$._text, $.verbatim],
+    [$.item],
+    [$._lastitem],
+  ],
+
+  rules: {
+    // DOCUMENT, SECTIONS, BODY, & PARAGRAPH =============== {{{1
+
+    document: $ => seq(
+      optional($.body),
+      repeat($.section),
+    ),
+
+    // SECTIONS, BODY, PARAGRAPH =========================== {{{1
+
+    section: $ => prec.dynamic(1, prec('section',
+      seq(
+        $.headline, $._eol,
+        optional(seq(
+          optional(seq($.plan, $._eol)),
+          optional(seq($.property_drawer, $._eol)),
+          optional($.body),
+          repeat($.section),
+        )),
+        $._sectionend,
+      ))),
+
+    _eol: _ => choice('\0', '\n', '\r'),
+    _nl: _ => choice('\n', '\r'),
+
+    body: $ => choice(
+      repeat1($._eol),
+      seq(
+        repeat($._eol),
+        repeat1(seq(
+          choice(
+            $._element,
+            $.paragraph
+          ),
+          repeat($._eol),
+        )),
+      )),
+
+    paragraph: $ => prec.right('paragraph',
+      repeat1(seq(
+        repeat1($._textelement),
+        $._eol)
+      )),
+
+    // ELEMENT AND TEXTELEMENT ============================= {{{1
+
+    _element: $ => choice(
+        $.drawer,
+        $.comment,
+        $.fndef,
+        $.directive,
+        $.list,
+        $.block,
+        $.dynamic_block,
+        // $.table,
+      ),
+
+    _textelement: $ => prec('textelement',
+      choice(
+        $._text,
+        $.timestamp,
+        $.footnote,
+        $.link,
+        $.bold,
+        $.code,
+        $.italic,
+        $.verbatim,
+        $.underline,
+        $.strikethrough,
+        // $.subscript
+        // $.superscript
+        // $.latexfragment
+      )),
+
+    // HEADLINES =========================================== {{{1
+
+    headline: $ => seq(
+      $.stars,
+      $.item,
+      optional($._taglist),
+    ),
+
+    item: $ => repeat1(choice($._text, ':')),
+
+    _taglist: $ => prec.dynamic(1,  // otherwise just item
+      seq(':',
+        repeat1(seq(
+          $.tag,
+          token.immediate(':')
+        )))),
+
+    tag: _ => token.immediate(/[\p{L}\p{N}_@#%]+/),
+
+    _propertyName:  _ => /:\p{Z}*:/,
+
+    property_drawer: $ => seq(
+      ':PROPERTIES:', $._eol,
+      repeat(prec.right(seq(optional($.property), repeat1($._eol)))),
+      ':END:',
+    ),
+
+    property: $ => seq(
+      $._propertyName,
+      repeat($._text),
+    ),
+
+    // PLANNING ============================================ {{{1
+
+    _scheduled:     _ => 'SCHEDULED:',
+    _deadline:      _ => 'DEADLINE:',
+    _closed:        _ => 'CLOSED:',
+
+    plan: $ => repeat1(prec('plan',
+      choice(
+        $.timestamp,
+        $.scheduled,
+        $.deadline,
+        $.closed,
+      ))),
+
+    scheduled: $ => seq($._scheduled, $.timestamp),
+    deadline: $ => seq($._deadline, $.timestamp),
+    closed: $ => seq(
+      $._closed,
+      alias(choice(
+        $._inactiveTimestamp,
+        $._inactiveTimestampRange,
+      ), $.timestamp),
+    ),
+
+    // TIMESTAMP =========================================== {{{1
+
+    _activeStart:   _ => '<',
+    _activeEnd:     _ => '>',
+    _inactiveStart: _ => '[',
+    _inactiveEnd:   _ => ']',
+    _tsSeparator:   _ => '--',
+    _ymd:           _ => /\p{N}{1,4}-\p{N}{1,2}-\p{N}{1,4}/,
+    time:           _ => /\p{N}?\p{N}:\p{N}\p{N}/,
+    repeater:       _ => /[.+]?\+\p{N}+\p{L}/,
+    delay:          _ => /--?\p{N}+\p{L}/,
+
+    date: $ => seq($._ymd, optional(/\p{L}+/)),
+
+    timestamp: $ => choice(
+      $._activeTimestamp,
+      $._activeTimestampRange,
+      $._inactiveTimestamp,
+      $._inactiveTimestampRange,
+    ),
+
+    _activeTimestamp: $ => seq(
+      $._activeStart,
+      $.date,
+      optional($.time),
+      optional($.repeater),
+      optional($.delay),
+      $._activeEnd,
+    ),
+
+    _inactiveTimestamp: $ => seq(
+      $._inactiveStart,
+      $.date,
+      optional($.time),
+      optional($.repeater),
+      optional($.delay),
+      $._inactiveEnd,
+    ),
+
+    _activeTimestampRange: $ => choice(
+      seq(
+        alias($._activeTimestamp, $.timestamp),
+        $._tsSeparator,
+        alias($._activeTimestamp, $.timestamp)),
+      seq(
+        $._activeStart,
+        $.date,
+        $.time, '-', $.time,
+        optional($.repeater),
+        optional($.delay),
+        $._activeEnd,
+      )
+    ),
+
+    _inactiveTimestampRange: $ => choice(
+      seq($._inactiveTimestamp, $._tsSeparator, $._inactiveTimestamp),
+      seq(
+        $._inactiveStart,
+        $.date,
+        $.time, '-', $.time,
+        optional($.repeater),
+        optional($.delay),
+        $._inactiveEnd,
+      )
+    ),
+
+    // MARKUP ============================================== {{{1
+
+    bold:          make_markup('*'),
+    italic:        make_markup('/'),
+    underline:     make_markup('_'),
+    strikethrough: make_markup('+'),
+    code:          make_markup('~', true),
+    verbatim:      make_markup('=', true),
+
+    // LINK ================================================ {{{1
+
+    _linkstart:     _ => '[[',
+    _linksep:       _ => '][',
+    _linkend:       _ => ']]',
+
+    link: $ => seq(
+      $._linkstart,
+      optional(seq(field('uri', $.linktext), $._linksep)),
+      field('description', $.linktext),
+      $._linkend,
+    ),
+    linktext: _ => /[^\]]*/,
+
+    // FOOTNOTE ============================================ {{{1
+
+    _fn_label: _ => /[^\p{Z}\[\]]+/,
+    _fn: _ => '[fn:',
+
+    fndef: $ => prec('fn_definition',
+      seq(
+        $._fn,
+        $._fn_label,
+        ']',
+        $.paragraph,
+      )),
+
+    footnote: $ => prec('footnote',
+      seq(
+        $._fn,
+        choice(
+          $._fn_label,
+          seq(optional($._fn_label), ':', repeat1($._fn_label)),
+        ),
+        ']',
+      )),
+
+    // DIRECTIVE =========================================== {{{1
+
+    directive: $ => seq(
+      '#+',
+      token.immediate(/[^\p{Z}:]+/), // name
+      token.immediate(':'),
+      repeat($._text),
+      $._eol,
+    ),
+
+    // COMMENTS ============================================ {{{1
+
+    comment: $ => prec.right(repeat1(seq(
+      '# ', repeat($._text), $._eol
+    ))),
+
+    // DRAWER ============================================== {{{1
+
+    drawer: $ => seq(
+      ':',
+      token.immediate(/[\p{L}\p{N}\p{Pd}\p{Pc}]+/),
+      token.immediate(':'),
+      $._eol,
+      optional($.body),
+      ':END:',
+      $._eol,
+    ),
+
+    // BLOCK =============================================== {{{1
+
+    block: $ => seq(
+      '#+BEGIN_',
+      alias($._name, $.name),
+      optional($.parameters),
+      $._nl,
+      alias(
+        repeat(seq(
+          repeat($._textonly),
+          $._nl,
+        )),
+        $.contents),
+      '#+END_', $._name, // \P{Z} does not match newlines
+      repeat($._junk), // FIXME
+      $._eol,
+    ),
+
+    _name: _ => token.immediate(/[^\p{Z}\n\r]+/),
+
+    // DYNAMIC BLOCK ======================================= {{{1
+
+    dynamic_block: $ => prec(1, seq( // FIXME why is this precedence required?
+      '#+BEGIN:',
+      optional(alias($._text, $.name)),
+      optional($.parameters),
+      // optional(alias(repeat1(/\S+/), $.parameters)),
+      $._eol,
+      alias(repeat(seq(
+        repeat($._textonly),
+        $._nl,
+      )), $.contents),
+      '#+END:',
+      repeat($._junk), // FIXME
+      $._eol,
+    )),
+
+    parameters: $ => repeat1($._text),
+
+    // LISTS =============================================== {{{1
+
+    list: $ => seq(
+      $._liststart,
+      repeat(seq($.listitem, optional($._eol))),
+      alias($._lastitem, $.listitem),
+    ),
+
+    listitem: $ => seq(
+      $._bullet,
+      optional($._checkbox),
+      optional($._itemtag),
+      optional($._itemtext),
+      $._listitemend,
+      $._eol,
+    ),
+
+    _lastitem: $ => seq(
+      $._bullet,
+      optional($._checkbox),
+      optional($._itemtag),
+      optional($._itemtext),
+      $._listend,
+      optional($._eol),
+    ),
+
+    _checkbox: _ => /\[[ xX-]\]/,
+    _itemtag: $ => seq(repeat($._textelement), '::'),
+
+    _itemtext: $ => seq(
+      repeat1($._textelement),
+      repeat(seq(
+        $._eol,
+        optional($._eol),
+        choice(repeat1($._textelement), $.list)
+      )),
+    ),
+
+
+    // TEXT ================================================ {{{1
+
+    // TODO: inline word/numbers/junk. Causes precedence issues
+    // A repeat would also be nice.
+    _textonly: $ => choice($._word,
+      $._numbers,
+      $._junk,
+    ),
+
+    _text: $ => choice(
+      $._word,
+      $._numbers,
+      $._junk,
+
+      $._activeStart, // Causes conflicts, so they get marked as text.
+      $._inactiveStart,
+
+      seq($._markup, '*'),
+      seq($._markup, '/'),
+      seq($._markup, '_'),
+      seq($._markup, '+'),
+      seq($._markup, '~'),
+      seq($._markup, '='),
+
+      '#', // comment collision
+    ),
+
+
+    _word:          _ => /\p{L}+/,
+    _numbers:       _ => /\p{N}+/,
+    _junk:          _ => /[^\p{Z}\p{L}\p{N}]/,
+
+  }
+};
+
+function make_markup(delim, textonly = false) {      // {{{1
+  return $ => prec.left(seq(
+    $._markup,
+    delim,
+    repeat1(textonly ? $._text : $._textelement),
+    repeat(seq($._eol, repeat1(textonly ? $._text : $._textelement))),
+    token.immediate(delim),
+  ))
+}
+
+// }}}
+
+module.exports = grammar(org_grammar);
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -0,0 +1,325 @@
+#include <tree_sitter/parser.h>
+#include <vector>
+#include <cwctype>
+#include <cstring>
+#include <cassert>
+#include <stdio.h>
+#include <iostream>
+
+namespace {
+
+using std::vector;
+using std::iswspace;
+
+enum TokenType {                                                       // {{{1
+  LISTSTART,
+  LISTEND,
+  LISTITEMEND,
+  BULLET,
+  HLSTARS,
+  SECTIONEND,
+  MARKUP,
+};
+
+enum Bullet {                                                          // {{{1
+  NOTABULLET,
+  DASH,
+  PLUS,
+  STAR,
+  LOWERDOT,
+  UPPERDOT,
+  LOWERPAREN,
+  UPPERPAREN,
+  NUMDOT,
+  NUMPAREN,
+};
+
+struct Scanner {                                                       // {{{1
+  vector<int16_t> indent_length_stack;
+  vector<int16_t> bullet_stack;
+  vector<int16_t> section_stack;
+
+  Scanner() {
+    deserialize(NULL, 0);
+  }
+
+  unsigned serialize(char *buffer) {                                   // {{{1
+    size_t i = 0;
+
+    size_t indent_count = indent_length_stack.size() - 1;
+    if (indent_count > UINT8_MAX) indent_count = UINT8_MAX;
+    buffer[i++] = indent_count;
+
+    vector<int16_t>::iterator
+    iter = indent_length_stack.begin() + 1,
+    end = indent_length_stack.end();
+
+    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
+      buffer[i++] = *iter;
+    }
+
+    iter = bullet_stack.begin() + 1;
+    end = bullet_stack.end();
+    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
+      buffer[i++] = *iter;
+    }
+
+    iter = section_stack.begin() + 1;
+    end = section_stack.end();
+
+    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
+      buffer[i++] = *iter;
+    }
+
+    return i;
+  }
+
+  void deserialize(const char *buffer, unsigned length) {              // {{{1
+    section_stack.clear();
+    section_stack.push_back(0);
+    indent_length_stack.clear();
+    indent_length_stack.push_back(-1);
+    bullet_stack.clear();
+    bullet_stack.push_back(NOTABULLET);
+
+    if (length == 0) return;
+
+    size_t i = 0;
+
+    size_t indent_count = (uint8_t)buffer[i++];
+
+    for (; i <= indent_count    ; i++) indent_length_stack.push_back(buffer[i]);
+    for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]);
+    for (; i < length           ; i++) section_stack.push_back(buffer[i]);
+
+  }
+
+  void advance(TSLexer *lexer) {                                       // {{{1
+    lexer->advance(lexer, false);
+  }
+
+  void skip(TSLexer *lexer) {                                          // {{{1
+    lexer->advance(lexer, true);
+  }
+
+  bool dedent(TSLexer *lexer) {                                        // {{{1
+    indent_length_stack.pop_back();
+    bullet_stack.pop_back();
+    lexer->result_symbol = LISTEND;
+    // std::cout << " == Dedent~" << std::endl;
+    return true;
+  }
+
+  Bullet getbullet(TSLexer *lexer) {                                   // {{{1
+    if (lexer->lookahead == '-') {
+      skip(lexer);
+      if (iswspace(lexer->lookahead)) return DASH;
+    } else if (lexer->lookahead == '+') {
+      skip(lexer);
+      if (iswspace(lexer->lookahead)) return PLUS;
+    } else if (lexer->lookahead == '*') {
+      skip(lexer);
+      if (iswspace(lexer->lookahead)) return STAR;
+    } else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
+      skip(lexer);
+      if (lexer->lookahead == '.') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return LOWERDOT;
+      } else if (lexer->lookahead == ')') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return LOWERPAREN;
+      }
+    } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
+      skip(lexer);
+      if (lexer->lookahead == '.') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return UPPERDOT;
+      } else if (lexer->lookahead == ')') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return UPPERPAREN;
+      }
+    } else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
+      do {
+        skip(lexer);
+      } while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
+      if (lexer->lookahead == '.') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return NUMDOT;
+      } else if (lexer->lookahead == ')') {
+        skip(lexer);
+        if (iswspace(lexer->lookahead)) return NUMPAREN;
+      }
+    }
+    return NOTABULLET;
+  }
+
+  bool scan(TSLexer *lexer, const bool *valid_symbols) {               // {{{1
+
+    // std::cout << " == " << valid_symbols[LISTSTART] << ", " << valid_symbols[LISTEND] << ", " << valid_symbols[LISTITEMEND] << ", " << valid_symbols[BULLET] << ", " << valid_symbols[HLSTARS] << ", " << valid_symbols[SECTIONEND] << ", " << valid_symbols[MARKUP] << std::endl;
+    if (valid_symbols[SECTIONEND] && lexer->lookahead == '\0' && section_stack.back() > 0) {
+      lexer->result_symbol = SECTIONEND;
+      section_stack.pop_back();
+      return true;
+    }
+
+    int16_t indent_length = 0;
+    // - Listiem ends                                                     {{{1
+    // Listend -> end of a line, looking for:
+    // 1. dedent
+    // 2. same indent, not a bullet
+    // 3. three eols
+    if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
+      int16_t newlines = 0;
+      lexer->mark_end(lexer);
+      for (;;) {
+        if (lexer->lookahead == ' ' && newlines > 0) {
+          indent_length++;
+        } else if (lexer->lookahead == '\t' && newlines > 0) {
+          indent_length += 8;
+        } else if (lexer->lookahead == '\0') {
+          return dedent(lexer);
+        } else if (lexer->lookahead == '\n') {
+          if (++newlines > 2) return dedent(lexer);
+          indent_length = 0;
+        } else {
+          break;
+        }
+        skip(lexer);
+      }
+      if (newlines == 0) return false;
+
+      if (indent_length < indent_length_stack.back()) {
+        return dedent(lexer);
+      } else if (indent_length == indent_length_stack.back()) {
+        if (getbullet(lexer) == bullet_stack.back()) {
+          // std::cout << " == Item end~" << std::endl;
+          lexer->result_symbol = LISTITEMEND;
+          return true;
+        }
+        return dedent(lexer);
+      }
+      return false;
+    }
+
+    // - Count whitespace                                                 {{{1
+    lexer->mark_end(lexer);
+    for (;;) {
+      if (lexer->lookahead == ' ') {
+        indent_length++;
+      } else if (lexer->lookahead == '\t') {
+        indent_length += 8;
+      } else if (lexer->lookahead == '\n') {
+        return false;
+      } else {
+        break;
+      }
+      skip(lexer);
+    }
+
+    // std::cout << " == indent: " << indent_length << " next: '" << char(lexer->lookahead) << "'" << std::endl;
+
+    // - Col=0 star                                                       {{{1
+    if (indent_length == 0 && lexer->lookahead == '*') {
+      lexer->mark_end(lexer);
+      int16_t stars = 1;
+      skip(lexer);
+      while (lexer->lookahead == '*') {
+        stars++;
+        skip(lexer);
+      }
+
+      if (valid_symbols[SECTIONEND] && stars <= section_stack.back()) {
+        section_stack.pop_back();
+        lexer->result_symbol = SECTIONEND;
+        // std::cout << " == Section End~" << std::endl;
+        return true;
+      } else if (valid_symbols[HLSTARS] && lexer->lookahead == ' ' || lexer->lookahead == '\t') {
+        section_stack.push_back(stars);
+        lexer->mark_end(lexer);
+        lexer->result_symbol = HLSTARS;
+        // std::cout << " == Stars~" << std::endl;
+        return true;
+      } else if (valid_symbols[MARKUP] && stars == 1 && (!iswspace(lexer->lookahead) && lexer->lookahead != '\0')) {
+        lexer->result_symbol = MARKUP;
+        // std::cout << " == Bold~" << std::endl;
+        return true;
+      }
+      return false;
+    }
+
+    // - Liststart and bullets                                            {{{1
+
+    if (valid_symbols[LISTSTART] || valid_symbols[BULLET]) {
+
+      bool plus = lexer->lookahead == '+'; // requires special treatment, like *
+      Bullet bullet = getbullet(lexer);
+
+      // std::cout << " == bullet: " << bullet << " back indent: " << indent_length_stack.back() << std::endl;
+      // std::cout << " == il gt back: " << (indent_length > indent_length_stack.back()) << std::endl;
+      if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) {
+        lexer->mark_end(lexer);
+        lexer->result_symbol = BULLET;
+        // std::cout << " == Bullet~" << std::endl;
+        return true;
+      } else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) {
+        indent_length_stack.push_back(indent_length);
+        bullet_stack.push_back(bullet);
+        lexer->result_symbol = LISTSTART;
+        // std::cout << " == Liststart~" << std::endl;
+        return true;
+      } else if (valid_symbols[MARKUP] && bullet == NOTABULLET && plus) {
+        lexer->result_symbol = MARKUP;
+        // std::cout << " == Markup~" << std::endl;
+        return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
+      }
+    }
+
+    // - Markup                                                           {{{1
+    if (valid_symbols[MARKUP] && (indent_length > 0 || lexer->get_column(lexer) == 0)
+      && (lexer->lookahead == '*'
+      || lexer->lookahead == '/'
+      || lexer->lookahead == '_'
+      || lexer->lookahead == '+'
+      || lexer->lookahead == '~'
+      || lexer->lookahead == '=')) {
+      lexer->mark_end(lexer);
+      skip(lexer);
+      lexer->result_symbol = MARKUP;
+      // std::cout << " == Markup~" << std::endl;
+      return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
+    }
+    // - Default                                                          {{{1
+    // std::cout << " == False~" << std::endl;
+    return false;
+  }
+};
+
+}
+
+extern "C" {                                                           // {{{1
+
+void *tree_sitter_org_external_scanner_create() {
+  return new Scanner();
+}
+
+bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  return scanner->scan(lexer, valid_symbols);
+}
+
+unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  return scanner->serialize(buffer);
+}
+
+void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  scanner->deserialize(buffer, length);
+}
+
+void tree_sitter_org_external_scanner_destroy(void *payload) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  delete scanner;
+}
+
+}