Initial commit

2021-04-11 01:07:45 -04:00 · 2021-04-11 01:07:45 -04:00 · 8d6ef83961
commit 8d6ef83961
4 changed files with 1771 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
 Cargo.lock
 package-lock.json
 node_modules
 build
 *.log
 /examples/*/
 /target/
--- a/corpus/basic.tst
+++ b/corpus/basic.tst
--- a/grammar.js
+++ b/grammar.js
@ -0,0 +1,434 @@
 org_grammar = {
  // EXTERNALS, INLINE =================================== {{{1
  name: 'org',
  extras: _ => [' '],  // Treat newlines explicitly
  externals: $ => [
    $._liststart,
    $._listend,
    $._listitemend,
    $._bullet,
    $.stars,
    $._sectionend,
    $._markup,
  ],
  // inline: $ => [$._word, $._numbers, $._junk],
  // inline: $ => [ $._activeStart, $._activeEnd, $._inactiveStart, $._inactiveEnd,
  //   $._tsSeparator, $._ymd, $._dayname,],
  // PRECEDENCES, CONFLICT =============================== {{{1
  precedences: _ => [
    ['section', 'element', 'paragraph', 'textelement'],
    ['plan', 'textelement'],
    ['fn_definition', 'footnote'],
  ],
  conflicts: $ => [
    [$._text, $.bold],
    [$._text, $.italic],
    [$._text, $.underline],
    [$._text, $.strikethrough],
    [$._text, $.code],
    [$._text, $.verbatim],
    [$.item],
    [$._lastitem],
  ],
  rules: {
    // DOCUMENT, SECTIONS, BODY, & PARAGRAPH =============== {{{1
    document: $ => seq(
      optional($.body),
      repeat($.section),
    ),
    // SECTIONS, BODY, PARAGRAPH =========================== {{{1
    section: $ => prec.dynamic(1, prec('section',
      seq(
        $.headline, $._eol,
        optional(seq(
          optional(seq($.plan, $._eol)),
          optional(seq($.property_drawer, $._eol)),
          optional($.body),
          repeat($.section),
        )),
        $._sectionend,
      ))),
    _eol: _ => choice('\0', '\n', '\r'),
    _nl: _ => choice('\n', '\r'),
    body: $ => choice(
      repeat1($._eol),
      seq(
        repeat($._eol),
        repeat1(seq(
          choice(
            $._element,
            $.paragraph
          ),
          repeat($._eol),
        )),
      )),
    paragraph: $ => prec.right('paragraph',
      repeat1(seq(
        repeat1($._textelement),
        $._eol)
      )),
    // ELEMENT AND TEXTELEMENT ============================= {{{1
    _element: $ => choice(
        $.drawer,
        $.comment,
        $.fndef,
        $.directive,
        $.list,
        $.block,
        $.dynamic_block,
        // $.table,
      ),
    _textelement: $ => prec('textelement',
      choice(
        $._text,
        $.timestamp,
        $.footnote,
        $.link,
        $.bold,
        $.code,
        $.italic,
        $.verbatim,
        $.underline,
        $.strikethrough,
        // $.subscript
        // $.superscript
        // $.latexfragment
      )),
    // HEADLINES =========================================== {{{1
    headline: $ => seq(
      $.stars,
      $.item,
      optional($._taglist),
    ),
    item: $ => repeat1(choice($._text, ':')),
    _taglist: $ => prec.dynamic(1,  // otherwise just item
      seq(':',
        repeat1(seq(
          $.tag,
          token.immediate(':')
        )))),
    tag: _ => token.immediate(/[\p{L}\p{N}_@#%]+/),
    _propertyName:  _ => /:\p{Z}*:/,
    property_drawer: $ => seq(
      ':PROPERTIES:', $._eol,
      repeat(prec.right(seq(optional($.property), repeat1($._eol)))),
      ':END:',
    ),
    property: $ => seq(
      $._propertyName,
      repeat($._text),
    ),
    // PLANNING ============================================ {{{1
    _scheduled:     _ => 'SCHEDULED:',
    _deadline:      _ => 'DEADLINE:',
    _closed:        _ => 'CLOSED:',
    plan: $ => repeat1(prec('plan',
      choice(
        $.timestamp,
        $.scheduled,
        $.deadline,
        $.closed,
      ))),
    scheduled: $ => seq($._scheduled, $.timestamp),
    deadline: $ => seq($._deadline, $.timestamp),
    closed: $ => seq(
      $._closed,
      alias(choice(
        $._inactiveTimestamp,
        $._inactiveTimestampRange,
      ), $.timestamp),
    ),
    // TIMESTAMP =========================================== {{{1
    _activeStart:   _ => '<',
    _activeEnd:     _ => '>',
    _inactiveStart: _ => '[',
    _inactiveEnd:   _ => ']',
    _tsSeparator:   _ => '--',
    _ymd:           _ => /\p{N}{1,4}-\p{N}{1,2}-\p{N}{1,4}/,
    time:           _ => /\p{N}?\p{N}:\p{N}\p{N}/,
    repeater:       _ => /[.+]?\+\p{N}+\p{L}/,
    delay:          _ => /--?\p{N}+\p{L}/,
    date: $ => seq($._ymd, optional(/\p{L}+/)),
    timestamp: $ => choice(
      $._activeTimestamp,
      $._activeTimestampRange,
      $._inactiveTimestamp,
      $._inactiveTimestampRange,
    ),
    _activeTimestamp: $ => seq(
      $._activeStart,
      $.date,
      optional($.time),
      optional($.repeater),
      optional($.delay),
      $._activeEnd,
    ),
    _inactiveTimestamp: $ => seq(
      $._inactiveStart,
      $.date,
      optional($.time),
      optional($.repeater),
      optional($.delay),
      $._inactiveEnd,
    ),
    _activeTimestampRange: $ => choice(
      seq(
        alias($._activeTimestamp, $.timestamp),
        $._tsSeparator,
        alias($._activeTimestamp, $.timestamp)),
      seq(
        $._activeStart,
        $.date,
        $.time, '-', $.time,
        optional($.repeater),
        optional($.delay),
        $._activeEnd,
      )
    ),
    _inactiveTimestampRange: $ => choice(
      seq($._inactiveTimestamp, $._tsSeparator, $._inactiveTimestamp),
      seq(
        $._inactiveStart,
        $.date,
        $.time, '-', $.time,
        optional($.repeater),
        optional($.delay),
        $._inactiveEnd,
      )
    ),
    // MARKUP ============================================== {{{1
    bold:          make_markup('*'),
    italic:        make_markup('/'),
    underline:     make_markup('_'),
    strikethrough: make_markup('+'),
    code:          make_markup('~', true),
    verbatim:      make_markup('=', true),
    // LINK ================================================ {{{1
    _linkstart:     _ => '[[',
    _linksep:       _ => '][',
    _linkend:       _ => ']]',
    link: $ => seq(
      $._linkstart,
      optional(seq(field('uri', $.linktext), $._linksep)),
      field('description', $.linktext),
      $._linkend,
    ),
    linktext: _ => /[^\]]*/,
    // FOOTNOTE ============================================ {{{1
    _fn_label: _ => /[^\p{Z}\[\]]+/,
    _fn: _ => '[fn:',
    fndef: $ => prec('fn_definition',
      seq(
        $._fn,
        $._fn_label,
        ']',
        $.paragraph,
      )),
    footnote: $ => prec('footnote',
      seq(
        $._fn,
        choice(
          $._fn_label,
          seq(optional($._fn_label), ':', repeat1($._fn_label)),
        ),
        ']',
      )),
    // DIRECTIVE =========================================== {{{1
    directive: $ => seq(
      '#+',
      token.immediate(/[^\p{Z}:]+/), // name
      token.immediate(':'),
      repeat($._text),
      $._eol,
    ),
    // COMMENTS ============================================ {{{1
    comment: $ => prec.right(repeat1(seq(
      '# ', repeat($._text), $._eol
    ))),
    // DRAWER ============================================== {{{1
    drawer: $ => seq(
      ':',
      token.immediate(/[\p{L}\p{N}\p{Pd}\p{Pc}]+/),
      token.immediate(':'),
      $._eol,
      optional($.body),
      ':END:',
      $._eol,
    ),
    // BLOCK =============================================== {{{1
    block: $ => seq(
      '#+BEGIN_',
      alias($._name, $.name),
      optional($.parameters),
      $._nl,
      alias(
        repeat(seq(
          repeat($._textonly),
          $._nl,
        )),
        $.contents),
      '#+END_', $._name, // \P{Z} does not match newlines
      repeat($._junk), // FIXME
      $._eol,
    ),
    _name: _ => token.immediate(/[^\p{Z}\n\r]+/),
    // DYNAMIC BLOCK ======================================= {{{1
    dynamic_block: $ => prec(1, seq( // FIXME why is this precedence required?
      '#+BEGIN:',
      optional(alias($._text, $.name)),
      optional($.parameters),
      // optional(alias(repeat1(/\S+/), $.parameters)),
      $._eol,
      alias(repeat(seq(
        repeat($._textonly),
        $._nl,
      )), $.contents),
      '#+END:',
      repeat($._junk), // FIXME
      $._eol,
    )),
    parameters: $ => repeat1($._text),
    // LISTS =============================================== {{{1
    list: $ => seq(
      $._liststart,
      repeat(seq($.listitem, optional($._eol))),
      alias($._lastitem, $.listitem),
    ),
    listitem: $ => seq(
      $._bullet,
      optional($._checkbox),
      optional($._itemtag),
      optional($._itemtext),
      $._listitemend,
      $._eol,
    ),
    _lastitem: $ => seq(
      $._bullet,
      optional($._checkbox),
      optional($._itemtag),
      optional($._itemtext),
      $._listend,
      optional($._eol),
    ),
    _checkbox: _ => /\[[ xX-]\]/,
    _itemtag: $ => seq(repeat($._textelement), '::'),
    _itemtext: $ => seq(
      repeat1($._textelement),
      repeat(seq(
        $._eol,
        optional($._eol),
        choice(repeat1($._textelement), $.list)
      )),
    ),
    // TEXT ================================================ {{{1
    // TODO: inline word/numbers/junk. Causes precedence issues
    // A repeat would also be nice.
    _textonly: $ => choice($._word,
      $._numbers,
      $._junk,
    ),
    _text: $ => choice(
      $._word,
      $._numbers,
      $._junk,
      $._activeStart, // Causes conflicts, so they get marked as text.
      $._inactiveStart,
      seq($._markup, '*'),
      seq($._markup, '/'),
      seq($._markup, '_'),
      seq($._markup, '+'),
      seq($._markup, '~'),
      seq($._markup, '='),
      '#', // comment collision
    ),
    _word:          _ => /\p{L}+/,
    _numbers:       _ => /\p{N}+/,
    _junk:          _ => /[^\p{Z}\p{L}\p{N}]/,
  }
 };
 function make_markup(delim, textonly = false) {      // {{{1
  return $ => prec.left(seq(
    $._markup,
    delim,
    repeat1(textonly ? $._text : $._textelement),
    repeat(seq($._eol, repeat1(textonly ? $._text : $._textelement))),
    token.immediate(delim),
  ))
 }
 // }}}
 module.exports = grammar(org_grammar);
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -0,0 +1,325 @@
 #include <tree_sitter/parser.h>
 #include <vector>
 #include <cwctype>
 #include <cstring>
 #include <cassert>
 #include <stdio.h>
 #include <iostream>
 namespace {
 using std::vector;
 using std::iswspace;
 enum TokenType {                                                       // {{{1
  LISTSTART,
  LISTEND,
  LISTITEMEND,
  BULLET,
  HLSTARS,
  SECTIONEND,
  MARKUP,
 };
 enum Bullet {                                                          // {{{1
  NOTABULLET,
  DASH,
  PLUS,
  STAR,
  LOWERDOT,
  UPPERDOT,
  LOWERPAREN,
  UPPERPAREN,
  NUMDOT,
  NUMPAREN,
 };
 struct Scanner {                                                       // {{{1
  vector<int16_t> indent_length_stack;
  vector<int16_t> bullet_stack;
  vector<int16_t> section_stack;
  Scanner() {
    deserialize(NULL, 0);
  }
  unsigned serialize(char *buffer) {                                   // {{{1
    size_t i = 0;
    size_t indent_count = indent_length_stack.size() - 1;
    if (indent_count > UINT8_MAX) indent_count = UINT8_MAX;
    buffer[i++] = indent_count;
    vector<int16_t>::iterator
    iter = indent_length_stack.begin() + 1,
    end = indent_length_stack.end();
    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
      buffer[i++] = *iter;
    }
    iter = bullet_stack.begin() + 1;
    end = bullet_stack.end();
    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
      buffer[i++] = *iter;
    }
    iter = section_stack.begin() + 1;
    end = section_stack.end();
    for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
      buffer[i++] = *iter;
    }
    return i;
  }
  void deserialize(const char *buffer, unsigned length) {              // {{{1
    section_stack.clear();
    section_stack.push_back(0);
    indent_length_stack.clear();
    indent_length_stack.push_back(-1);
    bullet_stack.clear();
    bullet_stack.push_back(NOTABULLET);
    if (length == 0) return;
    size_t i = 0;
    size_t indent_count = (uint8_t)buffer[i++];
    for (; i <= indent_count    ; i++) indent_length_stack.push_back(buffer[i]);
    for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]);
    for (; i < length           ; i++) section_stack.push_back(buffer[i]);
  }
  void advance(TSLexer *lexer) {                                       // {{{1
    lexer->advance(lexer, false);
  }
  void skip(TSLexer *lexer) {                                          // {{{1
    lexer->advance(lexer, true);
  }
  bool dedent(TSLexer *lexer) {                                        // {{{1
    indent_length_stack.pop_back();
    bullet_stack.pop_back();
    lexer->result_symbol = LISTEND;
    // std::cout << " == Dedent~" << std::endl;
    return true;
  }
  Bullet getbullet(TSLexer *lexer) {                                   // {{{1
    if (lexer->lookahead == '-') {
      skip(lexer);
      if (iswspace(lexer->lookahead)) return DASH;
    } else if (lexer->lookahead == '+') {
      skip(lexer);
      if (iswspace(lexer->lookahead)) return PLUS;
    } else if (lexer->lookahead == '*') {
      skip(lexer);
      if (iswspace(lexer->lookahead)) return STAR;
    } else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
      skip(lexer);
      if (lexer->lookahead == '.') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return LOWERDOT;
      } else if (lexer->lookahead == ')') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return LOWERPAREN;
      }
    } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
      skip(lexer);
      if (lexer->lookahead == '.') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return UPPERDOT;
      } else if (lexer->lookahead == ')') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return UPPERPAREN;
      }
    } else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
      do {
        skip(lexer);
      } while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
      if (lexer->lookahead == '.') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return NUMDOT;
      } else if (lexer->lookahead == ')') {
        skip(lexer);
        if (iswspace(lexer->lookahead)) return NUMPAREN;
      }
    }
    return NOTABULLET;
  }
  bool scan(TSLexer *lexer, const bool *valid_symbols) {               // {{{1
    // std::cout << " == " << valid_symbols[LISTSTART] << ", " << valid_symbols[LISTEND] << ", " << valid_symbols[LISTITEMEND] << ", " << valid_symbols[BULLET] << ", " << valid_symbols[HLSTARS] << ", " << valid_symbols[SECTIONEND] << ", " << valid_symbols[MARKUP] << std::endl;
    if (valid_symbols[SECTIONEND] && lexer->lookahead == '\0' && section_stack.back() > 0) {
      lexer->result_symbol = SECTIONEND;
      section_stack.pop_back();
      return true;
    }
    int16_t indent_length = 0;
    // - Listiem ends                                                     {{{1
    // Listend -> end of a line, looking for:
    // 1. dedent
    // 2. same indent, not a bullet
    // 3. three eols
    if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
      int16_t newlines = 0;
      lexer->mark_end(lexer);
      for (;;) {
        if (lexer->lookahead == ' ' && newlines > 0) {
          indent_length++;
        } else if (lexer->lookahead == '\t' && newlines > 0) {
          indent_length += 8;
        } else if (lexer->lookahead == '\0') {
          return dedent(lexer);
        } else if (lexer->lookahead == '\n') {
          if (++newlines > 2) return dedent(lexer);
          indent_length = 0;
        } else {
          break;
        }
        skip(lexer);
      }
      if (newlines == 0) return false;
      if (indent_length < indent_length_stack.back()) {
        return dedent(lexer);
      } else if (indent_length == indent_length_stack.back()) {
        if (getbullet(lexer) == bullet_stack.back()) {
          // std::cout << " == Item end~" << std::endl;
          lexer->result_symbol = LISTITEMEND;
          return true;
        }
        return dedent(lexer);
      }
      return false;
    }
    // - Count whitespace                                                 {{{1
    lexer->mark_end(lexer);
    for (;;) {
      if (lexer->lookahead == ' ') {
        indent_length++;
      } else if (lexer->lookahead == '\t') {
        indent_length += 8;
      } else if (lexer->lookahead == '\n') {
        return false;
      } else {
        break;
      }
      skip(lexer);
    }
    // std::cout << " == indent: " << indent_length << " next: '" << char(lexer->lookahead) << "'" << std::endl;
    // - Col=0 star                                                       {{{1
    if (indent_length == 0 && lexer->lookahead == '*') {
      lexer->mark_end(lexer);
      int16_t stars = 1;
      skip(lexer);
      while (lexer->lookahead == '*') {
        stars++;
        skip(lexer);
      }
      if (valid_symbols[SECTIONEND] && stars <= section_stack.back()) {
        section_stack.pop_back();
        lexer->result_symbol = SECTIONEND;
        // std::cout << " == Section End~" << std::endl;
        return true;
      } else if (valid_symbols[HLSTARS] && lexer->lookahead == ' ' || lexer->lookahead == '\t') {
        section_stack.push_back(stars);
        lexer->mark_end(lexer);
        lexer->result_symbol = HLSTARS;
        // std::cout << " == Stars~" << std::endl;
        return true;
      } else if (valid_symbols[MARKUP] && stars == 1 && (!iswspace(lexer->lookahead) && lexer->lookahead != '\0')) {
        lexer->result_symbol = MARKUP;
        // std::cout << " == Bold~" << std::endl;
        return true;
      }
      return false;
    }
    // - Liststart and bullets                                            {{{1
    if (valid_symbols[LISTSTART] || valid_symbols[BULLET]) {
      bool plus = lexer->lookahead == '+'; // requires special treatment, like *
      Bullet bullet = getbullet(lexer);
      // std::cout << " == bullet: " << bullet << " back indent: " << indent_length_stack.back() << std::endl;
      // std::cout << " == il gt back: " << (indent_length > indent_length_stack.back()) << std::endl;
      if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) {
        lexer->mark_end(lexer);
        lexer->result_symbol = BULLET;
        // std::cout << " == Bullet~" << std::endl;
        return true;
      } else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) {
        indent_length_stack.push_back(indent_length);
        bullet_stack.push_back(bullet);
        lexer->result_symbol = LISTSTART;
        // std::cout << " == Liststart~" << std::endl;
        return true;
      } else if (valid_symbols[MARKUP] && bullet == NOTABULLET && plus) {
        lexer->result_symbol = MARKUP;
        // std::cout << " == Markup~" << std::endl;
        return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
      }
    }
    // - Markup                                                           {{{1
    if (valid_symbols[MARKUP] && (indent_length > 0 || lexer->get_column(lexer) == 0)
      && (lexer->lookahead == '*'
      || lexer->lookahead == '/'
      || lexer->lookahead == '_'
      || lexer->lookahead == '+'
      || lexer->lookahead == '~'
      || lexer->lookahead == '=')) {
      lexer->mark_end(lexer);
      skip(lexer);
      lexer->result_symbol = MARKUP;
      // std::cout << " == Markup~" << std::endl;
      return (!iswspace(lexer->lookahead) && lexer->lookahead != '\0');
    }
    // - Default                                                          {{{1
    // std::cout << " == False~" << std::endl;
    return false;
  }
 };
 }
 extern "C" {                                                           // {{{1
 void *tree_sitter_org_external_scanner_create() {
  return new Scanner();
 }
 bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  return scanner->scan(lexer, valid_symbols);
 }
 unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  return scanner->serialize(buffer);
 }
 void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  scanner->deserialize(buffer, length);
 }
 void tree_sitter_org_external_scanner_destroy(void *payload) {
  Scanner *scanner = static_cast<Scanner *>(payload);
  delete scanner;
 }
 }