From 64cfbc213f5a83da17632c95382a5a0a2f3357c1 Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Mon, 19 Jun 2023 18:05:11 -0400 Subject: [PATCH] feat: rewrite the scanner in C (#40) * feat: rewrite the scanner in C * chore: update manifests & docs --- README.md | 7 +- binding.gyp | 2 +- bindings/rust/build.rs | 21 +-- src/scanner.c | 342 +++++++++++++++++++++++++++++++++++++++++ src/scanner.cc | 293 ----------------------------------- 5 files changed, 348 insertions(+), 317 deletions(-) create mode 100644 src/scanner.c delete mode 100644 src/scanner.cc diff --git a/README.md b/README.md index 5888b26..c79dd19 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Like in many regex systems, `*/+` is read as "0/1 or more", and `?` is 0 or 1. ## Example -``` org +```org #+TITLE: Example Some *marked up* words @@ -43,6 +43,7 @@ Text ``` Parses as: + ``` (document [0, 0] - [16, 0] body: (body [0, 0] - [4, 0] @@ -117,13 +118,13 @@ For manual install, use `make`. For neovim, using `nvim-treesitter/nvim-treesitter`, add to your configuration: -``` lua +```lua local parser_config = require "nvim-treesitter.parsers".get_parser_configs() parser_config.org = { install_info = { url = 'https://github.com/milisims/tree-sitter-org', revision = 'main', - files = { 'src/parser.c', 'src/scanner.cc' }, + files = { 'src/parser.c', 'src/scanner.c' }, }, filetype = 'org', } diff --git a/binding.gyp b/binding.gyp index 5494cf5..e729278 100644 --- a/binding.gyp +++ b/binding.gyp @@ -9,7 +9,7 @@ "sources": [ "src/parser.c", "bindings/node/binding.cc", - "src/scanner.cc" + "src/scanner.c" ], "cflags_c": [ "-std=c99", diff --git a/bindings/rust/build.rs b/bindings/rust/build.rs index 618e90a..8851fed 100644 --- a/bindings/rust/build.rs +++ b/bindings/rust/build.rs @@ -2,7 +2,7 @@ fn main() { let src_dir = std::path::Path::new("src"); let mut c_config = cc::Build::new(); - c_config.include(&src_dir); + c_config.include(src_dir); c_config .flag_if_supported("-Wno-unused-parameter") .flag_if_supported("-Wno-unused-but-set-variable") @@ -10,29 +10,10 @@ fn main() { let parser_path = src_dir.join("parser.c"); c_config.file(&parser_path); - // If your language uses an external scanner written in C, - // then include this block of code: - - /* let scanner_path = src_dir.join("scanner.c"); c_config.file(&scanner_path); println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap()); - */ c_config.compile("parser"); println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap()); - - // If your language uses an external scanner written in C++, - // then include this block of code: - - let mut cpp_config = cc::Build::new(); - cpp_config.cpp(true); - cpp_config.include(&src_dir); - cpp_config - .flag_if_supported("-Wno-unused-parameter") - .flag_if_supported("-Wno-unused-but-set-variable"); - let scanner_path = src_dir.join("scanner.cc"); - cpp_config.file(&scanner_path); - cpp_config.compile("scanner"); - println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap()); } diff --git a/src/scanner.c b/src/scanner.c new file mode 100644 index 0000000..f305612 --- /dev/null +++ b/src/scanner.c @@ -0,0 +1,342 @@ +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + { \ + (vec)->data = realloc((vec)->data, (_cap) * sizeof((vec)->data[0])); \ + assert((vec)->data != NULL); \ + (vec)->cap = (_cap); \ + } + +#define VEC_PUSH(vec, el) \ + { \ + if ((vec)->cap == (vec)->len) { \ + VEC_RESIZE((vec), MAX(16, (vec)->len * 2)); \ + } \ + (vec)->data[(vec)->len++] = (el); \ + } + +#define VEC_POP(vec) (vec)->len--; + +#define VEC_BACK(vec) ((vec)->data[(vec)->len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec)->data != NULL) \ + free((vec)->data); \ + } + +#define VEC_CLEAR(vec) \ + { (vec)->len = 0; } + +enum TokenType { + LISTSTART, + LISTEND, + LISTITEMEND, + BULLET, + HLSTARS, + SECTIONEND, + ENDOFFILE, +}; + +typedef enum { + NOTABULLET, + DASH, + PLUS, + STAR, + LOWERDOT, + UPPERDOT, + LOWERPAREN, + UPPERPAREN, + NUMDOT, + NUMPAREN, +} Bullet; + +typedef struct { + uint32_t len; + uint32_t cap; + int16_t *data; +} stack; + +typedef struct { + stack *indent_length_stack; + stack *bullet_stack; + stack *section_stack; +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +unsigned serialize(Scanner *scanner, char *buffer) { + size_t i = 0; + + size_t indent_count = scanner->indent_length_stack->len - 1; + if (indent_count > UINT8_MAX) + indent_count = UINT8_MAX; + buffer[i++] = indent_count; + + int iter = 1; + for (; iter < scanner->indent_length_stack->len && + i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + ++iter) { + buffer[i++] = scanner->indent_length_stack->data[iter]; + } + + iter = 1; + for (; iter < scanner->bullet_stack->len && + i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + ++iter) { + buffer[i++] = scanner->bullet_stack->data[iter]; + } + + iter = 1; + for (; iter < scanner->section_stack->len && + i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + ++iter) { + buffer[i++] = scanner->section_stack->data[iter]; + } + + return i; +} + +void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + VEC_CLEAR(scanner->section_stack); + VEC_PUSH(scanner->section_stack, 0); + VEC_CLEAR(scanner->indent_length_stack); + VEC_PUSH(scanner->indent_length_stack, -1); + VEC_CLEAR(scanner->bullet_stack); + VEC_PUSH(scanner->bullet_stack, NOTABULLET); + + if (length == 0) + return; + + size_t i = 0; + + size_t indent_count = (uint8_t)buffer[i++]; + + for (; i <= indent_count; i++) + VEC_PUSH(scanner->indent_length_stack, buffer[i]); + for (; i <= 2 * indent_count; i++) + VEC_PUSH(scanner->bullet_stack, buffer[i]); + for (; i < length; i++) + VEC_PUSH(scanner->section_stack, buffer[i]); +} + +static bool dedent(Scanner *scanner, TSLexer *lexer) { + VEC_POP(scanner->indent_length_stack); + VEC_POP(scanner->bullet_stack); + lexer->result_symbol = LISTEND; + return true; +} + +static bool in_error_recovery(const bool *valid_symbols) { + return (valid_symbols[LISTSTART] && valid_symbols[LISTEND] && + valid_symbols[LISTITEMEND] && valid_symbols[BULLET] && + valid_symbols[HLSTARS] && valid_symbols[SECTIONEND] && + valid_symbols[ENDOFFILE]); +} + +Bullet getbullet(TSLexer *lexer) { + if (lexer->lookahead == '-') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return DASH; + } else if (lexer->lookahead == '+') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return PLUS; + } else if (lexer->lookahead == '*') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return STAR; + } else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') { + advance(lexer); + if (lexer->lookahead == '.') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return LOWERDOT; + } else if (lexer->lookahead == ')') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return LOWERPAREN; + } + } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') { + advance(lexer); + if (lexer->lookahead == '.') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return UPPERDOT; + } else if (lexer->lookahead == ')') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return UPPERPAREN; + } + } else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') { + do { + advance(lexer); + } while ('0' <= lexer->lookahead && lexer->lookahead <= '9'); + if (lexer->lookahead == '.') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return NUMDOT; + } else if (lexer->lookahead == ')') { + advance(lexer); + if (iswspace(lexer->lookahead)) + return NUMPAREN; + } + } + return NOTABULLET; +} + +bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + if (in_error_recovery(valid_symbols)) + return false; + + // - Section ends + int16_t indent_length = 0; + lexer->mark_end(lexer); + for (;;) { + if (lexer->lookahead == ' ') { + indent_length++; + } else if (lexer->lookahead == '\t') { + indent_length += 8; + } else if (lexer->lookahead == '\0') { + if (valid_symbols[LISTEND]) { + lexer->result_symbol = LISTEND; + } else if (valid_symbols[SECTIONEND]) { + lexer->result_symbol = SECTIONEND; + } else if (valid_symbols[ENDOFFILE]) { + lexer->result_symbol = ENDOFFILE; + } else + return false; + + return true; + } else { + break; + } + skip(lexer); + } + + // - Listiem ends + // Listend -> end of a line, looking for: + // 1. dedent + // 2. same indent, not a bullet + // 3. two eols + int16_t newlines = 0; + if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) { + for (;;) { + if (lexer->lookahead == ' ') { + indent_length++; + } else if (lexer->lookahead == '\t') { + indent_length += 8; + } else if (lexer->lookahead == '\0') { + return dedent(scanner, lexer); + } else if (lexer->lookahead == '\n') { + if (++newlines > 1) + return dedent(scanner, lexer); + indent_length = 0; + } else { + break; + } + skip(lexer); + } + + if (indent_length < VEC_BACK(scanner->indent_length_stack)) { + return dedent(scanner, lexer); + } else if (indent_length == VEC_BACK(scanner->indent_length_stack)) { + if (getbullet(lexer) == VEC_BACK(scanner->bullet_stack)) { + lexer->result_symbol = LISTITEMEND; + return true; + } + return dedent(scanner, lexer); + } + } + + // - Col=0 star + if (indent_length == 0 && lexer->lookahead == '*') { + lexer->mark_end(lexer); + int16_t stars = 1; + skip(lexer); + while (lexer->lookahead == '*') { + stars++; + skip(lexer); + } + + if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) && + stars > 0 && stars <= VEC_BACK(scanner->section_stack)) { + VEC_POP(scanner->section_stack); + lexer->result_symbol = SECTIONEND; + return true; + } else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) { + VEC_PUSH(scanner->section_stack, stars); + lexer->result_symbol = HLSTARS; + return true; + } + return false; + } + + // - Liststart and bullets + if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) { + Bullet bullet = getbullet(lexer); + + if (valid_symbols[BULLET] && + bullet == VEC_BACK(scanner->bullet_stack) && + indent_length == VEC_BACK(scanner->indent_length_stack)) { + lexer->mark_end(lexer); + lexer->result_symbol = BULLET; + return true; + } else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && + indent_length > VEC_BACK(scanner->indent_length_stack)) { + VEC_PUSH(scanner->indent_length_stack, indent_length); + VEC_PUSH(scanner->bullet_stack, bullet); + lexer->result_symbol = LISTSTART; + return true; + } + } + + return false; // default +} + +void *tree_sitter_org_external_scanner_create() { + Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); + scanner->indent_length_stack = (stack *)calloc(1, sizeof(stack)); + scanner->bullet_stack = (stack *)calloc(1, sizeof(stack)); + scanner->section_stack = (stack *)calloc(1, sizeof(stack)); + deserialize(scanner, NULL, 0); + return scanner; +} + +bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_org_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_org_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +void tree_sitter_org_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + VEC_FREE(scanner->indent_length_stack); + VEC_FREE(scanner->bullet_stack); + VEC_FREE(scanner->section_stack); + free(scanner->indent_length_stack); + free(scanner->bullet_stack); + free(scanner->section_stack); + free(scanner); +} diff --git a/src/scanner.cc b/src/scanner.cc deleted file mode 100644 index f198183..0000000 --- a/src/scanner.cc +++ /dev/null @@ -1,293 +0,0 @@ -#include -#include -#include - -namespace { - -using std::vector; -using std::iswspace; - -enum TokenType { - LISTSTART, - LISTEND, - LISTITEMEND, - BULLET, - HLSTARS, - SECTIONEND, - ENDOFFILE, -}; - -enum Bullet { - NOTABULLET, - DASH, - PLUS, - STAR, - LOWERDOT, - UPPERDOT, - LOWERPAREN, - UPPERPAREN, - NUMDOT, - NUMPAREN, -}; - -struct Scanner { - vector indent_length_stack; - vector bullet_stack; - vector section_stack; - - Scanner() { - deserialize(NULL, 0); - } - - unsigned serialize(char *buffer) { - size_t i = 0; - - size_t indent_count = indent_length_stack.size() - 1; - if (indent_count > UINT8_MAX) indent_count = UINT8_MAX; - buffer[i++] = indent_count; - - vector::iterator - iter = indent_length_stack.begin() + 1, - end = indent_length_stack.end(); - - for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { - buffer[i++] = *iter; - } - - iter = bullet_stack.begin() + 1; - end = bullet_stack.end(); - for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { - buffer[i++] = *iter; - } - - iter = section_stack.begin() + 1; - end = section_stack.end(); - - for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { - buffer[i++] = *iter; - } - - return i; - } - - void deserialize(const char *buffer, unsigned length) { - section_stack.clear(); - section_stack.push_back(0); - indent_length_stack.clear(); - indent_length_stack.push_back(-1); - bullet_stack.clear(); - bullet_stack.push_back(NOTABULLET); - - if (length == 0) return; - - size_t i = 0; - - size_t indent_count = (uint8_t)buffer[i++]; - - for (; i <= indent_count ; i++) indent_length_stack.push_back(buffer[i]); - for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]); - for (; i < length ; i++) section_stack.push_back(buffer[i]); - - } - - void advance(TSLexer *lexer) { - lexer->advance(lexer, false); - } - - void skip(TSLexer *lexer) { - lexer->advance(lexer, true); - } - - bool dedent(TSLexer *lexer) { - indent_length_stack.pop_back(); - bullet_stack.pop_back(); - lexer->result_symbol = LISTEND; - return true; - } - - bool in_error_recovery(const bool *valid_symbols) { - return (valid_symbols[LISTSTART] && - valid_symbols[LISTEND] && - valid_symbols[LISTITEMEND] && - valid_symbols[BULLET] && - valid_symbols[HLSTARS] && - valid_symbols[SECTIONEND] && - valid_symbols[ENDOFFILE]); - } - - Bullet getbullet(TSLexer *lexer) { - if (lexer->lookahead == '-') { - advance(lexer); - if (iswspace(lexer->lookahead)) return DASH; - } else if (lexer->lookahead == '+') { - advance(lexer); - if (iswspace(lexer->lookahead)) return PLUS; - } else if (lexer->lookahead == '*') { - advance(lexer); - if (iswspace(lexer->lookahead)) return STAR; - } else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') { - advance(lexer); - if (lexer->lookahead == '.') { - advance(lexer); - if (iswspace(lexer->lookahead)) return LOWERDOT; - } else if (lexer->lookahead == ')') { - advance(lexer); - if (iswspace(lexer->lookahead)) return LOWERPAREN; - } - } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') { - advance(lexer); - if (lexer->lookahead == '.') { - advance(lexer); - if (iswspace(lexer->lookahead)) return UPPERDOT; - } else if (lexer->lookahead == ')') { - advance(lexer); - if (iswspace(lexer->lookahead)) return UPPERPAREN; - } - } else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') { - do { - advance(lexer); - } while ('0' <= lexer->lookahead && lexer->lookahead <= '9'); - if (lexer->lookahead == '.') { - advance(lexer); - if (iswspace(lexer->lookahead)) return NUMDOT; - } else if (lexer->lookahead == ')') { - advance(lexer); - if (iswspace(lexer->lookahead)) return NUMPAREN; - } - } - return NOTABULLET; - } - -bool scan(TSLexer *lexer, const bool *valid_symbols) { - - if (in_error_recovery(valid_symbols)) - return false; - - - // - Section ends - int16_t indent_length = 0; - lexer->mark_end(lexer); - for (;;) { - if (lexer->lookahead == ' ') { - indent_length++; - } else if (lexer->lookahead == '\t') { - indent_length += 8; - } else if (lexer->lookahead == '\0') { - - if (valid_symbols[LISTEND]) { lexer->result_symbol = LISTEND; } - else if (valid_symbols[SECTIONEND]) { lexer->result_symbol = SECTIONEND; } - else if (valid_symbols[ENDOFFILE]) { lexer->result_symbol = ENDOFFILE; } - else return false; - - return true; - } else { - break; - } - skip(lexer); - } - - // - Listiem ends - // Listend -> end of a line, looking for: - // 1. dedent - // 2. same indent, not a bullet - // 3. two eols - int16_t newlines = 0; - if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) { - for (;;) { - if (lexer->lookahead == ' ') { - indent_length++; - } else if (lexer->lookahead == '\t') { - indent_length += 8; - } else if (lexer->lookahead == '\0') { - return dedent(lexer); - } else if (lexer->lookahead == '\n') { - if (++newlines > 1) return dedent(lexer); - indent_length = 0; - } else { - break; - } - skip(lexer); - } - - if (indent_length < indent_length_stack.back()) { - return dedent(lexer); - } else if (indent_length == indent_length_stack.back()) { - if (getbullet(lexer) == bullet_stack.back()) { - lexer->result_symbol = LISTITEMEND; - return true; - } - return dedent(lexer); - } - } - - // - Col=0 star - if (indent_length == 0 && lexer->lookahead == '*') { - lexer->mark_end(lexer); - int16_t stars = 1; - skip(lexer); - while (lexer->lookahead == '*') { - stars++; - skip(lexer); - } - - if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) && stars > 0 && stars <= section_stack.back()) { - section_stack.pop_back(); - lexer->result_symbol = SECTIONEND; - return true; - } else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) { - section_stack.push_back(stars); - lexer->result_symbol = HLSTARS; - return true; - } - return false; - } - - // - Liststart and bullets - if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) { - Bullet bullet = getbullet(lexer); - - if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) { - lexer->mark_end(lexer); - lexer->result_symbol = BULLET; - return true; - } else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) { - indent_length_stack.push_back(indent_length); - bullet_stack.push_back(bullet); - lexer->result_symbol = LISTSTART; - return true; - } - } - - return false; // default -} -}; - -} - -extern "C" { - -void *tree_sitter_org_external_scanner_create() { - return new Scanner(); -} - -bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - Scanner *scanner = static_cast(payload); - return scanner->scan(lexer, valid_symbols); -} - -unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) { - Scanner *scanner = static_cast(payload); - return scanner->serialize(buffer); -} - -void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { - Scanner *scanner = static_cast(payload); - scanner->deserialize(buffer, length); -} - -void tree_sitter_org_external_scanner_destroy(void *payload) { - Scanner *scanner = static_cast(payload); - delete scanner; -} - -}