feat: rewrite the scanner in C (#40)

* feat: rewrite the scanner in C
* chore: update manifests & docs
This commit is contained in:
Amaan Qureshi 2023-06-19 18:05:11 -04:00 committed by GitHub
parent 081179c52b
commit 64cfbc213f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 348 additions and 317 deletions

View file

@ -23,7 +23,7 @@ Like in many regex systems, `*/+` is read as "0/1 or more", and `?` is 0 or 1.
## Example
``` org
```org
#+TITLE: Example
Some *marked up* words
@ -43,6 +43,7 @@ Text
```
Parses as:
```
(document [0, 0] - [16, 0]
body: (body [0, 0] - [4, 0]
@ -117,13 +118,13 @@ For manual install, use `make`.
For neovim, using `nvim-treesitter/nvim-treesitter`, add to your configuration:
``` lua
```lua
local parser_config = require "nvim-treesitter.parsers".get_parser_configs()
parser_config.org = {
install_info = {
url = 'https://github.com/milisims/tree-sitter-org',
revision = 'main',
files = { 'src/parser.c', 'src/scanner.cc' },
files = { 'src/parser.c', 'src/scanner.c' },
},
filetype = 'org',
}

View file

@ -9,7 +9,7 @@
"sources": [
"src/parser.c",
"bindings/node/binding.cc",
"src/scanner.cc"
"src/scanner.c"
],
"cflags_c": [
"-std=c99",

View file

@ -2,7 +2,7 @@ fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config.include(src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
@ -10,29 +10,10 @@ fn main() {
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
/*
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
}

342
src/scanner.c Normal file
View file

@ -0,0 +1,342 @@
#include <assert.h>
#include <stdio.h>
#include <tree_sitter/parser.h>
#include <wctype.h>
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
{ \
(vec)->data = realloc((vec)->data, (_cap) * sizeof((vec)->data[0])); \
assert((vec)->data != NULL); \
(vec)->cap = (_cap); \
}
#define VEC_PUSH(vec, el) \
{ \
if ((vec)->cap == (vec)->len) { \
VEC_RESIZE((vec), MAX(16, (vec)->len * 2)); \
} \
(vec)->data[(vec)->len++] = (el); \
}
#define VEC_POP(vec) (vec)->len--;
#define VEC_BACK(vec) ((vec)->data[(vec)->len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec)->data != NULL) \
free((vec)->data); \
}
#define VEC_CLEAR(vec) \
{ (vec)->len = 0; }
enum TokenType {
LISTSTART,
LISTEND,
LISTITEMEND,
BULLET,
HLSTARS,
SECTIONEND,
ENDOFFILE,
};
typedef enum {
NOTABULLET,
DASH,
PLUS,
STAR,
LOWERDOT,
UPPERDOT,
LOWERPAREN,
UPPERPAREN,
NUMDOT,
NUMPAREN,
} Bullet;
typedef struct {
uint32_t len;
uint32_t cap;
int16_t *data;
} stack;
typedef struct {
stack *indent_length_stack;
stack *bullet_stack;
stack *section_stack;
} Scanner;
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
unsigned serialize(Scanner *scanner, char *buffer) {
size_t i = 0;
size_t indent_count = scanner->indent_length_stack->len - 1;
if (indent_count > UINT8_MAX)
indent_count = UINT8_MAX;
buffer[i++] = indent_count;
int iter = 1;
for (; iter < scanner->indent_length_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->indent_length_stack->data[iter];
}
iter = 1;
for (; iter < scanner->bullet_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->bullet_stack->data[iter];
}
iter = 1;
for (; iter < scanner->section_stack->len &&
i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[i++] = scanner->section_stack->data[iter];
}
return i;
}
void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
VEC_CLEAR(scanner->section_stack);
VEC_PUSH(scanner->section_stack, 0);
VEC_CLEAR(scanner->indent_length_stack);
VEC_PUSH(scanner->indent_length_stack, -1);
VEC_CLEAR(scanner->bullet_stack);
VEC_PUSH(scanner->bullet_stack, NOTABULLET);
if (length == 0)
return;
size_t i = 0;
size_t indent_count = (uint8_t)buffer[i++];
for (; i <= indent_count; i++)
VEC_PUSH(scanner->indent_length_stack, buffer[i]);
for (; i <= 2 * indent_count; i++)
VEC_PUSH(scanner->bullet_stack, buffer[i]);
for (; i < length; i++)
VEC_PUSH(scanner->section_stack, buffer[i]);
}
static bool dedent(Scanner *scanner, TSLexer *lexer) {
VEC_POP(scanner->indent_length_stack);
VEC_POP(scanner->bullet_stack);
lexer->result_symbol = LISTEND;
return true;
}
static bool in_error_recovery(const bool *valid_symbols) {
return (valid_symbols[LISTSTART] && valid_symbols[LISTEND] &&
valid_symbols[LISTITEMEND] && valid_symbols[BULLET] &&
valid_symbols[HLSTARS] && valid_symbols[SECTIONEND] &&
valid_symbols[ENDOFFILE]);
}
Bullet getbullet(TSLexer *lexer) {
if (lexer->lookahead == '-') {
advance(lexer);
if (iswspace(lexer->lookahead))
return DASH;
} else if (lexer->lookahead == '+') {
advance(lexer);
if (iswspace(lexer->lookahead))
return PLUS;
} else if (lexer->lookahead == '*') {
advance(lexer);
if (iswspace(lexer->lookahead))
return STAR;
} else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return LOWERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return LOWERPAREN;
}
} else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return UPPERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return UPPERPAREN;
}
} else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
do {
advance(lexer);
} while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead))
return NUMDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead))
return NUMPAREN;
}
}
return NOTABULLET;
}
bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols))
return false;
// - Section ends
int16_t indent_length = 0;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
if (valid_symbols[LISTEND]) {
lexer->result_symbol = LISTEND;
} else if (valid_symbols[SECTIONEND]) {
lexer->result_symbol = SECTIONEND;
} else if (valid_symbols[ENDOFFILE]) {
lexer->result_symbol = ENDOFFILE;
} else
return false;
return true;
} else {
break;
}
skip(lexer);
}
// - Listiem ends
// Listend -> end of a line, looking for:
// 1. dedent
// 2. same indent, not a bullet
// 3. two eols
int16_t newlines = 0;
if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
return dedent(scanner, lexer);
} else if (lexer->lookahead == '\n') {
if (++newlines > 1)
return dedent(scanner, lexer);
indent_length = 0;
} else {
break;
}
skip(lexer);
}
if (indent_length < VEC_BACK(scanner->indent_length_stack)) {
return dedent(scanner, lexer);
} else if (indent_length == VEC_BACK(scanner->indent_length_stack)) {
if (getbullet(lexer) == VEC_BACK(scanner->bullet_stack)) {
lexer->result_symbol = LISTITEMEND;
return true;
}
return dedent(scanner, lexer);
}
}
// - Col=0 star
if (indent_length == 0 && lexer->lookahead == '*') {
lexer->mark_end(lexer);
int16_t stars = 1;
skip(lexer);
while (lexer->lookahead == '*') {
stars++;
skip(lexer);
}
if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) &&
stars > 0 && stars <= VEC_BACK(scanner->section_stack)) {
VEC_POP(scanner->section_stack);
lexer->result_symbol = SECTIONEND;
return true;
} else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) {
VEC_PUSH(scanner->section_stack, stars);
lexer->result_symbol = HLSTARS;
return true;
}
return false;
}
// - Liststart and bullets
if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) {
Bullet bullet = getbullet(lexer);
if (valid_symbols[BULLET] &&
bullet == VEC_BACK(scanner->bullet_stack) &&
indent_length == VEC_BACK(scanner->indent_length_stack)) {
lexer->mark_end(lexer);
lexer->result_symbol = BULLET;
return true;
} else if (valid_symbols[LISTSTART] && bullet != NOTABULLET &&
indent_length > VEC_BACK(scanner->indent_length_stack)) {
VEC_PUSH(scanner->indent_length_stack, indent_length);
VEC_PUSH(scanner->bullet_stack, bullet);
lexer->result_symbol = LISTSTART;
return true;
}
}
return false; // default
}
void *tree_sitter_org_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
scanner->indent_length_stack = (stack *)calloc(1, sizeof(stack));
scanner->bullet_stack = (stack *)calloc(1, sizeof(stack));
scanner->section_stack = (stack *)calloc(1, sizeof(stack));
deserialize(scanner, NULL, 0);
return scanner;
}
bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
unsigned tree_sitter_org_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
void tree_sitter_org_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
void tree_sitter_org_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
VEC_FREE(scanner->indent_length_stack);
VEC_FREE(scanner->bullet_stack);
VEC_FREE(scanner->section_stack);
free(scanner->indent_length_stack);
free(scanner->bullet_stack);
free(scanner->section_stack);
free(scanner);
}

View file

@ -1,293 +0,0 @@
#include <tree_sitter/parser.h>
#include <vector>
#include <cwctype>
namespace {
using std::vector;
using std::iswspace;
enum TokenType {
LISTSTART,
LISTEND,
LISTITEMEND,
BULLET,
HLSTARS,
SECTIONEND,
ENDOFFILE,
};
enum Bullet {
NOTABULLET,
DASH,
PLUS,
STAR,
LOWERDOT,
UPPERDOT,
LOWERPAREN,
UPPERPAREN,
NUMDOT,
NUMPAREN,
};
struct Scanner {
vector<int16_t> indent_length_stack;
vector<int16_t> bullet_stack;
vector<int16_t> section_stack;
Scanner() {
deserialize(NULL, 0);
}
unsigned serialize(char *buffer) {
size_t i = 0;
size_t indent_count = indent_length_stack.size() - 1;
if (indent_count > UINT8_MAX) indent_count = UINT8_MAX;
buffer[i++] = indent_count;
vector<int16_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = bullet_stack.begin() + 1;
end = bullet_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
iter = section_stack.begin() + 1;
end = section_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) {
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
section_stack.clear();
section_stack.push_back(0);
indent_length_stack.clear();
indent_length_stack.push_back(-1);
bullet_stack.clear();
bullet_stack.push_back(NOTABULLET);
if (length == 0) return;
size_t i = 0;
size_t indent_count = (uint8_t)buffer[i++];
for (; i <= indent_count ; i++) indent_length_stack.push_back(buffer[i]);
for (; i <= 2 * indent_count; i++) bullet_stack.push_back(buffer[i]);
for (; i < length ; i++) section_stack.push_back(buffer[i]);
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool dedent(TSLexer *lexer) {
indent_length_stack.pop_back();
bullet_stack.pop_back();
lexer->result_symbol = LISTEND;
return true;
}
bool in_error_recovery(const bool *valid_symbols) {
return (valid_symbols[LISTSTART] &&
valid_symbols[LISTEND] &&
valid_symbols[LISTITEMEND] &&
valid_symbols[BULLET] &&
valid_symbols[HLSTARS] &&
valid_symbols[SECTIONEND] &&
valid_symbols[ENDOFFILE]);
}
Bullet getbullet(TSLexer *lexer) {
if (lexer->lookahead == '-') {
advance(lexer);
if (iswspace(lexer->lookahead)) return DASH;
} else if (lexer->lookahead == '+') {
advance(lexer);
if (iswspace(lexer->lookahead)) return PLUS;
} else if (lexer->lookahead == '*') {
advance(lexer);
if (iswspace(lexer->lookahead)) return STAR;
} else if ('a' <= lexer->lookahead && lexer->lookahead <= 'z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return LOWERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return LOWERPAREN;
}
} else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z') {
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return UPPERDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return UPPERPAREN;
}
} else if ('0' <= lexer->lookahead && lexer->lookahead <= '9') {
do {
advance(lexer);
} while ('0' <= lexer->lookahead && lexer->lookahead <= '9');
if (lexer->lookahead == '.') {
advance(lexer);
if (iswspace(lexer->lookahead)) return NUMDOT;
} else if (lexer->lookahead == ')') {
advance(lexer);
if (iswspace(lexer->lookahead)) return NUMPAREN;
}
}
return NOTABULLET;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols))
return false;
// - Section ends
int16_t indent_length = 0;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
if (valid_symbols[LISTEND]) { lexer->result_symbol = LISTEND; }
else if (valid_symbols[SECTIONEND]) { lexer->result_symbol = SECTIONEND; }
else if (valid_symbols[ENDOFFILE]) { lexer->result_symbol = ENDOFFILE; }
else return false;
return true;
} else {
break;
}
skip(lexer);
}
// - Listiem ends
// Listend -> end of a line, looking for:
// 1. dedent
// 2. same indent, not a bullet
// 3. two eols
int16_t newlines = 0;
if (valid_symbols[LISTEND] || valid_symbols[LISTITEMEND]) {
for (;;) {
if (lexer->lookahead == ' ') {
indent_length++;
} else if (lexer->lookahead == '\t') {
indent_length += 8;
} else if (lexer->lookahead == '\0') {
return dedent(lexer);
} else if (lexer->lookahead == '\n') {
if (++newlines > 1) return dedent(lexer);
indent_length = 0;
} else {
break;
}
skip(lexer);
}
if (indent_length < indent_length_stack.back()) {
return dedent(lexer);
} else if (indent_length == indent_length_stack.back()) {
if (getbullet(lexer) == bullet_stack.back()) {
lexer->result_symbol = LISTITEMEND;
return true;
}
return dedent(lexer);
}
}
// - Col=0 star
if (indent_length == 0 && lexer->lookahead == '*') {
lexer->mark_end(lexer);
int16_t stars = 1;
skip(lexer);
while (lexer->lookahead == '*') {
stars++;
skip(lexer);
}
if (valid_symbols[SECTIONEND] && iswspace(lexer->lookahead) && stars > 0 && stars <= section_stack.back()) {
section_stack.pop_back();
lexer->result_symbol = SECTIONEND;
return true;
} else if (valid_symbols[HLSTARS] && iswspace(lexer->lookahead)) {
section_stack.push_back(stars);
lexer->result_symbol = HLSTARS;
return true;
}
return false;
}
// - Liststart and bullets
if ((valid_symbols[LISTSTART] || valid_symbols[BULLET]) && newlines == 0) {
Bullet bullet = getbullet(lexer);
if (valid_symbols[BULLET] && bullet == bullet_stack.back() && indent_length == indent_length_stack.back()) {
lexer->mark_end(lexer);
lexer->result_symbol = BULLET;
return true;
} else if (valid_symbols[LISTSTART] && bullet != NOTABULLET && indent_length > indent_length_stack.back()) {
indent_length_stack.push_back(indent_length);
bullet_stack.push_back(bullet);
lexer->result_symbol = LISTSTART;
return true;
}
}
return false; // default
}
};
}
extern "C" {
void *tree_sitter_org_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_org_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_org_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_org_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_org_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}