Просмотр исходного кода

Treesitter parser (#2902)

A push towards better editor support.
See utils/treesitter/README.md

---------

Co-authored-by: josh11b <josh11b@users.noreply.github.com>
mx42 2 лет назад
Родитель
Сommit
1cab6920f2

+ 3 - 0
.gitignore

@@ -30,3 +30,6 @@ compile_commands.json
 
 # vim temporary files
 .*.swp
+
+# generated by utils/treesitter/helix.sh
+/.helix/

+ 10 - 0
.pre-commit-config.yaml

@@ -186,6 +186,16 @@ repos:
           - ''
           - '" '
           - ''
+          - --custom_format
+          - '\.scm$'
+          - ''
+          - '; '
+          - ''
+          - --custom_format
+          - '\.c$'
+          - ''
+          - '// '
+          - ''
         exclude: |
           (?x)^(
               .bazelversion|

+ 29 - 0
WORKSPACE

@@ -271,3 +271,32 @@ local_repository(
     name = "woff2_carbon",
     path = "third_party/examples/woff2/carbon",
 )
+
+###############################################################################
+# Treesitter rules
+###############################################################################
+
+http_archive(
+    name = "rules_nodejs",
+    sha256 = "d124665ea12f89153086746821cf6c9ef93ab88360a50c1aeefa1fe522421704",
+    strip_prefix = "rules_nodejs-6.0.0-beta1",
+    url = "https://github.com/bazelbuild/rules_nodejs/releases/download/v6.0.0-beta1/rules_nodejs-v6.0.0-beta1.tar.gz",
+)
+
+load("@rules_nodejs//nodejs:repositories.bzl", "DEFAULT_NODE_VERSION", "nodejs_register_toolchains")
+
+nodejs_register_toolchains(
+    name = "nodejs",
+    node_version = DEFAULT_NODE_VERSION,
+)
+
+http_archive(
+    name = "rules_tree_sitter",
+    sha256 = "a09f177a2b8acb2f8a84def6ca0c41a5bd26b25634aa7313f22ade6c54e57ca1",
+    strip_prefix = "rules_tree_sitter-bc3a2131053207de7dfd9b24046b811ce770e35d",
+    urls = ["https://github.com/Maan2003/rules_tree_sitter/archive/bc3a2131053207de7dfd9b24046b811ce770e35d.tar.gz"],
+)
+
+load("@rules_tree_sitter//tree_sitter:tree_sitter.bzl", "tree_sitter_register_toolchains")
+
+tree_sitter_register_toolchains()

+ 5 - 1
docs/design/lexical_conventions/words.md

@@ -34,7 +34,11 @@ in Unicode Normalization Form C (NFC).
 
 ## Keywords
 
-<!-- Keep in sync with utils/textmate/Syntaxes/Carbon.plist -->
+<!--
+Keep in sync:
+- utils/textmate/Syntaxes/Carbon.plist
+- utils/treesitter/queries/highlights.scm
+-->
 
 The following words are interpreted as keywords:
 

+ 14 - 0
explorer/BUILD

@@ -80,3 +80,17 @@ filegroup(
     # Files are used for validating fuzzer completeness.
     visibility = ["//explorer/fuzzing:__pkg__"],
 )
+
+filegroup(
+    name = "treesitter_testdata",
+    srcs = glob(
+        ["testdata/**/*.carbon"],
+        exclude = [
+            "testdata/**/fail_*",
+
+            # multiline strings
+            "testdata/string/*",
+        ],
+    ),
+    visibility = ["//utils/treesitter:__pkg__"],
+)

+ 2 - 0
scripts/fix_cc_deps.py

@@ -147,6 +147,8 @@ def get_rules(bazel: str, targets: str, keep_going: bool) -> Dict[str, Rule]:
             elif rule_class == "genrule":
                 if list_name == "outs":
                     outs = get_bazel_list(list_child, True)
+            elif rule_class == "tree_sitter_cc_library":
+                continue
             else:
                 exit(f"unexpected rule type: {rule_class}")
         rules[rule_name] = Rule(hdrs, srcs, deps, outs)

+ 11 - 0
utils/treesitter/.gitignore

@@ -0,0 +1,11 @@
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# generated by tree-sitter-cli when building without bazel
+src/*
+!src/scanner.c
+Cargo.toml
+package.json
+binding.gyp
+bindings

+ 29 - 0
utils/treesitter/BUILD

@@ -0,0 +1,29 @@
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@rules_tree_sitter//tree_sitter:tree_sitter.bzl", "tree_sitter_cc_library")
+
+tree_sitter_cc_library(
+    name = "treesitter",
+    srcs = ["src/scanner.c"],
+    grammar = ["grammar.js"],
+)
+
+cc_binary(
+    name = "test_runner",
+    srcs = ["test_runner.cpp"],
+    deps = [
+        ":treesitter",
+    ],
+)
+
+cc_test(
+    name = "explorer_tests",
+    srcs = ["test_runner.cpp"],
+    args = ["$(locations //explorer:treesitter_testdata)"],
+    data = ["//explorer:treesitter_testdata"],
+    deps = [
+        ":treesitter",
+    ],
+)

+ 27 - 0
utils/treesitter/README.md

@@ -0,0 +1,27 @@
+<!--
+Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+Exceptions. See /LICENSE for license information.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+# Tree-sitter grammar for Carbon
+
+Tree-sitter is currently used for syntax highlighting in supported editors.
+
+## Editor Installation
+
+### Helix
+
+1. Install
+   [tree-sitter](https://tree-sitter.github.io/tree-sitter/creating-parsers#installation)
+   and Nodejs.
+2. Install [Helix](https://docs.helix-editor.com/install.html).
+3. Run `./helix.sh`
+
+### Neovim
+
+TODO
+
+### Emacs
+
+TODO

+ 553 - 0
utils/treesitter/grammar.js

@@ -0,0 +1,553 @@
+/*
+ * Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+ * Exceptions. See /LICENSE for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+// This grammar is more permissive than toolchain because it is geared towards
+// editor use.
+
+function repeat_sep1(thing, sep) {
+  return seq(thing, repeat(seq(sep, thing)));
+}
+
+function comma_sep(thing) {
+  // Trailing comma is only allowed if there is at least one element.
+  return optional(seq(repeat_sep1(thing, ','), optional(',')));
+}
+
+// This is based on toolchain/parser/precedence.cpp
+const PREC = {
+  TermPrefix: 11,
+  TermPostfix: 11,
+  NumericPrefix: 10,
+  NumericPostfix: 10,
+  Multiplicative: 9,
+  Additive: 8,
+  BitwisePrefix: 7,
+  BitwiseAnd: 6,
+  BitwiseOr: 6,
+  BitwiseXor: 6,
+  BitShift: 6,
+  TypePostfix: 5,
+  LogicalPrefix: 4,
+  Relational: 3,
+  LogicalAnd: 2,
+  LogicalOr: 2,
+  WhereClause: 1,
+  IfExpression: 1,
+};
+
+module.exports = grammar({
+  name: 'carbon',
+
+  word: ($) => $.ident,
+
+  conflicts: ($) => [
+    [$.paren_pattern, $.paren_expression],
+    [$.struct_literal, $.struct_type_literal],
+  ],
+
+  extras: ($) => [/\s/, $.comment],
+
+  // NOTE: This must match the order in src/scanner.c, names are not used for matching.
+  externals: ($) => [$.binary_star, $.postfix_star],
+
+  rules: {
+    source_file: ($) =>
+      seq(
+        optional($.package_directive),
+        repeat($.import_directive),
+        repeat($.declaration)
+      ),
+
+    api_or_impl: ($) => choice('api', 'impl'),
+
+    library_path: ($) => seq('library', $.string_literal),
+
+    package_directive: ($) =>
+      seq('package', $.ident, optional($.library_path), $.api_or_impl, ';'),
+
+    import_directive: ($) =>
+      seq('import', $.ident, optional($.library_path), ';'),
+
+    comment: ($) => token(seq('//', /.*/)),
+
+    // NOTE: this must be before ident rule to increase its priority.
+    // https://github.com/carbon-language/carbon-lang/blob/trunk/proposals/p2015.md#syntax
+    numeric_type_literal: ($) => /[iuf][1-9][0-9]*/,
+
+    ident: ($) => /[A-Za-z_][A-Za-z0-9_]*/,
+
+    bool_literal: ($) => choice('true', 'false'),
+
+    numeric_literal: ($) => {
+      // This is using variables because rules are not allowed in
+      // token.immediate and token.
+      // https://github.com/tree-sitter/tree-sitter/issues/449
+      const decimal_integer_literal = choice('0', /[1-9](_?[0-9])*/);
+      const hex_digits = /[0-9A-F](_?[0-9A-F])*/;
+      const binary_integer_literal = /0b[01](_?[01])*/;
+      const hex_integer_literal = seq('0x', token.immediate(hex_digits));
+
+      const decimal_real_number_literal = seq(
+        decimal_integer_literal,
+        token.immediate(/\.[0-9](_?[0-9])*/),
+        optional(
+          seq(
+            token.immediate(/e[+-]?/),
+            token.immediate(decimal_integer_literal)
+          )
+        )
+      );
+
+      const hex_real_number_literal = seq(
+        hex_integer_literal,
+        token.immediate('.'),
+        token.immediate(hex_digits),
+        optional(
+          seq(
+            token.immediate(/p[+-]?/),
+            token.immediate(decimal_integer_literal)
+          )
+        )
+      );
+
+      return token(
+        choice(
+          decimal_integer_literal,
+          binary_integer_literal,
+          hex_integer_literal,
+          decimal_real_number_literal,
+          hex_real_number_literal
+        )
+      );
+    },
+
+    _string_content: ($) => token.immediate(/[^\\"]+/),
+
+    escape_sequence: ($) =>
+      token.immediate(
+        seq(
+          '\\',
+          choice(
+            'n',
+            't',
+            'r',
+            "'",
+            '"',
+            '\\',
+            '0',
+            /x[0-9A-F]{2}/,
+            /u\{[0-9A-F]+\}/
+          )
+        )
+      ),
+
+    // TODO: multiline string
+    string_literal: ($) =>
+      seq(
+        '"',
+        repeat(choice($._string_content, $.escape_sequence)),
+        token.immediate('"')
+      ),
+
+    array_literal: ($) =>
+      seq(
+        '[',
+        field('type', $._expression),
+        ';',
+        optional(field('size', $._expression)),
+        ']'
+      ),
+
+    struct_literal: ($) =>
+      seq('{', comma_sep(seq($.designator, '=', $._expression)), '}'),
+
+    struct_type_literal: ($) =>
+      seq('{', comma_sep(seq($.designator, ':', $._expression)), '}'),
+
+    builtin_type: ($) => choice('Self', 'String', 'bool', 'type'),
+
+    literal: ($) =>
+      choice(
+        $.bool_literal,
+        $.numeric_literal,
+        $.numeric_type_literal,
+        $.string_literal,
+        $.struct_literal,
+        $.struct_type_literal
+      ),
+
+    _binding_lhs: ($) => choice($.ident, '_'),
+
+    paren_pattern: ($) =>
+      seq(
+        '(',
+        comma_sep(choice($._pattern_without_expression, $._expression)),
+        ')'
+      ),
+
+    _pattern_without_expression: ($) =>
+      choice(
+        'auto',
+        seq($._binding_lhs, ':', $._expression),
+        seq($._binding_lhs, ':!', $._expression),
+        seq('template', $._binding_lhs, ':!', $._expression),
+        seq('var', $._pattern),
+        $.paren_pattern,
+        // alternative patterns
+        // example: Optional(i32).Some(x: i32)
+        seq($._expression, $.paren_pattern)
+      ),
+
+    _pattern: ($) => choice($._pattern_without_expression, $._expression),
+
+    unary_prefix_expression: ($) => {
+      const table = [
+        [PREC.NumericPrefix, '-'],
+        [PREC.NumericPrefix, '--'],
+        [PREC.NumericPrefix, '++'],
+        [PREC.BitwisePrefix, '^'],
+        [PREC.LogicalPrefix, 'not'],
+      ];
+
+      return choice(
+        ...table.map(([precedence, operator]) =>
+          prec(
+            precedence,
+            seq(field('operator', operator), field('value', $._expression))
+          )
+        )
+      );
+    },
+
+    binary_expression: ($) => {
+      const table = [
+        [PREC.LogicalAnd, 'and'],
+        [PREC.LogicalOr, 'or'],
+        [PREC.BitwiseAnd, '&'],
+        [PREC.BitwiseOr, '|'],
+        [PREC.BitwiseXor, '^'],
+        [PREC.BitShift, choice('<<', '>>')],
+        [PREC.Relational, choice('==', '!=', '<', '<=', '>', '>=')],
+        [PREC.Additive, choice('+', '-')],
+        [PREC.Multiplicative, choice($.binary_star, '/', '%')],
+      ];
+
+      return choice(
+        ...table.map(([precedence, operator]) =>
+          prec.left(
+            precedence,
+            seq(
+              field('left', $._expression),
+              field('operator', operator),
+              field('right', $._expression)
+            )
+          )
+        )
+      );
+    },
+
+    // This should be non-associative but conflicts are not allowed in tree-sitter
+    as_expression: ($) => prec.left(seq($._expression, 'as', $._expression)),
+
+    ref_expression: ($) => prec.right(PREC.TermPrefix, seq('&', $._expression)),
+
+    deref_expression: ($) =>
+      prec.right(PREC.TermPrefix, seq('*', $._expression)),
+
+    fn_type_expression: ($) =>
+      prec.left(seq('__Fn', $.paren_expression, '->', $._expression)),
+
+    if_expression: ($) =>
+      prec(
+        PREC.IfExpression,
+        seq('if', $._expression, 'then', $._expression, 'else', $._expression)
+      ),
+
+    paren_expression: ($) => seq('(', comma_sep($._expression), ')'),
+
+    index_expression: ($) =>
+      prec(PREC.TermPostfix, seq($._expression, '[', $._expression, ']')),
+
+    designator: ($) => seq('.', choice('base', $.ident)),
+
+    postfix_expression: ($) =>
+      prec(
+        PREC.TermPostfix,
+        seq(
+          $._expression,
+          choice(
+            '++',
+            '--',
+            $.designator,
+            seq('->', $.ident),
+            seq(choice('.', '->'), '(', $._expression, ')')
+          )
+        )
+      ),
+
+    where_clause: ($) =>
+      prec(
+        PREC.WhereClause,
+        choice(
+          seq($._expression, '==', $._expression),
+          seq($._expression, 'impls', $._expression),
+          seq($._expression, '=', $._expression),
+          // TODO: Fix conflict with logical and
+          prec.left(seq($.where_clause, 'and', $.where_clause))
+        )
+      ),
+
+    where_expression: ($) =>
+      prec.left(PREC.TermPostfix, seq($._expression, 'where', $.where_clause)),
+
+    call_expression: ($) =>
+      prec(PREC.TermPostfix, seq($._expression, $.paren_expression)),
+
+    pointer_expression: ($) =>
+      prec(PREC.TypePostfix, seq($._expression, $.postfix_star)),
+
+    _expression: ($) =>
+      choice(
+        $.array_literal,
+        $.as_expression,
+        $.binary_expression,
+        $.builtin_type,
+        $.call_expression,
+        $.deref_expression,
+        $.fn_type_expression,
+        $.ident,
+        $.if_expression,
+        $.index_expression,
+        $.literal,
+        $.paren_expression,
+        $.pointer_expression,
+        $.postfix_expression,
+        $.ref_expression,
+        $.unary_prefix_expression,
+        $.where_expression,
+        'self',
+        // TODO: Remove these two once `where` clauses don't use the expression rule
+        '.Self',
+        $.designator
+      ),
+
+    var_declaration: ($) =>
+      seq(
+        'var',
+        $._pattern_without_expression,
+        optional(seq('=', $._expression)),
+        ';'
+      ),
+
+    let_declaration: ($) =>
+      seq('let', $._pattern_without_expression, '=', $._expression, ';'),
+
+    assign_statement: ($) =>
+      seq($._expression, $._assign_operator, $._expression, ';'),
+
+    _assign_operator: ($) =>
+      choice('=', '+=', '/=', '*=', '%=', '-=', '&=', '|=', '^=', '<<=', '>>='),
+
+    match_clause: ($) =>
+      seq(choice(seq('case', $._pattern), 'default'), '=>', $.block),
+
+    match_statement: ($) =>
+      seq('match', '(', $._expression, ')', '{', repeat($.match_clause), '}'),
+
+    returned_var_statement: ($) => seq('returned', $.var_declaration),
+
+    while_statement: ($) => seq('while', '(', $._expression, ')', $.block),
+
+    break_statement: ($) => seq('break', ';'),
+
+    continue_statement: ($) => seq('continue', ';'),
+
+    return_statement: ($) =>
+      seq('return', optional(choice('var', $._expression)), ';'),
+
+    if_statement: ($) =>
+      seq('if', '(', $._expression, ')', $.block, optional($.else)),
+
+    else: ($) => choice(seq('else', $.if_statement), seq('else', $.block)),
+
+    for_statement: ($) =>
+      seq('for', '(', $._pattern, 'in', $._expression, ')', $.block),
+
+    statement: ($) =>
+      choice(
+        seq($._expression, ';'),
+        $.assign_statement,
+        $.var_declaration,
+        $.let_declaration,
+        $.match_statement,
+        $.returned_var_statement,
+        $.if_statement,
+        $.while_statement,
+        $.break_statement,
+        $.continue_statement,
+        $.return_statement,
+        $.for_statement
+      ),
+
+    block: ($) => seq('{', repeat($.statement), '}'),
+
+    declared_name: ($) => repeat_sep1($.ident, '.'),
+
+    generic_binding: ($) =>
+      seq(optional('template'), $.ident, ':!', $._expression),
+
+    deduced_param: ($) =>
+      choice(
+        $.generic_binding,
+        seq(optional('addr'), 'self', ':', $._expression)
+      ),
+
+    deduced_params: ($) => seq('[', comma_sep($.deduced_param), ']'),
+
+    return_type: ($) => seq('->', choice('auto', $._expression)),
+
+    function_declaration: ($) =>
+      seq(
+        optional(choice('abstract', 'virtual', 'impl')),
+        'fn',
+        $.declared_name,
+        optional($.deduced_params),
+        $.paren_pattern,
+        optional($.return_type),
+        choice($.block, ';')
+      ),
+
+    namespace_declaration: ($) => seq('namespace', $.declared_name, ';'),
+
+    alias_declaration: ($) =>
+      seq('alias', $.declared_name, '=', $._expression, ';'),
+
+    type_params: ($) => $.paren_pattern,
+
+    interface_body_item: ($) =>
+      choice(
+        $.function_declaration,
+        seq('let', $.generic_binding, ';'),
+        seq('extend', $._expression, ';'),
+        seq('require', $._expression, 'impls', $._expression, ';')
+      ),
+
+    interface_body: ($) => seq('{', repeat($.interface_body_item), '}'),
+
+    interface_declaration: ($) =>
+      seq(
+        'interface',
+        $.declared_name,
+        optional($.deduced_params),
+        optional($.type_params),
+        choice(';', $.interface_body)
+      ),
+
+    constraint_declaration: ($) =>
+      seq(
+        'constraint',
+        $.declared_name,
+        optional($.deduced_params),
+        optional($.type_params),
+        choice(';', $.interface_body)
+      ),
+
+    impl_body_item: ($) => choice($.function_declaration, $.alias_declaration),
+
+    impl_body: ($) => seq('{', repeat($.impl_body_item), '}'),
+
+    impl_declaration: ($) =>
+      seq(
+        'impl',
+        optional(seq('forall', $.deduced_params)),
+        optional($._expression),
+        'as',
+        $._expression,
+        $.impl_body
+      ),
+
+    extend_impl_declaration: ($) =>
+      seq('extend', 'impl', 'as', $._expression, $.impl_body),
+
+    extend_base_declaration: ($) =>
+      seq('extend', 'base', ':', $._expression, ';'),
+
+    destructor_declaration: ($) =>
+      seq(
+        optional(choice('virtual', 'impl')),
+        'destructor',
+        optional($.deduced_params),
+        choice($.block, ';')
+      ),
+
+    class_body_item: ($) =>
+      choice(
+        $.declaration,
+        $.extend_base_declaration,
+        $.extend_impl_declaration,
+        $.mix_declaration,
+        $.destructor_declaration
+      ),
+
+    class_body: ($) => seq('{', repeat($.class_body_item), '}'),
+
+    class_declaration: ($) =>
+      seq(
+        optional(choice('base', 'abstract')),
+        'class',
+        $.declared_name,
+        optional($.deduced_params),
+        optional($.type_params),
+        choice(';', $.class_body)
+      ),
+
+    choice_declaration: ($) =>
+      seq(
+        'choice',
+        $.declared_name,
+        optional($.type_params),
+        '{',
+        comma_sep(seq($.ident, optional($.paren_expression))),
+        '}'
+      ),
+
+    empty_declaration: ($) => ';',
+
+    declaration: ($) =>
+      choice(
+        $.empty_declaration,
+        $.namespace_declaration,
+        $.var_declaration,
+        $.let_declaration,
+        $.function_declaration,
+        $.alias_declaration,
+        $.interface_declaration,
+        $.constraint_declaration,
+        $.impl_declaration,
+        $.class_declaration,
+        $.choice_declaration,
+        $.mixin_declaration,
+        $.match_first_declaration
+      ),
+
+    // Explorer only experimental featurues
+    mix_declaration: ($) => seq('__mix', $._expression, ';'),
+
+    mixin_declaration: ($) =>
+      seq(
+        '__mixin',
+        $.declared_name,
+        optional($.type_params),
+        optional(seq('for', $._expression)),
+        '{',
+        repeat(choice($.function_declaration, $.mix_declaration)),
+        '}'
+      ),
+
+    match_first_declaration: ($) =>
+      seq('__match_first', '{', repeat($.impl_declaration), '}'),
+  },
+});

+ 40 - 0
utils/treesitter/helix.sh

@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set -euo pipefail
+
+ROOT="$(git rev-parse --show-toplevel)"
+cd "$ROOT/utils/treesitter"
+
+tree-sitter generate --no-bindings
+
+mkdir -p "$ROOT/.helix"
+
+cat > "$ROOT/.helix/languages.toml" << EOF
+use-grammars = { only = ["carbon"] }
+
+[[language]]
+name = "carbon"
+scope = "source.carbon"
+file-types = ["carbon"]
+comment-token = "//"
+indent = { tab-width = 2, unit = "  " }
+roots = [".git"]
+
+[[grammar]]
+name = "carbon"
+source = { path = "$PWD" }
+EOF
+
+mkdir -p ~/.config/helix/runtime/grammars ~/.config/helix/runtime/queries
+ln -sTf $PWD/queries ~/.config/helix/runtime/queries/carbon
+hx --grammar build
+
+echo
+hx --health carbon
+echo
+echo 'use `hx path/to/foo.carbon` to open files'
+echo 'Try different themes with :theme'

+ 132 - 0
utils/treesitter/queries/highlights.scm

@@ -0,0 +1,132 @@
+; Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+; Exceptions. See /LICENSE for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; This maps syntax node patterns to highlighting scopes.
+; The scopes are used themes and editors to select style for that node.
+
+(comment) @comment
+(builtin_type) @type.builtin
+(bool_literal) @constant.builtin
+(escape_sequence) @constant.character.escape
+(string_literal) @string
+(numeric_literal) @constant.builtin
+(numeric_type_literal) @type.builtin
+
+; function declaration or call expression => function
+(function_declaration (declared_name (ident) @function))
+(call_expression (ident) @function)
+
+; TODO: add more specific rules
+
+; upper case => type
+((ident) @type
+  (#match? @type "^[A-Z]"))
+
+; lower case => variable
+((ident) @variable
+  (#match? @variable "^[a-z_]"))
+
+[
+  "("
+  ")"
+  "{"
+  "}"
+  "["
+  "]"
+] @punctuation.bracket
+
+[
+  "."
+  ";"
+  ","
+  ":"
+  ":!"
+  "=>"
+] @punctuation.delimiter
+
+"->" @punctuation
+
+[
+  "+"
+  "-"
+  (binary_star)
+  "/"
+  "%"
+  "=="
+  "!="
+  "<"
+  "<="
+  ">"
+  ">="
+  "not"
+  "and"
+  "or"
+  "|"
+  "&"
+  "^"
+  ">>"
+  "<<"
+  "*" ; prefix star
+  (postfix_star)
+  "++"
+  "--"
+] @operator
+
+; keywords not used in grammar.js are commented out
+[
+  "abstract"
+  ; "adapt"
+  "addr"
+  "alias"
+  "and"
+  "api"
+  "as"
+  "auto"
+  "base"
+  "break"
+  "case"
+  "choice"
+  "class"
+  "constraint"
+  "continue"
+  "default"
+  "destructor"
+  "else"
+  "extend"
+  ; "final"
+  "fn"
+  "for"
+  "forall"
+  ; "friend"
+  "if"
+  "impl"
+  "impls"
+  "import"
+  "in"
+  "interface"
+  "let"
+  "library"
+  ; "like"
+  "match"
+  "namespace"
+  "not"
+  ; "observe"
+  "or"
+  ; "override"
+  "package"
+  ; "partial"
+  ; "private"
+  ; "protected"
+  "require"
+  "return"
+  "returned"
+  "Self"
+  "template"
+  "then"
+  "type"
+  "var"
+  "virtual"
+  "where"
+  "while"
+] @keyword

+ 87 - 0
utils/treesitter/src/scanner.c

@@ -0,0 +1,87 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "tree_sitter/parser.h"
+
+enum TokenType {
+  BINARY_STAR,
+  POSTFIX_STAR,
+};
+
+// our scanner is stateless
+void* tree_sitter_carbon_external_scanner_create() { return NULL; }
+
+unsigned tree_sitter_carbon_external_scanner_serialize(
+    __attribute__((unused)) void* payload,
+    __attribute__((unused)) char* buffer) {
+  return 0;  // zero bytes used to serialize
+}
+
+void tree_sitter_carbon_external_scanner_deserialize(
+    __attribute__((unused)) void* payload,
+    __attribute__((unused)) const char* buffer,
+    __attribute__((unused)) unsigned length) {}
+
+void tree_sitter_carbon_external_scanner_destroy(
+    __attribute__((unused)) void* payload) {}
+
+// https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/symbolic_tokens.md#overview
+// > the token after the operator must be an identifier, a literal, or any kind
+// of opening bracket (for example, (, [, or {).
+static bool token_allowed_after_binary_operator(char c) {
+  return
+      // identifier
+      c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+      // string literal
+      c == '\"' ||
+      // TODO: character literal
+      // number literal
+      (c >= '0' && c <= '9') ||
+      // opening bracket
+      c == '(' || c == '[' || c == '{';
+}
+
+static bool is_whitespace(char c) { return c == ' ' || c == '\n'; }
+
+// https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
+//
+// > If a token in the externals array is valid at a given position in the
+// > parse, the external scanner will be called first before anything else is
+// > done.
+//
+// > But the external scanner may return false and in this case Tree-sitter
+// > fallbacks to the internal lexing mechanism.
+bool tree_sitter_carbon_external_scanner_scan(
+    __attribute__((unused)) void* payload,
+    __attribute__((unused)) TSLexer* lexer,
+    __attribute__((unused)) const bool* valid_symbols) {
+  // skip past whitespace if any
+  bool whitespace = false;
+  while (is_whitespace(lexer->lookahead)) {
+    whitespace = true;
+    lexer->advance(lexer, /* skip= */ true);
+  }
+
+  // if any other symbol than *, fallback to treesitter internal lexer
+  if (lexer->lookahead != '*') {
+    return false;
+  }
+
+  // move to past the *, add * to current token
+  lexer->advance(lexer, /* skip= */ false);
+
+  // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/symbolic_tokens.md
+  if (is_whitespace(lexer->lookahead) && whitespace) {
+    // foo * bar
+    lexer->result_symbol = BINARY_STAR;
+  } else if (!whitespace &&
+             token_allowed_after_binary_operator(lexer->lookahead)) {
+    // foo*bar or foo*(bar)
+    lexer->result_symbol = BINARY_STAR;
+  } else {
+    // foo*
+    lexer->result_symbol = POSTFIX_STAR;
+  }
+  return true;
+}

+ 80 - 0
utils/treesitter/test_runner.cpp

@@ -0,0 +1,80 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <tree_sitter/api.h>
+#include <tree_sitter/parser.h>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+extern "C" {
+TSLanguage* tree_sitter_carbon();
+}
+
+// Reads a file to string.
+static auto ReadFile(std::filesystem::path path) -> std::string {
+  std::ifstream file(path);
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  file.close();
+  return buffer.str();
+}
+
+// TODO: use file_test.cpp
+auto main(int argc, char** argv) -> int {
+  if (argc < 2) {
+    std::cerr << "Usage: treesitter_carbon_tester <file>...\n";
+    return 2;
+  }
+
+  auto* parser = ts_parser_new();
+  ts_parser_set_language(parser, tree_sitter_carbon());
+
+  std::vector<std::string> failed;
+  std::vector<std::string> skipped;
+  for (int i = 1; i < argc; i++) {
+    std::string file_path = argv[i];
+    std::string source = ReadFile(file_path);
+
+    // `and` in where clauses is not parsed correctly.
+    // TODO: remove once where clause is implemented correctly.
+    if (source.find("where") != std::string::npos &&
+        source.find("and") != std::string::npos) {
+      skipped.push_back(file_path);
+      continue;
+    }
+    auto* tree =
+        ts_parser_parse_string(parser, nullptr, source.data(), source.size());
+
+    auto root = ts_tree_root_node(tree);
+    auto has_error = ts_node_has_error(root);
+    char* node_debug = ts_node_string(root);
+
+    std::cout << file_path << ":\n" << node_debug << "\n";
+    if (has_error) {
+      failed.push_back(file_path);
+    }
+
+    free(node_debug);
+    ts_tree_delete(tree);
+  }
+  ts_parser_delete(parser);
+  for (const auto& file : skipped) {
+    std::cout << "SKIPPED " << file << "\n";
+  }
+  for (const auto& file : failed) {
+    std::cout << "FAILED " << file << "\n";
+  }
+  if (!skipped.empty()) {
+    std::cout << skipped.size() << " tests skipped.\n";
+  }
+  if (!failed.empty()) {
+    std::cout << failed.size() << " tests failing.\n";
+    return 1;
+  }
+}