scanner.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "tree_sitter/parser.h"
  5. enum TokenType {
  6. BINARY_STAR,
  7. POSTFIX_STAR,
  8. };
  9. // our scanner is stateless
  10. void* tree_sitter_carbon_external_scanner_create() { return NULL; }
  11. unsigned tree_sitter_carbon_external_scanner_serialize(
  12. __attribute__((unused)) void* payload,
  13. __attribute__((unused)) char* buffer) {
  14. return 0; // zero bytes used to serialize
  15. }
  16. void tree_sitter_carbon_external_scanner_deserialize(
  17. __attribute__((unused)) void* payload,
  18. __attribute__((unused)) const char* buffer,
  19. __attribute__((unused)) unsigned length) {}
  20. void tree_sitter_carbon_external_scanner_destroy(
  21. __attribute__((unused)) void* payload) {}
  22. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/symbolic_tokens.md#overview
  23. // > the token after the operator must be an identifier, a literal, or any kind
  24. // of opening bracket (for example, (, [, or {).
  25. static bool token_allowed_after_binary_operator(char c) {
  26. return
  27. // identifier
  28. c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
  29. // string literal
  30. c == '\"' ||
  31. // TODO: character literal
  32. // number literal
  33. (c >= '0' && c <= '9') ||
  34. // opening bracket
  35. c == '(' || c == '[' || c == '{';
  36. }
  37. static bool is_whitespace(char c) { return c == ' ' || c == '\n'; }
  38. // https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
  39. //
  40. // > If a token in the externals array is valid at a given position in the
  41. // > parse, the external scanner will be called first before anything else is
  42. // > done.
  43. //
  44. // > But the external scanner may return false and in this case Tree-sitter
  45. // > fallbacks to the internal lexing mechanism.
  46. bool tree_sitter_carbon_external_scanner_scan(
  47. __attribute__((unused)) void* payload,
  48. __attribute__((unused)) TSLexer* lexer,
  49. __attribute__((unused)) const bool* valid_symbols) {
  50. // skip past whitespace if any
  51. bool whitespace = false;
  52. while (is_whitespace(lexer->lookahead)) {
  53. whitespace = true;
  54. lexer->advance(lexer, /* skip= */ true);
  55. }
  56. // if any other symbol than *, fallback to treesitter internal lexer
  57. if (lexer->lookahead != '*') {
  58. return false;
  59. }
  60. // move to past the *, add * to current token
  61. lexer->advance(lexer, /* skip= */ false);
  62. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/symbolic_tokens.md
  63. if (is_whitespace(lexer->lookahead) && whitespace) {
  64. // foo * bar
  65. lexer->result_symbol = BINARY_STAR;
  66. } else if (!whitespace &&
  67. token_allowed_after_binary_operator(lexer->lookahead)) {
  68. // foo*bar or foo*(bar)
  69. lexer->result_symbol = BINARY_STAR;
  70. } else {
  71. // foo*
  72. lexer->result_symbol = POSTFIX_STAR;
  73. }
  74. return true;
  75. }