character_set.h 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEXER_CHARACTER_SET_H_
  5. #define CARBON_TOOLCHAIN_LEXER_CHARACTER_SET_H_
  6. #include "llvm/ADT/StringExtras.h"
  7. #include "llvm/ADT/StringRef.h"
  8. namespace Carbon {
  9. // TODO: These definitions need to be updated to match whatever Unicode lexical
  10. // rules we pick. The function interfaces will need to change to accommodate
  11. // multi-byte characters.
  12. // Is this an alphabetical character according to Carbon's lexical rules?
  13. //
  14. // Alphabetical characters are permitted at the start of identifiers. This
  15. // currently includes 'A'..'Z' and 'a'..'z'.
  16. inline auto IsAlpha(char c) -> bool { return llvm::isAlpha(c); }
  17. // Is this a decimal digit according to Carbon's lexical rules?
  18. //
  19. // This currently includes '0'..'9'.
  20. inline auto IsDecimalDigit(char c) -> bool { return llvm::isDigit(c); }
  21. // Is this an alphanumeric character according to Carbon's lexical rules?
  22. //
  23. // Alphanumeric characters are permitted as trailing characters in identifiers
  24. // and numeric literals. This includes alphabetical characters plus decimal
  25. // digits.
  26. //
  27. // Note that '_' is not considered alphanumeric, despite in most circumstances
  28. // being a valid continuation character of an identifier or numeric literal.
  29. inline auto IsAlnum(char c) -> bool { return llvm::isAlnum(c); }
  30. // Is this a hexadecimal digit according to Carbon's lexical rules?
  31. //
  32. // Hexadecimal digits are permitted in `0x`-prefixed literals, as well as after
  33. // a `\x` escape sequence.
  34. //
  35. // Note that lowercase 'a'..'f' are currently not considered hexadecimal digits
  36. // in any context.
  37. inline auto IsUpperHexDigit(char c) -> bool {
  38. return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
  39. }
  40. // Is this a lowercase letter?
  41. //
  42. // Lowercase letters in numeric literals can be followed by `+` or `-` to
  43. // extend the literal.
  44. inline auto IsLower(char c) -> bool { return 'a' <= c && c <= 'z'; }
  45. // Is this character considered to be horizontal whitespace?
  46. //
  47. // Such characters can appear in the indentation of a line.
  48. inline auto IsHorizontalWhitespace(char c) -> bool {
  49. return c == ' ' || c == '\t';
  50. }
  51. // Is this character considered to be vertical whitespace?
  52. //
  53. // Such characters are considered to terminate lines.
  54. inline auto IsVerticalWhitespace(char c) -> bool { return c == '\n'; }
  55. // Is this character considered to be whitespace?
  56. //
  57. // Changes here will need matching changes in
  58. // `TokenizedBuffer::Lexer::SkipWhitespace`.
  59. inline auto IsSpace(char c) -> bool {
  60. return IsHorizontalWhitespace(c) || IsVerticalWhitespace(c);
  61. }
  62. } // namespace Carbon
  63. #endif // CARBON_TOOLCHAIN_LEXER_CHARACTER_SET_H_