tokenized_buffer.cpp 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <array>
  7. #include <cmath>
  8. #include "common/check.h"
  9. #include "common/string_helpers.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/StringSwitch.h"
  12. #include "llvm/Support/ErrorHandling.h"
  13. #include "llvm/Support/Format.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/raw_ostream.h"
  16. #include "toolchain/lexer/character_set.h"
  17. #include "toolchain/lexer/lex_helpers.h"
  18. #include "toolchain/lexer/numeric_literal.h"
  19. #include "toolchain/lexer/string_literal.h"
  20. #if __x86_64__
  21. #include <x86intrin.h>
  22. #endif
  23. namespace Carbon {
  24. // TODO: Move Overload and VariantMatch somewhere more central.
  25. // Form an overload set from a list of functions. For example:
  26. //
  27. // ```
  28. // auto overloaded = Overload{[] (int) {}, [] (float) {}};
  29. // ```
  30. template <typename... Fs>
  31. struct Overload : Fs... {
  32. using Fs::operator()...;
  33. };
  34. template <typename... Fs>
  35. Overload(Fs...) -> Overload<Fs...>;
  36. // Pattern-match against the type of the value stored in the variant `V`. Each
  37. // element of `fs` should be a function that takes one or more of the variant
  38. // values in `V`.
  39. template <typename V, typename... Fs>
  40. auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
  41. return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
  42. }
  43. // Scans the provided text and returns the prefix `StringRef` of contiguous
  44. // identifier characters.
  45. //
  46. // This is a performance sensitive function and so uses vectorized code
  47. // sequences to optimize its scanning. When modifying, the identifier lexing
  48. // benchmarks should be checked for regressions.
  49. //
  50. // Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
  51. //
  52. // TODO: Currently, this code does not implement Carbon's design for Unicode
  53. // characters in identifiers. It does work on UTF-8 code unit sequences, but
  54. // currently considers non-ASCII characters to be non-identifier characters.
  55. // Some work has been done to ensure the hot loop, while optimized, retains
  56. // enough information to add Unicode handling without completely destroying the
  57. // relevant optimizations.
  58. static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
  59. // A table of booleans that we can use to classify bytes as being valid
  60. // identifier (or keyword) characters. This is used in the generic,
  61. // non-vectorized fallback code to scan for length of an identifier.
  62. constexpr std::array<bool, 256> IsIdentifierByteTable = []() constexpr {
  63. std::array<bool, 256> table = {};
  64. for (char c = '0'; c <= '9'; ++c) {
  65. table[c] = true;
  66. }
  67. for (char c = 'A'; c <= 'Z'; ++c) {
  68. table[c] = true;
  69. }
  70. for (char c = 'a'; c <= 'z'; ++c) {
  71. table[c] = true;
  72. }
  73. table['_'] = true;
  74. return table;
  75. }();
  76. #if __x86_64__
  77. // This code uses a scheme derived from the techniques in Geoff Langdale and
  78. // Daniel Lemire's work on parsing JSON[1]. Specifically, that paper outlines
  79. // a technique of using two 4-bit indexed in-register look-up tables (LUTs) to
  80. // classify bytes in a branchless SIMD code sequence.
  81. //
  82. // [1]: https://arxiv.org/pdf/1902.08318.pdf
  83. //
  84. // The goal is to get a bit mask classifying different sets of bytes. For each
  85. // input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
  86. // character. Otherwise, we want the mask bits to be set with the following
  87. // logic derived by inspecting the high nibble and low nibble of the input:
  88. // bit0 = 1 for `_`: high `0x5` and low `0xF`
  89. // bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
  90. // bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
  91. // bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
  92. // bit4 = unused
  93. // bit5 = unused
  94. // bit6 = unused
  95. // bit7 = unused
  96. //
  97. // No bits set means definitively non-ID ASCII character.
  98. //
  99. // bits 4-7 remain unused if we need to classify more characters.
  100. const auto high_lut = _mm_setr_epi8(
  101. /*0x0:*/ 0b0000'0000,
  102. /*0x1:*/ 0b0000'0000,
  103. /*0x2:*/ 0b0000'0000,
  104. /*0x3:*/ 0b0000'0010,
  105. /*0x4:*/ 0b0000'0100,
  106. /*0x5:*/ 0b0000'1001,
  107. /*0x6:*/ 0b0000'0100,
  108. /*0x7:*/ 0b0000'1000,
  109. /*0x8:*/ 0b0000'0000,
  110. /*0x9:*/ 0b0000'0000,
  111. /*0xA:*/ 0b0000'0000,
  112. /*0xB:*/ 0b0000'0000,
  113. /*0xC:*/ 0b0000'0000,
  114. /*0xD:*/ 0b0000'0000,
  115. /*0xE:*/ 0b0000'0000,
  116. /*0xF:*/ 0b0000'0000);
  117. const auto low_lut = _mm_setr_epi8(
  118. /*0x0:*/ 0b0000'1010,
  119. /*0x1:*/ 0b0000'1110,
  120. /*0x2:*/ 0b0000'1110,
  121. /*0x3:*/ 0b0000'1110,
  122. /*0x4:*/ 0b0000'1110,
  123. /*0x5:*/ 0b0000'1110,
  124. /*0x6:*/ 0b0000'1110,
  125. /*0x7:*/ 0b0000'1110,
  126. /*0x8:*/ 0b0000'1110,
  127. /*0x9:*/ 0b0000'1110,
  128. /*0xA:*/ 0b0000'1100,
  129. /*0xB:*/ 0b0000'0100,
  130. /*0xC:*/ 0b0000'0100,
  131. /*0xD:*/ 0b0000'0100,
  132. /*0xE:*/ 0b0000'0100,
  133. /*0xF:*/ 0b0000'0101);
  134. // Use `ssize_t` for performance here as we index memory in a tight loop.
  135. ssize_t i = 0;
  136. const ssize_t size = text.size();
  137. while ((i + 16) <= size) {
  138. __m128i input =
  139. _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
  140. // The high bits of each byte indicate a non-ASCII character encoded using
  141. // UTF-8. Test those and fall back to the scalar code if present. These
  142. // bytes will also cause spurious zeros in the LUT results, but we can
  143. // ignore that because we track them independently here.
  144. #if __SSE4_1__
  145. if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
  146. break;
  147. }
  148. #else
  149. if (_mm_movemask_epi8(input) != 0) {
  150. break;
  151. }
  152. #endif
  153. // Do two LUT lookups and mask the results together to get the results for
  154. // both low and high nibbles. Note that we don't need to mask out the high
  155. // bit of input here because we track that above for UTF-8 handling.
  156. __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
  157. // Note that the input needs to be masked to only include the high nibble or
  158. // we could end up with bit7 set forcing the result to a zero byte.
  159. __m128i input_high =
  160. _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
  161. __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
  162. __m128i mask = _mm_and_si128(low_mask, high_mask);
  163. // Now compare to find the completely zero bytes.
  164. __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
  165. int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
  166. // Check if there are bits in the tail mask, which means zero bytes and the
  167. // end of the identifier. We could do this without materializing the scalar
  168. // mask on more recent CPUs, but we generally expect the median length we
  169. // encounter to be <16 characters and so we avoid the extra instruction in
  170. // that case and predict this branch to succeed so it is laid out in a
  171. // reasonable way.
  172. if (LLVM_LIKELY(tail_ascii_mask != 0)) {
  173. // Move past the definitively classified bytes that are part of the
  174. // identifier, and return the complete identifier text.
  175. i += __builtin_ctz(tail_ascii_mask);
  176. return text.substr(0, i);
  177. }
  178. i += 16;
  179. }
  180. // Fallback to scalar loop. We only end up here when we don't have >=16
  181. // bytes to scan or we find a UTF-8 unicode character.
  182. // TODO: This assumes all Unicode characters are non-identifiers.
  183. while (i < size &&
  184. IsIdentifierByteTable[static_cast<unsigned char>(text[i])]) {
  185. ++i;
  186. }
  187. return text.substr(0, i);
  188. #else
  189. // TODO: Optimize this with SIMD for other architectures.
  190. return text.take_while([](char c) {
  191. return IsIdentifierByteTable[static_cast<unsigned char>(c)];
  192. });
  193. #endif
  194. }
  195. // Implementation of the lexer logic itself.
  196. //
  197. // The design is that lexing can loop over the source buffer, consuming it into
  198. // tokens by calling into this API. This class handles the state and breaks down
  199. // the different lexing steps that may be used. It directly updates the provided
  200. // tokenized buffer with the lexed tokens.
  201. class TokenizedBuffer::Lexer {
  202. public:
  203. // Symbolic result of a lexing action. This indicates whether we successfully
  204. // lexed a token, or whether other lexing actions should be attempted.
  205. //
  206. // While it wraps a simple boolean state, its API both helps make the failures
  207. // more self documenting, and by consuming the actual token constructively
  208. // when one is produced, it helps ensure the correct result is returned.
  209. class LexResult {
  210. public:
  211. // Consumes (and discard) a valid token to construct a result
  212. // indicating a token has been produced. Relies on implicit conversions.
  213. // NOLINTNEXTLINE(google-explicit-constructor)
  214. LexResult(Token /*discarded_token*/) : LexResult(true) {}
  215. // Returns a result indicating no token was produced.
  216. static auto NoMatch() -> LexResult { return LexResult(false); }
  217. // Tests whether a token was produced by the lexing routine, and
  218. // the lexer can continue forming tokens.
  219. explicit operator bool() const { return formed_token_; }
  220. private:
  221. explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
  222. bool formed_token_;
  223. };
  224. using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef& source_text)
  225. -> LexResult;
  226. using DispatchTableT = std::array<DispatchFunctionT*, 256>;
  227. Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
  228. : buffer_(&buffer),
  229. translator_(&buffer),
  230. emitter_(translator_, consumer),
  231. token_translator_(&buffer),
  232. token_emitter_(token_translator_, consumer),
  233. current_line_(buffer.AddLine(LineInfo(0))),
  234. current_line_info_(&buffer.GetLineInfo(current_line_)) {}
  235. // Perform the necessary bookkeeping to step past a newline at the current
  236. // line and column.
  237. auto HandleNewline() -> void {
  238. current_line_info_->length = current_column_;
  239. current_line_ = buffer_->AddLine(
  240. LineInfo(current_line_info_->start + current_column_ + 1));
  241. current_line_info_ = &buffer_->GetLineInfo(current_line_);
  242. current_column_ = 0;
  243. set_indent_ = false;
  244. }
  245. auto NoteWhitespace() -> void {
  246. if (!buffer_->token_infos_.empty()) {
  247. buffer_->token_infos_.back().has_trailing_space = true;
  248. }
  249. }
  250. auto SkipWhitespace(llvm::StringRef& source_text) -> bool {
  251. const char* const whitespace_start = source_text.begin();
  252. while (!source_text.empty()) {
  253. // We only support line-oriented commenting and lex comments as-if they
  254. // were whitespace.
  255. if (source_text.startswith("//")) {
  256. // Any comment must be the only non-whitespace on the line.
  257. if (set_indent_) {
  258. CARBON_DIAGNOSTIC(TrailingComment, Error,
  259. "Trailing comments are not permitted.");
  260. emitter_.Emit(source_text.begin(), TrailingComment);
  261. }
  262. // The introducer '//' must be followed by whitespace or EOF.
  263. if (source_text.size() > 2 && !IsSpace(source_text[2])) {
  264. CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
  265. "Whitespace is required after '//'.");
  266. emitter_.Emit(source_text.begin() + 2,
  267. NoWhitespaceAfterCommentIntroducer);
  268. }
  269. while (!source_text.empty() && source_text.front() != '\n') {
  270. ++current_column_;
  271. source_text = source_text.drop_front();
  272. }
  273. if (source_text.empty()) {
  274. break;
  275. }
  276. }
  277. switch (source_text.front()) {
  278. default:
  279. // If we find a non-whitespace character without exhausting the
  280. // buffer, return true to continue lexing.
  281. CARBON_CHECK(!IsSpace(source_text.front()));
  282. if (whitespace_start != source_text.begin()) {
  283. NoteWhitespace();
  284. }
  285. return true;
  286. case '\n':
  287. // If this is the last character in the source, directly return here
  288. // to avoid creating an empty line.
  289. source_text = source_text.drop_front();
  290. if (source_text.empty()) {
  291. current_line_info_->length = current_column_;
  292. return false;
  293. }
  294. // Otherwise, add a line and set up to continue lexing.
  295. HandleNewline();
  296. continue;
  297. case ' ':
  298. case '\t':
  299. // Skip other forms of whitespace while tracking column.
  300. // TODO: This obviously needs looooots more work to handle unicode
  301. // whitespace as well as special handling to allow better tokenization
  302. // of operators. This is just a stub to check that our column
  303. // management works.
  304. ++current_column_;
  305. source_text = source_text.drop_front();
  306. continue;
  307. }
  308. }
  309. CARBON_CHECK(source_text.empty())
  310. << "Cannot reach here w/o finishing the text!";
  311. // Update the line length as this is also the end of a line.
  312. current_line_info_->length = current_column_;
  313. return false;
  314. }
  315. auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
  316. std::optional<LexedNumericLiteral> literal =
  317. LexedNumericLiteral::Lex(source_text);
  318. if (!literal) {
  319. return LexError(source_text);
  320. }
  321. int int_column = current_column_;
  322. int token_size = literal->text().size();
  323. current_column_ += token_size;
  324. source_text = source_text.drop_front(token_size);
  325. if (!set_indent_) {
  326. current_line_info_->indent = int_column;
  327. set_indent_ = true;
  328. }
  329. return VariantMatch(
  330. literal->ComputeValue(emitter_),
  331. [&](LexedNumericLiteral::IntegerValue&& value) {
  332. auto token = buffer_->AddToken({.kind = TokenKind::IntegerLiteral,
  333. .token_line = current_line_,
  334. .column = int_column});
  335. buffer_->GetTokenInfo(token).literal_index =
  336. buffer_->literal_int_storage_.size();
  337. buffer_->literal_int_storage_.push_back(std::move(value.value));
  338. return token;
  339. },
  340. [&](LexedNumericLiteral::RealValue&& value) {
  341. auto token = buffer_->AddToken({.kind = TokenKind::RealLiteral,
  342. .token_line = current_line_,
  343. .column = int_column});
  344. buffer_->GetTokenInfo(token).literal_index =
  345. buffer_->literal_int_storage_.size();
  346. buffer_->literal_int_storage_.push_back(std::move(value.mantissa));
  347. buffer_->literal_int_storage_.push_back(std::move(value.exponent));
  348. CARBON_CHECK(buffer_->GetRealLiteral(token).IsDecimal() ==
  349. (value.radix == LexedNumericLiteral::Radix::Decimal));
  350. return token;
  351. },
  352. [&](LexedNumericLiteral::UnrecoverableError) {
  353. auto token = buffer_->AddToken({
  354. .kind = TokenKind::Error,
  355. .token_line = current_line_,
  356. .column = int_column,
  357. .error_length = token_size,
  358. });
  359. return token;
  360. });
  361. }
  362. auto LexStringLiteral(llvm::StringRef& source_text) -> LexResult {
  363. std::optional<LexedStringLiteral> literal =
  364. LexedStringLiteral::Lex(source_text);
  365. if (!literal) {
  366. return LexError(source_text);
  367. }
  368. Line string_line = current_line_;
  369. int string_column = current_column_;
  370. int literal_size = literal->text().size();
  371. source_text = source_text.drop_front(literal_size);
  372. if (!set_indent_) {
  373. current_line_info_->indent = string_column;
  374. set_indent_ = true;
  375. }
  376. // Update line and column information.
  377. if (!literal->is_multi_line()) {
  378. current_column_ += literal_size;
  379. } else {
  380. for (char c : literal->text()) {
  381. if (c == '\n') {
  382. HandleNewline();
  383. // The indentation of all lines in a multi-line string literal is
  384. // that of the first line.
  385. current_line_info_->indent = string_column;
  386. set_indent_ = true;
  387. } else {
  388. ++current_column_;
  389. }
  390. }
  391. }
  392. if (literal->is_terminated()) {
  393. auto token =
  394. buffer_->AddToken({.kind = TokenKind::StringLiteral,
  395. .token_line = string_line,
  396. .column = string_column,
  397. .literal_index = static_cast<int32_t>(
  398. buffer_->literal_string_storage_.size())});
  399. buffer_->literal_string_storage_.push_back(
  400. literal->ComputeValue(emitter_));
  401. return token;
  402. } else {
  403. CARBON_DIAGNOSTIC(UnterminatedString, Error,
  404. "String is missing a terminator.");
  405. emitter_.Emit(literal->text().begin(), UnterminatedString);
  406. return buffer_->AddToken({.kind = TokenKind::Error,
  407. .token_line = string_line,
  408. .column = string_column,
  409. .error_length = literal_size});
  410. }
  411. }
  412. auto LexSymbolToken(llvm::StringRef& source_text,
  413. TokenKind kind = TokenKind::Error) -> LexResult {
  414. auto compute_symbol_kind = [](llvm::StringRef source_text) {
  415. return llvm::StringSwitch<TokenKind>(source_text)
  416. #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
  417. .StartsWith(Spelling, TokenKind::Name)
  418. #include "toolchain/lexer/token_kind.def"
  419. .Default(TokenKind::Error);
  420. };
  421. // We use the `error` token as a place-holder for cases where one character
  422. // isn't enough to pick a definitive symbol token. Recompute the kind using
  423. // the full symbol set.
  424. if (LLVM_UNLIKELY(kind == TokenKind::Error)) {
  425. kind = compute_symbol_kind(source_text);
  426. if (kind == TokenKind::Error) {
  427. return LexError(source_text);
  428. }
  429. } else {
  430. // Verify in a debug build that the incoming token kind is correct.
  431. CARBON_DCHECK(kind == compute_symbol_kind(source_text))
  432. << "Incoming token kind '" << kind
  433. << "' does not match computed kind '"
  434. << compute_symbol_kind(source_text) << "'!";
  435. }
  436. if (!set_indent_) {
  437. current_line_info_->indent = current_column_;
  438. set_indent_ = true;
  439. }
  440. CloseInvalidOpenGroups(kind);
  441. const char* location = source_text.begin();
  442. Token token = buffer_->AddToken(
  443. {.kind = kind, .token_line = current_line_, .column = current_column_});
  444. current_column_ += kind.fixed_spelling().size();
  445. source_text = source_text.drop_front(kind.fixed_spelling().size());
  446. // Opening symbols just need to be pushed onto our queue of opening groups.
  447. if (kind.is_opening_symbol()) {
  448. open_groups_.push_back(token);
  449. return token;
  450. }
  451. // Only closing symbols need further special handling.
  452. if (!kind.is_closing_symbol()) {
  453. return token;
  454. }
  455. TokenInfo& closing_token_info = buffer_->GetTokenInfo(token);
  456. // Check that there is a matching opening symbol before we consume this as
  457. // a closing symbol.
  458. if (open_groups_.empty()) {
  459. closing_token_info.kind = TokenKind::Error;
  460. closing_token_info.error_length = kind.fixed_spelling().size();
  461. CARBON_DIAGNOSTIC(
  462. UnmatchedClosing, Error,
  463. "Closing symbol without a corresponding opening symbol.");
  464. emitter_.Emit(location, UnmatchedClosing);
  465. // Note that this still returns true as we do consume a symbol.
  466. return token;
  467. }
  468. // Finally can handle a normal closing symbol.
  469. Token opening_token = open_groups_.pop_back_val();
  470. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  471. opening_token_info.closing_token = token;
  472. closing_token_info.opening_token = opening_token;
  473. return token;
  474. }
  475. // Given a word that has already been lexed, determine whether it is a type
  476. // literal and if so form the corresponding token.
  477. auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
  478. -> LexResult {
  479. if (word.size() < 2) {
  480. // Too short to form one of these tokens.
  481. return LexResult::NoMatch();
  482. }
  483. if (word[1] < '1' || word[1] > '9') {
  484. // Doesn't start with a valid initial digit.
  485. return LexResult::NoMatch();
  486. }
  487. std::optional<TokenKind> kind;
  488. switch (word.front()) {
  489. case 'i':
  490. kind = TokenKind::IntegerTypeLiteral;
  491. break;
  492. case 'u':
  493. kind = TokenKind::UnsignedIntegerTypeLiteral;
  494. break;
  495. case 'f':
  496. kind = TokenKind::FloatingPointTypeLiteral;
  497. break;
  498. default:
  499. return LexResult::NoMatch();
  500. };
  501. llvm::StringRef suffix = word.substr(1);
  502. if (!CanLexInteger(emitter_, suffix)) {
  503. return buffer_->AddToken(
  504. {.kind = TokenKind::Error,
  505. .token_line = current_line_,
  506. .column = column,
  507. .error_length = static_cast<int32_t>(word.size())});
  508. }
  509. llvm::APInt suffix_value;
  510. if (suffix.getAsInteger(10, suffix_value)) {
  511. return LexResult::NoMatch();
  512. }
  513. auto token = buffer_->AddToken(
  514. {.kind = *kind, .token_line = current_line_, .column = column});
  515. buffer_->GetTokenInfo(token).literal_index =
  516. buffer_->literal_int_storage_.size();
  517. buffer_->literal_int_storage_.push_back(std::move(suffix_value));
  518. return token;
  519. }
  520. // Closes all open groups that cannot remain open across the symbol `K`.
  521. // Users may pass `Error` to close all open groups.
  522. auto CloseInvalidOpenGroups(TokenKind kind) -> void {
  523. if (!kind.is_closing_symbol() && kind != TokenKind::Error) {
  524. return;
  525. }
  526. while (!open_groups_.empty()) {
  527. Token opening_token = open_groups_.back();
  528. TokenKind opening_kind = buffer_->GetTokenInfo(opening_token).kind;
  529. if (kind == opening_kind.closing_symbol()) {
  530. return;
  531. }
  532. open_groups_.pop_back();
  533. CARBON_DIAGNOSTIC(
  534. MismatchedClosing, Error,
  535. "Closing symbol does not match most recent opening symbol.");
  536. token_emitter_.Emit(opening_token, MismatchedClosing);
  537. CARBON_CHECK(!buffer_->tokens().empty())
  538. << "Must have a prior opening token!";
  539. Token prev_token = buffer_->tokens().end()[-1];
  540. // TODO: do a smarter backwards scan for where to put the closing
  541. // token.
  542. Token closing_token = buffer_->AddToken(
  543. {.kind = opening_kind.closing_symbol(),
  544. .has_trailing_space = buffer_->HasTrailingWhitespace(prev_token),
  545. .is_recovery = true,
  546. .token_line = current_line_,
  547. .column = current_column_});
  548. TokenInfo& opening_token_info = buffer_->GetTokenInfo(opening_token);
  549. TokenInfo& closing_token_info = buffer_->GetTokenInfo(closing_token);
  550. opening_token_info.closing_token = closing_token;
  551. closing_token_info.opening_token = opening_token;
  552. }
  553. }
  554. auto GetOrCreateIdentifier(llvm::StringRef text) -> Identifier {
  555. auto insert_result = buffer_->identifier_map_.insert(
  556. {text, Identifier(buffer_->identifier_infos_.size())});
  557. if (insert_result.second) {
  558. buffer_->identifier_infos_.push_back({text});
  559. }
  560. return insert_result.first->second;
  561. }
  562. auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
  563. if (static_cast<unsigned char>(source_text.front()) > 0x7F) {
  564. // TODO: Need to add support for Unicode lexing.
  565. return LexError(source_text);
  566. }
  567. CARBON_CHECK(IsAlpha(source_text.front()) || source_text.front() == '_');
  568. if (!set_indent_) {
  569. current_line_info_->indent = current_column_;
  570. set_indent_ = true;
  571. }
  572. // Take the valid characters off the front of the source buffer.
  573. llvm::StringRef identifier_text = ScanForIdentifierPrefix(source_text);
  574. CARBON_CHECK(!identifier_text.empty())
  575. << "Must have at least one character!";
  576. int identifier_column = current_column_;
  577. current_column_ += identifier_text.size();
  578. source_text = source_text.drop_front(identifier_text.size());
  579. // Check if the text is a type literal, and if so form such a literal.
  580. if (LexResult result =
  581. LexWordAsTypeLiteralToken(identifier_text, identifier_column)) {
  582. return result;
  583. }
  584. // Check if the text matches a keyword token, and if so use that.
  585. TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
  586. #define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
  587. #include "toolchain/lexer/token_kind.def"
  588. .Default(TokenKind::Error);
  589. if (kind != TokenKind::Error) {
  590. return buffer_->AddToken({.kind = kind,
  591. .token_line = current_line_,
  592. .column = identifier_column});
  593. }
  594. // Otherwise we have a generic identifier.
  595. return buffer_->AddToken({.kind = TokenKind::Identifier,
  596. .token_line = current_line_,
  597. .column = identifier_column,
  598. .id = GetOrCreateIdentifier(identifier_text)});
  599. }
  600. auto LexError(llvm::StringRef& source_text) -> LexResult {
  601. llvm::StringRef error_text = source_text.take_while([](char c) {
  602. if (IsAlnum(c)) {
  603. return false;
  604. }
  605. switch (c) {
  606. case '_':
  607. case '\t':
  608. case '\n':
  609. return false;
  610. default:
  611. break;
  612. }
  613. return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
  614. #define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
  615. #include "toolchain/lexer/token_kind.def"
  616. .Default(true);
  617. });
  618. if (error_text.empty()) {
  619. // TODO: Reimplement this to use the lexer properly. In the meantime,
  620. // guarantee that we eat at least one byte.
  621. error_text = source_text.take_front(1);
  622. }
  623. auto token = buffer_->AddToken(
  624. {.kind = TokenKind::Error,
  625. .token_line = current_line_,
  626. .column = current_column_,
  627. .error_length = static_cast<int32_t>(error_text.size())});
  628. CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
  629. "Encountered unrecognized characters while parsing.");
  630. emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
  631. current_column_ += error_text.size();
  632. source_text = source_text.drop_front(error_text.size());
  633. return token;
  634. }
  635. auto AddEndOfFileToken() -> void {
  636. buffer_->AddToken({.kind = TokenKind::EndOfFile,
  637. .token_line = current_line_,
  638. .column = current_column_});
  639. }
  640. constexpr static auto MakeDispatchTable() -> DispatchTableT {
  641. DispatchTableT table = {};
  642. auto dispatch_lex_error = +[](Lexer& lexer, llvm::StringRef& source_text) {
  643. return lexer.LexError(source_text);
  644. };
  645. for (int i = 0; i < 256; ++i) {
  646. table[i] = dispatch_lex_error;
  647. }
  648. // Symbols have some special dispatching. First, set the first character of
  649. // each symbol token spelling to dispatch to the symbol lexer. We don't
  650. // provide a pre-computed token here, so the symbol lexer will compute the
  651. // exact symbol token kind.
  652. auto dispatch_lex_symbol = +[](Lexer& lexer, llvm::StringRef& source_text) {
  653. return lexer.LexSymbolToken(source_text);
  654. };
  655. #define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
  656. table[(Spelling)[0]] = dispatch_lex_symbol;
  657. #include "toolchain/lexer/token_kind.def"
  658. // Now special cased single-character symbols that are guaranteed to not
  659. // join with another symbol. These are grouping symbols, terminators,
  660. // or separators in the grammar and have a good reason to be
  661. // orthogonal to any other punctuation. We do this separately because this
  662. // needs to override some of the generic handling above, and provide a
  663. // custom token.
  664. #define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
  665. table[(Spelling)[0]] = +[](Lexer& lexer, llvm::StringRef& source_text) { \
  666. return lexer.LexSymbolToken(source_text, TokenKind::TokenName); \
  667. };
  668. #include "toolchain/lexer/token_kind.def"
  669. auto dispatch_lex_word = +[](Lexer& lexer, llvm::StringRef& source_text) {
  670. return lexer.LexKeywordOrIdentifier(source_text);
  671. };
  672. table['_'] = dispatch_lex_word;
  673. // Note that we don't use `llvm::seq` because this needs to be `constexpr`
  674. // evaluated.
  675. for (unsigned char c = 'a'; c <= 'z'; ++c) {
  676. table[c] = dispatch_lex_word;
  677. }
  678. for (unsigned char c = 'A'; c <= 'Z'; ++c) {
  679. table[c] = dispatch_lex_word;
  680. }
  681. // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
  682. // as whitespace characters should already have been skipped and the
  683. // only remaining valid Unicode characters would be part of an
  684. // identifier. That code can either accept or reject.
  685. for (int i = 0x80; i < 0x100; ++i) {
  686. table[i] = dispatch_lex_word;
  687. }
  688. auto dispatch_lex_numeric =
  689. +[](Lexer& lexer, llvm::StringRef& source_text) {
  690. return lexer.LexNumericLiteral(source_text);
  691. };
  692. for (unsigned char c = '0'; c <= '9'; ++c) {
  693. table[c] = dispatch_lex_numeric;
  694. }
  695. auto dispatch_lex_string = +[](Lexer& lexer, llvm::StringRef& source_text) {
  696. return lexer.LexStringLiteral(source_text);
  697. };
  698. table['\''] = dispatch_lex_string;
  699. table['"'] = dispatch_lex_string;
  700. table['#'] = dispatch_lex_string;
  701. return table;
  702. };
  703. private:
  704. TokenizedBuffer* buffer_;
  705. SourceBufferLocationTranslator translator_;
  706. LexerDiagnosticEmitter emitter_;
  707. TokenLocationTranslator token_translator_;
  708. TokenDiagnosticEmitter token_emitter_;
  709. Line current_line_;
  710. LineInfo* current_line_info_;
  711. int current_column_ = 0;
  712. bool set_indent_ = false;
  713. llvm::SmallVector<Token> open_groups_;
  714. };
  715. auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  716. -> TokenizedBuffer {
  717. TokenizedBuffer buffer(source);
  718. ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
  719. Lexer lexer(buffer, error_tracking_consumer);
  720. // Build a table of function pointers that we can use to dispatch to the
  721. // correct lexer routine based on the first byte of source text.
  722. //
  723. // While it is tempting to simply use a `switch` on the first byte and
  724. // dispatch with cases into this, in practice that doesn't produce great code.
  725. // There seem to be two issues that are the root cause.
  726. //
  727. // First, there are lots of different values of bytes that dispatch to a
  728. // fairly small set of routines, and then some byte values that dispatch
  729. // differently for each byte. This pattern isn't one that the compiler-based
  730. // lowering of switches works well with -- it tries to balance all the cases,
  731. // and in doing so emits several compares and other control flow rather than a
  732. // simple jump table.
  733. //
  734. // Second, with a `case`, it isn't as obvious how to create a single, uniform
  735. // interface that is effective for *every* byte value, and thus makes for a
  736. // single consistent table-based dispatch. By forcing these to be function
  737. // pointers, we also coerce the code to use a strictly homogeneous structure
  738. // that can form a single dispatch table.
  739. //
  740. // These two actually interact -- the second issue is part of what makes the
  741. // non-table lowering in the first one desirable for many switches and cases.
  742. //
  743. // Ultimately, when table-based dispatch is such an important technique, we
  744. // get better results by taking full control and manually creating the
  745. // dispatch structures.
  746. constexpr Lexer::DispatchTableT DispatchTable = Lexer::MakeDispatchTable();
  747. llvm::StringRef source_text = source.text();
  748. while (lexer.SkipWhitespace(source_text)) {
  749. Lexer::LexResult result =
  750. DispatchTable[static_cast<unsigned char>(source_text.front())](
  751. lexer, source_text);
  752. CARBON_CHECK(result) << "Failed to form a token!";
  753. }
  754. // The end-of-file token is always considered to be whitespace.
  755. lexer.NoteWhitespace();
  756. lexer.CloseInvalidOpenGroups(TokenKind::Error);
  757. lexer.AddEndOfFileToken();
  758. if (error_tracking_consumer.seen_error()) {
  759. buffer.has_errors_ = true;
  760. }
  761. return buffer;
  762. }
  763. auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
  764. return GetTokenInfo(token).kind;
  765. }
  766. auto TokenizedBuffer::GetLine(Token token) const -> Line {
  767. return GetTokenInfo(token).token_line;
  768. }
  769. auto TokenizedBuffer::GetLineNumber(Token token) const -> int {
  770. return GetLineNumber(GetLine(token));
  771. }
  772. auto TokenizedBuffer::GetColumnNumber(Token token) const -> int {
  773. return GetTokenInfo(token).column + 1;
  774. }
  775. auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
  776. const auto& token_info = GetTokenInfo(token);
  777. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  778. if (!fixed_spelling.empty()) {
  779. return fixed_spelling;
  780. }
  781. if (token_info.kind == TokenKind::Error) {
  782. const auto& line_info = GetLineInfo(token_info.token_line);
  783. int64_t token_start = line_info.start + token_info.column;
  784. return source_->text().substr(token_start, token_info.error_length);
  785. }
  786. // Refer back to the source text to preserve oddities like radix or digit
  787. // separators the author included.
  788. if (token_info.kind == TokenKind::IntegerLiteral ||
  789. token_info.kind == TokenKind::RealLiteral) {
  790. const auto& line_info = GetLineInfo(token_info.token_line);
  791. int64_t token_start = line_info.start + token_info.column;
  792. std::optional<LexedNumericLiteral> relexed_token =
  793. LexedNumericLiteral::Lex(source_->text().substr(token_start));
  794. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  795. return relexed_token->text();
  796. }
  797. // Refer back to the source text to find the original spelling, including
  798. // escape sequences etc.
  799. if (token_info.kind == TokenKind::StringLiteral) {
  800. const auto& line_info = GetLineInfo(token_info.token_line);
  801. int64_t token_start = line_info.start + token_info.column;
  802. std::optional<LexedStringLiteral> relexed_token =
  803. LexedStringLiteral::Lex(source_->text().substr(token_start));
  804. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  805. return relexed_token->text();
  806. }
  807. // Refer back to the source text to avoid needing to reconstruct the
  808. // spelling from the size.
  809. if (token_info.kind.is_sized_type_literal()) {
  810. const auto& line_info = GetLineInfo(token_info.token_line);
  811. int64_t token_start = line_info.start + token_info.column;
  812. llvm::StringRef suffix =
  813. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  814. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  815. }
  816. if (token_info.kind == TokenKind::EndOfFile) {
  817. return llvm::StringRef();
  818. }
  819. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  820. return GetIdentifierText(token_info.id);
  821. }
  822. auto TokenizedBuffer::GetIdentifier(Token token) const -> Identifier {
  823. const auto& token_info = GetTokenInfo(token);
  824. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  825. return token_info.id;
  826. }
  827. auto TokenizedBuffer::GetIntegerLiteral(Token token) const
  828. -> const llvm::APInt& {
  829. const auto& token_info = GetTokenInfo(token);
  830. CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
  831. return literal_int_storage_[token_info.literal_index];
  832. }
  833. auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
  834. const auto& token_info = GetTokenInfo(token);
  835. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  836. // Note that every real literal is at least three characters long, so we can
  837. // safely look at the second character to determine whether we have a
  838. // decimal or hexadecimal literal.
  839. const auto& line_info = GetLineInfo(token_info.token_line);
  840. int64_t token_start = line_info.start + token_info.column;
  841. char second_char = source_->text()[token_start + 1];
  842. bool is_decimal = second_char != 'x' && second_char != 'b';
  843. return RealLiteralValue(this, token_info.literal_index, is_decimal);
  844. }
  845. auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
  846. const auto& token_info = GetTokenInfo(token);
  847. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  848. return literal_string_storage_[token_info.literal_index];
  849. }
  850. auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
  851. -> const llvm::APInt& {
  852. const auto& token_info = GetTokenInfo(token);
  853. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  854. return literal_int_storage_[token_info.literal_index];
  855. }
  856. auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
  857. -> Token {
  858. const auto& opening_token_info = GetTokenInfo(opening_token);
  859. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  860. << opening_token_info.kind;
  861. return opening_token_info.closing_token;
  862. }
  863. auto TokenizedBuffer::GetMatchedOpeningToken(Token closing_token) const
  864. -> Token {
  865. const auto& closing_token_info = GetTokenInfo(closing_token);
  866. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  867. << closing_token_info.kind;
  868. return closing_token_info.opening_token;
  869. }
  870. auto TokenizedBuffer::HasLeadingWhitespace(Token token) const -> bool {
  871. auto it = TokenIterator(token);
  872. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  873. }
  874. auto TokenizedBuffer::HasTrailingWhitespace(Token token) const -> bool {
  875. return GetTokenInfo(token).has_trailing_space;
  876. }
  877. auto TokenizedBuffer::IsRecoveryToken(Token token) const -> bool {
  878. return GetTokenInfo(token).is_recovery;
  879. }
  880. auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
  881. return line.index + 1;
  882. }
  883. auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
  884. return GetLineInfo(line).indent + 1;
  885. }
  886. auto TokenizedBuffer::GetIdentifierText(Identifier identifier) const
  887. -> llvm::StringRef {
  888. return identifier_infos_[identifier.index].text;
  889. }
  890. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  891. index = std::max(widths.index, index);
  892. kind = std::max(widths.kind, kind);
  893. column = std::max(widths.column, column);
  894. line = std::max(widths.line, line);
  895. indent = std::max(widths.indent, indent);
  896. }
  897. // Compute the printed width of a number. When numbers are printed in decimal,
  898. // the number of digits needed is is one more than the log-base-10 of the
  899. // value. We handle a value of `zero` explicitly.
  900. //
  901. // This routine requires its argument to be *non-negative*.
  902. static auto ComputeDecimalPrintedWidth(int number) -> int {
  903. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  904. if (number == 0) {
  905. return 1;
  906. }
  907. return static_cast<int>(std::log10(number)) + 1;
  908. }
  909. auto TokenizedBuffer::GetTokenPrintWidths(Token token) const -> PrintWidths {
  910. PrintWidths widths = {};
  911. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  912. widths.kind = GetKind(token).name().size();
  913. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  914. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  915. widths.indent =
  916. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  917. return widths;
  918. }
  919. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  920. if (tokens().begin() == tokens().end()) {
  921. return;
  922. }
  923. PrintWidths widths = {};
  924. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  925. for (Token token : tokens()) {
  926. widths.Widen(GetTokenPrintWidths(token));
  927. }
  928. output_stream << "[\n";
  929. for (Token token : tokens()) {
  930. PrintToken(output_stream, token, widths);
  931. output_stream << "\n";
  932. }
  933. output_stream << "]\n";
  934. }
  935. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  936. Token token) const -> void {
  937. PrintToken(output_stream, token, {});
  938. }
  939. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
  940. PrintWidths widths) const -> void {
  941. widths.Widen(GetTokenPrintWidths(token));
  942. int token_index = token.index;
  943. const auto& token_info = GetTokenInfo(token);
  944. llvm::StringRef token_text = GetTokenText(token);
  945. // Output the main chunk using one format string. We have to do the
  946. // justification manually in order to use the dynamically computed widths
  947. // and get the quotes included.
  948. output_stream << llvm::formatv(
  949. "{ index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  950. "spelling: '{5}'",
  951. llvm::format_decimal(token_index, widths.index),
  952. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  953. widths.kind + 2),
  954. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  955. llvm::format_decimal(GetColumnNumber(token), widths.column),
  956. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  957. widths.indent),
  958. token_text);
  959. switch (token_info.kind) {
  960. case TokenKind::Identifier:
  961. output_stream << ", identifier: " << GetIdentifier(token).index;
  962. break;
  963. case TokenKind::IntegerLiteral:
  964. output_stream << ", value: `";
  965. GetIntegerLiteral(token).print(output_stream, /*isSigned=*/false);
  966. output_stream << "`";
  967. break;
  968. case TokenKind::RealLiteral:
  969. output_stream << ", value: `" << GetRealLiteral(token) << "`";
  970. break;
  971. case TokenKind::StringLiteral:
  972. output_stream << ", value: `" << GetStringLiteral(token) << "`";
  973. break;
  974. default:
  975. if (token_info.kind.is_opening_symbol()) {
  976. output_stream << ", closing_token: "
  977. << GetMatchedClosingToken(token).index;
  978. } else if (token_info.kind.is_closing_symbol()) {
  979. output_stream << ", opening_token: "
  980. << GetMatchedOpeningToken(token).index;
  981. }
  982. break;
  983. }
  984. if (token_info.has_trailing_space) {
  985. output_stream << ", has_trailing_space: true";
  986. }
  987. if (token_info.is_recovery) {
  988. output_stream << ", recovery: true";
  989. }
  990. output_stream << " },";
  991. }
  992. auto TokenizedBuffer::GetLineInfo(Line line) -> LineInfo& {
  993. return line_infos_[line.index];
  994. }
  995. auto TokenizedBuffer::GetLineInfo(Line line) const -> const LineInfo& {
  996. return line_infos_[line.index];
  997. }
  998. auto TokenizedBuffer::AddLine(LineInfo info) -> Line {
  999. line_infos_.push_back(info);
  1000. return Line(static_cast<int>(line_infos_.size()) - 1);
  1001. }
  1002. auto TokenizedBuffer::GetTokenInfo(Token token) -> TokenInfo& {
  1003. return token_infos_[token.index];
  1004. }
  1005. auto TokenizedBuffer::GetTokenInfo(Token token) const -> const TokenInfo& {
  1006. return token_infos_[token.index];
  1007. }
  1008. auto TokenizedBuffer::AddToken(TokenInfo info) -> Token {
  1009. token_infos_.push_back(info);
  1010. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  1011. return Token(static_cast<int>(token_infos_.size()) - 1);
  1012. }
  1013. auto TokenizedBuffer::TokenIterator::Print(llvm::raw_ostream& output) const
  1014. -> void {
  1015. output << token_.index;
  1016. }
  1017. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  1018. const char* loc) -> DiagnosticLocation {
  1019. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  1020. << "location not within buffer";
  1021. int64_t offset = loc - buffer_->source_->text().begin();
  1022. // Find the first line starting after the given location. Note that we can't
  1023. // inspect `line.length` here because it is not necessarily correct for the
  1024. // final line during lexing (but will be correct later for the parse tree).
  1025. const auto* line_it = std::partition_point(
  1026. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  1027. [offset](const LineInfo& line) { return line.start <= offset; });
  1028. // Step back one line to find the line containing the given position.
  1029. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  1030. << "location precedes the start of the first line";
  1031. --line_it;
  1032. int line_number = line_it - buffer_->line_infos_.begin();
  1033. int column_number = offset - line_it->start;
  1034. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  1035. // the length will be npos and the line will be grabbed from the known start
  1036. // to the end of the buffer; we'll then adjust the length.
  1037. llvm::StringRef line =
  1038. buffer_->source_->text().substr(line_it->start, line_it->length);
  1039. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  1040. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  1041. << "Currently we assume no unlexed newlines prior to the error column, "
  1042. "but there was one when erroring at "
  1043. << buffer_->source_->filename() << ":" << line_number << ":"
  1044. << column_number;
  1045. // Look for the next newline since we don't know the length. We can start at
  1046. // the column because prior newlines will have been lexed.
  1047. auto end_newline_pos = line.find('\n', column_number);
  1048. if (end_newline_pos != llvm::StringRef::npos) {
  1049. line = line.take_front(end_newline_pos);
  1050. }
  1051. }
  1052. return {.file_name = buffer_->source_->filename(),
  1053. .line = line,
  1054. .line_number = line_number + 1,
  1055. .column_number = column_number + 1};
  1056. }
  1057. auto TokenizedBuffer::TokenLocationTranslator::GetLocation(Token token)
  1058. -> DiagnosticLocation {
  1059. // Map the token location into a position within the source buffer.
  1060. const auto& token_info = buffer_->GetTokenInfo(token);
  1061. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  1062. const char* token_start =
  1063. buffer_->source_->text().begin() + line_info.start + token_info.column;
  1064. // Find the corresponding file location.
  1065. // TODO: Should we somehow indicate in the diagnostic location if this token
  1066. // is a recovery token that doesn't correspond to the original source?
  1067. return SourceBufferLocationTranslator(buffer_).GetLocation(token_start);
  1068. }
  1069. } // namespace Carbon