tokenized_buffer.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <cmath>
  6. #include "common/check.h"
  7. #include "common/string_helpers.h"
  8. #include "llvm/ADT/StringRef.h"
  9. #include "llvm/Support/Format.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/base/value_store.h"
  12. #include "toolchain/diagnostics/diagnostic_emitter.h"
  13. #include "toolchain/lex/character_set.h"
  14. #include "toolchain/lex/numeric_literal.h"
  15. #include "toolchain/lex/string_literal.h"
  16. namespace Carbon::Lex {
  17. auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
  18. return GetTokenInfo(token).kind;
  19. }
  20. auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
  21. return GetTokenInfo(token).token_line;
  22. }
  23. auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
  24. return GetLineNumber(GetLine(token));
  25. }
  26. auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
  27. return GetTokenInfo(token).column + 1;
  28. }
  29. auto TokenizedBuffer::GetEndLocation(TokenIndex token) const
  30. -> std::pair<LineIndex, int> {
  31. auto line = GetLine(token);
  32. int column = GetColumnNumber(token);
  33. auto token_text = GetTokenText(token);
  34. if (auto [before_newline, after_newline] = token_text.rsplit('\n');
  35. before_newline.size() == token_text.size()) {
  36. // Token fits on one line, advance the column number.
  37. column += before_newline.size();
  38. } else {
  39. // Token contains newlines.
  40. line.index += before_newline.count('\n') + 1;
  41. column = 1 + after_newline.size();
  42. }
  43. return {line, column};
  44. }
  45. auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
  46. const auto& token_info = GetTokenInfo(token);
  47. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  48. if (!fixed_spelling.empty()) {
  49. return fixed_spelling;
  50. }
  51. if (token_info.kind == TokenKind::Error) {
  52. const auto& line_info = GetLineInfo(token_info.token_line);
  53. int64_t token_start = line_info.start + token_info.column;
  54. return source_->text().substr(token_start, token_info.error_length);
  55. }
  56. // Refer back to the source text to preserve oddities like radix or digit
  57. // separators the author included.
  58. if (token_info.kind == TokenKind::IntLiteral ||
  59. token_info.kind == TokenKind::RealLiteral) {
  60. const auto& line_info = GetLineInfo(token_info.token_line);
  61. int64_t token_start = line_info.start + token_info.column;
  62. std::optional<NumericLiteral> relexed_token =
  63. NumericLiteral::Lex(source_->text().substr(token_start));
  64. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  65. return relexed_token->text();
  66. }
  67. // Refer back to the source text to find the original spelling, including
  68. // escape sequences etc.
  69. if (token_info.kind == TokenKind::StringLiteral) {
  70. const auto& line_info = GetLineInfo(token_info.token_line);
  71. int64_t token_start = line_info.start + token_info.column;
  72. std::optional<StringLiteral> relexed_token =
  73. StringLiteral::Lex(source_->text().substr(token_start));
  74. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  75. return relexed_token->text();
  76. }
  77. // Refer back to the source text to avoid needing to reconstruct the
  78. // spelling from the size.
  79. if (token_info.kind.is_sized_type_literal()) {
  80. const auto& line_info = GetLineInfo(token_info.token_line);
  81. int64_t token_start = line_info.start + token_info.column;
  82. llvm::StringRef suffix =
  83. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  84. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  85. }
  86. if (token_info.kind == TokenKind::FileStart ||
  87. token_info.kind == TokenKind::FileEnd) {
  88. return llvm::StringRef();
  89. }
  90. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  91. return value_stores_->identifiers().Get(token_info.ident_id);
  92. }
  93. auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
  94. const auto& token_info = GetTokenInfo(token);
  95. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  96. return token_info.ident_id;
  97. }
  98. auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
  99. const auto& token_info = GetTokenInfo(token);
  100. CARBON_CHECK(token_info.kind == TokenKind::IntLiteral) << token_info.kind;
  101. return token_info.int_id;
  102. }
  103. auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
  104. const auto& token_info = GetTokenInfo(token);
  105. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  106. return token_info.real_id;
  107. }
  108. auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
  109. -> StringLiteralValueId {
  110. const auto& token_info = GetTokenInfo(token);
  111. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  112. return token_info.string_literal_id;
  113. }
  114. auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const
  115. -> const llvm::APInt& {
  116. const auto& token_info = GetTokenInfo(token);
  117. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  118. return value_stores_->ints().Get(token_info.int_id);
  119. }
  120. auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
  121. -> TokenIndex {
  122. const auto& opening_token_info = GetTokenInfo(opening_token);
  123. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  124. << opening_token_info.kind;
  125. return opening_token_info.closing_token;
  126. }
  127. auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
  128. -> TokenIndex {
  129. const auto& closing_token_info = GetTokenInfo(closing_token);
  130. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  131. << closing_token_info.kind;
  132. return closing_token_info.opening_token;
  133. }
  134. auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const -> bool {
  135. auto it = TokenIterator(token);
  136. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  137. }
  138. auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const -> bool {
  139. return GetTokenInfo(token).has_trailing_space;
  140. }
  141. auto TokenizedBuffer::IsRecoveryToken(TokenIndex token) const -> bool {
  142. return GetTokenInfo(token).is_recovery;
  143. }
  144. auto TokenizedBuffer::GetLineNumber(LineIndex line) const -> int {
  145. return line.index + 1;
  146. }
  147. auto TokenizedBuffer::GetNextLine(LineIndex line) const -> LineIndex {
  148. LineIndex next(line.index + 1);
  149. CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
  150. return next;
  151. }
  152. auto TokenizedBuffer::GetPrevLine(LineIndex line) const -> LineIndex {
  153. CARBON_CHECK(line.index > 0);
  154. return LineIndex(line.index - 1);
  155. }
  156. auto TokenizedBuffer::GetIndentColumnNumber(LineIndex line) const -> int {
  157. return GetLineInfo(line).indent + 1;
  158. }
  159. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  160. index = std::max(widths.index, index);
  161. kind = std::max(widths.kind, kind);
  162. column = std::max(widths.column, column);
  163. line = std::max(widths.line, line);
  164. indent = std::max(widths.indent, indent);
  165. }
  166. // Compute the printed width of a number. When numbers are printed in decimal,
  167. // the number of digits needed is is one more than the log-base-10 of the
  168. // value. We handle a value of `zero` explicitly.
  169. //
  170. // This routine requires its argument to be *non-negative*.
  171. static auto ComputeDecimalPrintedWidth(int number) -> int {
  172. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  173. if (number == 0) {
  174. return 1;
  175. }
  176. return static_cast<int>(std::log10(number)) + 1;
  177. }
  178. auto TokenizedBuffer::GetTokenPrintWidths(TokenIndex token) const
  179. -> PrintWidths {
  180. PrintWidths widths = {};
  181. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  182. widths.kind = GetKind(token).name().size();
  183. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  184. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  185. widths.indent =
  186. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  187. return widths;
  188. }
  189. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  190. if (tokens().begin() == tokens().end()) {
  191. return;
  192. }
  193. output_stream << "- filename: " << source_->filename() << "\n"
  194. << " tokens: [\n";
  195. PrintWidths widths = {};
  196. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  197. for (TokenIndex token : tokens()) {
  198. widths.Widen(GetTokenPrintWidths(token));
  199. }
  200. for (TokenIndex token : tokens()) {
  201. PrintToken(output_stream, token, widths);
  202. output_stream << "\n";
  203. }
  204. output_stream << " ]\n";
  205. }
  206. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  207. TokenIndex token) const -> void {
  208. PrintToken(output_stream, token, {});
  209. }
  210. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  211. TokenIndex token, PrintWidths widths) const
  212. -> void {
  213. widths.Widen(GetTokenPrintWidths(token));
  214. int token_index = token.index;
  215. const auto& token_info = GetTokenInfo(token);
  216. llvm::StringRef token_text = GetTokenText(token);
  217. // Output the main chunk using one format string. We have to do the
  218. // justification manually in order to use the dynamically computed widths
  219. // and get the quotes included.
  220. output_stream << llvm::formatv(
  221. " { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  222. "spelling: '{5}'",
  223. llvm::format_decimal(token_index, widths.index),
  224. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  225. widths.kind + 2),
  226. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  227. llvm::format_decimal(GetColumnNumber(token), widths.column),
  228. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  229. widths.indent),
  230. token_text);
  231. switch (token_info.kind) {
  232. case TokenKind::Identifier:
  233. output_stream << ", identifier: " << GetIdentifier(token).index;
  234. break;
  235. case TokenKind::IntLiteral:
  236. output_stream << ", value: `";
  237. value_stores_->ints()
  238. .Get(GetIntLiteral(token))
  239. .print(output_stream, /*isSigned=*/false);
  240. output_stream << "`";
  241. break;
  242. case TokenKind::RealLiteral:
  243. output_stream << ", value: `"
  244. << value_stores_->reals().Get(GetRealLiteral(token)) << "`";
  245. break;
  246. case TokenKind::StringLiteral:
  247. output_stream << ", value: `"
  248. << value_stores_->string_literal_values().Get(
  249. GetStringLiteralValue(token))
  250. << "`";
  251. break;
  252. default:
  253. if (token_info.kind.is_opening_symbol()) {
  254. output_stream << ", closing_token: "
  255. << GetMatchedClosingToken(token).index;
  256. } else if (token_info.kind.is_closing_symbol()) {
  257. output_stream << ", opening_token: "
  258. << GetMatchedOpeningToken(token).index;
  259. }
  260. break;
  261. }
  262. if (token_info.has_trailing_space) {
  263. output_stream << ", has_trailing_space: true";
  264. }
  265. if (token_info.is_recovery) {
  266. output_stream << ", recovery: true";
  267. }
  268. output_stream << " },";
  269. }
  270. auto TokenizedBuffer::GetLineInfo(LineIndex line) -> LineInfo& {
  271. return line_infos_[line.index];
  272. }
  273. auto TokenizedBuffer::GetLineInfo(LineIndex line) const -> const LineInfo& {
  274. return line_infos_[line.index];
  275. }
  276. auto TokenizedBuffer::AddLine(LineInfo info) -> LineIndex {
  277. line_infos_.push_back(info);
  278. return LineIndex(static_cast<int>(line_infos_.size()) - 1);
  279. }
  280. auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
  281. return token_infos_[token.index];
  282. }
  283. auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const -> const TokenInfo& {
  284. return token_infos_[token.index];
  285. }
  286. auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
  287. token_infos_.push_back(info);
  288. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  289. return TokenIndex(static_cast<int>(token_infos_.size()) - 1);
  290. }
  291. auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
  292. output << token_.index;
  293. }
  294. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  295. const char* loc) -> DiagnosticLocation {
  296. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  297. << "location not within buffer";
  298. int64_t offset = loc - buffer_->source_->text().begin();
  299. // Find the first line starting after the given location. Note that we can't
  300. // inspect `line.length` here because it is not necessarily correct for the
  301. // final line during lexing (but will be correct later for the parse tree).
  302. const auto* line_it = std::partition_point(
  303. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  304. [offset](const LineInfo& line) { return line.start <= offset; });
  305. // Step back one line to find the line containing the given position.
  306. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  307. << "location precedes the start of the first line";
  308. --line_it;
  309. int line_number = line_it - buffer_->line_infos_.begin();
  310. int column_number = offset - line_it->start;
  311. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  312. // the length will be npos and the line will be grabbed from the known start
  313. // to the end of the buffer; we'll then adjust the length.
  314. llvm::StringRef line =
  315. buffer_->source_->text().substr(line_it->start, line_it->length);
  316. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  317. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  318. << "Currently we assume no unlexed newlines prior to the error column, "
  319. "but there was one when erroring at "
  320. << buffer_->source_->filename() << ":" << line_number << ":"
  321. << column_number;
  322. // Look for the next newline since we don't know the length. We can start at
  323. // the column because prior newlines will have been lexed.
  324. auto end_newline_pos = line.find('\n', column_number);
  325. if (end_newline_pos != llvm::StringRef::npos) {
  326. line = line.take_front(end_newline_pos);
  327. }
  328. }
  329. return {.file_name = buffer_->source_->filename(),
  330. .line = line,
  331. .line_number = line_number + 1,
  332. .column_number = column_number + 1};
  333. }
  334. auto TokenLocationTranslator::GetLocation(TokenIndex token)
  335. -> DiagnosticLocation {
  336. // Map the token location into a position within the source buffer.
  337. const auto& token_info = buffer_->GetTokenInfo(token);
  338. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  339. const char* token_start =
  340. buffer_->source_->text().begin() + line_info.start + token_info.column;
  341. // Find the corresponding file location.
  342. // TODO: Should we somehow indicate in the diagnostic location if this token
  343. // is a recovery token that doesn't correspond to the original source?
  344. DiagnosticLocation loc =
  345. TokenizedBuffer::SourceBufferLocationTranslator(buffer_).GetLocation(
  346. token_start);
  347. loc.length = buffer_->GetTokenText(token).size();
  348. return loc;
  349. }
  350. } // namespace Carbon::Lex