tokenized_buffer.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <algorithm>
  6. #include <cmath>
  7. #include "common/check.h"
  8. #include "common/string_helpers.h"
  9. #include "llvm/ADT/StringRef.h"
  10. #include "llvm/Support/Format.h"
  11. #include "llvm/Support/FormatVariadic.h"
  12. #include "toolchain/base/shared_value_stores.h"
  13. #include "toolchain/diagnostics/diagnostic_emitter.h"
  14. #include "toolchain/lex/character_set.h"
  15. #include "toolchain/lex/numeric_literal.h"
  16. #include "toolchain/lex/string_literal.h"
  17. namespace Carbon::Lex {
  18. auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
  19. return FindLineIndex(GetTokenInfo(token).byte_offset());
  20. }
  21. auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
  22. return GetLine(token).index + 1;
  23. }
  24. auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
  25. const auto& token_info = GetTokenInfo(token);
  26. const auto& line_info = GetLineInfo(FindLineIndex(token_info.byte_offset()));
  27. return token_info.byte_offset() - line_info.start + 1;
  28. }
  29. auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
  30. -> std::pair<LineIndex, int> {
  31. auto line = GetLine(token);
  32. int column = GetColumnNumber(token);
  33. auto token_text = GetTokenText(token);
  34. if (auto [before_newline, after_newline] = token_text.rsplit('\n');
  35. before_newline.size() == token_text.size()) {
  36. // Token fits on one line, advance the column number.
  37. column += before_newline.size();
  38. } else {
  39. // Token contains newlines.
  40. line.index += before_newline.count('\n') + 1;
  41. column = 1 + after_newline.size();
  42. }
  43. return {line, column};
  44. }
  45. auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
  46. const auto& token_info = GetTokenInfo(token);
  47. llvm::StringRef fixed_spelling = token_info.kind().fixed_spelling();
  48. if (!fixed_spelling.empty()) {
  49. return fixed_spelling;
  50. }
  51. if (token_info.kind() == TokenKind::Error) {
  52. return source_->text().substr(token_info.byte_offset(),
  53. token_info.error_length());
  54. }
  55. // Refer back to the source text to preserve oddities like radix or digit
  56. // separators the author included.
  57. if (token_info.kind() == TokenKind::IntLiteral ||
  58. token_info.kind() == TokenKind::RealLiteral) {
  59. std::optional<NumericLiteral> relexed_token =
  60. NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()),
  61. token_info.kind() == TokenKind::RealLiteral);
  62. CARBON_CHECK(relexed_token, "Could not reform numeric literal token.");
  63. return relexed_token->text();
  64. }
  65. // Refer back to the source text to find the original spelling, including
  66. // escape sequences etc.
  67. if (token_info.kind() == TokenKind::StringLiteral) {
  68. std::optional<StringLiteral> relexed_token =
  69. StringLiteral::Lex(source_->text().substr(token_info.byte_offset()));
  70. CARBON_CHECK(relexed_token, "Could not reform string literal token.");
  71. return relexed_token->text();
  72. }
  73. // Refer back to the source text to avoid needing to reconstruct the
  74. // spelling from the size.
  75. if (token_info.kind().is_sized_type_literal()) {
  76. llvm::StringRef suffix = source_->text()
  77. .substr(token_info.byte_offset() + 1)
  78. .take_while(IsDecimalDigit);
  79. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  80. }
  81. if (token_info.kind() == TokenKind::FileStart ||
  82. token_info.kind() == TokenKind::FileEnd) {
  83. return llvm::StringRef();
  84. }
  85. CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
  86. token_info.kind());
  87. return value_stores_->identifiers().Get(token_info.ident_id());
  88. }
  89. auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
  90. const auto& token_info = GetTokenInfo(token);
  91. CARBON_CHECK(token_info.kind() == TokenKind::Identifier, "{0}",
  92. token_info.kind());
  93. return token_info.ident_id();
  94. }
  95. auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
  96. const auto& token_info = GetTokenInfo(token);
  97. CARBON_CHECK(token_info.kind() == TokenKind::IntLiteral, "{0}",
  98. token_info.kind());
  99. return token_info.int_id();
  100. }
  101. auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
  102. const auto& token_info = GetTokenInfo(token);
  103. CARBON_CHECK(token_info.kind() == TokenKind::RealLiteral, "{0}",
  104. token_info.kind());
  105. return token_info.real_id();
  106. }
  107. auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
  108. -> StringLiteralValueId {
  109. const auto& token_info = GetTokenInfo(token);
  110. CARBON_CHECK(token_info.kind() == TokenKind::StringLiteral, "{0}",
  111. token_info.kind());
  112. return token_info.string_literal_id();
  113. }
  114. auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
  115. const auto& token_info = GetTokenInfo(token);
  116. CARBON_CHECK(token_info.kind().is_sized_type_literal(), "{0}",
  117. token_info.kind());
  118. return token_info.int_id();
  119. }
  120. auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
  121. -> TokenIndex {
  122. const auto& opening_token_info = GetTokenInfo(opening_token);
  123. CARBON_CHECK(opening_token_info.kind().is_opening_symbol(), "{0}",
  124. opening_token_info.kind());
  125. return opening_token_info.closing_token_index();
  126. }
  127. auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
  128. -> TokenIndex {
  129. const auto& closing_token_info = GetTokenInfo(closing_token);
  130. CARBON_CHECK(closing_token_info.kind().is_closing_symbol(), "{0}",
  131. closing_token_info.kind());
  132. return closing_token_info.opening_token_index();
  133. }
  134. auto TokenizedBuffer::IsRecoveryToken(TokenIndex token) const -> bool {
  135. if (recovery_tokens_.empty()) {
  136. return false;
  137. }
  138. return recovery_tokens_[token.index];
  139. }
  140. auto TokenizedBuffer::GetNextLine(LineIndex line) const -> LineIndex {
  141. LineIndex next(line.index + 1);
  142. CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
  143. return next;
  144. }
  145. auto TokenizedBuffer::GetPrevLine(LineIndex line) const -> LineIndex {
  146. CARBON_CHECK(line.index > 0);
  147. return LineIndex(line.index - 1);
  148. }
  149. auto TokenizedBuffer::GetIndentColumnNumber(LineIndex line) const -> int {
  150. return GetLineInfo(line).indent + 1;
  151. }
  152. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  153. index = std::max(widths.index, index);
  154. kind = std::max(widths.kind, kind);
  155. column = std::max(widths.column, column);
  156. line = std::max(widths.line, line);
  157. indent = std::max(widths.indent, indent);
  158. }
  159. // Compute the printed width of a number. When numbers are printed in decimal,
  160. // the number of digits needed is one more than the log-base-10 of the
  161. // value. We handle a value of `zero` explicitly.
  162. //
  163. // This routine requires its argument to be *non-negative*.
  164. static auto ComputeDecimalPrintedWidth(int number) -> int {
  165. CARBON_CHECK(number >= 0, "Negative numbers are not supported.");
  166. if (number == 0) {
  167. return 1;
  168. }
  169. return static_cast<int>(std::log10(number)) + 1;
  170. }
  171. auto TokenizedBuffer::GetTokenPrintWidths(TokenIndex token) const
  172. -> PrintWidths {
  173. PrintWidths widths = {};
  174. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  175. widths.kind = GetKind(token).name().size();
  176. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  177. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  178. widths.indent =
  179. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  180. return widths;
  181. }
  182. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream,
  183. bool omit_file_boundary_tokens) const -> void {
  184. output_stream << "- filename: " << source_->filename() << "\n"
  185. << " tokens:\n";
  186. PrintWidths widths = {};
  187. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  188. for (TokenIndex token : tokens()) {
  189. widths.Widen(GetTokenPrintWidths(token));
  190. }
  191. for (TokenIndex token : tokens()) {
  192. if (omit_file_boundary_tokens) {
  193. auto kind = GetKind(token);
  194. if (kind == TokenKind::FileStart || kind == TokenKind::FileEnd) {
  195. continue;
  196. }
  197. }
  198. PrintToken(output_stream, token, widths);
  199. output_stream << "\n";
  200. }
  201. }
  202. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  203. TokenIndex token) const -> void {
  204. PrintToken(output_stream, token, {});
  205. }
  206. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  207. TokenIndex token, PrintWidths widths) const
  208. -> void {
  209. widths.Widen(GetTokenPrintWidths(token));
  210. int token_index = token.index;
  211. const auto& token_info = GetTokenInfo(token);
  212. LineIndex line_index = FindLineIndex(token_info.byte_offset());
  213. llvm::StringRef token_text = GetTokenText(token);
  214. // Output the main chunk using one format string. We have to do the
  215. // justification manually in order to use the dynamically computed widths
  216. // and get the quotes included.
  217. output_stream << llvm::formatv(
  218. " - { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  219. "spelling: '{5}'",
  220. llvm::format_decimal(token_index, widths.index),
  221. llvm::right_justify(
  222. llvm::formatv("'{0}'", token_info.kind().name()).str(),
  223. widths.kind + 2),
  224. llvm::format_decimal(GetLineNumber(token), widths.line),
  225. llvm::format_decimal(GetColumnNumber(token), widths.column),
  226. llvm::format_decimal(GetIndentColumnNumber(line_index), widths.indent),
  227. token_text);
  228. switch (token_info.kind()) {
  229. case TokenKind::Identifier:
  230. output_stream << ", identifier: " << GetIdentifier(token).index;
  231. break;
  232. case TokenKind::IntLiteral:
  233. output_stream << ", value: `";
  234. value_stores_->ints()
  235. .Get(GetIntLiteral(token))
  236. .print(output_stream, /*isSigned=*/false);
  237. output_stream << "`";
  238. break;
  239. case TokenKind::RealLiteral:
  240. output_stream << ", value: `"
  241. << value_stores_->reals().Get(GetRealLiteral(token)) << "`";
  242. break;
  243. case TokenKind::StringLiteral:
  244. output_stream << ", value: `"
  245. << value_stores_->string_literal_values().Get(
  246. GetStringLiteralValue(token))
  247. << "`";
  248. break;
  249. default:
  250. if (token_info.kind().is_opening_symbol()) {
  251. output_stream << ", closing_token: "
  252. << GetMatchedClosingToken(token).index;
  253. } else if (token_info.kind().is_closing_symbol()) {
  254. output_stream << ", opening_token: "
  255. << GetMatchedOpeningToken(token).index;
  256. }
  257. break;
  258. }
  259. if (token_info.has_leading_space()) {
  260. output_stream << ", has_leading_space: true";
  261. }
  262. if (IsRecoveryToken(token)) {
  263. output_stream << ", recovery: true";
  264. }
  265. output_stream << " }";
  266. }
  267. // Find the line index corresponding to a specific byte offset within the source
  268. // text for this tokenized buffer.
  269. //
  270. // This takes advantage of the lines being sorted by their starting byte offsets
  271. // to do a binary search for the line that contains the provided offset.
  272. auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
  273. CARBON_DCHECK(!line_infos_.empty());
  274. const auto* line_it =
  275. llvm::partition_point(line_infos_, [byte_offset](LineInfo line_info) {
  276. return line_info.start <= byte_offset;
  277. });
  278. --line_it;
  279. // If this isn't the first line but it starts past the end of the source, then
  280. // this is a synthetic line added for simplicity of lexing. Step back one
  281. // further to find the last non-synthetic line.
  282. if (line_it != line_infos_.begin() &&
  283. line_it->start == static_cast<int32_t>(source_->text().size())) {
  284. --line_it;
  285. }
  286. CARBON_DCHECK(line_it->start <= byte_offset);
  287. return LineIndex(line_it - line_infos_.begin());
  288. }
  289. auto TokenizedBuffer::GetLineInfo(LineIndex line) -> LineInfo& {
  290. return line_infos_[line.index];
  291. }
  292. auto TokenizedBuffer::GetLineInfo(LineIndex line) const -> const LineInfo& {
  293. return line_infos_[line.index];
  294. }
  295. auto TokenizedBuffer::AddLine(LineInfo info) -> LineIndex {
  296. line_infos_.push_back(info);
  297. return LineIndex(static_cast<int>(line_infos_.size()) - 1);
  298. }
  299. auto TokenizedBuffer::IsAfterComment(TokenIndex token,
  300. CommentIndex comment_index) const -> bool {
  301. const auto& comment_data = comments_[comment_index.index];
  302. return GetTokenInfo(token).byte_offset() > comment_data.start;
  303. }
  304. auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
  305. -> llvm::StringRef {
  306. const auto& comment_data = comments_[comment_index.index];
  307. return source_->text().substr(comment_data.start, comment_data.length);
  308. }
  309. auto TokenizedBuffer::AddComment(int32_t indent, int32_t start, int32_t end)
  310. -> void {
  311. if (!comments_.empty()) {
  312. auto& comment = comments_.back();
  313. if (comment.start + comment.length + indent == start) {
  314. comment.length = end - comment.start;
  315. return;
  316. }
  317. }
  318. comments_.push_back({.start = start, .length = end - start});
  319. }
  320. auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
  321. llvm::StringRef label) const -> void {
  322. mem_usage.Collect(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
  323. mem_usage.Collect(MemUsage::ConcatLabel(label, "token_infos_"), token_infos_);
  324. mem_usage.Collect(MemUsage::ConcatLabel(label, "line_infos_"), line_infos_);
  325. mem_usage.Collect(MemUsage::ConcatLabel(label, "comments_"), comments_);
  326. }
  327. auto TokenizedBuffer::SourceBufferDiagnosticConverter::ConvertLoc(
  328. const char* loc, ContextFnT /*context_fn*/) const -> DiagnosticLoc {
  329. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc),
  330. "location not within buffer");
  331. int32_t offset = loc - buffer_->source_->text().begin();
  332. // Find the first line starting after the given location.
  333. const auto* next_line_it = llvm::partition_point(
  334. buffer_->line_infos_,
  335. [offset](const LineInfo& line) { return line.start <= offset; });
  336. // Step back one line to find the line containing the given position.
  337. CARBON_CHECK(next_line_it != buffer_->line_infos_.begin(),
  338. "location precedes the start of the first line");
  339. const auto* line_it = std::prev(next_line_it);
  340. int line_number = line_it - buffer_->line_infos_.begin();
  341. int column_number = offset - line_it->start;
  342. // Grab the line from the buffer by slicing from this line to the next
  343. // minus the newline. When on the last line, instead use the start to the end
  344. // of the buffer.
  345. llvm::StringRef text = buffer_->source_->text();
  346. llvm::StringRef line = next_line_it != buffer_->line_infos_.end()
  347. ? text.slice(line_it->start, next_line_it->start)
  348. : text.substr(line_it->start);
  349. // Remove a newline at the end of the line if present.
  350. // TODO: This should expand to remove all vertical whitespace bytes at the
  351. // tail of the line such as CR+LF, etc.
  352. line.consume_back("\n");
  353. return {.filename = buffer_->source_->filename(),
  354. .line = line,
  355. .line_number = line_number + 1,
  356. .column_number = column_number + 1};
  357. }
  358. auto TokenDiagnosticConverter::ConvertLoc(TokenIndex token,
  359. ContextFnT context_fn) const
  360. -> DiagnosticLoc {
  361. // Map the token location into a position within the source buffer.
  362. const auto& token_info = buffer_->GetTokenInfo(token);
  363. const char* token_start =
  364. buffer_->source_->text().begin() + token_info.byte_offset();
  365. // Find the corresponding file location.
  366. // TODO: Should we somehow indicate in the diagnostic location if this token
  367. // is a recovery token that doesn't correspond to the original source?
  368. DiagnosticLoc loc =
  369. TokenizedBuffer::SourceBufferDiagnosticConverter(buffer_).ConvertLoc(
  370. token_start, context_fn);
  371. loc.length = buffer_->GetTokenText(token).size();
  372. return loc;
  373. }
  374. } // namespace Carbon::Lex