tokenized_buffer.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <cmath>
  6. #include "common/check.h"
  7. #include "common/string_helpers.h"
  8. #include "llvm/ADT/StringRef.h"
  9. #include "llvm/Support/Format.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/base/value_store.h"
  12. #include "toolchain/lex/character_set.h"
  13. #include "toolchain/lex/numeric_literal.h"
  14. #include "toolchain/lex/string_literal.h"
  15. namespace Carbon::Lex {
  16. auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
  17. return GetTokenInfo(token).kind;
  18. }
  19. auto TokenizedBuffer::GetLine(Token token) const -> Line {
  20. return GetTokenInfo(token).token_line;
  21. }
  22. auto TokenizedBuffer::GetLineNumber(Token token) const -> int {
  23. return GetLineNumber(GetLine(token));
  24. }
  25. auto TokenizedBuffer::GetColumnNumber(Token token) const -> int {
  26. return GetTokenInfo(token).column + 1;
  27. }
  28. auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
  29. const auto& token_info = GetTokenInfo(token);
  30. llvm::StringRef fixed_spelling = token_info.kind.fixed_spelling();
  31. if (!fixed_spelling.empty()) {
  32. return fixed_spelling;
  33. }
  34. if (token_info.kind == TokenKind::Error) {
  35. const auto& line_info = GetLineInfo(token_info.token_line);
  36. int64_t token_start = line_info.start + token_info.column;
  37. return source_->text().substr(token_start, token_info.error_length);
  38. }
  39. // Refer back to the source text to preserve oddities like radix or digit
  40. // separators the author included.
  41. if (token_info.kind == TokenKind::IntegerLiteral ||
  42. token_info.kind == TokenKind::RealLiteral) {
  43. const auto& line_info = GetLineInfo(token_info.token_line);
  44. int64_t token_start = line_info.start + token_info.column;
  45. std::optional<NumericLiteral> relexed_token =
  46. NumericLiteral::Lex(source_->text().substr(token_start));
  47. CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
  48. return relexed_token->text();
  49. }
  50. // Refer back to the source text to find the original spelling, including
  51. // escape sequences etc.
  52. if (token_info.kind == TokenKind::StringLiteral) {
  53. const auto& line_info = GetLineInfo(token_info.token_line);
  54. int64_t token_start = line_info.start + token_info.column;
  55. std::optional<StringLiteral> relexed_token =
  56. StringLiteral::Lex(source_->text().substr(token_start));
  57. CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
  58. return relexed_token->text();
  59. }
  60. // Refer back to the source text to avoid needing to reconstruct the
  61. // spelling from the size.
  62. if (token_info.kind.is_sized_type_literal()) {
  63. const auto& line_info = GetLineInfo(token_info.token_line);
  64. int64_t token_start = line_info.start + token_info.column;
  65. llvm::StringRef suffix =
  66. source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
  67. return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
  68. }
  69. if (token_info.kind == TokenKind::StartOfFile ||
  70. token_info.kind == TokenKind::EndOfFile) {
  71. return llvm::StringRef();
  72. }
  73. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  74. return value_stores_->identifiers().Get(token_info.ident_id);
  75. }
  76. auto TokenizedBuffer::GetIdentifier(Token token) const -> IdentifierId {
  77. const auto& token_info = GetTokenInfo(token);
  78. CARBON_CHECK(token_info.kind == TokenKind::Identifier) << token_info.kind;
  79. return token_info.ident_id;
  80. }
  81. auto TokenizedBuffer::GetIntegerLiteral(Token token) const -> IntegerId {
  82. const auto& token_info = GetTokenInfo(token);
  83. CARBON_CHECK(token_info.kind == TokenKind::IntegerLiteral) << token_info.kind;
  84. return token_info.integer_id;
  85. }
  86. auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealId {
  87. const auto& token_info = GetTokenInfo(token);
  88. CARBON_CHECK(token_info.kind == TokenKind::RealLiteral) << token_info.kind;
  89. return token_info.real_id;
  90. }
  91. auto TokenizedBuffer::GetStringLiteral(Token token) const -> StringLiteralId {
  92. const auto& token_info = GetTokenInfo(token);
  93. CARBON_CHECK(token_info.kind == TokenKind::StringLiteral) << token_info.kind;
  94. return token_info.string_literal_id;
  95. }
  96. auto TokenizedBuffer::GetTypeLiteralSize(Token token) const
  97. -> const llvm::APInt& {
  98. const auto& token_info = GetTokenInfo(token);
  99. CARBON_CHECK(token_info.kind.is_sized_type_literal()) << token_info.kind;
  100. return value_stores_->integers().Get(token_info.integer_id);
  101. }
  102. auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
  103. -> Token {
  104. const auto& opening_token_info = GetTokenInfo(opening_token);
  105. CARBON_CHECK(opening_token_info.kind.is_opening_symbol())
  106. << opening_token_info.kind;
  107. return opening_token_info.closing_token;
  108. }
  109. auto TokenizedBuffer::GetMatchedOpeningToken(Token closing_token) const
  110. -> Token {
  111. const auto& closing_token_info = GetTokenInfo(closing_token);
  112. CARBON_CHECK(closing_token_info.kind.is_closing_symbol())
  113. << closing_token_info.kind;
  114. return closing_token_info.opening_token;
  115. }
  116. auto TokenizedBuffer::HasLeadingWhitespace(Token token) const -> bool {
  117. auto it = TokenIterator(token);
  118. return it == tokens().begin() || GetTokenInfo(*(it - 1)).has_trailing_space;
  119. }
  120. auto TokenizedBuffer::HasTrailingWhitespace(Token token) const -> bool {
  121. return GetTokenInfo(token).has_trailing_space;
  122. }
  123. auto TokenizedBuffer::IsRecoveryToken(Token token) const -> bool {
  124. return GetTokenInfo(token).is_recovery;
  125. }
  126. auto TokenizedBuffer::GetLineNumber(Line line) const -> int {
  127. return line.index + 1;
  128. }
  129. auto TokenizedBuffer::GetNextLine(Line line) const -> Line {
  130. Line next(line.index + 1);
  131. CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
  132. return next;
  133. }
  134. auto TokenizedBuffer::GetPrevLine(Line line) const -> Line {
  135. CARBON_CHECK(line.index > 0);
  136. return Line(line.index - 1);
  137. }
  138. auto TokenizedBuffer::GetIndentColumnNumber(Line line) const -> int {
  139. return GetLineInfo(line).indent + 1;
  140. }
  141. auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
  142. index = std::max(widths.index, index);
  143. kind = std::max(widths.kind, kind);
  144. column = std::max(widths.column, column);
  145. line = std::max(widths.line, line);
  146. indent = std::max(widths.indent, indent);
  147. }
  148. // Compute the printed width of a number. When numbers are printed in decimal,
  149. // the number of digits needed is is one more than the log-base-10 of the
  150. // value. We handle a value of `zero` explicitly.
  151. //
  152. // This routine requires its argument to be *non-negative*.
  153. static auto ComputeDecimalPrintedWidth(int number) -> int {
  154. CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
  155. if (number == 0) {
  156. return 1;
  157. }
  158. return static_cast<int>(std::log10(number)) + 1;
  159. }
  160. auto TokenizedBuffer::GetTokenPrintWidths(Token token) const -> PrintWidths {
  161. PrintWidths widths = {};
  162. widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
  163. widths.kind = GetKind(token).name().size();
  164. widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
  165. widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
  166. widths.indent =
  167. ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
  168. return widths;
  169. }
  170. auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
  171. if (tokens().begin() == tokens().end()) {
  172. return;
  173. }
  174. output_stream << "- filename: " << source_->filename() << "\n"
  175. << " tokens: [\n";
  176. PrintWidths widths = {};
  177. widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
  178. for (Token token : tokens()) {
  179. widths.Widen(GetTokenPrintWidths(token));
  180. }
  181. for (Token token : tokens()) {
  182. PrintToken(output_stream, token, widths);
  183. output_stream << "\n";
  184. }
  185. output_stream << " ]\n";
  186. }
  187. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
  188. Token token) const -> void {
  189. PrintToken(output_stream, token, {});
  190. }
  191. auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
  192. PrintWidths widths) const -> void {
  193. widths.Widen(GetTokenPrintWidths(token));
  194. int token_index = token.index;
  195. const auto& token_info = GetTokenInfo(token);
  196. llvm::StringRef token_text = GetTokenText(token);
  197. // Output the main chunk using one format string. We have to do the
  198. // justification manually in order to use the dynamically computed widths
  199. // and get the quotes included.
  200. output_stream << llvm::formatv(
  201. " { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
  202. "spelling: '{5}'",
  203. llvm::format_decimal(token_index, widths.index),
  204. llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
  205. widths.kind + 2),
  206. llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
  207. llvm::format_decimal(GetColumnNumber(token), widths.column),
  208. llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
  209. widths.indent),
  210. token_text);
  211. switch (token_info.kind) {
  212. case TokenKind::Identifier:
  213. output_stream << ", identifier: " << GetIdentifier(token).index;
  214. break;
  215. case TokenKind::IntegerLiteral:
  216. output_stream << ", value: `";
  217. value_stores_->integers()
  218. .Get(GetIntegerLiteral(token))
  219. .print(output_stream, /*isSigned=*/false);
  220. output_stream << "`";
  221. break;
  222. case TokenKind::RealLiteral:
  223. output_stream << ", value: `"
  224. << value_stores_->reals().Get(GetRealLiteral(token)) << "`";
  225. break;
  226. case TokenKind::StringLiteral:
  227. output_stream << ", value: `"
  228. << value_stores_->string_literals().Get(
  229. GetStringLiteral(token))
  230. << "`";
  231. break;
  232. default:
  233. if (token_info.kind.is_opening_symbol()) {
  234. output_stream << ", closing_token: "
  235. << GetMatchedClosingToken(token).index;
  236. } else if (token_info.kind.is_closing_symbol()) {
  237. output_stream << ", opening_token: "
  238. << GetMatchedOpeningToken(token).index;
  239. }
  240. break;
  241. }
  242. if (token_info.has_trailing_space) {
  243. output_stream << ", has_trailing_space: true";
  244. }
  245. if (token_info.is_recovery) {
  246. output_stream << ", recovery: true";
  247. }
  248. output_stream << " },";
  249. }
  250. auto TokenizedBuffer::GetLineInfo(Line line) -> LineInfo& {
  251. return line_infos_[line.index];
  252. }
  253. auto TokenizedBuffer::GetLineInfo(Line line) const -> const LineInfo& {
  254. return line_infos_[line.index];
  255. }
  256. auto TokenizedBuffer::AddLine(LineInfo info) -> Line {
  257. line_infos_.push_back(info);
  258. return Line(static_cast<int>(line_infos_.size()) - 1);
  259. }
  260. auto TokenizedBuffer::GetTokenInfo(Token token) -> TokenInfo& {
  261. return token_infos_[token.index];
  262. }
  263. auto TokenizedBuffer::GetTokenInfo(Token token) const -> const TokenInfo& {
  264. return token_infos_[token.index];
  265. }
  266. auto TokenizedBuffer::AddToken(TokenInfo info) -> Token {
  267. token_infos_.push_back(info);
  268. expected_parse_tree_size_ += info.kind.expected_parse_tree_size();
  269. return Token(static_cast<int>(token_infos_.size()) - 1);
  270. }
  271. auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
  272. output << token_.index;
  273. }
  274. auto TokenizedBuffer::SourceBufferLocationTranslator::GetLocation(
  275. const char* loc) -> DiagnosticLocation {
  276. CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
  277. << "location not within buffer";
  278. int64_t offset = loc - buffer_->source_->text().begin();
  279. // Find the first line starting after the given location. Note that we can't
  280. // inspect `line.length` here because it is not necessarily correct for the
  281. // final line during lexing (but will be correct later for the parse tree).
  282. const auto* line_it = std::partition_point(
  283. buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
  284. [offset](const LineInfo& line) { return line.start <= offset; });
  285. // Step back one line to find the line containing the given position.
  286. CARBON_CHECK(line_it != buffer_->line_infos_.begin())
  287. << "location precedes the start of the first line";
  288. --line_it;
  289. int line_number = line_it - buffer_->line_infos_.begin();
  290. int column_number = offset - line_it->start;
  291. // Start by grabbing the line from the buffer. If the line isn't fully lexed,
  292. // the length will be npos and the line will be grabbed from the known start
  293. // to the end of the buffer; we'll then adjust the length.
  294. llvm::StringRef line =
  295. buffer_->source_->text().substr(line_it->start, line_it->length);
  296. if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
  297. CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
  298. << "Currently we assume no unlexed newlines prior to the error column, "
  299. "but there was one when erroring at "
  300. << buffer_->source_->filename() << ":" << line_number << ":"
  301. << column_number;
  302. // Look for the next newline since we don't know the length. We can start at
  303. // the column because prior newlines will have been lexed.
  304. auto end_newline_pos = line.find('\n', column_number);
  305. if (end_newline_pos != llvm::StringRef::npos) {
  306. line = line.take_front(end_newline_pos);
  307. }
  308. }
  309. return {.file_name = buffer_->source_->filename(),
  310. .line = line,
  311. .line_number = line_number + 1,
  312. .column_number = column_number + 1};
  313. }
  314. auto TokenLocationTranslator::GetLocation(Token token) -> DiagnosticLocation {
  315. // Map the token location into a position within the source buffer.
  316. const auto& token_info = buffer_->GetTokenInfo(token);
  317. const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
  318. const char* token_start =
  319. buffer_->source_->text().begin() + line_info.start + token_info.column;
  320. // Find the corresponding file location.
  321. // TODO: Should we somehow indicate in the diagnostic location if this token
  322. // is a recovery token that doesn't correspond to the original source?
  323. return TokenizedBuffer::SourceBufferLocationTranslator(buffer_).GetLocation(
  324. token_start);
  325. }
  326. } // namespace Carbon::Lex