| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438 |
- // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- // Exceptions. See /LICENSE for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- #include "toolchain/lex/tokenized_buffer.h"
- #include <algorithm>
- #include <cmath>
- #include "common/check.h"
- #include "common/string_helpers.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/Support/Format.h"
- #include "llvm/Support/FormatVariadic.h"
- #include "toolchain/base/value_store.h"
- #include "toolchain/diagnostics/diagnostic_emitter.h"
- #include "toolchain/lex/character_set.h"
- #include "toolchain/lex/numeric_literal.h"
- #include "toolchain/lex/string_literal.h"
- namespace Carbon::Lex {
- auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
- return GetTokenInfo(token).kind();
- }
- auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
- return FindLineIndex(GetTokenInfo(token).byte_offset());
- }
- auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
- return GetLineNumber(GetLine(token));
- }
- auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
- const auto& token_info = GetTokenInfo(token);
- const auto& line_info = GetLineInfo(FindLineIndex(token_info.byte_offset()));
- return token_info.byte_offset() - line_info.start + 1;
- }
- auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
- -> std::pair<LineIndex, int> {
- auto line = GetLine(token);
- int column = GetColumnNumber(token);
- auto token_text = GetTokenText(token);
- if (auto [before_newline, after_newline] = token_text.rsplit('\n');
- before_newline.size() == token_text.size()) {
- // Token fits on one line, advance the column number.
- column += before_newline.size();
- } else {
- // Token contains newlines.
- line.index += before_newline.count('\n') + 1;
- column = 1 + after_newline.size();
- }
- return {line, column};
- }
- auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
- const auto& token_info = GetTokenInfo(token);
- llvm::StringRef fixed_spelling = token_info.kind().fixed_spelling();
- if (!fixed_spelling.empty()) {
- return fixed_spelling;
- }
- if (token_info.kind() == TokenKind::Error) {
- return source_->text().substr(token_info.byte_offset(),
- token_info.error_length());
- }
- // Refer back to the source text to preserve oddities like radix or digit
- // separators the author included.
- if (token_info.kind() == TokenKind::IntLiteral ||
- token_info.kind() == TokenKind::RealLiteral) {
- std::optional<NumericLiteral> relexed_token =
- NumericLiteral::Lex(source_->text().substr(token_info.byte_offset()));
- CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
- return relexed_token->text();
- }
- // Refer back to the source text to find the original spelling, including
- // escape sequences etc.
- if (token_info.kind() == TokenKind::StringLiteral) {
- std::optional<StringLiteral> relexed_token =
- StringLiteral::Lex(source_->text().substr(token_info.byte_offset()));
- CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
- return relexed_token->text();
- }
- // Refer back to the source text to avoid needing to reconstruct the
- // spelling from the size.
- if (token_info.kind().is_sized_type_literal()) {
- llvm::StringRef suffix = source_->text()
- .substr(token_info.byte_offset() + 1)
- .take_while(IsDecimalDigit);
- return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
- }
- if (token_info.kind() == TokenKind::FileStart ||
- token_info.kind() == TokenKind::FileEnd) {
- return llvm::StringRef();
- }
- CARBON_CHECK(token_info.kind() == TokenKind::Identifier) << token_info.kind();
- return value_stores_->identifiers().Get(token_info.ident_id());
- }
- auto TokenizedBuffer::GetIdentifier(TokenIndex token) const -> IdentifierId {
- const auto& token_info = GetTokenInfo(token);
- CARBON_CHECK(token_info.kind() == TokenKind::Identifier) << token_info.kind();
- return token_info.ident_id();
- }
- auto TokenizedBuffer::GetIntLiteral(TokenIndex token) const -> IntId {
- const auto& token_info = GetTokenInfo(token);
- CARBON_CHECK(token_info.kind() == TokenKind::IntLiteral) << token_info.kind();
- return token_info.int_id();
- }
- auto TokenizedBuffer::GetRealLiteral(TokenIndex token) const -> RealId {
- const auto& token_info = GetTokenInfo(token);
- CARBON_CHECK(token_info.kind() == TokenKind::RealLiteral)
- << token_info.kind();
- return token_info.real_id();
- }
- auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
- -> StringLiteralValueId {
- const auto& token_info = GetTokenInfo(token);
- CARBON_CHECK(token_info.kind() == TokenKind::StringLiteral)
- << token_info.kind();
- return token_info.string_literal_id();
- }
- auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
- const auto& token_info = GetTokenInfo(token);
- CARBON_CHECK(token_info.kind().is_sized_type_literal()) << token_info.kind();
- return token_info.int_id();
- }
- auto TokenizedBuffer::GetMatchedClosingToken(TokenIndex opening_token) const
- -> TokenIndex {
- const auto& opening_token_info = GetTokenInfo(opening_token);
- CARBON_CHECK(opening_token_info.kind().is_opening_symbol())
- << opening_token_info.kind();
- return opening_token_info.closing_token_index();
- }
- auto TokenizedBuffer::GetMatchedOpeningToken(TokenIndex closing_token) const
- -> TokenIndex {
- const auto& closing_token_info = GetTokenInfo(closing_token);
- CARBON_CHECK(closing_token_info.kind().is_closing_symbol())
- << closing_token_info.kind();
- return closing_token_info.opening_token_index();
- }
- auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const -> bool {
- return GetTokenInfo(token).has_leading_space();
- }
- auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const -> bool {
- TokenIterator it(token);
- ++it;
- return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
- }
- auto TokenizedBuffer::IsRecoveryToken(TokenIndex token) const -> bool {
- if (recovery_tokens_.empty()) {
- return false;
- }
- return recovery_tokens_[token.index];
- }
- auto TokenizedBuffer::GetLineNumber(LineIndex line) const -> int {
- return line.index + 1;
- }
- auto TokenizedBuffer::GetNextLine(LineIndex line) const -> LineIndex {
- LineIndex next(line.index + 1);
- CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
- return next;
- }
- auto TokenizedBuffer::GetPrevLine(LineIndex line) const -> LineIndex {
- CARBON_CHECK(line.index > 0);
- return LineIndex(line.index - 1);
- }
- auto TokenizedBuffer::GetIndentColumnNumber(LineIndex line) const -> int {
- return GetLineInfo(line).indent + 1;
- }
- auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
- index = std::max(widths.index, index);
- kind = std::max(widths.kind, kind);
- column = std::max(widths.column, column);
- line = std::max(widths.line, line);
- indent = std::max(widths.indent, indent);
- }
- // Compute the printed width of a number. When numbers are printed in decimal,
- // the number of digits needed is one more than the log-base-10 of the
- // value. We handle a value of `zero` explicitly.
- //
- // This routine requires its argument to be *non-negative*.
- static auto ComputeDecimalPrintedWidth(int number) -> int {
- CARBON_CHECK(number >= 0) << "Negative numbers are not supported.";
- if (number == 0) {
- return 1;
- }
- return static_cast<int>(std::log10(number)) + 1;
- }
- auto TokenizedBuffer::GetTokenPrintWidths(TokenIndex token) const
- -> PrintWidths {
- PrintWidths widths = {};
- widths.index = ComputeDecimalPrintedWidth(token_infos_.size());
- widths.kind = GetKind(token).name().size();
- widths.line = ComputeDecimalPrintedWidth(GetLineNumber(token));
- widths.column = ComputeDecimalPrintedWidth(GetColumnNumber(token));
- widths.indent =
- ComputeDecimalPrintedWidth(GetIndentColumnNumber(GetLine(token)));
- return widths;
- }
- auto TokenizedBuffer::Print(llvm::raw_ostream& output_stream) const -> void {
- if (tokens().begin() == tokens().end()) {
- return;
- }
- output_stream << "- filename: " << source_->filename() << "\n"
- << " tokens: [\n";
- PrintWidths widths = {};
- widths.index = ComputeDecimalPrintedWidth((token_infos_.size()));
- for (TokenIndex token : tokens()) {
- widths.Widen(GetTokenPrintWidths(token));
- }
- for (TokenIndex token : tokens()) {
- PrintToken(output_stream, token, widths);
- output_stream << "\n";
- }
- output_stream << " ]\n";
- }
- auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
- TokenIndex token) const -> void {
- PrintToken(output_stream, token, {});
- }
- auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
- TokenIndex token, PrintWidths widths) const
- -> void {
- widths.Widen(GetTokenPrintWidths(token));
- int token_index = token.index;
- const auto& token_info = GetTokenInfo(token);
- LineIndex line_index = FindLineIndex(token_info.byte_offset());
- llvm::StringRef token_text = GetTokenText(token);
- // Output the main chunk using one format string. We have to do the
- // justification manually in order to use the dynamically computed widths
- // and get the quotes included.
- output_stream << llvm::formatv(
- " { index: {0}, kind: {1}, line: {2}, column: {3}, indent: {4}, "
- "spelling: '{5}'",
- llvm::format_decimal(token_index, widths.index),
- llvm::right_justify(
- llvm::formatv("'{0}'", token_info.kind().name()).str(),
- widths.kind + 2),
- llvm::format_decimal(GetLineNumber(GetLine(token)), widths.line),
- llvm::format_decimal(GetColumnNumber(token), widths.column),
- llvm::format_decimal(GetIndentColumnNumber(line_index), widths.indent),
- token_text);
- switch (token_info.kind()) {
- case TokenKind::Identifier:
- output_stream << ", identifier: " << GetIdentifier(token).index;
- break;
- case TokenKind::IntLiteral:
- output_stream << ", value: `";
- value_stores_->ints()
- .Get(GetIntLiteral(token))
- .print(output_stream, /*isSigned=*/false);
- output_stream << "`";
- break;
- case TokenKind::RealLiteral:
- output_stream << ", value: `"
- << value_stores_->reals().Get(GetRealLiteral(token)) << "`";
- break;
- case TokenKind::StringLiteral:
- output_stream << ", value: `"
- << value_stores_->string_literal_values().Get(
- GetStringLiteralValue(token))
- << "`";
- break;
- default:
- if (token_info.kind().is_opening_symbol()) {
- output_stream << ", closing_token: "
- << GetMatchedClosingToken(token).index;
- } else if (token_info.kind().is_closing_symbol()) {
- output_stream << ", opening_token: "
- << GetMatchedOpeningToken(token).index;
- }
- break;
- }
- if (token_info.has_leading_space()) {
- output_stream << ", has_leading_space: true";
- }
- if (IsRecoveryToken(token)) {
- output_stream << ", recovery: true";
- }
- output_stream << " },";
- }
- // Find the line index corresponding to a specific byte offset within the source
- // text for this tokenized buffer.
- //
- // This takes advantage of the lines being sorted by their starting byte offsets
- // to do a binary search for the line that contains the provided offset.
- auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
- CARBON_DCHECK(!line_infos_.empty());
- const auto* line_it =
- std::partition_point(line_infos_.begin(), line_infos_.end(),
- [byte_offset](LineInfo line_info) {
- return line_info.start <= byte_offset;
- });
- --line_it;
- // If this isn't the first line but it starts past the end of the source, then
- // this is a synthetic line added for simplicity of lexing. Step back one
- // further to find the last non-synthetic line.
- if (line_it != line_infos_.begin() &&
- line_it->start == static_cast<int32_t>(source_->text().size())) {
- --line_it;
- }
- CARBON_DCHECK(line_it->start <= byte_offset);
- return LineIndex(line_it - line_infos_.begin());
- }
- auto TokenizedBuffer::GetLineInfo(LineIndex line) -> LineInfo& {
- return line_infos_[line.index];
- }
- auto TokenizedBuffer::GetLineInfo(LineIndex line) const -> const LineInfo& {
- return line_infos_[line.index];
- }
- auto TokenizedBuffer::AddLine(LineInfo info) -> LineIndex {
- line_infos_.push_back(info);
- return LineIndex(static_cast<int>(line_infos_.size()) - 1);
- }
- auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
- return token_infos_[token.index];
- }
- auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const -> const TokenInfo& {
- return token_infos_[token.index];
- }
- auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
- token_infos_.push_back(info);
- expected_parse_tree_size_ += info.kind().expected_parse_tree_size();
- return TokenIndex(static_cast<int>(token_infos_.size()) - 1);
- }
- auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
- llvm::StringRef label) const -> void {
- mem_usage.Add(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
- mem_usage.Add(MemUsage::ConcatLabel(label, "token_infos_"), token_infos_);
- mem_usage.Add(MemUsage::ConcatLabel(label, "line_infos_"), line_infos_);
- }
- auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
- output << token_.index;
- }
- auto TokenizedBuffer::SourceBufferDiagnosticConverter::ConvertLoc(
- const char* loc, ContextFnT /*context_fn*/) const -> DiagnosticLoc {
- CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
- << "location not within buffer";
- int32_t offset = loc - buffer_->source_->text().begin();
- // Find the first line starting after the given location.
- const auto* next_line_it = std::partition_point(
- buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
- [offset](const LineInfo& line) { return line.start <= offset; });
- // Step back one line to find the line containing the given position.
- CARBON_CHECK(next_line_it != buffer_->line_infos_.begin())
- << "location precedes the start of the first line";
- const auto* line_it = std::prev(next_line_it);
- int line_number = line_it - buffer_->line_infos_.begin();
- int column_number = offset - line_it->start;
- // Grab the line from the buffer by slicing from this line to the next
- // minus the newline. When on the last line, instead use the start to the end
- // of the buffer.
- llvm::StringRef text = buffer_->source_->text();
- llvm::StringRef line = next_line_it != buffer_->line_infos_.end()
- ? text.slice(line_it->start, next_line_it->start)
- : text.substr(line_it->start);
- // Remove a newline at the end of the line if present.
- // TODO: This should expand to remove all vertical whitespace bytes at the
- // tail of the line such as CR+LF, etc.
- line.consume_back("\n");
- return {.filename = buffer_->source_->filename(),
- .line = line,
- .line_number = line_number + 1,
- .column_number = column_number + 1};
- }
- auto TokenDiagnosticConverter::ConvertLoc(TokenIndex token,
- ContextFnT context_fn) const
- -> DiagnosticLoc {
- // Map the token location into a position within the source buffer.
- const auto& token_info = buffer_->GetTokenInfo(token);
- const char* token_start =
- buffer_->source_->text().begin() + token_info.byte_offset();
- // Find the corresponding file location.
- // TODO: Should we somehow indicate in the diagnostic location if this token
- // is a recovery token that doesn't correspond to the original source?
- DiagnosticLoc loc =
- TokenizedBuffer::SourceBufferDiagnosticConverter(buffer_).ConvertLoc(
- token_start, context_fn);
- loc.length = buffer_->GetTokenText(token).size();
- return loc;
- }
- } // namespace Carbon::Lex
|