/* Part of the Carbon Language project, under the Apache License v2.0 with LLVM Exceptions. See /LICENSE for license information. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ %{ #include #include "common/check.h" #include "common/error.h" #include "explorer/syntax/lex_helper.h" #include "explorer/syntax/lex_scan_helper.h" #include "explorer/syntax/parse_and_lex_context.h" #include "explorer/syntax/parser.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" %} /* Turn off legacy bits we don't need. */ %option noyywrap nounput nodefault %option reentrant /* Lexing a token immediately after consuming some whitespace. */ %s AFTER_WHITESPACE /* * Lexing a token immediately after consuming an operand-ending token: * a closing bracket, identifier, or literal. */ %s AFTER_OPERAND /* table-begin */ ABSTRACT "abstract" ADDR "addr" ALIAS "alias" AMPERSAND "&" AND "and" API "api" ARROW "->" AS "as" AUTO "auto" AWAIT "__await" BASE "base" BOOL "bool" BREAK "break" CARET "^" CASE "case" CHOICE "choice" CLASS "class" COLON ":" COLON_BANG ":!" COMMA "," CONSTRAINT "constraint" CONTINUATION "__continuation" CONTINUATION_TYPE "__Continuation" CONTINUE "continue" DEFAULT "default" DESTRUCTOR "destructor" DOUBLE_ARROW "=>" ELSE "else" EQUAL "=" EQUAL_EQUAL "==" EXTENDS "extends" EXTERNAL "external" FALSE "false" FN "fn" FN_TYPE "__Fn" FOR "for" FORALL "forall" GREATER ">" GREATER_EQUAL ">=" GREATER_GREATER ">>" IF "if" IMPL "impl" IMPORT "import" IN "in" INTERFACE "interface" IS "is" LEFT_CURLY_BRACE "{" LEFT_PARENTHESIS "(" LEFT_SQUARE_BRACKET "[" LESS "<" LESS_EQUAL "<=" LESS_LESS "<<" LET "let" LIBRARY "library" MATCH "match" MINUS "-" MIX "__mix" MIXIN "__mixin" NOT "not" NOT_EQUAL "!=" OR "or" PACKAGE "package" PERCENT "%" PERIOD "." PIPE "|" PLUS "+" RETURN "return" RETURNED "returned" RIGHT_CURLY_BRACE "}" RIGHT_PARENTHESIS ")" RIGHT_SQUARE_BRACKET "]" RUN "__run" SELF "Self" SEMICOLON ";" SLASH "/" STRING "String" THEN "then" TRUE "true" TYPE "type" UNDERSCORE "_" UNIMPL_EXAMPLE "__unimplemented_example_infix" VAR "var" VIRTUAL "virtual" WHERE "where" WHILE "while" /* table-end */ /* This should be kept table-like, but isn't automatic due to spaces. */ identifier [A-Za-z_][A-Za-z0-9_]* /* TODO: Remove Print special casing once we have variadics or overloads. */ intrinsic_identifier (Print|__intrinsic_[A-Za-z0-9_]*) sized_type_literal [iuf][1-9][0-9]* integer_literal [0-9]+ horizontal_whitespace [ \t\r] whitespace [ \t\r\n] one_line_comment \/\/[^\n]*\n operand_start [(A-Za-z0-9_\"] %% %{ // Code run each time yylex is called. // Begin with an empty token span starting where its previous end was. context.current_token_position.step(); %} /* table-begin */ {ABSTRACT} { return CARBON_SIMPLE_TOKEN(ABSTRACT); } {ADDR} { return CARBON_SIMPLE_TOKEN(ADDR); } {ALIAS} { return CARBON_SIMPLE_TOKEN(ALIAS); } {AMPERSAND} { return CARBON_SIMPLE_TOKEN(AMPERSAND); } {AND} { return CARBON_SIMPLE_TOKEN(AND); } {API} { return CARBON_SIMPLE_TOKEN(API); } {ARROW} { return CARBON_SIMPLE_TOKEN(ARROW); } {AS} { return CARBON_SIMPLE_TOKEN(AS); } {AUTO} { return CARBON_SIMPLE_TOKEN(AUTO); } {AWAIT} { return CARBON_SIMPLE_TOKEN(AWAIT); } {BASE} { return CARBON_SIMPLE_TOKEN(BASE); } {BOOL} { return CARBON_SIMPLE_TOKEN(BOOL); } {BREAK} { return CARBON_SIMPLE_TOKEN(BREAK); } {CARET} { return CARBON_SIMPLE_TOKEN(CARET); } {CASE} { return CARBON_SIMPLE_TOKEN(CASE); } {CHOICE} { return CARBON_SIMPLE_TOKEN(CHOICE); } {CLASS} { return CARBON_SIMPLE_TOKEN(CLASS); } {COLON_BANG} { return CARBON_SIMPLE_TOKEN(COLON_BANG); } {COLON} { return CARBON_SIMPLE_TOKEN(COLON); } {COMMA} { return CARBON_SIMPLE_TOKEN(COMMA); } {CONSTRAINT} { return CARBON_SIMPLE_TOKEN(CONSTRAINT); } {CONTINUATION_TYPE} { return CARBON_SIMPLE_TOKEN(CONTINUATION_TYPE); } {CONTINUATION} { return CARBON_SIMPLE_TOKEN(CONTINUATION); } {CONTINUE} { return CARBON_SIMPLE_TOKEN(CONTINUE); } {DEFAULT} { return CARBON_SIMPLE_TOKEN(DEFAULT); } {DESTRUCTOR} { return CARBON_SIMPLE_TOKEN(DESTRUCTOR); } {DOUBLE_ARROW} { return CARBON_SIMPLE_TOKEN(DOUBLE_ARROW); } {ELSE} { return CARBON_SIMPLE_TOKEN(ELSE); } {EQUAL_EQUAL} { return CARBON_SIMPLE_TOKEN(EQUAL_EQUAL); } {EQUAL} { return CARBON_SIMPLE_TOKEN(EQUAL); } {EXTENDS} { return CARBON_SIMPLE_TOKEN(EXTENDS); } {EXTERNAL} { return CARBON_SIMPLE_TOKEN(EXTERNAL); } {FALSE} { return CARBON_SIMPLE_TOKEN(FALSE); } {FN_TYPE} { return CARBON_SIMPLE_TOKEN(FN_TYPE); } {FN} { return CARBON_SIMPLE_TOKEN(FN); } {FORALL} { return CARBON_SIMPLE_TOKEN(FORALL); } {FOR} { return CARBON_SIMPLE_TOKEN(FOR); } {GREATER_EQUAL} { return CARBON_SIMPLE_TOKEN(GREATER_EQUAL); } {GREATER_GREATER} { return CARBON_SIMPLE_TOKEN(GREATER_GREATER); } {GREATER} { return CARBON_SIMPLE_TOKEN(GREATER); } {IF} { return CARBON_SIMPLE_TOKEN(IF); } {IMPL} { return CARBON_SIMPLE_TOKEN(IMPL); } {IMPORT} { return CARBON_SIMPLE_TOKEN(IMPORT); } {INTERFACE} { return CARBON_SIMPLE_TOKEN(INTERFACE); } {IN} { return CARBON_SIMPLE_TOKEN(IN); } {IS} { return CARBON_SIMPLE_TOKEN(IS); } {LEFT_CURLY_BRACE} { return CARBON_SIMPLE_TOKEN(LEFT_CURLY_BRACE); } {LEFT_PARENTHESIS} { return CARBON_SIMPLE_TOKEN(LEFT_PARENTHESIS); } {LEFT_SQUARE_BRACKET} { return CARBON_SIMPLE_TOKEN(LEFT_SQUARE_BRACKET); } {LESS_EQUAL} { return CARBON_SIMPLE_TOKEN(LESS_EQUAL); } {LESS_LESS} { return CARBON_SIMPLE_TOKEN(LESS_LESS); } {LESS} { return CARBON_SIMPLE_TOKEN(LESS); } {LET} { return CARBON_SIMPLE_TOKEN(LET); } {LIBRARY} { return CARBON_SIMPLE_TOKEN(LIBRARY); } {MATCH} { return CARBON_SIMPLE_TOKEN(MATCH); } {MINUS} { return CARBON_SIMPLE_TOKEN(MINUS); } {MIXIN} { return CARBON_SIMPLE_TOKEN(MIXIN); } {MIX} { return CARBON_SIMPLE_TOKEN(MIX); } {NOT_EQUAL} { return CARBON_SIMPLE_TOKEN(NOT_EQUAL); } {NOT} { return CARBON_SIMPLE_TOKEN(NOT); } {OR} { return CARBON_SIMPLE_TOKEN(OR); } {PACKAGE} { return CARBON_SIMPLE_TOKEN(PACKAGE); } {PERCENT} { return CARBON_SIMPLE_TOKEN(PERCENT); } {PERIOD} { return CARBON_SIMPLE_TOKEN(PERIOD); } {PIPE} { return CARBON_SIMPLE_TOKEN(PIPE); } {PLUS} { return CARBON_SIMPLE_TOKEN(PLUS); } {RETURNED} { return CARBON_SIMPLE_TOKEN(RETURNED); } {RETURN} { return CARBON_SIMPLE_TOKEN(RETURN); } {RUN} { return CARBON_SIMPLE_TOKEN(RUN); } {SELF} { return CARBON_SIMPLE_TOKEN(SELF); } {SEMICOLON} { return CARBON_SIMPLE_TOKEN(SEMICOLON); } {SLASH} { return CARBON_SIMPLE_TOKEN(SLASH); } {STRING} { return CARBON_SIMPLE_TOKEN(STRING); } {THEN} { return CARBON_SIMPLE_TOKEN(THEN); } {TRUE} { return CARBON_SIMPLE_TOKEN(TRUE); } {TYPE} { return CARBON_SIMPLE_TOKEN(TYPE); } {UNDERSCORE} { return CARBON_SIMPLE_TOKEN(UNDERSCORE); } {UNIMPL_EXAMPLE} { return CARBON_SIMPLE_TOKEN(UNIMPL_EXAMPLE); } {VAR} { return CARBON_SIMPLE_TOKEN(VAR); } {VIRTUAL} { return CARBON_SIMPLE_TOKEN(VIRTUAL); } {WHERE} { return CARBON_SIMPLE_TOKEN(WHERE); } {WHILE} { return CARBON_SIMPLE_TOKEN(WHILE); } /* table-end */ /* More modern Bisons provide make_EOF. */ <> { return CARBON_SIMPLE_TOKEN(END_OF_FILE); } {RIGHT_PARENTHESIS} { BEGIN(AFTER_OPERAND); return CARBON_SIMPLE_TOKEN(RIGHT_PARENTHESIS); } {RIGHT_CURLY_BRACE} { BEGIN(AFTER_OPERAND); return CARBON_SIMPLE_TOKEN(RIGHT_CURLY_BRACE); } {RIGHT_SQUARE_BRACKET} { BEGIN(AFTER_OPERAND); return CARBON_SIMPLE_TOKEN(RIGHT_SQUARE_BRACKET); } /* * For a `*` operator, we look at whitespace and local context to determine the * arity and fixity. There are two ways to write a binary operator: * * 1) Whitespace on both sides. * 2) Whitespace on neither side, and the previous token is considered to be * the end of an operand, and the next token is considered to be the start * of an operand. * * Otherwise, the operator is unary, but we also check for whitespace to help * the parser enforce the rule that whitespace is not permitted between the * operator and its operand, leading to three more cases: * * 3) Whitespace before (but implicitly not after, because that would give a * longer match and hit case 1): this can only be a prefix operator. * 4) Whitespace after and not before: this can only be a postfix operator. * 5) No whitespace on either side (otherwise the longest match would take us * to case 4): this is a unary operator and could be either prefix or * postfix. */ /* `*` operator case 1: */ "*"{whitespace}+ { BEGIN(AFTER_WHITESPACE); return CARBON_SIMPLE_TOKEN(BINARY_STAR); } /* `*` operator case 2: */ "*"/{operand_start} { return CARBON_SIMPLE_TOKEN(BINARY_STAR); } /* `*` operator case 3: */ "*" { return CARBON_SIMPLE_TOKEN(PREFIX_STAR); } /* `*` operator case 4: */ "*"{whitespace}+ { BEGIN(AFTER_WHITESPACE); return CARBON_SIMPLE_TOKEN(POSTFIX_STAR); } /* `*` operator case 5: */ "*" { return CARBON_SIMPLE_TOKEN(UNARY_STAR); } {sized_type_literal} { BEGIN(AFTER_OPERAND); return CARBON_ARG_TOKEN(sized_type_literal, yytext); } {intrinsic_identifier} { BEGIN(AFTER_OPERAND); Carbon::ErrorOr intrinsic = Carbon::IntrinsicExpression::FindIntrinsic(yytext, context.source_loc()); if (intrinsic.ok()) { return CARBON_ARG_TOKEN(intrinsic_identifier, *intrinsic); } else { return context.RecordSyntaxError(std::move(intrinsic).error()); } } {identifier} { BEGIN(AFTER_OPERAND); return CARBON_ARG_TOKEN(identifier, yytext); } {integer_literal} { BEGIN(AFTER_OPERAND); int val = 0; if (!llvm::to_integer(yytext, val)) { return context.RecordSyntaxError( llvm::formatv("Invalid integer literal: {0}", yytext)); } return CARBON_ARG_TOKEN(integer_literal, val); } #*(\"\"\"|\") { // Raw string literal. // yytext (the token that matches the above regex) and chars scanned by // str_lex_helper hold the source text, not the string the source represents. Carbon::StringLexHelper str_lex_helper(yytext, yyscanner, context); const std::string& s = str_lex_helper.str(); const int hashtag_num = s.find_first_of('"'); const int leading_quotes = s.size() - hashtag_num; if (leading_quotes == 3 && hashtag_num > 0) { // Check if it's a single-line string, like #"""#. // TODO: Extend with other single-line string cases, like #""""#, based on // the definition of block string in the design doc. if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) { return Carbon::ProcessSingleLineString(str_lex_helper.str(), context, hashtag_num); } else if (str_lex_helper.is_eof()) { return CARBON_SIMPLE_TOKEN(END_OF_FILE); } } else if (!str_lex_helper.Advance()) { return CARBON_SIMPLE_TOKEN(END_OF_FILE); } // 3 quotes indicates multi-line, otherwise it'll be one. const bool multi_line = leading_quotes == 3; while (!str_lex_helper.is_eof()) { switch (str_lex_helper.last_char()) { case '\n': // Fall through. case '\v': // Fall through. case '\f': // Fall through. case '\r': if (!multi_line) { return context.RecordSyntaxError( llvm::formatv("missing closing quote in single-line string: {0}", str_lex_helper.str())); } str_lex_helper.Advance(); break; case '"': if (multi_line) { // Check for 2 more '"'s on block string. if (!(str_lex_helper.Advance() && str_lex_helper.last_char() == '"')) { continue; } if (!(str_lex_helper.Advance() && str_lex_helper.last_char() == '"')) { continue; } // Now we are at the last " of """. } if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) { // Reach closing quotes, break out of the loop. if (leading_quotes == 3) { return Carbon::ProcessMultiLineString(str_lex_helper.str(), context, hashtag_num); } else { return Carbon::ProcessSingleLineString(str_lex_helper.str(), context, hashtag_num); } } break; case '\\': if (Carbon::ReadHashTags(str_lex_helper, hashtag_num)) { // Read the escaped char. if (!str_lex_helper.Advance()) { continue; } // Read the next char. str_lex_helper.Advance(); } break; default: str_lex_helper.Advance(); } } return CARBON_SIMPLE_TOKEN(END_OF_FILE); } {one_line_comment} { // Advance end by 1 line, resetting the column to zero. context.current_token_position.lines(1); // Make the span empty by setting start to end. context.current_token_position.step(); } {horizontal_whitespace}+ { // Make the span empty by setting start to end. context.current_token_position.step(); BEGIN(AFTER_WHITESPACE); } \n+ { // Advance end by yyleng lines, resetting the column to zero. context.current_token_position.lines(yyleng); // Make the span empty by setting start to end. context.current_token_position.step(); BEGIN(AFTER_WHITESPACE); } . { return context.RecordSyntaxError( llvm::formatv("invalid character '\\x{0}' in source file.", llvm::toHex(llvm::StringRef(yytext, 1)))); } %% auto YyinputWrapper(yyscan_t yyscanner) -> int { return yyinput(yyscanner); }