/* Part of the Carbon Language project, under the Apache License v2.0 with LLVM Exceptions. See /LICENSE for license information. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ %{ #include #include "common/check.h" #include "common/string_helpers.h" #include "executable_semantics/common/tracing_flag.h" #include "executable_semantics/syntax/parse_and_lex_context.h" #include "executable_semantics/syntax/parser.h" #include "llvm/ADT/StringExtras.h" %} /* Turn off legacy bits we don't need. */ %option noyywrap nounput nodefault noinput %option reentrant /* Lexing a token immediately after consuming some whitespace. */ %s AFTER_WHITESPACE /* * Lexing a token immediately after consuming an operand-ending token: * a closing bracket, identifier, or literal. */ %s AFTER_OPERAND /* table-begin */ AND "and" API "api" ARROW "->" AUTO "auto" AWAIT "__await" BOOL "Bool" BREAK "break" CASE "case" CHOICE "choice" CLASS "class" COLON ":" COLON_BANG ":!" COMMA "," CONTINUATION "__continuation" CONTINUATION_TYPE "__Continuation" CONTINUE "continue" DEFAULT "default" DOUBLE_ARROW "=>" ELSE "else" EQUAL "=" EQUAL_EQUAL "==" FALSE "false" FN "fn" FNTY "fnty" IF "if" IMPL "impl" IMPORT "import" LEFT_CURLY_BRACE "{" LEFT_PARENTHESIS "(" LEFT_SQUARE_BRACKET "[" LIBRARY "library" MATCH "match" MINUS "-" NOT "not" OR "or" PACKAGE "package" PERIOD "." PLUS "+" RETURN "return" RIGHT_CURLY_BRACE "}" RIGHT_PARENTHESIS ")" RIGHT_SQUARE_BRACKET "]" RUN "__run" SEMICOLON ";" SLASH "/" STRING "String" TRUE "true" TYPE "Type" UNDERSCORE "_" VAR "var" WHILE "while" /* table-end */ /* This should be kept table-like, but isn't automatic due to spaces. */ identifier [A-Za-z_][A-Za-z0-9_]* sized_type_literal [iuf][1-9][0-9]* integer_literal [0-9]+ horizontal_whitespace [ \t\r] whitespace [ \t\r\n] one_line_comment \/\/[^\n]*\n operand_start [(A-Za-z0-9_\"] /* Single-line string literals should reject vertical whitespace. */ string_literal \"([^\\\"\n\v\f\r]|\\.)*\" %{ // This macro is expanded immediately before each action specified below. // // Advances the current token position by yyleng columns without changing // the line number, and takes us out of the after-whitespace / after-operand // state. #define YY_USER_ACTION \ context.current_token_position.columns(yyleng); \ if (YY_START == AFTER_WHITESPACE || YY_START == AFTER_OPERAND) { \ BEGIN(INITIAL); \ } #define SIMPLE_TOKEN(name) \ Carbon::Parser::make_##name(context.current_token_position); #define ARG_TOKEN(name, arg) \ Carbon::Parser::make_##name(arg, context.current_token_position); %} %% %{ // Code run each time yylex is called. // Begin with an empty token span starting where its previous end was. context.current_token_position.step(); %} /* table-begin */ {AND} { return SIMPLE_TOKEN(AND); } {API} { return SIMPLE_TOKEN(API); } {ARROW} { return SIMPLE_TOKEN(ARROW); } {AUTO} { return SIMPLE_TOKEN(AUTO); } {AWAIT} { return SIMPLE_TOKEN(AWAIT); } {BOOL} { return SIMPLE_TOKEN(BOOL); } {BREAK} { return SIMPLE_TOKEN(BREAK); } {CASE} { return SIMPLE_TOKEN(CASE); } {CHOICE} { return SIMPLE_TOKEN(CHOICE); } {CLASS} { return SIMPLE_TOKEN(CLASS); } {COLON_BANG} { return SIMPLE_TOKEN(COLON_BANG); } {COLON} { return SIMPLE_TOKEN(COLON); } {COMMA} { return SIMPLE_TOKEN(COMMA); } {CONTINUATION_TYPE} { return SIMPLE_TOKEN(CONTINUATION_TYPE); } {CONTINUATION} { return SIMPLE_TOKEN(CONTINUATION); } {CONTINUE} { return SIMPLE_TOKEN(CONTINUE); } {DEFAULT} { return SIMPLE_TOKEN(DEFAULT); } {DOUBLE_ARROW} { return SIMPLE_TOKEN(DOUBLE_ARROW); } {ELSE} { return SIMPLE_TOKEN(ELSE); } {EQUAL_EQUAL} { return SIMPLE_TOKEN(EQUAL_EQUAL); } {EQUAL} { return SIMPLE_TOKEN(EQUAL); } {FALSE} { return SIMPLE_TOKEN(FALSE); } {FNTY} { return SIMPLE_TOKEN(FNTY); } {FN} { return SIMPLE_TOKEN(FN); } {IF} { return SIMPLE_TOKEN(IF); } {IMPL} { return SIMPLE_TOKEN(IMPL); } {IMPORT} { return SIMPLE_TOKEN(IMPORT); } {LEFT_CURLY_BRACE} { return SIMPLE_TOKEN(LEFT_CURLY_BRACE); } {LEFT_PARENTHESIS} { return SIMPLE_TOKEN(LEFT_PARENTHESIS); } {LEFT_SQUARE_BRACKET} { return SIMPLE_TOKEN(LEFT_SQUARE_BRACKET); } {LIBRARY} { return SIMPLE_TOKEN(LIBRARY); } {MATCH} { return SIMPLE_TOKEN(MATCH); } {MINUS} { return SIMPLE_TOKEN(MINUS); } {NOT} { return SIMPLE_TOKEN(NOT); } {OR} { return SIMPLE_TOKEN(OR); } {PACKAGE} { return SIMPLE_TOKEN(PACKAGE); } {PERIOD} { return SIMPLE_TOKEN(PERIOD); } {PLUS} { return SIMPLE_TOKEN(PLUS); } {RETURN} { return SIMPLE_TOKEN(RETURN); } {RUN} { return SIMPLE_TOKEN(RUN); } {SEMICOLON} { return SIMPLE_TOKEN(SEMICOLON); } {SLASH} { return SIMPLE_TOKEN(SLASH); } {STRING} { return SIMPLE_TOKEN(STRING); } {TRUE} { return SIMPLE_TOKEN(TRUE); } {TYPE} { return SIMPLE_TOKEN(TYPE); } {UNDERSCORE} { return SIMPLE_TOKEN(UNDERSCORE); } {VAR} { return SIMPLE_TOKEN(VAR); } {WHILE} { return SIMPLE_TOKEN(WHILE); } /* table-end */ /* More modern Bisons provide make_EOF. */ <> { return SIMPLE_TOKEN(END_OF_FILE); } {RIGHT_PARENTHESIS} { BEGIN(AFTER_OPERAND); return SIMPLE_TOKEN(RIGHT_PARENTHESIS); } {RIGHT_CURLY_BRACE} { BEGIN(AFTER_OPERAND); return SIMPLE_TOKEN(RIGHT_CURLY_BRACE); } {RIGHT_SQUARE_BRACKET} { BEGIN(AFTER_OPERAND); return SIMPLE_TOKEN(RIGHT_SQUARE_BRACKET); } /* * For a `*` operator, we look at whitespace and local context to determine the * arity and fixity. There are two ways to write a binary operator: * * 1) Whitespace on both sides. * 2) Whitespace on neither side, and the previous token is considered to be * the end of an operand, and the next token is considered to be the start * of an operand. * * Otherwise, the operator is unary, but we also check for whitespace to help * the parser enforce the rule that whitespace is not permitted between the * operator and its operand, leading to three more cases: * * 3) Whitespace before (but implicitly not after, because that would give a * longer match and hit case 1): this can only be a prefix operator. * 4) Whitespace after and not before: this can only be a postfix operator. * 5) No whitespace on either side (otherwise the longest match would take us * to case 4): this is a unary operator and could be either prefix or * postfix. */ /* `*` operator case 1: */ "*"{whitespace}+ { BEGIN(AFTER_WHITESPACE); return SIMPLE_TOKEN(BINARY_STAR); } /* `*` operator case 2: */ "*"/{operand_start} { return SIMPLE_TOKEN(BINARY_STAR); } /* `*` operator case 3: */ "*" { return SIMPLE_TOKEN(PREFIX_STAR); } /* `*` operator case 4: */ "*"{whitespace}+ { BEGIN(AFTER_WHITESPACE); return SIMPLE_TOKEN(POSTFIX_STAR); } /* `*` operator case 5: */ "*" { return SIMPLE_TOKEN(UNARY_STAR); } {sized_type_literal} { return ARG_TOKEN(sized_type_literal, yytext); } {identifier} { BEGIN(AFTER_OPERAND); return ARG_TOKEN(identifier, yytext); } {integer_literal} { BEGIN(AFTER_OPERAND); int val; CHECK(llvm::to_integer(yytext, val)); return ARG_TOKEN(integer_literal, val); } {string_literal} { llvm::StringRef str(yytext); CHECK(str.consume_front("\"") && str.consume_back("\"")); std::optional unescaped = Carbon::UnescapeStringLiteral(str); if (unescaped == std::nullopt) { if (Carbon::tracing_output) { // Print a newline because tracing prints an incomplete line // "Reading a token: ". llvm::errs() << "\n"; } FATAL_COMPILATION_ERROR(context.SourceLoc()) << "Invalid escaping in string: " << yytext; } return ARG_TOKEN(string_literal, *unescaped); } {one_line_comment} { // Advance end by 1 line, resetting the column to zero. context.current_token_position.lines(1); // Make the span empty by setting start to end. context.current_token_position.step(); } {horizontal_whitespace}+ { // Make the span empty by setting start to end. context.current_token_position.step(); BEGIN(AFTER_WHITESPACE); } \n+ { // Advance end by yyleng lines, resetting the column to zero. context.current_token_position.lines(yyleng); // Make the span empty by setting start to end. context.current_token_position.step(); BEGIN(AFTER_WHITESPACE); } . { if (Carbon::tracing_output) { // Print a newline because tracing prints an incomplete line // "Reading a token: ". llvm::errs() << "\n"; } FATAL_COMPILATION_ERROR(context.SourceLoc()) << "invalid character '\\x" << llvm::toHex(llvm::StringRef(yytext, 1)) << "' in source file."; } %%