tomteb
/
carbon-lang


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
							/*
Part of the Carbon Language project, under the Apache License v2.0 with LLVM
Exceptions. See /LICENSE for license information.
SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/

%{
  #include <cstdlib>

  #include "common/check.h"
  #include "common/error.h"
  #include "common/string_helpers.h"
  #include "explorer/syntax/parse_and_lex_context.h"
  #include "explorer/syntax/parser.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Support/FormatVariadic.h"

  // Reads and returns a single character. Reports an error on EOF.
  auto ReadChar(yyscan_t yyscanner, Carbon::ParseAndLexContext& context) -> int;
%}

/* Turn off legacy bits we don't need. */
%option noyywrap nounput nodefault

%option reentrant

/* Lexing a token immediately after consuming some whitespace. */
%s AFTER_WHITESPACE
/*
 * Lexing a token immediately after consuming an operand-ending token:
 * a closing bracket, identifier, or literal.
 */
%s AFTER_OPERAND

/* table-begin */
ADDR                 "addr"
ALIAS                "alias"
AMPERSAND            "&"
AND                  "and"
API                  "api"
ARROW                "->"
AS                   "as"
AUTO                 "auto"
AWAIT                "__await"
BOOL                 "Bool"
BREAK                "break"
CASE                 "case"
CHOICE               "choice"
CLASS                "class"
COLON                ":"
COLON_BANG           ":!"
COMMA                ","
CONTINUATION         "__continuation"
CONTINUATION_TYPE    "__Continuation"
CONTINUE             "continue"
DEFAULT              "default"
DOUBLE_ARROW         "=>"
ELSE                 "else"
EQUAL                "="
EQUAL_EQUAL          "=="
EXTERNAL             "external"
FALSE                "false"
FN                   "fn"
FN_TYPE              "__Fn"
FORALL               "forall"
IF                   "if"
IMPL                 "impl"
IMPORT               "import"
INTERFACE            "interface"
IS                   "is"
LEFT_CURLY_BRACE     "{"
LEFT_PARENTHESIS     "("
LEFT_SQUARE_BRACKET  "["
LET                  "let"
LIBRARY              "library"
MATCH                "match"
MINUS                "-"
NOT                  "not"
OR                   "or"
PACKAGE              "package"
PERIOD               "."
PLUS                 "+"
RETURN               "return"
RIGHT_CURLY_BRACE    "}"
RIGHT_PARENTHESIS    ")"
RIGHT_SQUARE_BRACKET "]"
RUN                  "__run"
SELF                 "Self"
SEMICOLON            ";"
SLASH                "/"
STRING               "String"
THEN                 "then"
TRUE                 "true"
TYPE                 "Type"
UNDERSCORE           "_"
UNIMPL_EXAMPLE       "__unimplemented_example_infix"
VAR                  "var"
WHERE                "where"
WHILE                "while"
/* table-end */

/* This should be kept table-like, but isn't automatic due to spaces. */
identifier            [A-Za-z_][A-Za-z0-9_]*
intrinsic_identifier  __intrinsic_[A-Za-z0-9_]*
sized_type_literal    [iuf][1-9][0-9]*
integer_literal       [0-9]+
horizontal_whitespace [ \t\r]
whitespace            [ \t\r\n]
one_line_comment      \/\/[^\n]*\n
operand_start         [(A-Za-z0-9_\"]

/* Single-line string literals should reject vertical whitespace. */
string_literal        \"([^\\\"\n\v\f\r]|\\.)*\"

%{
  // This macro is expanded immediately before each action specified below.
  //
  // Advances the current token position by yyleng columns without changing
  // the line number, and takes us out of the after-whitespace / after-operand
  // state.
  #define YY_USER_ACTION                                             \
    context.current_token_position.columns(yyleng);                  \
    if (YY_START == AFTER_WHITESPACE || YY_START == AFTER_OPERAND) { \
      BEGIN(INITIAL);                                                \
    }

  #define SIMPLE_TOKEN(name) \
    Carbon::Parser::make_##name(context.current_token_position);

  #define ARG_TOKEN(name, arg) \
    Carbon::Parser::make_##name(arg, context.current_token_position);
%}

%%

%{
  // Code run each time yylex is called.

  // Begin with an empty token span starting where its previous end was.
  context.current_token_position.step();
%}

 /* table-begin */
{ADDR}                { return SIMPLE_TOKEN(ADDR);                }
{ALIAS}               { return SIMPLE_TOKEN(ALIAS);               }
{AMPERSAND}           { return SIMPLE_TOKEN(AMPERSAND);           }
{AND}                 { return SIMPLE_TOKEN(AND);                 }
{API}                 { return SIMPLE_TOKEN(API);                 }
{ARROW}               { return SIMPLE_TOKEN(ARROW);               }
{AS}                  { return SIMPLE_TOKEN(AS);                  }
{AUTO}                { return SIMPLE_TOKEN(AUTO);                }
{AWAIT}               { return SIMPLE_TOKEN(AWAIT);               }
{BOOL}                { return SIMPLE_TOKEN(BOOL);                }
{BREAK}               { return SIMPLE_TOKEN(BREAK);               }
{CASE}                { return SIMPLE_TOKEN(CASE);                }
{CHOICE}              { return SIMPLE_TOKEN(CHOICE);              }
{CLASS}               { return SIMPLE_TOKEN(CLASS);               }
{COLON_BANG}          { return SIMPLE_TOKEN(COLON_BANG);          }
{COLON}               { return SIMPLE_TOKEN(COLON);               }
{COMMA}               { return SIMPLE_TOKEN(COMMA);               }
{CONTINUATION_TYPE}   { return SIMPLE_TOKEN(CONTINUATION_TYPE);   }
{CONTINUATION}        { return SIMPLE_TOKEN(CONTINUATION);        }
{CONTINUE}            { return SIMPLE_TOKEN(CONTINUE);            }
{DEFAULT}             { return SIMPLE_TOKEN(DEFAULT);             }
{DOUBLE_ARROW}        { return SIMPLE_TOKEN(DOUBLE_ARROW);        }
{ELSE}                { return SIMPLE_TOKEN(ELSE);                }
{EQUAL_EQUAL}         { return SIMPLE_TOKEN(EQUAL_EQUAL);         }
{EQUAL}               { return SIMPLE_TOKEN(EQUAL);               }
{EXTERNAL}            { return SIMPLE_TOKEN(EXTERNAL);            }
{FALSE}               { return SIMPLE_TOKEN(FALSE);               }
{FN_TYPE}             { return SIMPLE_TOKEN(FN_TYPE);             }
{FN}                  { return SIMPLE_TOKEN(FN);                  }
{FORALL}              { return SIMPLE_TOKEN(FORALL);              }
{IF}                  { return SIMPLE_TOKEN(IF);                  }
{IMPL}                { return SIMPLE_TOKEN(IMPL);                }
{IMPORT}              { return SIMPLE_TOKEN(IMPORT);              }
{INTERFACE}           { return SIMPLE_TOKEN(INTERFACE);           }
{IS}                  { return SIMPLE_TOKEN(IS);                  }
{LEFT_CURLY_BRACE}    { return SIMPLE_TOKEN(LEFT_CURLY_BRACE);    }
{LEFT_PARENTHESIS}    { return SIMPLE_TOKEN(LEFT_PARENTHESIS);    }
{LEFT_SQUARE_BRACKET} { return SIMPLE_TOKEN(LEFT_SQUARE_BRACKET); }
{LET}                 { return SIMPLE_TOKEN(LET);                 }
{LIBRARY}             { return SIMPLE_TOKEN(LIBRARY);             }
{MATCH}               { return SIMPLE_TOKEN(MATCH);               }
{MINUS}               { return SIMPLE_TOKEN(MINUS);               }
{NOT}                 { return SIMPLE_TOKEN(NOT);                 }
{OR}                  { return SIMPLE_TOKEN(OR);                  }
{PACKAGE}             { return SIMPLE_TOKEN(PACKAGE);             }
{PERIOD}              { return SIMPLE_TOKEN(PERIOD);              }
{PLUS}                { return SIMPLE_TOKEN(PLUS);                }
{RETURN}              { return SIMPLE_TOKEN(RETURN);              }
{RUN}                 { return SIMPLE_TOKEN(RUN);                 }
{SELF}                { return SIMPLE_TOKEN(SELF);                }
{SEMICOLON}           { return SIMPLE_TOKEN(SEMICOLON);           }
{SLASH}               { return SIMPLE_TOKEN(SLASH);               }
{STRING}              { return SIMPLE_TOKEN(STRING);              }
{THEN}                { return SIMPLE_TOKEN(THEN);                }
{TRUE}                { return SIMPLE_TOKEN(TRUE);                }
{TYPE}                { return SIMPLE_TOKEN(TYPE);                }
{UNDERSCORE}          { return SIMPLE_TOKEN(UNDERSCORE);          }
{UNIMPL_EXAMPLE}      { return SIMPLE_TOKEN(UNIMPL_EXAMPLE);      }
{VAR}                 { return SIMPLE_TOKEN(VAR);                 }
{WHERE}               { return SIMPLE_TOKEN(WHERE);               }
{WHILE}               { return SIMPLE_TOKEN(WHILE);               }
 /* table-end */

 /* More modern Bisons provide make_EOF. */
<<EOF>>               { return SIMPLE_TOKEN(END_OF_FILE); }

{RIGHT_PARENTHESIS} {
  BEGIN(AFTER_OPERAND);
  return SIMPLE_TOKEN(RIGHT_PARENTHESIS);
}
{RIGHT_CURLY_BRACE} {
  BEGIN(AFTER_OPERAND);
  return SIMPLE_TOKEN(RIGHT_CURLY_BRACE);
}
{RIGHT_SQUARE_BRACKET} {
  BEGIN(AFTER_OPERAND);
  return SIMPLE_TOKEN(RIGHT_SQUARE_BRACKET);
}

 /*
  * For a `*` operator, we look at whitespace and local context to determine the
  * arity and fixity. There are two ways to write a binary operator:
  *
  * 1) Whitespace on both sides.
  * 2) Whitespace on neither side, and the previous token is considered to be
  *    the end of an operand, and the next token is considered to be the start
  *    of an operand.
  *
  * Otherwise, the operator is unary, but we also check for whitespace to help
  * the parser enforce the rule that whitespace is not permitted between the
  * operator and its operand, leading to three more cases:
  *
  * 3) Whitespace before (but implicitly not after, because that would give a
  *    longer match and hit case 1): this can only be a prefix operator.
  * 4) Whitespace after and not before: this can only be a postfix operator.
  * 5) No whitespace on either side (otherwise the longest match would take us
  *    to case 4): this is a unary operator and could be either prefix or
  *    postfix.
  */

 /* `*` operator case 1: */
<AFTER_WHITESPACE>"*"{whitespace}+ {
  BEGIN(AFTER_WHITESPACE);
  return SIMPLE_TOKEN(BINARY_STAR);
}
 /* `*` operator case 2: */
<AFTER_OPERAND>"*"/{operand_start} { return SIMPLE_TOKEN(BINARY_STAR); }
 /* `*` operator case 3: */
<AFTER_WHITESPACE>"*" { return SIMPLE_TOKEN(PREFIX_STAR); }
 /* `*` operator case 4: */
<INITIAL,AFTER_OPERAND>"*"{whitespace}+ {
  BEGIN(AFTER_WHITESPACE);
  return SIMPLE_TOKEN(POSTFIX_STAR);
}
 /* `*` operator case 5: */
<INITIAL,AFTER_OPERAND>"*" { return SIMPLE_TOKEN(UNARY_STAR); }

{sized_type_literal} {
  BEGIN(AFTER_OPERAND);
  return ARG_TOKEN(sized_type_literal, yytext);
}

{intrinsic_identifier} {
  BEGIN(AFTER_OPERAND);
  Carbon::ErrorOr<Carbon::IntrinsicExpression::Intrinsic> intrinsic =
      Carbon::IntrinsicExpression::FindIntrinsic(yytext, context.source_loc());
  if (intrinsic.ok()) {
    return ARG_TOKEN(intrinsic_identifier, *intrinsic);
  } else {
    return context.RecordSyntaxError(intrinsic.error().message());
  }
}

{identifier} {
  BEGIN(AFTER_OPERAND);
  return ARG_TOKEN(identifier, yytext);
}

{integer_literal} {
  BEGIN(AFTER_OPERAND);
  int val = 0;
  if (!llvm::to_integer(yytext, val)) {
    return context.RecordSyntaxError(
        llvm::formatv("Invalid integer literal: {0}", yytext));
  }
  return ARG_TOKEN(integer_literal, val);
}

{string_literal} {
  llvm::StringRef str(yytext);
  CARBON_CHECK(str.consume_front("\"") && str.consume_back("\""));
  std::optional<std::string> unescaped = Carbon::UnescapeStringLiteral(str);
  if (unescaped == std::nullopt) {
    return context.RecordSyntaxError(
        llvm::formatv("Invalid escaping in string: {0}", yytext));
  }
  return ARG_TOKEN(string_literal, *unescaped);
}

\"\"\" {
  // Block string literal.
  std::string s(yytext);
  // Scans for the closing """, checking for possible escape sequences
  // like \""".
  for (;;) {
    int c = ReadChar(yyscanner, context);
    if (c <= 0) {
      return SIMPLE_TOKEN(END_OF_FILE);
    }
    s.push_back(c);
    if (c != '"' && c != '\\') {
      continue;
    }
    if (c == '\\') {
      // \" in \""" is not a terminator.
      c = ReadChar(yyscanner, context);
      if (c <= 0) {
        return SIMPLE_TOKEN(END_OF_FILE);
      }
      s.push_back(c);
      continue;
    }

    c = ReadChar(yyscanner, context);
    if (c <= 0) {
      return SIMPLE_TOKEN(END_OF_FILE);
    }
    s.push_back(c);
    if (c != '"') {
      continue;
    }

    c = ReadChar(yyscanner, context);
    if (c <= 0) {
      return SIMPLE_TOKEN(END_OF_FILE);
    }
    s.push_back(c);
    if (c == '"') {
      break;
    }
  }
  Carbon::ErrorOr<std::string> block_string =
      Carbon::ParseBlockStringLiteral(s);
  if (!block_string.ok()) {
    return context.RecordSyntaxError(llvm::formatv(
        "Invalid block string: {0}", block_string.error().message()));
  }
  return ARG_TOKEN(string_literal, *block_string);
}

{one_line_comment} {
  // Advance end by 1 line, resetting the column to zero.
  context.current_token_position.lines(1);
  // Make the span empty by setting start to end.
  context.current_token_position.step();
}

{horizontal_whitespace}+ {
  // Make the span empty by setting start to end.
  context.current_token_position.step();
  BEGIN(AFTER_WHITESPACE);
}

\n+ {
  // Advance end by yyleng lines, resetting the column to zero.
  context.current_token_position.lines(yyleng);
  // Make the span empty by setting start to end.
  context.current_token_position.step();
  BEGIN(AFTER_WHITESPACE);
}

. {
  return context.RecordSyntaxError(
      llvm::formatv("invalid character '\\x{0}' in source file.",
                    llvm::toHex(llvm::StringRef(yytext, 1))));
}

%%

auto ReadChar(yyscan_t yyscanner, Carbon::ParseAndLexContext& context) -> int {
  const int c = yyinput(yyscanner);
  if (c <= 0) {
    context.RecordSyntaxError("Unexpected end of file");
  }
  return c;
}