tokenized_buffer_test.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/tokenized_buffer.h"
  5. #include <iterator>
  6. #include "diagnostics/diagnostic_emitter.h"
  7. #include "gmock/gmock.h"
  8. #include "gtest/gtest.h"
  9. #include "lexer/tokenized_buffer_test_helpers.h"
  10. #include "llvm/ADT/ArrayRef.h"
  11. #include "llvm/ADT/None.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/SmallString.h"
  14. #include "llvm/ADT/Twine.h"
  15. #include "llvm/Support/SourceMgr.h"
  16. #include "llvm/Support/YAMLParser.h"
  17. #include "llvm/Support/raw_ostream.h"
  18. namespace Carbon {
  19. namespace {
  20. using ::Carbon::Testing::ExpectedToken;
  21. using ::Carbon::Testing::HasTokens;
  22. using ::Carbon::Testing::IsKeyValueScalars;
  23. using ::testing::Eq;
  24. using ::testing::NotNull;
  25. using ::testing::StrEq;
  26. struct LexerTest : ::testing::Test {
  27. llvm::SmallVector<SourceBuffer, 16> source_storage;
  28. auto GetSourceBuffer(llvm::Twine text) -> SourceBuffer& {
  29. source_storage.push_back(SourceBuffer::CreateFromText(text.str()));
  30. return source_storage.back();
  31. }
  32. auto Lex(llvm::Twine text) -> TokenizedBuffer {
  33. // TODO: build a full mock for this.
  34. return TokenizedBuffer::Lex(GetSourceBuffer(text), NullDiagnosticEmitter());
  35. }
  36. };
  37. TEST_F(LexerTest, HandlesEmptyBuffer) {
  38. auto buffer = Lex("");
  39. EXPECT_FALSE(buffer.HasErrors());
  40. EXPECT_EQ(buffer.Tokens().begin(), buffer.Tokens().end());
  41. }
  42. TEST_F(LexerTest, TracksLinesAndColumns) {
  43. auto buffer = Lex("\n ;;\n ;;;\n");
  44. EXPECT_FALSE(buffer.HasErrors());
  45. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  46. {.kind = TokenKind::Semi(),
  47. .line = 2,
  48. .column = 3,
  49. .indent_column = 3},
  50. {.kind = TokenKind::Semi(),
  51. .line = 2,
  52. .column = 4,
  53. .indent_column = 3},
  54. {.kind = TokenKind::Semi(),
  55. .line = 3,
  56. .column = 4,
  57. .indent_column = 4},
  58. {.kind = TokenKind::Semi(),
  59. .line = 3,
  60. .column = 5,
  61. .indent_column = 4},
  62. {.kind = TokenKind::Semi(),
  63. .line = 3,
  64. .column = 6,
  65. .indent_column = 4},
  66. }));
  67. }
  68. TEST_F(LexerTest, HandlesIntegerLiteral) {
  69. auto buffer = Lex("12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567");
  70. EXPECT_FALSE(buffer.HasErrors());
  71. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  72. {.kind = TokenKind::IntegerLiteral(),
  73. .line = 1,
  74. .column = 1,
  75. .indent_column = 1,
  76. .text = "12"},
  77. {.kind = TokenKind::Minus(),
  78. .line = 1,
  79. .column = 3,
  80. .indent_column = 1},
  81. {.kind = TokenKind::IntegerLiteral(),
  82. .line = 1,
  83. .column = 4,
  84. .indent_column = 1,
  85. .text = "578"},
  86. {.kind = TokenKind::IntegerLiteral(),
  87. .line = 2,
  88. .column = 3,
  89. .indent_column = 3,
  90. .text = "1"},
  91. {.kind = TokenKind::IntegerLiteral(),
  92. .line = 2,
  93. .column = 6,
  94. .indent_column = 3,
  95. .text = "2"},
  96. {.kind = TokenKind::IntegerLiteral(),
  97. .line = 3,
  98. .column = 1,
  99. .indent_column = 1,
  100. .text = "0x12_3ABC"},
  101. {.kind = TokenKind::IntegerLiteral(),
  102. .line = 4,
  103. .column = 1,
  104. .indent_column = 1,
  105. .text = "0b10_10_11"},
  106. {.kind = TokenKind::IntegerLiteral(),
  107. .line = 5,
  108. .column = 1,
  109. .indent_column = 1,
  110. .text = "1_234_567"},
  111. }));
  112. auto token_12 = buffer.Tokens().begin();
  113. EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
  114. auto token_578 = buffer.Tokens().begin() + 2;
  115. EXPECT_EQ(buffer.GetIntegerLiteral(*token_578), 578);
  116. auto token_1 = buffer.Tokens().begin() + 3;
  117. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1), 1);
  118. auto token_2 = buffer.Tokens().begin() + 4;
  119. EXPECT_EQ(buffer.GetIntegerLiteral(*token_2), 2);
  120. auto token_0x12_3abc = buffer.Tokens().begin() + 5;
  121. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0x12_3abc), 0x12'3abc);
  122. auto token_0b10_10_11 = buffer.Tokens().begin() + 6;
  123. EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
  124. auto token_1_234_567 = buffer.Tokens().begin() + 7;
  125. EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
  126. }
  127. TEST_F(LexerTest, ValidatesBaseSpecifier) {
  128. llvm::StringLiteral valid[] = {
  129. // Decimal integer literals.
  130. "0",
  131. "1",
  132. "123456789000000000000000000000000000000000000",
  133. // Hexadecimal integer literals.
  134. "0x0123456789ABCDEF",
  135. "0x0000000000000000000000000000000",
  136. // Binary integer literals.
  137. "0b10110100101001010",
  138. "0b0000000",
  139. };
  140. for (llvm::StringLiteral literal : valid) {
  141. auto buffer = Lex(literal);
  142. EXPECT_FALSE(buffer.HasErrors()) << literal;
  143. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  144. {.kind = TokenKind::IntegerLiteral(),
  145. .line = 1,
  146. .column = 1,
  147. .indent_column = 1,
  148. .text = literal}}));
  149. }
  150. llvm::StringLiteral invalid[] = {
  151. "00", "0X123", "0o123", "0B1",
  152. "007", "123L", "123456789A", "0x",
  153. "0b", "0x123abc", "0b011101201001", "0b10A",
  154. };
  155. for (llvm::StringLiteral literal : invalid) {
  156. auto buffer = Lex(literal);
  157. EXPECT_TRUE(buffer.HasErrors()) << literal;
  158. ASSERT_THAT(
  159. buffer,
  160. HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
  161. .line = 1,
  162. .column = 1,
  163. .indent_column = 1,
  164. .text = literal}}));
  165. }
  166. }
  167. TEST_F(LexerTest, ValidatesIntegerDigitSeparators) {
  168. llvm::StringLiteral valid[] = {
  169. // Decimal literals optionally have digit separators every 3 places.
  170. "1_234",
  171. "123_456",
  172. "1_234_567",
  173. // Hexadecimal literals optionally have digit separators every 4 places.
  174. "0x1_0000",
  175. "0x1000_0000",
  176. "0x1_0000_0000",
  177. // Binary integer literals can have digit separators anywhere..
  178. "0b1_0_1_0_1_0",
  179. "0b111_0000",
  180. };
  181. for (llvm::StringLiteral literal : valid) {
  182. auto buffer = Lex(literal);
  183. EXPECT_FALSE(buffer.HasErrors()) << literal;
  184. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  185. {.kind = TokenKind::IntegerLiteral(),
  186. .line = 1,
  187. .column = 1,
  188. .indent_column = 1,
  189. .text = literal}}));
  190. }
  191. llvm::StringLiteral invalid[] = {
  192. // Decimal literals.
  193. "12_34",
  194. "123_4_6_789",
  195. "12_3456_789",
  196. "12__345",
  197. "1_",
  198. // Hexadecimal literals.
  199. "0x_1234",
  200. "0x123_",
  201. "0x12_3",
  202. "0x_234_5678",
  203. "0x1234_567",
  204. // Binary literals.
  205. "0b_10101",
  206. "0b1__01",
  207. "0b1011_",
  208. "0b1_01_01_",
  209. };
  210. for (llvm::StringLiteral literal : invalid) {
  211. auto buffer = Lex(literal);
  212. EXPECT_TRUE(buffer.HasErrors()) << literal;
  213. // We expect to produce a token even for a literal containing invalid digit
  214. // separators, for better error recovery.
  215. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  216. {.kind = TokenKind::IntegerLiteral(),
  217. .line = 1,
  218. .column = 1,
  219. .indent_column = 1,
  220. .text = literal}}));
  221. }
  222. }
  223. TEST_F(LexerTest, HandlesGarbageCharacters) {
  224. constexpr char GarbageText[] = "$$💩-$\n$\0$12$";
  225. auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  226. EXPECT_TRUE(buffer.HasErrors());
  227. EXPECT_THAT(
  228. buffer,
  229. HasTokens(llvm::ArrayRef<ExpectedToken>{
  230. {.kind = TokenKind::Error(),
  231. .line = 1,
  232. .column = 1,
  233. .text = llvm::StringRef("$$💩", 6)},
  234. // 💩 takes 4 bytes, and we count column as bytes offset.
  235. {.kind = TokenKind::Minus(), .line = 1, .column = 7},
  236. {.kind = TokenKind::Error(), .line = 1, .column = 8, .text = "$"},
  237. // newline
  238. {.kind = TokenKind::Error(),
  239. .line = 2,
  240. .column = 1,
  241. .text = llvm::StringRef("$\0$", 3)},
  242. {.kind = TokenKind::IntegerLiteral(),
  243. .line = 2,
  244. .column = 4,
  245. .text = "12"},
  246. {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
  247. }));
  248. }
  249. TEST_F(LexerTest, Symbols) {
  250. // We don't need to exhaustively test symbols here as they're handled with
  251. // common code, but we want to check specific patterns to verify things like
  252. // max-munch rule and handling of interesting symbols.
  253. auto buffer = Lex("<<<");
  254. EXPECT_FALSE(buffer.HasErrors());
  255. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  256. {TokenKind::LessLess()},
  257. {TokenKind::Less()},
  258. }));
  259. buffer = Lex("<<=>>");
  260. EXPECT_FALSE(buffer.HasErrors());
  261. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  262. {TokenKind::LessLessEqual()},
  263. {TokenKind::GreaterGreater()},
  264. }));
  265. buffer = Lex("< <=> >");
  266. EXPECT_FALSE(buffer.HasErrors());
  267. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  268. {TokenKind::Less()},
  269. {TokenKind::LessEqualGreater()},
  270. {TokenKind::Greater()},
  271. }));
  272. buffer = Lex("\\/?#@&^!");
  273. EXPECT_FALSE(buffer.HasErrors());
  274. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  275. {TokenKind::Backslash()},
  276. {TokenKind::Slash()},
  277. {TokenKind::Question()},
  278. {TokenKind::Hash()},
  279. {TokenKind::At()},
  280. {TokenKind::Amp()},
  281. {TokenKind::Caret()},
  282. {TokenKind::Exclaim()},
  283. }));
  284. }
  285. TEST_F(LexerTest, Parens) {
  286. auto buffer = Lex("()");
  287. EXPECT_FALSE(buffer.HasErrors());
  288. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  289. {TokenKind::OpenParen()},
  290. {TokenKind::CloseParen()},
  291. }));
  292. buffer = Lex("((()()))");
  293. EXPECT_FALSE(buffer.HasErrors());
  294. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  295. {TokenKind::OpenParen()},
  296. {TokenKind::OpenParen()},
  297. {TokenKind::OpenParen()},
  298. {TokenKind::CloseParen()},
  299. {TokenKind::OpenParen()},
  300. {TokenKind::CloseParen()},
  301. {TokenKind::CloseParen()},
  302. {TokenKind::CloseParen()},
  303. }));
  304. }
  305. TEST_F(LexerTest, CurlyBraces) {
  306. auto buffer = Lex("{}");
  307. EXPECT_FALSE(buffer.HasErrors());
  308. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  309. {TokenKind::OpenCurlyBrace()},
  310. {TokenKind::CloseCurlyBrace()},
  311. }));
  312. buffer = Lex("{{{}{}}}");
  313. EXPECT_FALSE(buffer.HasErrors());
  314. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  315. {TokenKind::OpenCurlyBrace()},
  316. {TokenKind::OpenCurlyBrace()},
  317. {TokenKind::OpenCurlyBrace()},
  318. {TokenKind::CloseCurlyBrace()},
  319. {TokenKind::OpenCurlyBrace()},
  320. {TokenKind::CloseCurlyBrace()},
  321. {TokenKind::CloseCurlyBrace()},
  322. {TokenKind::CloseCurlyBrace()},
  323. }));
  324. }
  325. TEST_F(LexerTest, MatchingGroups) {
  326. {
  327. TokenizedBuffer buffer = Lex("(){}");
  328. ASSERT_FALSE(buffer.HasErrors());
  329. auto it = buffer.Tokens().begin();
  330. auto open_paren_token = *it++;
  331. auto close_paren_token = *it++;
  332. EXPECT_EQ(close_paren_token,
  333. buffer.GetMatchedClosingToken(open_paren_token));
  334. EXPECT_EQ(open_paren_token,
  335. buffer.GetMatchedOpeningToken(close_paren_token));
  336. auto open_curly_token = *it++;
  337. auto close_curly_token = *it++;
  338. EXPECT_EQ(close_curly_token,
  339. buffer.GetMatchedClosingToken(open_curly_token));
  340. EXPECT_EQ(open_curly_token,
  341. buffer.GetMatchedOpeningToken(close_curly_token));
  342. EXPECT_EQ(buffer.Tokens().end(), it);
  343. }
  344. {
  345. TokenizedBuffer buffer = Lex("({x}){(y)} {{((z))}}");
  346. ASSERT_FALSE(buffer.HasErrors());
  347. auto it = buffer.Tokens().begin();
  348. auto open_paren_token = *it++;
  349. auto open_curly_token = *it++;
  350. ASSERT_EQ("x", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  351. auto close_curly_token = *it++;
  352. auto close_paren_token = *it++;
  353. EXPECT_EQ(close_paren_token,
  354. buffer.GetMatchedClosingToken(open_paren_token));
  355. EXPECT_EQ(open_paren_token,
  356. buffer.GetMatchedOpeningToken(close_paren_token));
  357. EXPECT_EQ(close_curly_token,
  358. buffer.GetMatchedClosingToken(open_curly_token));
  359. EXPECT_EQ(open_curly_token,
  360. buffer.GetMatchedOpeningToken(close_curly_token));
  361. open_curly_token = *it++;
  362. open_paren_token = *it++;
  363. ASSERT_EQ("y", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  364. close_paren_token = *it++;
  365. close_curly_token = *it++;
  366. EXPECT_EQ(close_curly_token,
  367. buffer.GetMatchedClosingToken(open_curly_token));
  368. EXPECT_EQ(open_curly_token,
  369. buffer.GetMatchedOpeningToken(close_curly_token));
  370. EXPECT_EQ(close_paren_token,
  371. buffer.GetMatchedClosingToken(open_paren_token));
  372. EXPECT_EQ(open_paren_token,
  373. buffer.GetMatchedOpeningToken(close_paren_token));
  374. open_curly_token = *it++;
  375. auto inner_open_curly_token = *it++;
  376. open_paren_token = *it++;
  377. auto inner_open_paren_token = *it++;
  378. ASSERT_EQ("z", buffer.GetIdentifierText(buffer.GetIdentifier(*it++)));
  379. auto inner_close_paren_token = *it++;
  380. close_paren_token = *it++;
  381. auto inner_close_curly_token = *it++;
  382. close_curly_token = *it++;
  383. EXPECT_EQ(close_curly_token,
  384. buffer.GetMatchedClosingToken(open_curly_token));
  385. EXPECT_EQ(open_curly_token,
  386. buffer.GetMatchedOpeningToken(close_curly_token));
  387. EXPECT_EQ(inner_close_curly_token,
  388. buffer.GetMatchedClosingToken(inner_open_curly_token));
  389. EXPECT_EQ(inner_open_curly_token,
  390. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  391. EXPECT_EQ(close_paren_token,
  392. buffer.GetMatchedClosingToken(open_paren_token));
  393. EXPECT_EQ(open_paren_token,
  394. buffer.GetMatchedOpeningToken(close_paren_token));
  395. EXPECT_EQ(inner_close_paren_token,
  396. buffer.GetMatchedClosingToken(inner_open_paren_token));
  397. EXPECT_EQ(inner_open_paren_token,
  398. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  399. EXPECT_EQ(buffer.Tokens().end(), it);
  400. }
  401. }
  402. TEST_F(LexerTest, MismatchedGroups) {
  403. auto buffer = Lex("{");
  404. EXPECT_TRUE(buffer.HasErrors());
  405. EXPECT_THAT(buffer,
  406. HasTokens(llvm::ArrayRef<ExpectedToken>{
  407. {TokenKind::OpenCurlyBrace()},
  408. {.kind = TokenKind::CloseCurlyBrace(), .recovery = true},
  409. }));
  410. buffer = Lex("}");
  411. EXPECT_TRUE(buffer.HasErrors());
  412. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  413. {.kind = TokenKind::Error(), .text = "}"},
  414. }));
  415. buffer = Lex("{(}");
  416. EXPECT_TRUE(buffer.HasErrors());
  417. EXPECT_THAT(
  418. buffer,
  419. HasTokens(llvm::ArrayRef<ExpectedToken>{
  420. {.kind = TokenKind::OpenCurlyBrace(), .column = 1},
  421. {.kind = TokenKind::OpenParen(), .column = 2},
  422. {.kind = TokenKind::CloseParen(), .column = 3, .recovery = true},
  423. {.kind = TokenKind::CloseCurlyBrace(), .column = 3},
  424. }));
  425. buffer = Lex(")({)");
  426. EXPECT_TRUE(buffer.HasErrors());
  427. EXPECT_THAT(
  428. buffer,
  429. HasTokens(llvm::ArrayRef<ExpectedToken>{
  430. {.kind = TokenKind::Error(), .column = 1, .text = ")"},
  431. {.kind = TokenKind::OpenParen(), .column = 2},
  432. {.kind = TokenKind::OpenCurlyBrace(), .column = 3},
  433. {.kind = TokenKind::CloseCurlyBrace(), .column = 4, .recovery = true},
  434. {.kind = TokenKind::CloseParen(), .column = 4},
  435. }));
  436. }
  437. TEST_F(LexerTest, Keywords) {
  438. auto buffer = Lex(" fn");
  439. EXPECT_FALSE(buffer.HasErrors());
  440. EXPECT_THAT(
  441. buffer,
  442. HasTokens(llvm::ArrayRef<ExpectedToken>{
  443. {.kind = TokenKind::FnKeyword(), .column = 4, .indent_column = 4},
  444. }));
  445. buffer = Lex("and or not if else for loop return var break continue _");
  446. EXPECT_FALSE(buffer.HasErrors());
  447. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  448. {TokenKind::AndKeyword()},
  449. {TokenKind::OrKeyword()},
  450. {TokenKind::NotKeyword()},
  451. {TokenKind::IfKeyword()},
  452. {TokenKind::ElseKeyword()},
  453. {TokenKind::ForKeyword()},
  454. {TokenKind::LoopKeyword()},
  455. {TokenKind::ReturnKeyword()},
  456. {TokenKind::VarKeyword()},
  457. {TokenKind::BreakKeyword()},
  458. {TokenKind::ContinueKeyword()},
  459. {TokenKind::UnderscoreKeyword()},
  460. }));
  461. }
  462. TEST_F(LexerTest, Comments) {
  463. auto buffer = Lex(" ;\n // foo\n ;");
  464. EXPECT_FALSE(buffer.HasErrors());
  465. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  466. {.kind = TokenKind::Semi(),
  467. .line = 1,
  468. .column = 2,
  469. .indent_column = 2},
  470. {.kind = TokenKind::Semi(),
  471. .line = 3,
  472. .column = 3,
  473. .indent_column = 3},
  474. }));
  475. buffer = Lex("// foo\n//\n// bar");
  476. EXPECT_FALSE(buffer.HasErrors());
  477. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  478. // Make sure weird characters aren't a problem.
  479. buffer = Lex(" //foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  480. EXPECT_FALSE(buffer.HasErrors());
  481. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{}));
  482. }
  483. TEST_F(LexerTest, DocComments) {
  484. auto buffer = Lex(" /// foo");
  485. EXPECT_FALSE(buffer.HasErrors());
  486. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  487. {.kind = TokenKind::DocComment(),
  488. .line = 1,
  489. .column = 3,
  490. .indent_column = 3,
  491. .text = "/// foo"},
  492. }));
  493. buffer = Lex("/// foo\n//\n/// bar");
  494. EXPECT_FALSE(buffer.HasErrors());
  495. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  496. {.kind = TokenKind::DocComment(),
  497. .line = 1,
  498. .column = 1,
  499. .indent_column = 1,
  500. .text = "/// foo"},
  501. {.kind = TokenKind::DocComment(),
  502. .line = 3,
  503. .column = 1,
  504. .indent_column = 1,
  505. .text = "/// bar"},
  506. }));
  507. buffer = Lex("/// foo\n///\n/// bar");
  508. EXPECT_FALSE(buffer.HasErrors());
  509. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  510. {.kind = TokenKind::DocComment(),
  511. .line = 1,
  512. .column = 1,
  513. .indent_column = 1,
  514. .text = "/// foo"},
  515. {.kind = TokenKind::DocComment(),
  516. .line = 2,
  517. .column = 1,
  518. .indent_column = 1,
  519. .text = "///"},
  520. {.kind = TokenKind::DocComment(),
  521. .line = 3,
  522. .column = 1,
  523. .indent_column = 1,
  524. .text = "/// bar"},
  525. }));
  526. // Make sure weird characters aren't a problem.
  527. buffer = Lex(" ///foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  528. EXPECT_FALSE(buffer.HasErrors());
  529. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  530. {.kind = TokenKind::DocComment(),
  531. .line = 1,
  532. .column = 3,
  533. .indent_column = 3,
  534. .text = "///foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]"},
  535. }));
  536. }
  537. TEST_F(LexerTest, Identifiers) {
  538. auto buffer = Lex(" foobar");
  539. EXPECT_FALSE(buffer.HasErrors());
  540. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  541. {.kind = TokenKind::Identifier(),
  542. .column = 4,
  543. .indent_column = 4,
  544. .text = "foobar"},
  545. }));
  546. // Check different kinds of identifier character sequences.
  547. buffer = Lex("_foo_bar");
  548. EXPECT_FALSE(buffer.HasErrors());
  549. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  550. {.kind = TokenKind::Identifier(), .text = "_foo_bar"},
  551. }));
  552. buffer = Lex("foo2bar00");
  553. EXPECT_FALSE(buffer.HasErrors());
  554. EXPECT_THAT(buffer,
  555. HasTokens(llvm::ArrayRef<ExpectedToken>{
  556. {.kind = TokenKind::Identifier(), .text = "foo2bar00"},
  557. }));
  558. // Check that we can parse identifiers that start with a keyword.
  559. buffer = Lex("fnord");
  560. EXPECT_FALSE(buffer.HasErrors());
  561. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  562. {.kind = TokenKind::Identifier(), .text = "fnord"},
  563. }));
  564. // Check multiple identifiers with indent and interning.
  565. buffer = Lex(" foo;bar\nbar \n foo\tfoo");
  566. EXPECT_FALSE(buffer.HasErrors());
  567. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  568. {.kind = TokenKind::Identifier(),
  569. .line = 1,
  570. .column = 4,
  571. .indent_column = 4,
  572. .text = "foo"},
  573. {.kind = TokenKind::Semi()},
  574. {.kind = TokenKind::Identifier(),
  575. .line = 1,
  576. .column = 8,
  577. .indent_column = 4,
  578. .text = "bar"},
  579. {.kind = TokenKind::Identifier(),
  580. .line = 2,
  581. .column = 1,
  582. .indent_column = 1,
  583. .text = "bar"},
  584. {.kind = TokenKind::Identifier(),
  585. .line = 3,
  586. .column = 3,
  587. .indent_column = 3,
  588. .text = "foo"},
  589. {.kind = TokenKind::Identifier(),
  590. .line = 3,
  591. .column = 7,
  592. .indent_column = 3,
  593. .text = "foo"},
  594. }));
  595. }
  596. auto GetAndDropLine(llvm::StringRef& text) -> std::string {
  597. auto newline_offset = text.find_first_of('\n');
  598. llvm::StringRef line = text.slice(0, newline_offset);
  599. if (newline_offset != llvm::StringRef::npos) {
  600. text = text.substr(newline_offset + 1);
  601. } else {
  602. text = "";
  603. }
  604. return line.str();
  605. }
  606. TEST_F(LexerTest, Printing) {
  607. auto buffer = Lex(";");
  608. ASSERT_FALSE(buffer.HasErrors());
  609. std::string print_storage;
  610. llvm::raw_string_ostream print_stream(print_storage);
  611. buffer.Print(print_stream);
  612. llvm::StringRef print = print_stream.str();
  613. EXPECT_THAT(GetAndDropLine(print),
  614. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  615. "indent: 1, spelling: ';' }"));
  616. EXPECT_TRUE(print.empty()) << print;
  617. // Test kind padding.
  618. buffer = Lex("(;foo;)");
  619. ASSERT_FALSE(buffer.HasErrors());
  620. print_storage.clear();
  621. buffer.Print(print_stream);
  622. print = print_stream.str();
  623. EXPECT_THAT(GetAndDropLine(print),
  624. StrEq("token: { index: 0, kind: 'OpenParen', line: 1, column: "
  625. "1, indent: 1, spelling: '(', closing_token: 4 }"));
  626. EXPECT_THAT(GetAndDropLine(print),
  627. StrEq("token: { index: 1, kind: 'Semi', line: 1, column: "
  628. "2, indent: 1, spelling: ';' }"));
  629. EXPECT_THAT(GetAndDropLine(print),
  630. StrEq("token: { index: 2, kind: 'Identifier', line: 1, column: "
  631. "3, indent: 1, spelling: 'foo', identifier: 0 }"));
  632. EXPECT_THAT(GetAndDropLine(print),
  633. StrEq("token: { index: 3, kind: 'Semi', line: 1, column: "
  634. "6, indent: 1, spelling: ';' }"));
  635. EXPECT_THAT(GetAndDropLine(print),
  636. StrEq("token: { index: 4, kind: 'CloseParen', line: 1, column: "
  637. "7, indent: 1, spelling: ')', opening_token: 0 }"));
  638. EXPECT_TRUE(print.empty()) << print;
  639. // Test digit padding with max values of 9, 10, and 11.
  640. buffer = Lex(";\n\n\n\n\n\n\n\n\n\n ;;");
  641. ASSERT_FALSE(buffer.HasErrors());
  642. print_storage.clear();
  643. buffer.Print(print_stream);
  644. print = print_stream.str();
  645. EXPECT_THAT(GetAndDropLine(print),
  646. StrEq("token: { index: 0, kind: 'Semi', line: 1, column: 1, "
  647. "indent: 1, spelling: ';' }"));
  648. EXPECT_THAT(GetAndDropLine(print),
  649. StrEq("token: { index: 1, kind: 'Semi', line: 11, column: 9, "
  650. "indent: 9, spelling: ';' }"));
  651. EXPECT_THAT(GetAndDropLine(print),
  652. StrEq("token: { index: 2, kind: 'Semi', line: 11, column: 10, "
  653. "indent: 9, spelling: ';' }"));
  654. EXPECT_TRUE(print.empty()) << print;
  655. }
  656. TEST_F(LexerTest, PrintingAsYaml) {
  657. // Test that we can parse this into YAML and verify line and indent data.
  658. auto buffer = Lex("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  659. ASSERT_FALSE(buffer.HasErrors());
  660. std::string print_output;
  661. llvm::raw_string_ostream print_stream(print_output);
  662. buffer.Print(print_stream);
  663. print_stream.flush();
  664. // Parse the output into a YAML stream. This will print errors to stderr.
  665. llvm::SourceMgr source_manager;
  666. llvm::yaml::Stream yaml_stream(print_output, source_manager);
  667. auto yaml_it = yaml_stream.begin();
  668. auto* root_node = llvm::dyn_cast<llvm::yaml::MappingNode>(yaml_it->getRoot());
  669. ASSERT_THAT(root_node, NotNull());
  670. // Walk the top-level mapping of tokens, dig out the sub-mapping of data for
  671. // each taken, and then verify those entries.
  672. auto mapping_it = llvm::cast<llvm::yaml::MappingNode>(root_node)->begin();
  673. auto* token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  674. ASSERT_THAT(token_node, NotNull());
  675. auto* token_key_node =
  676. llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  677. ASSERT_THAT(token_key_node, NotNull());
  678. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  679. auto* token_value_node =
  680. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  681. ASSERT_THAT(token_value_node, NotNull());
  682. auto token_it = token_value_node->begin();
  683. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "0"));
  684. ++token_it;
  685. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  686. ++token_it;
  687. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "2"));
  688. ++token_it;
  689. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "2"));
  690. ++token_it;
  691. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "2"));
  692. ++token_it;
  693. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  694. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  695. ++mapping_it;
  696. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  697. ASSERT_THAT(token_node, NotNull());
  698. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  699. ASSERT_THAT(token_key_node, NotNull());
  700. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  701. token_value_node =
  702. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  703. ASSERT_THAT(token_value_node, NotNull());
  704. token_it = token_value_node->begin();
  705. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "1"));
  706. ++token_it;
  707. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  708. ++token_it;
  709. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  710. ++token_it;
  711. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "1"));
  712. ++token_it;
  713. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  714. ++token_it;
  715. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  716. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  717. ++mapping_it;
  718. token_node = llvm::dyn_cast<llvm::yaml::KeyValueNode>(&*mapping_it);
  719. ASSERT_THAT(token_node, NotNull());
  720. token_key_node = llvm::dyn_cast<llvm::yaml::ScalarNode>(token_node->getKey());
  721. ASSERT_THAT(token_key_node, NotNull());
  722. EXPECT_THAT(token_key_node->getRawValue(), StrEq("token"));
  723. token_value_node =
  724. llvm::dyn_cast<llvm::yaml::MappingNode>(token_node->getValue());
  725. ASSERT_THAT(token_value_node, NotNull());
  726. token_it = token_value_node->begin();
  727. EXPECT_THAT(&*token_it, IsKeyValueScalars("index", "2"));
  728. ++token_it;
  729. EXPECT_THAT(&*token_it, IsKeyValueScalars("kind", "Semi"));
  730. ++token_it;
  731. EXPECT_THAT(&*token_it, IsKeyValueScalars("line", "5"));
  732. ++token_it;
  733. EXPECT_THAT(&*token_it, IsKeyValueScalars("column", "3"));
  734. ++token_it;
  735. EXPECT_THAT(&*token_it, IsKeyValueScalars("indent", "1"));
  736. ++token_it;
  737. EXPECT_THAT(&*token_it, IsKeyValueScalars("spelling", ";"));
  738. EXPECT_THAT(++token_it, Eq(token_value_node->end()));
  739. ASSERT_THAT(++mapping_it, Eq(root_node->end()));
  740. ASSERT_THAT(++yaml_it, Eq(yaml_stream.end()));
  741. }
  742. } // namespace
  743. } // namespace Carbon