tokenized_buffer_test.cpp 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <gmock/gmock.h>
  6. #include <gtest/gtest.h>
  7. #include <cmath>
  8. #include <forward_list>
  9. #include <iterator>
  10. #include "llvm/ADT/ArrayRef.h"
  11. #include "llvm/Support/FormatVariadic.h"
  12. #include "testing/base/test_raw_ostream.h"
  13. #include "toolchain/base/value_store.h"
  14. #include "toolchain/diagnostics/diagnostic_emitter.h"
  15. #include "toolchain/diagnostics/mocks.h"
  16. #include "toolchain/lex/lex.h"
  17. #include "toolchain/lex/tokenized_buffer_test_helpers.h"
  18. #include "toolchain/testing/compile_helper.h"
  19. #include "toolchain/testing/yaml_test_helpers.h"
  20. namespace Carbon::Lex {
  21. namespace {
  22. using ::Carbon::Testing::ExpectedToken;
  23. using ::Carbon::Testing::IsSingleDiagnostic;
  24. using ::Carbon::Testing::TestRawOstream;
  25. using ::testing::_;
  26. using ::testing::ElementsAre;
  27. using ::testing::Eq;
  28. using ::testing::HasSubstr;
  29. using ::testing::Pair;
  30. namespace Yaml = ::Carbon::Testing::Yaml;
  31. class LexerTest : public ::testing::Test {
  32. protected:
  33. Testing::CompileHelper compile_helper_;
  34. };
  35. TEST_F(LexerTest, HandlesEmptyBuffer) {
  36. auto& buffer = compile_helper_.GetTokenizedBuffer("");
  37. EXPECT_FALSE(buffer.has_errors());
  38. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  39. {.kind = TokenKind::FileStart},
  40. {.kind = TokenKind::FileEnd}}));
  41. }
  42. TEST_F(LexerTest, TracksLinesAndColumns) {
  43. auto& buffer = compile_helper_.GetTokenizedBuffer(
  44. "\n ;;\n ;;;\n x\"foo\" '''baz\n a\n ''' y");
  45. EXPECT_FALSE(buffer.has_errors());
  46. EXPECT_THAT(
  47. buffer,
  48. HasTokens(llvm::ArrayRef<ExpectedToken>{
  49. {.kind = TokenKind::FileStart,
  50. .line = 1,
  51. .column = 1,
  52. .indent_column = 1},
  53. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
  54. {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
  55. {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
  56. {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
  57. {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
  58. {.kind = TokenKind::Identifier,
  59. .line = 4,
  60. .column = 4,
  61. .indent_column = 4,
  62. .text = "x"},
  63. {.kind = TokenKind::StringLiteral,
  64. .line = 4,
  65. .column = 5,
  66. .indent_column = 4},
  67. {.kind = TokenKind::StringLiteral,
  68. .line = 4,
  69. .column = 11,
  70. .indent_column = 4},
  71. {.kind = TokenKind::Identifier,
  72. .line = 6,
  73. .column = 6,
  74. .indent_column = 11,
  75. .text = "y"},
  76. {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
  77. }));
  78. }
  79. TEST_F(LexerTest, TracksLinesAndColumnsCRLF) {
  80. auto& buffer = compile_helper_.GetTokenizedBuffer(
  81. "\r\n ;;\r\n ;;;\r\n x\"foo\" '''baz\r\n a\r\n ''' y");
  82. EXPECT_FALSE(buffer.has_errors());
  83. EXPECT_THAT(
  84. buffer,
  85. HasTokens(llvm::ArrayRef<ExpectedToken>{
  86. {.kind = TokenKind::FileStart,
  87. .line = 1,
  88. .column = 1,
  89. .indent_column = 1},
  90. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
  91. {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
  92. {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
  93. {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
  94. {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
  95. {.kind = TokenKind::Identifier,
  96. .line = 4,
  97. .column = 4,
  98. .indent_column = 4,
  99. .text = "x"},
  100. {.kind = TokenKind::StringLiteral,
  101. .line = 4,
  102. .column = 5,
  103. .indent_column = 4},
  104. {.kind = TokenKind::StringLiteral,
  105. .line = 4,
  106. .column = 11,
  107. .indent_column = 4},
  108. {.kind = TokenKind::Identifier,
  109. .line = 6,
  110. .column = 6,
  111. .indent_column = 11,
  112. .text = "y"},
  113. {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
  114. }));
  115. }
  116. TEST_F(LexerTest, InvalidCR) {
  117. auto& buffer = compile_helper_.GetTokenizedBuffer("\n ;;\r ;\n x");
  118. EXPECT_TRUE(buffer.has_errors());
  119. EXPECT_THAT(
  120. buffer,
  121. HasTokens(llvm::ArrayRef<ExpectedToken>{
  122. {.kind = TokenKind::FileStart,
  123. .line = 1,
  124. .column = 1,
  125. .indent_column = 1},
  126. {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
  127. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
  128. {.kind = TokenKind::Semi, .line = 2, .column = 6, .indent_column = 2},
  129. {.kind = TokenKind::Identifier,
  130. .line = 3,
  131. .column = 4,
  132. .indent_column = 4,
  133. .text = "x"},
  134. {.kind = TokenKind::FileEnd, .line = 3, .column = 5},
  135. }));
  136. }
  137. TEST_F(LexerTest, InvalidLFCR) {
  138. auto& buffer = compile_helper_.GetTokenizedBuffer("\n ;;\n\r ;\n x");
  139. EXPECT_TRUE(buffer.has_errors());
  140. EXPECT_THAT(
  141. buffer,
  142. HasTokens(llvm::ArrayRef<ExpectedToken>{
  143. {.kind = TokenKind::FileStart,
  144. .line = 1,
  145. .column = 1,
  146. .indent_column = 1},
  147. {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
  148. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
  149. {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 1},
  150. {.kind = TokenKind::Identifier,
  151. .line = 4,
  152. .column = 4,
  153. .indent_column = 4,
  154. .text = "x"},
  155. {.kind = TokenKind::FileEnd, .line = 4, .column = 5},
  156. }));
  157. }
  158. TEST_F(LexerTest, HandlesNumericLiteral) {
  159. auto [buffer, value_stores] =
  160. compile_helper_.GetTokenizedBufferWithSharedValueStore(
  161. "12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
  162. EXPECT_FALSE(buffer.has_errors());
  163. ASSERT_THAT(buffer,
  164. HasTokens(llvm::ArrayRef<ExpectedToken>{
  165. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  166. {.kind = TokenKind::IntLiteral,
  167. .line = 1,
  168. .column = 1,
  169. .indent_column = 1,
  170. .text = "12"},
  171. {.kind = TokenKind::Minus,
  172. .line = 1,
  173. .column = 3,
  174. .indent_column = 1},
  175. {.kind = TokenKind::IntLiteral,
  176. .line = 1,
  177. .column = 4,
  178. .indent_column = 1,
  179. .text = "578"},
  180. {.kind = TokenKind::IntLiteral,
  181. .line = 2,
  182. .column = 3,
  183. .indent_column = 3,
  184. .text = "1"},
  185. {.kind = TokenKind::IntLiteral,
  186. .line = 2,
  187. .column = 6,
  188. .indent_column = 3,
  189. .text = "2"},
  190. {.kind = TokenKind::IntLiteral,
  191. .line = 3,
  192. .column = 1,
  193. .indent_column = 1,
  194. .text = "0x12_3ABC"},
  195. {.kind = TokenKind::IntLiteral,
  196. .line = 4,
  197. .column = 1,
  198. .indent_column = 1,
  199. .text = "0b10_10_11"},
  200. {.kind = TokenKind::IntLiteral,
  201. .line = 5,
  202. .column = 1,
  203. .indent_column = 1,
  204. .text = "1_234_567"},
  205. {.kind = TokenKind::RealLiteral,
  206. .line = 6,
  207. .column = 1,
  208. .indent_column = 1,
  209. .text = "1.5e9"},
  210. {.kind = TokenKind::FileEnd, .line = 6, .column = 6},
  211. }));
  212. auto token_start = buffer.tokens().begin();
  213. auto token_12 = token_start + 1;
  214. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_12)), 12);
  215. auto token_578 = token_12 + 2;
  216. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_578)), 578);
  217. auto token_1 = token_578 + 1;
  218. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_1)), 1);
  219. auto token_2 = token_1 + 1;
  220. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_2)), 2);
  221. auto token_0x12_3abc = token_2 + 1;
  222. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_0x12_3abc)),
  223. 0x12'3abc);
  224. auto token_0b10_10_11 = token_0x12_3abc + 1;
  225. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_0b10_10_11)),
  226. 0b10'10'11);
  227. auto token_1_234_567 = token_0b10_10_11 + 1;
  228. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_1_234_567)),
  229. 1'234'567);
  230. auto token_1_5e9 = token_1_234_567 + 1;
  231. auto value_1_5e9 =
  232. value_stores.reals().Get(buffer.GetRealLiteral(*token_1_5e9));
  233. EXPECT_EQ(value_1_5e9.mantissa.getZExtValue(), 15);
  234. EXPECT_EQ(value_1_5e9.exponent.getSExtValue(), 8);
  235. EXPECT_EQ(value_1_5e9.is_decimal, true);
  236. }
  237. TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
  238. auto& buffer =
  239. compile_helper_.GetTokenizedBuffer("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
  240. EXPECT_TRUE(buffer.has_errors());
  241. ASSERT_THAT(buffer,
  242. HasTokens(llvm::ArrayRef<ExpectedToken>{
  243. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  244. {.kind = TokenKind::Error,
  245. .line = 1,
  246. .column = 1,
  247. .indent_column = 1,
  248. .text = "14x"},
  249. {.kind = TokenKind::IntLiteral,
  250. .line = 1,
  251. .column = 5,
  252. .indent_column = 1,
  253. .text = "15_49"},
  254. {.kind = TokenKind::Error,
  255. .line = 1,
  256. .column = 11,
  257. .indent_column = 1,
  258. .text = "0x3.5q"},
  259. {.kind = TokenKind::RealLiteral,
  260. .line = 1,
  261. .column = 18,
  262. .indent_column = 1,
  263. .text = "0x3_4.5_6"},
  264. {.kind = TokenKind::Error,
  265. .line = 1,
  266. .column = 28,
  267. .indent_column = 1,
  268. .text = "0ops"},
  269. {.kind = TokenKind::FileEnd, .line = 1, .column = 32},
  270. }));
  271. }
  272. TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
  273. llvm::StringLiteral source_text = R"(
  274. 1.
  275. .2
  276. 3.+foo
  277. 4.0-bar
  278. 5.0e+123+456
  279. 6.0e+1e+2
  280. 1e7
  281. 8..10
  282. 9.0.9.5
  283. 10.foo
  284. 11.0.foo
  285. 12e+1
  286. 13._
  287. )";
  288. auto& buffer = compile_helper_.GetTokenizedBuffer(source_text);
  289. EXPECT_TRUE(buffer.has_errors());
  290. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  291. {.kind = TokenKind::FileStart},
  292. {.kind = TokenKind::IntLiteral, .text = "1"},
  293. {.kind = TokenKind::Period},
  294. // newline
  295. {.kind = TokenKind::Period},
  296. {.kind = TokenKind::IntLiteral, .text = "2"},
  297. // newline
  298. {.kind = TokenKind::IntLiteral, .text = "3"},
  299. {.kind = TokenKind::Period},
  300. {.kind = TokenKind::Plus},
  301. {.kind = TokenKind::Identifier, .text = "foo"},
  302. // newline
  303. {.kind = TokenKind::RealLiteral, .text = "4.0"},
  304. {.kind = TokenKind::Minus},
  305. {.kind = TokenKind::Identifier, .text = "bar"},
  306. // newline
  307. {.kind = TokenKind::RealLiteral, .text = "5.0e+123"},
  308. {.kind = TokenKind::Plus},
  309. {.kind = TokenKind::IntLiteral, .text = "456"},
  310. // newline
  311. {.kind = TokenKind::Error, .text = "6.0e+1e"},
  312. {.kind = TokenKind::Plus},
  313. {.kind = TokenKind::IntLiteral, .text = "2"},
  314. // newline
  315. {.kind = TokenKind::Error, .text = "1e7"},
  316. // newline
  317. {.kind = TokenKind::IntLiteral, .text = "8"},
  318. {.kind = TokenKind::Period},
  319. {.kind = TokenKind::Period},
  320. {.kind = TokenKind::IntLiteral, .text = "10"},
  321. // newline
  322. {.kind = TokenKind::RealLiteral, .text = "9.0"},
  323. {.kind = TokenKind::Period},
  324. {.kind = TokenKind::RealLiteral, .text = "9.5"},
  325. // newline
  326. {.kind = TokenKind::Error, .text = "10.foo"},
  327. // newline
  328. {.kind = TokenKind::RealLiteral, .text = "11.0"},
  329. {.kind = TokenKind::Period},
  330. {.kind = TokenKind::Identifier, .text = "foo"},
  331. // newline
  332. {.kind = TokenKind::Error, .text = "12e"},
  333. {.kind = TokenKind::Plus},
  334. {.kind = TokenKind::IntLiteral, .text = "1"},
  335. // newline
  336. {.kind = TokenKind::IntLiteral, .text = "13"},
  337. {.kind = TokenKind::Period},
  338. {.kind = TokenKind::Underscore},
  339. // newline
  340. {.kind = TokenKind::FileEnd},
  341. }));
  342. }
  343. TEST_F(LexerTest, HandlesGarbageCharacters) {
  344. constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\\\"\\\n\"x";
  345. auto& buffer = compile_helper_.GetTokenizedBuffer(
  346. llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  347. EXPECT_TRUE(buffer.has_errors());
  348. EXPECT_THAT(
  349. buffer,
  350. HasTokens(llvm::ArrayRef<ExpectedToken>{
  351. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  352. {.kind = TokenKind::Error,
  353. .line = 1,
  354. .column = 1,
  355. // 💩 takes 4 bytes, and we count column as bytes offset.
  356. .text = llvm::StringRef("$$💩", 6)},
  357. {.kind = TokenKind::Minus, .line = 1, .column = 7},
  358. {.kind = TokenKind::Error, .line = 1, .column = 8, .text = "$"},
  359. // newline
  360. {.kind = TokenKind::Error,
  361. .line = 2,
  362. .column = 1,
  363. .text = llvm::StringRef("$\0$", 3)},
  364. {.kind = TokenKind::IntLiteral, .line = 2, .column = 4, .text = "12"},
  365. {.kind = TokenKind::Error, .line = 2, .column = 6, .text = "$"},
  366. // newline
  367. {.kind = TokenKind::Backslash, .line = 3, .column = 1, .text = "\\"},
  368. {.kind = TokenKind::Error, .line = 3, .column = 2, .text = "\"\\"},
  369. // newline
  370. {.kind = TokenKind::Error, .line = 4, .column = 1, .text = "\"x"},
  371. {.kind = TokenKind::FileEnd, .line = 4, .column = 3},
  372. }));
  373. }
  374. TEST_F(LexerTest, Symbols) {
  375. // We don't need to exhaustively test symbols here as they're handled with
  376. // common code, but we want to check specific patterns to verify things like
  377. // max-munch rule and handling of interesting symbols.
  378. auto& buffer1 = compile_helper_.GetTokenizedBuffer("<<<");
  379. EXPECT_FALSE(buffer1.has_errors());
  380. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  381. {.kind = TokenKind::FileStart},
  382. {.kind = TokenKind::LessLess},
  383. {.kind = TokenKind::Less},
  384. {.kind = TokenKind::FileEnd},
  385. }));
  386. auto& buffer2 = compile_helper_.GetTokenizedBuffer("<<=>>");
  387. EXPECT_FALSE(buffer2.has_errors());
  388. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  389. {.kind = TokenKind::FileStart},
  390. {.kind = TokenKind::LessLessEqual},
  391. {.kind = TokenKind::GreaterGreater},
  392. {.kind = TokenKind::FileEnd},
  393. }));
  394. auto& buffer3 = compile_helper_.GetTokenizedBuffer("< <=> >");
  395. EXPECT_FALSE(buffer3.has_errors());
  396. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  397. {.kind = TokenKind::FileStart},
  398. {.kind = TokenKind::Less},
  399. {.kind = TokenKind::LessEqualGreater},
  400. {.kind = TokenKind::Greater},
  401. {.kind = TokenKind::FileEnd},
  402. }));
  403. auto& buffer4 = compile_helper_.GetTokenizedBuffer("\\/?@&^!");
  404. EXPECT_FALSE(buffer4.has_errors());
  405. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  406. {.kind = TokenKind::FileStart},
  407. {.kind = TokenKind::Backslash},
  408. {.kind = TokenKind::Slash},
  409. {.kind = TokenKind::Question},
  410. {.kind = TokenKind::At},
  411. {.kind = TokenKind::Amp},
  412. {.kind = TokenKind::Caret},
  413. {.kind = TokenKind::Exclaim},
  414. {.kind = TokenKind::FileEnd},
  415. }));
  416. }
  417. TEST_F(LexerTest, Parens) {
  418. auto& buffer1 = compile_helper_.GetTokenizedBuffer("()");
  419. EXPECT_FALSE(buffer1.has_errors());
  420. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  421. {.kind = TokenKind::FileStart},
  422. {.kind = TokenKind::OpenParen},
  423. {.kind = TokenKind::CloseParen},
  424. {.kind = TokenKind::FileEnd},
  425. }));
  426. auto& buffer2 = compile_helper_.GetTokenizedBuffer("((()()))");
  427. EXPECT_FALSE(buffer2.has_errors());
  428. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  429. {.kind = TokenKind::FileStart},
  430. {.kind = TokenKind::OpenParen},
  431. {.kind = TokenKind::OpenParen},
  432. {.kind = TokenKind::OpenParen},
  433. {.kind = TokenKind::CloseParen},
  434. {.kind = TokenKind::OpenParen},
  435. {.kind = TokenKind::CloseParen},
  436. {.kind = TokenKind::CloseParen},
  437. {.kind = TokenKind::CloseParen},
  438. {.kind = TokenKind::FileEnd},
  439. }));
  440. }
  441. TEST_F(LexerTest, CurlyBraces) {
  442. auto& buffer1 = compile_helper_.GetTokenizedBuffer("{}");
  443. EXPECT_FALSE(buffer1.has_errors());
  444. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  445. {.kind = TokenKind::FileStart},
  446. {.kind = TokenKind::OpenCurlyBrace},
  447. {.kind = TokenKind::CloseCurlyBrace},
  448. {.kind = TokenKind::FileEnd},
  449. }));
  450. auto& buffer2 = compile_helper_.GetTokenizedBuffer("{{{}{}}}");
  451. EXPECT_FALSE(buffer2.has_errors());
  452. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  453. {.kind = TokenKind::FileStart},
  454. {.kind = TokenKind::OpenCurlyBrace},
  455. {.kind = TokenKind::OpenCurlyBrace},
  456. {.kind = TokenKind::OpenCurlyBrace},
  457. {.kind = TokenKind::CloseCurlyBrace},
  458. {.kind = TokenKind::OpenCurlyBrace},
  459. {.kind = TokenKind::CloseCurlyBrace},
  460. {.kind = TokenKind::CloseCurlyBrace},
  461. {.kind = TokenKind::CloseCurlyBrace},
  462. {.kind = TokenKind::FileEnd},
  463. }));
  464. }
  465. TEST_F(LexerTest, MatchingGroups) {
  466. {
  467. auto& buffer = compile_helper_.GetTokenizedBuffer("(){}");
  468. ASSERT_FALSE(buffer.has_errors());
  469. auto it = ++buffer.tokens().begin();
  470. auto open_paren_token = *it++;
  471. auto close_paren_token = *it++;
  472. EXPECT_EQ(close_paren_token,
  473. buffer.GetMatchedClosingToken(open_paren_token));
  474. EXPECT_EQ(open_paren_token,
  475. buffer.GetMatchedOpeningToken(close_paren_token));
  476. auto open_curly_token = *it++;
  477. auto close_curly_token = *it++;
  478. EXPECT_EQ(close_curly_token,
  479. buffer.GetMatchedClosingToken(open_curly_token));
  480. EXPECT_EQ(open_curly_token,
  481. buffer.GetMatchedOpeningToken(close_curly_token));
  482. auto eof_token = *it++;
  483. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::FileEnd);
  484. EXPECT_EQ(buffer.tokens().end(), it);
  485. }
  486. {
  487. auto [buffer, value_stores] =
  488. compile_helper_.GetTokenizedBufferWithSharedValueStore(
  489. "({x}){(y)} {{((z))}}");
  490. ASSERT_FALSE(buffer.has_errors());
  491. auto it = ++buffer.tokens().begin();
  492. auto open_paren_token = *it++;
  493. auto open_curly_token = *it++;
  494. ASSERT_EQ("x", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  495. auto close_curly_token = *it++;
  496. auto close_paren_token = *it++;
  497. EXPECT_EQ(close_paren_token,
  498. buffer.GetMatchedClosingToken(open_paren_token));
  499. EXPECT_EQ(open_paren_token,
  500. buffer.GetMatchedOpeningToken(close_paren_token));
  501. EXPECT_EQ(close_curly_token,
  502. buffer.GetMatchedClosingToken(open_curly_token));
  503. EXPECT_EQ(open_curly_token,
  504. buffer.GetMatchedOpeningToken(close_curly_token));
  505. open_curly_token = *it++;
  506. open_paren_token = *it++;
  507. ASSERT_EQ("y", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  508. close_paren_token = *it++;
  509. close_curly_token = *it++;
  510. EXPECT_EQ(close_curly_token,
  511. buffer.GetMatchedClosingToken(open_curly_token));
  512. EXPECT_EQ(open_curly_token,
  513. buffer.GetMatchedOpeningToken(close_curly_token));
  514. EXPECT_EQ(close_paren_token,
  515. buffer.GetMatchedClosingToken(open_paren_token));
  516. EXPECT_EQ(open_paren_token,
  517. buffer.GetMatchedOpeningToken(close_paren_token));
  518. open_curly_token = *it++;
  519. auto inner_open_curly_token = *it++;
  520. open_paren_token = *it++;
  521. auto inner_open_paren_token = *it++;
  522. ASSERT_EQ("z", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  523. auto inner_close_paren_token = *it++;
  524. close_paren_token = *it++;
  525. auto inner_close_curly_token = *it++;
  526. close_curly_token = *it++;
  527. EXPECT_EQ(close_curly_token,
  528. buffer.GetMatchedClosingToken(open_curly_token));
  529. EXPECT_EQ(open_curly_token,
  530. buffer.GetMatchedOpeningToken(close_curly_token));
  531. EXPECT_EQ(inner_close_curly_token,
  532. buffer.GetMatchedClosingToken(inner_open_curly_token));
  533. EXPECT_EQ(inner_open_curly_token,
  534. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  535. EXPECT_EQ(close_paren_token,
  536. buffer.GetMatchedClosingToken(open_paren_token));
  537. EXPECT_EQ(open_paren_token,
  538. buffer.GetMatchedOpeningToken(close_paren_token));
  539. EXPECT_EQ(inner_close_paren_token,
  540. buffer.GetMatchedClosingToken(inner_open_paren_token));
  541. EXPECT_EQ(inner_open_paren_token,
  542. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  543. auto eof_token = *it++;
  544. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::FileEnd);
  545. EXPECT_EQ(buffer.tokens().end(), it);
  546. }
  547. }
  548. TEST_F(LexerTest, MismatchedGroups) {
  549. auto& buffer1 = compile_helper_.GetTokenizedBuffer("{");
  550. EXPECT_TRUE(buffer1.has_errors());
  551. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  552. {.kind = TokenKind::FileStart},
  553. {.kind = TokenKind::Error, .text = "{"},
  554. {.kind = TokenKind::FileEnd},
  555. }));
  556. auto& buffer2 = compile_helper_.GetTokenizedBuffer("}");
  557. EXPECT_TRUE(buffer2.has_errors());
  558. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  559. {.kind = TokenKind::FileStart},
  560. {.kind = TokenKind::Error, .text = "}"},
  561. {.kind = TokenKind::FileEnd},
  562. }));
  563. auto& buffer3 = compile_helper_.GetTokenizedBuffer("{(}");
  564. EXPECT_TRUE(buffer3.has_errors());
  565. EXPECT_THAT(
  566. buffer3,
  567. HasTokens(llvm::ArrayRef<ExpectedToken>{
  568. {.kind = TokenKind::FileStart},
  569. {.kind = TokenKind::OpenCurlyBrace, .column = 1},
  570. {.kind = TokenKind::OpenParen, .column = 2},
  571. {.kind = TokenKind::CloseParen, .column = 3, .recovery = true},
  572. {.kind = TokenKind::CloseCurlyBrace, .column = 3},
  573. {.kind = TokenKind::FileEnd},
  574. }));
  575. auto& buffer4 = compile_helper_.GetTokenizedBuffer(")({)");
  576. EXPECT_TRUE(buffer4.has_errors());
  577. EXPECT_THAT(
  578. buffer4,
  579. HasTokens(llvm::ArrayRef<ExpectedToken>{
  580. {.kind = TokenKind::FileStart},
  581. {.kind = TokenKind::Error, .column = 1, .text = ")"},
  582. {.kind = TokenKind::OpenParen, .column = 2},
  583. {.kind = TokenKind::OpenCurlyBrace, .column = 3},
  584. {.kind = TokenKind::CloseCurlyBrace, .column = 4, .recovery = true},
  585. {.kind = TokenKind::CloseParen, .column = 4},
  586. {.kind = TokenKind::FileEnd},
  587. }));
  588. }
  589. TEST_F(LexerTest, Whitespace) {
  590. auto& buffer = compile_helper_.GetTokenizedBuffer("{( } {(");
  591. // Whether there should be whitespace before/after each token.
  592. bool space[] = {false,
  593. // start-of-file
  594. true,
  595. // {
  596. false,
  597. // (
  598. true,
  599. // inserted )
  600. true,
  601. // }
  602. true,
  603. // error {
  604. false,
  605. // error (
  606. true,
  607. // EOF
  608. false};
  609. int pos = 0;
  610. for (TokenIndex token : buffer.tokens()) {
  611. SCOPED_TRACE(
  612. llvm::formatv("Token #{0}: '{1}'", token, buffer.GetTokenText(token)));
  613. ASSERT_LT(pos, std::size(space));
  614. EXPECT_THAT(buffer.HasLeadingWhitespace(token), Eq(space[pos]));
  615. ++pos;
  616. ASSERT_LT(pos, std::size(space));
  617. EXPECT_THAT(buffer.HasTrailingWhitespace(token), Eq(space[pos]));
  618. }
  619. ASSERT_EQ(pos + 1, std::size(space));
  620. }
  621. TEST_F(LexerTest, Keywords) {
  622. TokenKind keywords[] = {
  623. #define CARBON_TOKEN(TokenName)
  624. #define CARBON_KEYWORD_TOKEN(TokenName, ...) TokenKind::TokenName,
  625. #include "toolchain/lex/token_kind.def"
  626. };
  627. for (const auto& keyword : keywords) {
  628. auto& buffer = compile_helper_.GetTokenizedBuffer(keyword.fixed_spelling());
  629. EXPECT_FALSE(buffer.has_errors());
  630. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  631. {.kind = TokenKind::FileStart},
  632. {.kind = keyword, .column = 1, .indent_column = 1},
  633. {.kind = TokenKind::FileEnd},
  634. }));
  635. }
  636. }
  637. TEST_F(LexerTest, Comments) {
  638. auto& buffer1 = compile_helper_.GetTokenizedBuffer(" ;\n // foo\n ;\n");
  639. EXPECT_FALSE(buffer1.has_errors());
  640. EXPECT_THAT(
  641. buffer1,
  642. HasTokens(llvm::ArrayRef<ExpectedToken>{
  643. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  644. {.kind = TokenKind::Semi, .line = 1, .column = 2, .indent_column = 2},
  645. {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 3},
  646. {.kind = TokenKind::FileEnd, .line = 3, .column = 4},
  647. }));
  648. auto& buffer2 = compile_helper_.GetTokenizedBuffer("// foo\n//\n// bar");
  649. EXPECT_FALSE(buffer2.has_errors());
  650. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  651. {.kind = TokenKind::FileStart},
  652. {.kind = TokenKind::FileEnd}}));
  653. // Make sure weird characters aren't a problem.
  654. auto& buffer3 =
  655. compile_helper_.GetTokenizedBuffer(" // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  656. EXPECT_FALSE(buffer3.has_errors());
  657. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  658. {.kind = TokenKind::FileStart},
  659. {.kind = TokenKind::FileEnd}}));
  660. // Make sure we can lex a comment at the end of the input.
  661. auto& buffer4 = compile_helper_.GetTokenizedBuffer("//");
  662. EXPECT_FALSE(buffer4.has_errors());
  663. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  664. {.kind = TokenKind::FileStart},
  665. {.kind = TokenKind::FileEnd}}));
  666. }
  667. TEST_F(LexerTest, InvalidComments) {
  668. llvm::StringLiteral testcases[] = {
  669. " /// foo\n",
  670. "foo // bar\n",
  671. "//! hello",
  672. " //world",
  673. };
  674. for (llvm::StringLiteral testcase : testcases) {
  675. auto& buffer = compile_helper_.GetTokenizedBuffer(testcase);
  676. EXPECT_TRUE(buffer.has_errors());
  677. }
  678. }
  679. TEST_F(LexerTest, Identifiers) {
  680. auto& buffer1 = compile_helper_.GetTokenizedBuffer(" foobar");
  681. EXPECT_FALSE(buffer1.has_errors());
  682. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  683. {.kind = TokenKind::FileStart},
  684. {.kind = TokenKind::Identifier,
  685. .column = 4,
  686. .indent_column = 4,
  687. .text = "foobar"},
  688. {.kind = TokenKind::FileEnd},
  689. }));
  690. // Check different kinds of identifier character sequences.
  691. auto& buffer2 = compile_helper_.GetTokenizedBuffer("_foo_bar");
  692. EXPECT_FALSE(buffer2.has_errors());
  693. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  694. {.kind = TokenKind::FileStart},
  695. {.kind = TokenKind::Identifier, .text = "_foo_bar"},
  696. {.kind = TokenKind::FileEnd},
  697. }));
  698. auto& buffer3 = compile_helper_.GetTokenizedBuffer("foo2bar00");
  699. EXPECT_FALSE(buffer3.has_errors());
  700. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  701. {.kind = TokenKind::FileStart},
  702. {.kind = TokenKind::Identifier, .text = "foo2bar00"},
  703. {.kind = TokenKind::FileEnd},
  704. }));
  705. // Check that we can parse identifiers that start with a keyword.
  706. auto& buffer4 = compile_helper_.GetTokenizedBuffer("fnord");
  707. EXPECT_FALSE(buffer4.has_errors());
  708. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  709. {.kind = TokenKind::FileStart},
  710. {.kind = TokenKind::Identifier, .text = "fnord"},
  711. {.kind = TokenKind::FileEnd},
  712. }));
  713. // Check multiple identifiers with indent and interning.
  714. auto& buffer5 =
  715. compile_helper_.GetTokenizedBuffer(" foo;bar\nbar \n foo\tfoo");
  716. EXPECT_FALSE(buffer5.has_errors());
  717. EXPECT_THAT(buffer5,
  718. HasTokens(llvm::ArrayRef<ExpectedToken>{
  719. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  720. {.kind = TokenKind::Identifier,
  721. .line = 1,
  722. .column = 4,
  723. .indent_column = 4,
  724. .text = "foo"},
  725. {.kind = TokenKind::Semi},
  726. {.kind = TokenKind::Identifier,
  727. .line = 1,
  728. .column = 8,
  729. .indent_column = 4,
  730. .text = "bar"},
  731. {.kind = TokenKind::Identifier,
  732. .line = 2,
  733. .column = 1,
  734. .indent_column = 1,
  735. .text = "bar"},
  736. {.kind = TokenKind::Identifier,
  737. .line = 3,
  738. .column = 3,
  739. .indent_column = 3,
  740. .text = "foo"},
  741. {.kind = TokenKind::Identifier,
  742. .line = 3,
  743. .column = 7,
  744. .indent_column = 3,
  745. .text = "foo"},
  746. {.kind = TokenKind::FileEnd, .line = 3, .column = 10},
  747. }));
  748. }
  749. TEST_F(LexerTest, StringLiterals) {
  750. llvm::StringLiteral testcase = R"(
  751. "hello world\n"
  752. '''foo
  753. test \
  754. \xAB
  755. ''' trailing
  756. #"""#
  757. "\0"
  758. #"\0"foo"\1"#
  759. """x"""
  760. )";
  761. auto [buffer, value_stores] =
  762. compile_helper_.GetTokenizedBufferWithSharedValueStore(testcase);
  763. EXPECT_FALSE(buffer.has_errors());
  764. EXPECT_THAT(buffer,
  765. HasTokens(llvm::ArrayRef<ExpectedToken>{
  766. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  767. {.kind = TokenKind::StringLiteral,
  768. .line = 2,
  769. .column = 5,
  770. .indent_column = 5,
  771. .value_stores = &value_stores,
  772. .string_contents = {"hello world\n"}},
  773. {.kind = TokenKind::StringLiteral,
  774. .line = 4,
  775. .column = 5,
  776. .indent_column = 5,
  777. .value_stores = &value_stores,
  778. .string_contents = {" test \xAB\n"}},
  779. {.kind = TokenKind::Identifier,
  780. .line = 7,
  781. .column = 10,
  782. .indent_column = 5,
  783. .text = "trailing"},
  784. {.kind = TokenKind::StringLiteral,
  785. .line = 9,
  786. .column = 7,
  787. .indent_column = 7,
  788. .value_stores = &value_stores,
  789. .string_contents = {"\""}},
  790. {.kind = TokenKind::StringLiteral,
  791. .line = 11,
  792. .column = 5,
  793. .indent_column = 5,
  794. .value_stores = &value_stores,
  795. .string_contents = llvm::StringLiteral::withInnerNUL("\0")},
  796. {.kind = TokenKind::StringLiteral,
  797. .line = 13,
  798. .column = 5,
  799. .indent_column = 5,
  800. .value_stores = &value_stores,
  801. .string_contents = {"\\0\"foo\"\\1"}},
  802. // """x""" is three string literals, not one invalid
  803. // attempt at a block string literal.
  804. {.kind = TokenKind::StringLiteral,
  805. .line = 15,
  806. .column = 5,
  807. .indent_column = 5,
  808. .value_stores = &value_stores,
  809. .string_contents = {""}},
  810. {.kind = TokenKind::StringLiteral,
  811. .line = 15,
  812. .column = 7,
  813. .indent_column = 5,
  814. .value_stores = &value_stores,
  815. .string_contents = {"x"}},
  816. {.kind = TokenKind::StringLiteral,
  817. .line = 15,
  818. .column = 10,
  819. .indent_column = 5,
  820. .value_stores = &value_stores,
  821. .string_contents = {""}},
  822. {.kind = TokenKind::FileEnd, .line = 16, .column = 3},
  823. }));
  824. }
  825. TEST_F(LexerTest, InvalidStringLiterals) {
  826. llvm::StringLiteral invalid[] = {
  827. // clang-format off
  828. R"(")",
  829. R"('''
  830. '')",
  831. R"("\)",
  832. R"("\")",
  833. R"("\\)",
  834. R"("\\\")",
  835. R"(''')",
  836. R"('''
  837. )",
  838. R"('''\)",
  839. R"(#'''
  840. ''')",
  841. // clang-format on
  842. };
  843. for (llvm::StringLiteral test : invalid) {
  844. SCOPED_TRACE(test);
  845. auto& buffer = compile_helper_.GetTokenizedBuffer(test);
  846. EXPECT_TRUE(buffer.has_errors());
  847. // We should have formed at least one error token.
  848. bool found_error = false;
  849. for (TokenIndex token : buffer.tokens()) {
  850. if (buffer.GetKind(token) == TokenKind::Error) {
  851. found_error = true;
  852. break;
  853. }
  854. }
  855. EXPECT_TRUE(found_error);
  856. }
  857. }
  858. TEST_F(LexerTest, TypeLiterals) {
  859. llvm::StringLiteral testcase = R"(
  860. i0 i1 i20 i999999999999 i0x1
  861. u0 u1 u64 u64b
  862. f32 f80 f1 fi
  863. s1
  864. )";
  865. auto [buffer, value_stores] =
  866. compile_helper_.GetTokenizedBufferWithSharedValueStore(testcase);
  867. EXPECT_FALSE(buffer.has_errors());
  868. ASSERT_THAT(buffer,
  869. HasTokens(llvm::ArrayRef<ExpectedToken>{
  870. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  871. {.kind = TokenKind::Identifier,
  872. .line = 2,
  873. .column = 5,
  874. .indent_column = 5,
  875. .text = {"i0"}},
  876. {.kind = TokenKind::IntTypeLiteral,
  877. .line = 2,
  878. .column = 8,
  879. .indent_column = 5,
  880. .text = {"i1"}},
  881. {.kind = TokenKind::IntTypeLiteral,
  882. .line = 2,
  883. .column = 11,
  884. .indent_column = 5,
  885. .text = {"i20"}},
  886. {.kind = TokenKind::IntTypeLiteral,
  887. .line = 2,
  888. .column = 15,
  889. .indent_column = 5,
  890. .text = {"i999999999999"}},
  891. {.kind = TokenKind::Identifier,
  892. .line = 2,
  893. .column = 29,
  894. .indent_column = 5,
  895. .text = {"i0x1"}},
  896. {.kind = TokenKind::Identifier,
  897. .line = 3,
  898. .column = 5,
  899. .indent_column = 5,
  900. .text = {"u0"}},
  901. {.kind = TokenKind::UnsignedIntTypeLiteral,
  902. .line = 3,
  903. .column = 8,
  904. .indent_column = 5,
  905. .text = {"u1"}},
  906. {.kind = TokenKind::UnsignedIntTypeLiteral,
  907. .line = 3,
  908. .column = 11,
  909. .indent_column = 5,
  910. .text = {"u64"}},
  911. {.kind = TokenKind::Identifier,
  912. .line = 3,
  913. .column = 15,
  914. .indent_column = 5,
  915. .text = {"u64b"}},
  916. {.kind = TokenKind::FloatTypeLiteral,
  917. .line = 4,
  918. .column = 5,
  919. .indent_column = 5,
  920. .text = {"f32"}},
  921. {.kind = TokenKind::FloatTypeLiteral,
  922. .line = 4,
  923. .column = 9,
  924. .indent_column = 5,
  925. .text = {"f80"}},
  926. {.kind = TokenKind::FloatTypeLiteral,
  927. .line = 4,
  928. .column = 13,
  929. .indent_column = 5,
  930. .text = {"f1"}},
  931. {.kind = TokenKind::Identifier,
  932. .line = 4,
  933. .column = 16,
  934. .indent_column = 5,
  935. .text = {"fi"}},
  936. {.kind = TokenKind::Identifier,
  937. .line = 5,
  938. .column = 5,
  939. .indent_column = 5,
  940. .text = {"s1"}},
  941. {.kind = TokenKind::FileEnd, .line = 6, .column = 3},
  942. }));
  943. auto type_size = [&](int token_index) {
  944. auto token = buffer.tokens().begin()[token_index];
  945. return value_stores.ints().Get(buffer.GetTypeLiteralSize(token));
  946. };
  947. EXPECT_EQ(type_size(2), 1);
  948. EXPECT_EQ(type_size(3), 20);
  949. EXPECT_EQ(type_size(4), 999999999999ULL);
  950. EXPECT_EQ(type_size(7), 1);
  951. EXPECT_EQ(type_size(8), 64);
  952. EXPECT_EQ(type_size(10), 32);
  953. EXPECT_EQ(type_size(11), 80);
  954. EXPECT_EQ(type_size(12), 1);
  955. }
  956. TEST_F(LexerTest, TypeLiteralTooManyDigits) {
  957. // We increase the number of digits until the first one that is to large.
  958. Testing::MockDiagnosticConsumer consumer;
  959. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  960. DiagnosticKind::TooManyTypeBitWidthDigits,
  961. DiagnosticLevel::Error, 1, 2, _)));
  962. std::string code = "i";
  963. // A 128-bit APInt should be plenty large, but if needed in the future it can
  964. // be widened without issue.
  965. llvm::APInt bits = llvm::APInt::getZero(128);
  966. for ([[maybe_unused]] int _ : llvm::seq(1, 30)) {
  967. code.append("9");
  968. bits = bits * 10 + 9;
  969. auto [buffer, value_stores] =
  970. compile_helper_.GetTokenizedBufferWithSharedValueStore(code, &consumer);
  971. if (buffer.has_errors()) {
  972. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  973. {.kind = TokenKind::FileStart},
  974. {.kind = TokenKind::Error, .text = code},
  975. {.kind = TokenKind::FileEnd},
  976. }));
  977. break;
  978. }
  979. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  980. {.kind = TokenKind::FileStart},
  981. {.kind = TokenKind::IntTypeLiteral, .text = code},
  982. {.kind = TokenKind::FileEnd},
  983. }));
  984. auto token = buffer.tokens().begin()[1];
  985. EXPECT_TRUE(llvm::APInt::isSameValue(
  986. value_stores.ints().Get(buffer.GetTypeLiteralSize(token)), bits));
  987. }
  988. // Make sure we can also gracefully reject very large number of digits without
  989. // crashing or hanging, and show the correct number.
  990. constexpr int Count = 10000;
  991. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  992. DiagnosticKind::TooManyTypeBitWidthDigits,
  993. DiagnosticLevel::Error, 1, 2,
  994. HasSubstr(llvm::formatv(" {0} ", Count)))));
  995. code = "i";
  996. code.append(Count, '9');
  997. auto& buffer = compile_helper_.GetTokenizedBuffer(code, &consumer);
  998. ASSERT_TRUE(buffer.has_errors());
  999. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  1000. {.kind = TokenKind::FileStart},
  1001. {.kind = TokenKind::Error, .text = code},
  1002. {.kind = TokenKind::FileEnd},
  1003. }));
  1004. }
  1005. TEST_F(LexerTest, DiagnosticTrailingComment) {
  1006. llvm::StringLiteral testcase = R"(
  1007. // Hello!
  1008. var String x; // trailing comment
  1009. )";
  1010. Testing::MockDiagnosticConsumer consumer;
  1011. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1012. DiagnosticKind::TrailingComment,
  1013. DiagnosticLevel::Error, 3, 19, _)));
  1014. compile_helper_.GetTokenizedBuffer(testcase, &consumer);
  1015. }
  1016. TEST_F(LexerTest, DiagnosticWhitespace) {
  1017. Testing::MockDiagnosticConsumer consumer;
  1018. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1019. DiagnosticKind::NoWhitespaceAfterCommentIntroducer,
  1020. DiagnosticLevel::Error, 1, 3, _)));
  1021. compile_helper_.GetTokenizedBuffer("//no space after comment", &consumer);
  1022. }
  1023. TEST_F(LexerTest, DiagnosticUnrecognizedEscape) {
  1024. Testing::MockDiagnosticConsumer consumer;
  1025. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1026. DiagnosticKind::UnknownEscapeSequence,
  1027. DiagnosticLevel::Error, 1, 8, HasSubstr("`b`"))));
  1028. compile_helper_.GetTokenizedBuffer(R"("hello\bworld")", &consumer);
  1029. }
  1030. TEST_F(LexerTest, DiagnosticBadHex) {
  1031. Testing::MockDiagnosticConsumer consumer;
  1032. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1033. DiagnosticKind::HexadecimalEscapeMissingDigits,
  1034. DiagnosticLevel::Error, 1, 9, _)));
  1035. compile_helper_.GetTokenizedBuffer(R"("hello\xabworld")", &consumer);
  1036. }
  1037. TEST_F(LexerTest, DiagnosticInvalidDigit) {
  1038. Testing::MockDiagnosticConsumer consumer;
  1039. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1040. DiagnosticKind::InvalidDigit,
  1041. DiagnosticLevel::Error, 1, 6, HasSubstr("'a'"))));
  1042. compile_helper_.GetTokenizedBuffer("0x123abc", &consumer);
  1043. }
  1044. TEST_F(LexerTest, DiagnosticMissingTerminator) {
  1045. Testing::MockDiagnosticConsumer consumer;
  1046. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1047. DiagnosticKind::UnterminatedString,
  1048. DiagnosticLevel::Error, 1, 1, _)));
  1049. compile_helper_.GetTokenizedBuffer(R"(#" ")", &consumer);
  1050. }
  1051. TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
  1052. Testing::MockDiagnosticConsumer consumer;
  1053. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1054. DiagnosticKind::UnrecognizedCharacters,
  1055. DiagnosticLevel::Error, 1, 1, _)));
  1056. compile_helper_.GetTokenizedBuffer("\b", &consumer);
  1057. }
  1058. TEST_F(LexerTest, DiagnosticFileTooLarge) {
  1059. Testing::MockDiagnosticConsumer consumer;
  1060. static constexpr size_t NumLines = 10'000'000;
  1061. std::string input;
  1062. input.reserve(NumLines * 3);
  1063. for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
  1064. input += "{}\n";
  1065. }
  1066. EXPECT_CALL(consumer,
  1067. HandleDiagnostic(IsSingleDiagnostic(
  1068. DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
  1069. TokenizedBuffer::MaxTokens / 2, 1, _)));
  1070. compile_helper_.GetTokenizedBuffer(input, &consumer);
  1071. }
  1072. // Appends comment lines to the string, to create a comment block.
  1073. static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
  1074. -> void {
  1075. llvm::raw_string_ostream out(str);
  1076. for (int i : llvm::seq(count)) {
  1077. out << "// " << tag << i << "\n";
  1078. }
  1079. }
  1080. TEST_F(LexerTest, CommentBlock) {
  1081. for (int comments_before = 0; comments_before < 5; ++comments_before) {
  1082. std::string prefix;
  1083. AppendCommentLines(prefix, comments_before, "B");
  1084. for (int comments_after = 1; comments_after < 5; ++comments_after) {
  1085. std::string source = prefix;
  1086. if (comments_before > 0) {
  1087. source += "//\n";
  1088. }
  1089. AppendCommentLines(source, comments_after, "C");
  1090. SCOPED_TRACE(llvm::formatv(
  1091. "{0} comment lines before the empty comment line, {1} after",
  1092. comments_before, comments_after));
  1093. auto& buffer = compile_helper_.GetTokenizedBuffer(source);
  1094. ASSERT_FALSE(buffer.has_errors());
  1095. EXPECT_THAT(buffer.comments_size(), Eq(1));
  1096. }
  1097. }
  1098. }
  1099. TEST_F(LexerTest, IndentedComments) {
  1100. for (int indent = 0; indent < 40; ++indent) {
  1101. SCOPED_TRACE(llvm::formatv("Indent: {0}", indent));
  1102. std::string source;
  1103. llvm::raw_string_ostream source_stream(source);
  1104. source_stream.indent(indent);
  1105. source_stream << "// Comment\n";
  1106. auto& buffer = compile_helper_.GetTokenizedBuffer(source);
  1107. ASSERT_FALSE(buffer.has_errors());
  1108. EXPECT_THAT(buffer.comments_size(), Eq(1));
  1109. std::string simd_source =
  1110. source +
  1111. "\"Add a bunch of padding so that SIMD logic shouldn't hit EOF\"";
  1112. auto& simd_buffer = compile_helper_.GetTokenizedBuffer(source);
  1113. ASSERT_FALSE(simd_buffer.has_errors());
  1114. EXPECT_THAT(simd_buffer.comments_size(), Eq(1));
  1115. }
  1116. }
  1117. TEST_F(LexerTest, MultipleComments) {
  1118. constexpr llvm::StringLiteral Format = R"(
  1119. {0}
  1120. {1}
  1121. {2}
  1122. {3}
  1123. '''This is a string, not a comment. The next comment will stop SIMD due to being
  1124. too close to the EOF.
  1125. '''
  1126. {4}
  1127. x
  1128. )";
  1129. constexpr llvm::StringLiteral Comments[] = {
  1130. // NOLINTNEXTLINE(bugprone-suspicious-missing-comma)
  1131. "// This comment should be possible to parse with SIMD.\n"
  1132. "// This one too.\n",
  1133. "// This one as well, though it's a different indent.\n"
  1134. " // And mixes indent.\n"
  1135. " // And mixes indent more.\n",
  1136. "// This is one comment:\n"
  1137. "//Invalid\n"
  1138. "// Valid\n"
  1139. "//Invalid\n"
  1140. "//\n"
  1141. "// Valid\n"
  1142. "//\n"
  1143. "// Valid\n",
  1144. "// This uses a high indent, which stops SIMD.\n", "//\n"};
  1145. std::string source = llvm::formatv(Format.data(), Comments[0], Comments[1],
  1146. Comments[2], Comments[3], Comments[4])
  1147. .str();
  1148. auto& buffer = compile_helper_.GetTokenizedBuffer(source);
  1149. EXPECT_TRUE(buffer.has_errors());
  1150. EXPECT_THAT(buffer.comments_size(), Eq(std::size(Comments)));
  1151. for (int i :
  1152. llvm::seq(std::min<int>(buffer.comments_size(), std::size(Comments)))) {
  1153. EXPECT_THAT(buffer.GetCommentText(CommentIndex(i)).str(),
  1154. testing::StrEq(Comments[i]));
  1155. }
  1156. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  1157. {.kind = TokenKind::FileStart},
  1158. {.kind = TokenKind::StringLiteral},
  1159. {.kind = TokenKind::Identifier},
  1160. {.kind = TokenKind::FileEnd},
  1161. }));
  1162. }
  1163. TEST_F(LexerTest, PrintingOutputYaml) {
  1164. // Test that we can parse this into YAML and verify line and indent data.
  1165. auto& buffer =
  1166. compile_helper_.GetTokenizedBuffer("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  1167. ASSERT_FALSE(buffer.has_errors());
  1168. TestRawOstream print_stream;
  1169. buffer.Print(print_stream);
  1170. EXPECT_THAT(
  1171. Yaml::Value::FromText(print_stream.TakeStr()),
  1172. IsYaml(ElementsAre(Yaml::Sequence(ElementsAre(Yaml::Mapping(ElementsAre(
  1173. Pair("filename", buffer.source().filename().str()),
  1174. Pair("tokens", Yaml::Sequence(ElementsAre(
  1175. Yaml::Mapping(ElementsAre(
  1176. Pair("index", "0"), Pair("kind", "FileStart"),
  1177. Pair("line", "1"), Pair("column", "1"),
  1178. Pair("indent", "1"), Pair("spelling", ""))),
  1179. Yaml::Mapping(ElementsAre(
  1180. Pair("index", "1"), Pair("kind", "Semi"),
  1181. Pair("line", "2"), Pair("column", "2"),
  1182. Pair("indent", "2"), Pair("spelling", ";"),
  1183. Pair("has_leading_space", "true"))),
  1184. Yaml::Mapping(ElementsAre(
  1185. Pair("index", "2"), Pair("kind", "Semi"),
  1186. Pair("line", "5"), Pair("column", "1"),
  1187. Pair("indent", "1"), Pair("spelling", ";"),
  1188. Pair("has_leading_space", "true"))),
  1189. Yaml::Mapping(ElementsAre(
  1190. Pair("index", "3"), Pair("kind", "Semi"),
  1191. Pair("line", "5"), Pair("column", "3"),
  1192. Pair("indent", "1"), Pair("spelling", ";"),
  1193. Pair("has_leading_space", "true"))),
  1194. Yaml::Mapping(ElementsAre(
  1195. Pair("index", "4"), Pair("kind", "FileEnd"),
  1196. Pair("line", "15"), Pair("column", "1"),
  1197. Pair("indent", "1"), Pair("spelling", ""),
  1198. Pair("has_leading_space", "true")))))))))))));
  1199. }
  1200. } // namespace
  1201. } // namespace Carbon::Lex