tokenized_buffer_test.cpp 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/tokenized_buffer.h"
  5. #include <gmock/gmock.h>
  6. #include <gtest/gtest.h>
  7. #include <algorithm>
  8. #include <cmath>
  9. #include <forward_list>
  10. #include <iterator>
  11. #include <string>
  12. #include "common/raw_string_ostream.h"
  13. #include "llvm/ADT/ArrayRef.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "toolchain/base/shared_value_stores.h"
  16. #include "toolchain/diagnostics/diagnostic_emitter.h"
  17. #include "toolchain/diagnostics/mocks.h"
  18. #include "toolchain/lex/lex.h"
  19. #include "toolchain/lex/tokenized_buffer_test_helpers.h"
  20. #include "toolchain/testing/compile_helper.h"
  21. #include "toolchain/testing/yaml_test_helpers.h"
  22. namespace Carbon::Lex {
  23. namespace {
  24. using ::Carbon::Testing::ExpectedToken;
  25. using ::Carbon::Testing::IsSingleDiagnostic;
  26. using ::testing::_;
  27. using ::testing::ElementsAre;
  28. using ::testing::Eq;
  29. using ::testing::HasSubstr;
  30. using ::testing::Pair;
  31. namespace Yaml = ::Carbon::Testing::Yaml;
  32. class LexerTest : public ::testing::Test {
  33. public:
  34. Testing::CompileHelper compile_helper_;
  35. };
  36. TEST_F(LexerTest, HandlesEmptyBuffer) {
  37. auto& buffer = compile_helper_.GetTokenizedBuffer("");
  38. EXPECT_FALSE(buffer.has_errors());
  39. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  40. {.kind = TokenKind::FileStart},
  41. {.kind = TokenKind::FileEnd}}));
  42. }
  43. TEST_F(LexerTest, NullStringRef) {
  44. auto& buffer = compile_helper_.GetTokenizedBuffer(llvm::StringRef());
  45. EXPECT_FALSE(buffer.has_errors());
  46. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  47. {.kind = TokenKind::FileStart},
  48. {.kind = TokenKind::FileEnd}}));
  49. }
  50. TEST_F(LexerTest, TracksLinesAndColumns) {
  51. auto& buffer = compile_helper_.GetTokenizedBuffer(
  52. "\n ;;\n ;;;\n x\"foo\" '''baz\n a\n ''' y");
  53. EXPECT_FALSE(buffer.has_errors());
  54. EXPECT_THAT(
  55. buffer,
  56. HasTokens(llvm::ArrayRef<ExpectedToken>{
  57. {.kind = TokenKind::FileStart,
  58. .line = 1,
  59. .column = 1,
  60. .indent_column = 1},
  61. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
  62. {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
  63. {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
  64. {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
  65. {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
  66. {.kind = TokenKind::Identifier,
  67. .line = 4,
  68. .column = 4,
  69. .indent_column = 4,
  70. .text = "x"},
  71. {.kind = TokenKind::StringLiteral,
  72. .line = 4,
  73. .column = 5,
  74. .indent_column = 4},
  75. {.kind = TokenKind::StringLiteral,
  76. .line = 4,
  77. .column = 11,
  78. .indent_column = 4},
  79. {.kind = TokenKind::Identifier,
  80. .line = 6,
  81. .column = 6,
  82. .indent_column = 11,
  83. .text = "y"},
  84. {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
  85. }));
  86. }
  87. TEST_F(LexerTest, TracksLinesAndColumnsCrLf) {
  88. auto& buffer = compile_helper_.GetTokenizedBuffer(
  89. "\r\n ;;\r\n ;;;\r\n x\"foo\" '''baz\r\n a\r\n ''' y");
  90. EXPECT_FALSE(buffer.has_errors());
  91. EXPECT_THAT(
  92. buffer,
  93. HasTokens(llvm::ArrayRef<ExpectedToken>{
  94. {.kind = TokenKind::FileStart,
  95. .line = 1,
  96. .column = 1,
  97. .indent_column = 1},
  98. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
  99. {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
  100. {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
  101. {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
  102. {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
  103. {.kind = TokenKind::Identifier,
  104. .line = 4,
  105. .column = 4,
  106. .indent_column = 4,
  107. .text = "x"},
  108. {.kind = TokenKind::StringLiteral,
  109. .line = 4,
  110. .column = 5,
  111. .indent_column = 4},
  112. {.kind = TokenKind::StringLiteral,
  113. .line = 4,
  114. .column = 11,
  115. .indent_column = 4},
  116. {.kind = TokenKind::Identifier,
  117. .line = 6,
  118. .column = 6,
  119. .indent_column = 11,
  120. .text = "y"},
  121. {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
  122. }));
  123. }
  124. TEST_F(LexerTest, InvalidCR) {
  125. auto& buffer = compile_helper_.GetTokenizedBuffer("\n ;;\r ;\n x");
  126. EXPECT_TRUE(buffer.has_errors());
  127. EXPECT_THAT(
  128. buffer,
  129. HasTokens(llvm::ArrayRef<ExpectedToken>{
  130. {.kind = TokenKind::FileStart,
  131. .line = 1,
  132. .column = 1,
  133. .indent_column = 1},
  134. {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
  135. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
  136. {.kind = TokenKind::Semi, .line = 2, .column = 6, .indent_column = 2},
  137. {.kind = TokenKind::Identifier,
  138. .line = 3,
  139. .column = 4,
  140. .indent_column = 4,
  141. .text = "x"},
  142. {.kind = TokenKind::FileEnd, .line = 3, .column = 5},
  143. }));
  144. }
  145. TEST_F(LexerTest, InvalidLfCr) {
  146. auto& buffer = compile_helper_.GetTokenizedBuffer("\n ;;\n\r ;\n x");
  147. EXPECT_TRUE(buffer.has_errors());
  148. EXPECT_THAT(
  149. buffer,
  150. HasTokens(llvm::ArrayRef<ExpectedToken>{
  151. {.kind = TokenKind::FileStart,
  152. .line = 1,
  153. .column = 1,
  154. .indent_column = 1},
  155. {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
  156. {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
  157. {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 1},
  158. {.kind = TokenKind::Identifier,
  159. .line = 4,
  160. .column = 4,
  161. .indent_column = 4,
  162. .text = "x"},
  163. {.kind = TokenKind::FileEnd, .line = 4, .column = 5},
  164. }));
  165. }
  166. TEST_F(LexerTest, HandlesNumericLiteral) {
  167. auto [buffer, value_stores] =
  168. compile_helper_.GetTokenizedBufferWithSharedValueStore(
  169. "12-578\n 1 2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
  170. EXPECT_FALSE(buffer.has_errors());
  171. ASSERT_THAT(buffer,
  172. HasTokens(llvm::ArrayRef<ExpectedToken>{
  173. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  174. {.kind = TokenKind::IntLiteral,
  175. .line = 1,
  176. .column = 1,
  177. .indent_column = 1,
  178. .text = "12"},
  179. {.kind = TokenKind::Minus,
  180. .line = 1,
  181. .column = 3,
  182. .indent_column = 1},
  183. {.kind = TokenKind::IntLiteral,
  184. .line = 1,
  185. .column = 4,
  186. .indent_column = 1,
  187. .text = "578"},
  188. {.kind = TokenKind::IntLiteral,
  189. .line = 2,
  190. .column = 3,
  191. .indent_column = 3,
  192. .text = "1"},
  193. {.kind = TokenKind::IntLiteral,
  194. .line = 2,
  195. .column = 6,
  196. .indent_column = 3,
  197. .text = "2"},
  198. {.kind = TokenKind::IntLiteral,
  199. .line = 3,
  200. .column = 1,
  201. .indent_column = 1,
  202. .text = "0x12_3ABC"},
  203. {.kind = TokenKind::IntLiteral,
  204. .line = 4,
  205. .column = 1,
  206. .indent_column = 1,
  207. .text = "0b10_10_11"},
  208. {.kind = TokenKind::IntLiteral,
  209. .line = 5,
  210. .column = 1,
  211. .indent_column = 1,
  212. .text = "1_234_567"},
  213. {.kind = TokenKind::RealLiteral,
  214. .line = 6,
  215. .column = 1,
  216. .indent_column = 1,
  217. .text = "1.5e9"},
  218. {.kind = TokenKind::FileEnd, .line = 6, .column = 6},
  219. }));
  220. auto token_start = buffer.tokens().begin();
  221. auto token_12 = token_start + 1;
  222. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_12)), 12);
  223. auto token_578 = token_12 + 2;
  224. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_578)), 578);
  225. auto token_1 = token_578 + 1;
  226. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_1)), 1);
  227. auto token_2 = token_1 + 1;
  228. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_2)), 2);
  229. auto token_0x12_3abc = token_2 + 1;
  230. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_0x12_3abc)),
  231. 0x12'3abc);
  232. auto token_0b10_10_11 = token_0x12_3abc + 1;
  233. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_0b10_10_11)),
  234. 0b10'10'11);
  235. auto token_1_234_567 = token_0b10_10_11 + 1;
  236. EXPECT_EQ(value_stores.ints().Get(buffer.GetIntLiteral(*token_1_234_567)),
  237. 1'234'567);
  238. auto token_1_5e9 = token_1_234_567 + 1;
  239. auto value_1_5e9 =
  240. value_stores.reals().Get(buffer.GetRealLiteral(*token_1_5e9));
  241. EXPECT_EQ(value_1_5e9.mantissa.getZExtValue(), 15);
  242. EXPECT_EQ(value_1_5e9.exponent.getSExtValue(), 8);
  243. EXPECT_EQ(value_1_5e9.is_decimal, true);
  244. }
  245. TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
  246. auto& buffer =
  247. compile_helper_.GetTokenizedBuffer("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
  248. EXPECT_TRUE(buffer.has_errors());
  249. ASSERT_THAT(buffer,
  250. HasTokens(llvm::ArrayRef<ExpectedToken>{
  251. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  252. {.kind = TokenKind::Error,
  253. .line = 1,
  254. .column = 1,
  255. .indent_column = 1,
  256. .text = "14x"},
  257. {.kind = TokenKind::IntLiteral,
  258. .line = 1,
  259. .column = 5,
  260. .indent_column = 1,
  261. .text = "15_49"},
  262. {.kind = TokenKind::Error,
  263. .line = 1,
  264. .column = 11,
  265. .indent_column = 1,
  266. .text = "0x3.5q"},
  267. {.kind = TokenKind::RealLiteral,
  268. .line = 1,
  269. .column = 18,
  270. .indent_column = 1,
  271. .text = "0x3_4.5_6"},
  272. {.kind = TokenKind::Error,
  273. .line = 1,
  274. .column = 28,
  275. .indent_column = 1,
  276. .text = "0ops"},
  277. {.kind = TokenKind::FileEnd, .line = 1, .column = 32},
  278. }));
  279. }
  280. TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
  281. llvm::StringLiteral source_text = R"(
  282. 1.
  283. .2
  284. 3.+foo
  285. 4.0-bar
  286. 5.0e+123+456
  287. 6.0e+1e+2
  288. 1e7
  289. 8..10
  290. 9.0.9.5
  291. 10.foo
  292. 11.0.foo
  293. 12e+1
  294. 13._
  295. )";
  296. auto& buffer = compile_helper_.GetTokenizedBuffer(source_text);
  297. EXPECT_TRUE(buffer.has_errors());
  298. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  299. {.kind = TokenKind::FileStart},
  300. {.kind = TokenKind::IntLiteral, .text = "1"},
  301. {.kind = TokenKind::Period},
  302. // newline
  303. {.kind = TokenKind::Period},
  304. {.kind = TokenKind::IntLiteral, .text = "2"},
  305. // newline
  306. {.kind = TokenKind::IntLiteral, .text = "3"},
  307. {.kind = TokenKind::Period},
  308. {.kind = TokenKind::Plus},
  309. {.kind = TokenKind::Identifier, .text = "foo"},
  310. // newline
  311. {.kind = TokenKind::RealLiteral, .text = "4.0"},
  312. {.kind = TokenKind::Minus},
  313. {.kind = TokenKind::Identifier, .text = "bar"},
  314. // newline
  315. {.kind = TokenKind::RealLiteral, .text = "5.0e+123"},
  316. {.kind = TokenKind::Plus},
  317. {.kind = TokenKind::IntLiteral, .text = "456"},
  318. // newline
  319. {.kind = TokenKind::Error, .text = "6.0e+1e"},
  320. {.kind = TokenKind::Plus},
  321. {.kind = TokenKind::IntLiteral, .text = "2"},
  322. // newline
  323. {.kind = TokenKind::Error, .text = "1e7"},
  324. // newline
  325. {.kind = TokenKind::IntLiteral, .text = "8"},
  326. {.kind = TokenKind::Period},
  327. {.kind = TokenKind::Period},
  328. {.kind = TokenKind::IntLiteral, .text = "10"},
  329. // newline
  330. {.kind = TokenKind::RealLiteral, .text = "9.0"},
  331. {.kind = TokenKind::Period},
  332. {.kind = TokenKind::IntLiteral, .text = "9"},
  333. {.kind = TokenKind::Period},
  334. {.kind = TokenKind::IntLiteral, .text = "5"},
  335. // newline
  336. {.kind = TokenKind::Error, .text = "10.foo"},
  337. // newline
  338. {.kind = TokenKind::RealLiteral, .text = "11.0"},
  339. {.kind = TokenKind::Period},
  340. {.kind = TokenKind::Identifier, .text = "foo"},
  341. // newline
  342. {.kind = TokenKind::Error, .text = "12e"},
  343. {.kind = TokenKind::Plus},
  344. {.kind = TokenKind::IntLiteral, .text = "1"},
  345. // newline
  346. {.kind = TokenKind::IntLiteral, .text = "13"},
  347. {.kind = TokenKind::Period},
  348. {.kind = TokenKind::Underscore},
  349. // newline
  350. {.kind = TokenKind::FileEnd},
  351. }));
  352. }
  353. TEST_F(LexerTest, HandlesGarbageCharacters) {
  354. constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\\\"\\\n\"x";
  355. auto& buffer = compile_helper_.GetTokenizedBuffer(
  356. llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
  357. EXPECT_TRUE(buffer.has_errors());
  358. EXPECT_THAT(
  359. buffer,
  360. HasTokens(llvm::ArrayRef<ExpectedToken>{
  361. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  362. {.kind = TokenKind::Error,
  363. .line = 1,
  364. .column = 1,
  365. // 💩 takes 4 bytes, and we count column as bytes offset.
  366. .text = llvm::StringRef("$$💩", 6)},
  367. {.kind = TokenKind::Minus, .line = 1, .column = 7},
  368. {.kind = TokenKind::Error, .line = 1, .column = 8, .text = "$"},
  369. // newline
  370. {.kind = TokenKind::Error,
  371. .line = 2,
  372. .column = 1,
  373. .text = llvm::StringRef("$\0$", 3)},
  374. {.kind = TokenKind::IntLiteral, .line = 2, .column = 4, .text = "12"},
  375. {.kind = TokenKind::Error, .line = 2, .column = 6, .text = "$"},
  376. // newline
  377. {.kind = TokenKind::Backslash, .line = 3, .column = 1, .text = "\\"},
  378. {.kind = TokenKind::Error, .line = 3, .column = 2, .text = "\"\\"},
  379. // newline
  380. {.kind = TokenKind::Error, .line = 4, .column = 1, .text = "\"x"},
  381. {.kind = TokenKind::FileEnd, .line = 4, .column = 3},
  382. }));
  383. }
  384. TEST_F(LexerTest, Symbols) {
  385. // We don't need to exhaustively test symbols here as they're handled with
  386. // common code, but we want to check specific patterns to verify things like
  387. // max-munch rule and handling of interesting symbols.
  388. auto& buffer1 = compile_helper_.GetTokenizedBuffer("<<<");
  389. EXPECT_FALSE(buffer1.has_errors());
  390. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  391. {.kind = TokenKind::FileStart},
  392. {.kind = TokenKind::LessLess},
  393. {.kind = TokenKind::Less},
  394. {.kind = TokenKind::FileEnd},
  395. }));
  396. auto& buffer2 = compile_helper_.GetTokenizedBuffer("<<=>>");
  397. EXPECT_FALSE(buffer2.has_errors());
  398. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  399. {.kind = TokenKind::FileStart},
  400. {.kind = TokenKind::LessLessEqual},
  401. {.kind = TokenKind::GreaterGreater},
  402. {.kind = TokenKind::FileEnd},
  403. }));
  404. auto& buffer3 = compile_helper_.GetTokenizedBuffer("< <=> >");
  405. EXPECT_FALSE(buffer3.has_errors());
  406. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  407. {.kind = TokenKind::FileStart},
  408. {.kind = TokenKind::Less},
  409. {.kind = TokenKind::LessEqualGreater},
  410. {.kind = TokenKind::Greater},
  411. {.kind = TokenKind::FileEnd},
  412. }));
  413. auto& buffer4 = compile_helper_.GetTokenizedBuffer("\\/?@&^!");
  414. EXPECT_FALSE(buffer4.has_errors());
  415. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  416. {.kind = TokenKind::FileStart},
  417. {.kind = TokenKind::Backslash},
  418. {.kind = TokenKind::Slash},
  419. {.kind = TokenKind::Question},
  420. {.kind = TokenKind::At},
  421. {.kind = TokenKind::Amp},
  422. {.kind = TokenKind::Caret},
  423. {.kind = TokenKind::Exclaim},
  424. {.kind = TokenKind::FileEnd},
  425. }));
  426. }
  427. TEST_F(LexerTest, Parens) {
  428. auto& buffer1 = compile_helper_.GetTokenizedBuffer("()");
  429. EXPECT_FALSE(buffer1.has_errors());
  430. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  431. {.kind = TokenKind::FileStart},
  432. {.kind = TokenKind::OpenParen},
  433. {.kind = TokenKind::CloseParen},
  434. {.kind = TokenKind::FileEnd},
  435. }));
  436. auto& buffer2 = compile_helper_.GetTokenizedBuffer("((()()))");
  437. EXPECT_FALSE(buffer2.has_errors());
  438. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  439. {.kind = TokenKind::FileStart},
  440. {.kind = TokenKind::OpenParen},
  441. {.kind = TokenKind::OpenParen},
  442. {.kind = TokenKind::OpenParen},
  443. {.kind = TokenKind::CloseParen},
  444. {.kind = TokenKind::OpenParen},
  445. {.kind = TokenKind::CloseParen},
  446. {.kind = TokenKind::CloseParen},
  447. {.kind = TokenKind::CloseParen},
  448. {.kind = TokenKind::FileEnd},
  449. }));
  450. }
  451. TEST_F(LexerTest, CurlyBraces) {
  452. auto& buffer1 = compile_helper_.GetTokenizedBuffer("{}");
  453. EXPECT_FALSE(buffer1.has_errors());
  454. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  455. {.kind = TokenKind::FileStart},
  456. {.kind = TokenKind::OpenCurlyBrace},
  457. {.kind = TokenKind::CloseCurlyBrace},
  458. {.kind = TokenKind::FileEnd},
  459. }));
  460. auto& buffer2 = compile_helper_.GetTokenizedBuffer("{{{}{}}}");
  461. EXPECT_FALSE(buffer2.has_errors());
  462. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  463. {.kind = TokenKind::FileStart},
  464. {.kind = TokenKind::OpenCurlyBrace},
  465. {.kind = TokenKind::OpenCurlyBrace},
  466. {.kind = TokenKind::OpenCurlyBrace},
  467. {.kind = TokenKind::CloseCurlyBrace},
  468. {.kind = TokenKind::OpenCurlyBrace},
  469. {.kind = TokenKind::CloseCurlyBrace},
  470. {.kind = TokenKind::CloseCurlyBrace},
  471. {.kind = TokenKind::CloseCurlyBrace},
  472. {.kind = TokenKind::FileEnd},
  473. }));
  474. }
  475. TEST_F(LexerTest, MatchingGroups) {
  476. {
  477. auto& buffer = compile_helper_.GetTokenizedBuffer("(){}");
  478. ASSERT_FALSE(buffer.has_errors());
  479. auto it = ++buffer.tokens().begin();
  480. auto open_paren_token = *it++;
  481. auto close_paren_token = *it++;
  482. EXPECT_EQ(close_paren_token,
  483. buffer.GetMatchedClosingToken(open_paren_token));
  484. EXPECT_EQ(open_paren_token,
  485. buffer.GetMatchedOpeningToken(close_paren_token));
  486. auto open_curly_token = *it++;
  487. auto close_curly_token = *it++;
  488. EXPECT_EQ(close_curly_token,
  489. buffer.GetMatchedClosingToken(open_curly_token));
  490. EXPECT_EQ(open_curly_token,
  491. buffer.GetMatchedOpeningToken(close_curly_token));
  492. auto eof_token = *it++;
  493. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::FileEnd);
  494. EXPECT_EQ(buffer.tokens().end(), it);
  495. }
  496. {
  497. auto [buffer, value_stores] =
  498. compile_helper_.GetTokenizedBufferWithSharedValueStore(
  499. "({x}){(y)} {{((z))}}");
  500. ASSERT_FALSE(buffer.has_errors());
  501. auto it = ++buffer.tokens().begin();
  502. auto open_paren_token = *it++;
  503. auto open_curly_token = *it++;
  504. ASSERT_EQ("x", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  505. auto close_curly_token = *it++;
  506. auto close_paren_token = *it++;
  507. EXPECT_EQ(close_paren_token,
  508. buffer.GetMatchedClosingToken(open_paren_token));
  509. EXPECT_EQ(open_paren_token,
  510. buffer.GetMatchedOpeningToken(close_paren_token));
  511. EXPECT_EQ(close_curly_token,
  512. buffer.GetMatchedClosingToken(open_curly_token));
  513. EXPECT_EQ(open_curly_token,
  514. buffer.GetMatchedOpeningToken(close_curly_token));
  515. open_curly_token = *it++;
  516. open_paren_token = *it++;
  517. ASSERT_EQ("y", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  518. close_paren_token = *it++;
  519. close_curly_token = *it++;
  520. EXPECT_EQ(close_curly_token,
  521. buffer.GetMatchedClosingToken(open_curly_token));
  522. EXPECT_EQ(open_curly_token,
  523. buffer.GetMatchedOpeningToken(close_curly_token));
  524. EXPECT_EQ(close_paren_token,
  525. buffer.GetMatchedClosingToken(open_paren_token));
  526. EXPECT_EQ(open_paren_token,
  527. buffer.GetMatchedOpeningToken(close_paren_token));
  528. open_curly_token = *it++;
  529. auto inner_open_curly_token = *it++;
  530. open_paren_token = *it++;
  531. auto inner_open_paren_token = *it++;
  532. ASSERT_EQ("z", value_stores.identifiers().Get(buffer.GetIdentifier(*it++)));
  533. auto inner_close_paren_token = *it++;
  534. close_paren_token = *it++;
  535. auto inner_close_curly_token = *it++;
  536. close_curly_token = *it++;
  537. EXPECT_EQ(close_curly_token,
  538. buffer.GetMatchedClosingToken(open_curly_token));
  539. EXPECT_EQ(open_curly_token,
  540. buffer.GetMatchedOpeningToken(close_curly_token));
  541. EXPECT_EQ(inner_close_curly_token,
  542. buffer.GetMatchedClosingToken(inner_open_curly_token));
  543. EXPECT_EQ(inner_open_curly_token,
  544. buffer.GetMatchedOpeningToken(inner_close_curly_token));
  545. EXPECT_EQ(close_paren_token,
  546. buffer.GetMatchedClosingToken(open_paren_token));
  547. EXPECT_EQ(open_paren_token,
  548. buffer.GetMatchedOpeningToken(close_paren_token));
  549. EXPECT_EQ(inner_close_paren_token,
  550. buffer.GetMatchedClosingToken(inner_open_paren_token));
  551. EXPECT_EQ(inner_open_paren_token,
  552. buffer.GetMatchedOpeningToken(inner_close_paren_token));
  553. auto eof_token = *it++;
  554. EXPECT_EQ(buffer.GetKind(eof_token), TokenKind::FileEnd);
  555. EXPECT_EQ(buffer.tokens().end(), it);
  556. }
  557. }
  558. TEST_F(LexerTest, MismatchedGroups) {
  559. auto& buffer1 = compile_helper_.GetTokenizedBuffer("{");
  560. EXPECT_TRUE(buffer1.has_errors());
  561. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  562. {.kind = TokenKind::FileStart},
  563. {.kind = TokenKind::Error, .text = "{"},
  564. {.kind = TokenKind::FileEnd},
  565. }));
  566. auto& buffer2 = compile_helper_.GetTokenizedBuffer("}");
  567. EXPECT_TRUE(buffer2.has_errors());
  568. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  569. {.kind = TokenKind::FileStart},
  570. {.kind = TokenKind::Error, .text = "}"},
  571. {.kind = TokenKind::FileEnd},
  572. }));
  573. auto& buffer3 = compile_helper_.GetTokenizedBuffer("{(}");
  574. EXPECT_TRUE(buffer3.has_errors());
  575. EXPECT_THAT(
  576. buffer3,
  577. HasTokens(llvm::ArrayRef<ExpectedToken>{
  578. {.kind = TokenKind::FileStart},
  579. {.kind = TokenKind::OpenCurlyBrace, .column = 1},
  580. {.kind = TokenKind::OpenParen, .column = 2},
  581. {.kind = TokenKind::CloseParen, .column = 3, .recovery = true},
  582. {.kind = TokenKind::CloseCurlyBrace, .column = 3},
  583. {.kind = TokenKind::FileEnd},
  584. }));
  585. auto& buffer4 = compile_helper_.GetTokenizedBuffer(")({)");
  586. EXPECT_TRUE(buffer4.has_errors());
  587. EXPECT_THAT(
  588. buffer4,
  589. HasTokens(llvm::ArrayRef<ExpectedToken>{
  590. {.kind = TokenKind::FileStart},
  591. {.kind = TokenKind::Error, .column = 1, .text = ")"},
  592. {.kind = TokenKind::OpenParen, .column = 2},
  593. {.kind = TokenKind::OpenCurlyBrace, .column = 3},
  594. {.kind = TokenKind::CloseCurlyBrace, .column = 4, .recovery = true},
  595. {.kind = TokenKind::CloseParen, .column = 4},
  596. {.kind = TokenKind::FileEnd},
  597. }));
  598. }
  599. TEST_F(LexerTest, Whitespace) {
  600. auto& buffer = compile_helper_.GetTokenizedBuffer("{( } {(");
  601. // Whether there should be whitespace before/after each token.
  602. bool space[] = {false,
  603. // start-of-file
  604. true,
  605. // {
  606. false,
  607. // (
  608. true,
  609. // inserted )
  610. true,
  611. // }
  612. true,
  613. // error {
  614. false,
  615. // error (
  616. true,
  617. // EOF
  618. false};
  619. int pos = 0;
  620. for (TokenIndex token : buffer.tokens()) {
  621. SCOPED_TRACE(
  622. llvm::formatv("Token #{0}: '{1}'", token, buffer.GetTokenText(token)));
  623. ASSERT_LT(pos, std::size(space));
  624. EXPECT_THAT(buffer.HasLeadingWhitespace(token), Eq(space[pos]));
  625. ++pos;
  626. ASSERT_LT(pos, std::size(space));
  627. EXPECT_THAT(buffer.HasTrailingWhitespace(token), Eq(space[pos]));
  628. }
  629. ASSERT_EQ(pos + 1, std::size(space));
  630. }
  631. TEST_F(LexerTest, Keywords) {
  632. TokenKind keywords[] = {
  633. #define CARBON_TOKEN(TokenName)
  634. #define CARBON_KEYWORD_TOKEN(TokenName, ...) TokenKind::TokenName,
  635. #include "toolchain/lex/token_kind.def"
  636. };
  637. for (const auto& keyword : keywords) {
  638. auto& buffer = compile_helper_.GetTokenizedBuffer(keyword.fixed_spelling());
  639. EXPECT_FALSE(buffer.has_errors());
  640. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  641. {.kind = TokenKind::FileStart},
  642. {.kind = keyword, .column = 1, .indent_column = 1},
  643. {.kind = TokenKind::FileEnd},
  644. }));
  645. }
  646. }
  647. TEST_F(LexerTest, Comments) {
  648. auto& buffer1 = compile_helper_.GetTokenizedBuffer(" ;\n // foo\n ;\n");
  649. EXPECT_FALSE(buffer1.has_errors());
  650. EXPECT_THAT(
  651. buffer1,
  652. HasTokens(llvm::ArrayRef<ExpectedToken>{
  653. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  654. {.kind = TokenKind::Semi, .line = 1, .column = 2, .indent_column = 2},
  655. {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 3},
  656. {.kind = TokenKind::FileEnd, .line = 3, .column = 4},
  657. }));
  658. auto& buffer2 = compile_helper_.GetTokenizedBuffer("// foo\n//\n// bar");
  659. EXPECT_FALSE(buffer2.has_errors());
  660. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  661. {.kind = TokenKind::FileStart},
  662. {.kind = TokenKind::FileEnd}}));
  663. // Make sure weird characters aren't a problem.
  664. auto& buffer3 =
  665. compile_helper_.GetTokenizedBuffer(" // foo#$!^?@-_💩🍫⃠ [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]");
  666. EXPECT_FALSE(buffer3.has_errors());
  667. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  668. {.kind = TokenKind::FileStart},
  669. {.kind = TokenKind::FileEnd}}));
  670. // Make sure we can lex a comment at the end of the input.
  671. auto& buffer4 = compile_helper_.GetTokenizedBuffer("//");
  672. EXPECT_FALSE(buffer4.has_errors());
  673. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  674. {.kind = TokenKind::FileStart},
  675. {.kind = TokenKind::FileEnd}}));
  676. }
  677. TEST_F(LexerTest, InvalidComments) {
  678. llvm::StringLiteral testcases[] = {
  679. " /// foo\n",
  680. "foo // bar\n",
  681. "//! hello",
  682. " //world",
  683. };
  684. for (llvm::StringLiteral testcase : testcases) {
  685. auto& buffer = compile_helper_.GetTokenizedBuffer(testcase);
  686. EXPECT_TRUE(buffer.has_errors());
  687. }
  688. }
  689. TEST_F(LexerTest, Identifiers) {
  690. auto& buffer1 = compile_helper_.GetTokenizedBuffer(" foobar");
  691. EXPECT_FALSE(buffer1.has_errors());
  692. EXPECT_THAT(buffer1, HasTokens(llvm::ArrayRef<ExpectedToken>{
  693. {.kind = TokenKind::FileStart},
  694. {.kind = TokenKind::Identifier,
  695. .column = 4,
  696. .indent_column = 4,
  697. .text = "foobar"},
  698. {.kind = TokenKind::FileEnd},
  699. }));
  700. // Check different kinds of identifier character sequences.
  701. auto& buffer2 = compile_helper_.GetTokenizedBuffer("_foo_bar");
  702. EXPECT_FALSE(buffer2.has_errors());
  703. EXPECT_THAT(buffer2, HasTokens(llvm::ArrayRef<ExpectedToken>{
  704. {.kind = TokenKind::FileStart},
  705. {.kind = TokenKind::Identifier, .text = "_foo_bar"},
  706. {.kind = TokenKind::FileEnd},
  707. }));
  708. auto& buffer3 = compile_helper_.GetTokenizedBuffer("foo2bar00");
  709. EXPECT_FALSE(buffer3.has_errors());
  710. EXPECT_THAT(buffer3, HasTokens(llvm::ArrayRef<ExpectedToken>{
  711. {.kind = TokenKind::FileStart},
  712. {.kind = TokenKind::Identifier, .text = "foo2bar00"},
  713. {.kind = TokenKind::FileEnd},
  714. }));
  715. // Check that we can parse identifiers that start with a keyword.
  716. auto& buffer4 = compile_helper_.GetTokenizedBuffer("fnord");
  717. EXPECT_FALSE(buffer4.has_errors());
  718. EXPECT_THAT(buffer4, HasTokens(llvm::ArrayRef<ExpectedToken>{
  719. {.kind = TokenKind::FileStart},
  720. {.kind = TokenKind::Identifier, .text = "fnord"},
  721. {.kind = TokenKind::FileEnd},
  722. }));
  723. // Check multiple identifiers with indent and interning.
  724. auto& buffer5 =
  725. compile_helper_.GetTokenizedBuffer(" foo;bar\nbar \n foo\tfoo");
  726. EXPECT_FALSE(buffer5.has_errors());
  727. EXPECT_THAT(buffer5,
  728. HasTokens(llvm::ArrayRef<ExpectedToken>{
  729. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  730. {.kind = TokenKind::Identifier,
  731. .line = 1,
  732. .column = 4,
  733. .indent_column = 4,
  734. .text = "foo"},
  735. {.kind = TokenKind::Semi},
  736. {.kind = TokenKind::Identifier,
  737. .line = 1,
  738. .column = 8,
  739. .indent_column = 4,
  740. .text = "bar"},
  741. {.kind = TokenKind::Identifier,
  742. .line = 2,
  743. .column = 1,
  744. .indent_column = 1,
  745. .text = "bar"},
  746. {.kind = TokenKind::Identifier,
  747. .line = 3,
  748. .column = 3,
  749. .indent_column = 3,
  750. .text = "foo"},
  751. {.kind = TokenKind::Identifier,
  752. .line = 3,
  753. .column = 7,
  754. .indent_column = 3,
  755. .text = "foo"},
  756. {.kind = TokenKind::FileEnd, .line = 3, .column = 10},
  757. }));
  758. }
  759. TEST_F(LexerTest, StringLiterals) {
  760. llvm::StringLiteral testcase = R"(
  761. "hello world\n"
  762. '''foo
  763. test \
  764. \xAB
  765. ''' trailing
  766. #"""#
  767. "\0"
  768. #"\0"foo"\1"#
  769. """x"""
  770. )";
  771. auto [buffer, value_stores] =
  772. compile_helper_.GetTokenizedBufferWithSharedValueStore(testcase);
  773. EXPECT_FALSE(buffer.has_errors());
  774. EXPECT_THAT(buffer,
  775. HasTokens(llvm::ArrayRef<ExpectedToken>{
  776. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  777. {.kind = TokenKind::StringLiteral,
  778. .line = 2,
  779. .column = 5,
  780. .indent_column = 5,
  781. .value_stores = &value_stores,
  782. .string_contents = {"hello world\n"}},
  783. {.kind = TokenKind::StringLiteral,
  784. .line = 4,
  785. .column = 5,
  786. .indent_column = 5,
  787. .value_stores = &value_stores,
  788. .string_contents = {" test \xAB\n"}},
  789. {.kind = TokenKind::Identifier,
  790. .line = 7,
  791. .column = 10,
  792. .indent_column = 5,
  793. .text = "trailing"},
  794. {.kind = TokenKind::StringLiteral,
  795. .line = 9,
  796. .column = 7,
  797. .indent_column = 7,
  798. .value_stores = &value_stores,
  799. .string_contents = {"\""}},
  800. {.kind = TokenKind::StringLiteral,
  801. .line = 11,
  802. .column = 5,
  803. .indent_column = 5,
  804. .value_stores = &value_stores,
  805. .string_contents = llvm::StringLiteral::withInnerNUL("\0")},
  806. {.kind = TokenKind::StringLiteral,
  807. .line = 13,
  808. .column = 5,
  809. .indent_column = 5,
  810. .value_stores = &value_stores,
  811. .string_contents = {"\\0\"foo\"\\1"}},
  812. // """x""" is three string literals, not one invalid
  813. // attempt at a block string literal.
  814. {.kind = TokenKind::StringLiteral,
  815. .line = 15,
  816. .column = 5,
  817. .indent_column = 5,
  818. .value_stores = &value_stores,
  819. .string_contents = {""}},
  820. {.kind = TokenKind::StringLiteral,
  821. .line = 15,
  822. .column = 7,
  823. .indent_column = 5,
  824. .value_stores = &value_stores,
  825. .string_contents = {"x"}},
  826. {.kind = TokenKind::StringLiteral,
  827. .line = 15,
  828. .column = 10,
  829. .indent_column = 5,
  830. .value_stores = &value_stores,
  831. .string_contents = {""}},
  832. {.kind = TokenKind::FileEnd, .line = 16, .column = 3},
  833. }));
  834. }
  835. TEST_F(LexerTest, InvalidStringLiterals) {
  836. llvm::StringLiteral invalid[] = {
  837. // clang-format off
  838. R"(")",
  839. R"('''
  840. '')",
  841. R"("\)",
  842. R"("\")",
  843. R"("\\)",
  844. R"("\\\")",
  845. R"(''')",
  846. R"('''
  847. )",
  848. R"('''\)",
  849. R"(#'''
  850. ''')",
  851. // clang-format on
  852. };
  853. for (llvm::StringLiteral test : invalid) {
  854. SCOPED_TRACE(test);
  855. auto& buffer = compile_helper_.GetTokenizedBuffer(test);
  856. EXPECT_TRUE(buffer.has_errors());
  857. // We should have formed at least one error token.
  858. bool found_error = false;
  859. for (TokenIndex token : buffer.tokens()) {
  860. if (buffer.GetKind(token) == TokenKind::Error) {
  861. found_error = true;
  862. break;
  863. }
  864. }
  865. EXPECT_TRUE(found_error);
  866. }
  867. }
  868. TEST_F(LexerTest, TypeLiterals) {
  869. llvm::StringLiteral testcase = R"(
  870. i0 i1 i20 i999999999999 i0x1
  871. u0 u1 u64 u64b
  872. f32 f80 f1 fi
  873. s1
  874. )";
  875. auto [buffer, value_stores] =
  876. compile_helper_.GetTokenizedBufferWithSharedValueStore(testcase);
  877. EXPECT_FALSE(buffer.has_errors());
  878. ASSERT_THAT(buffer,
  879. HasTokens(llvm::ArrayRef<ExpectedToken>{
  880. {.kind = TokenKind::FileStart, .line = 1, .column = 1},
  881. {.kind = TokenKind::Identifier,
  882. .line = 2,
  883. .column = 5,
  884. .indent_column = 5,
  885. .text = {"i0"}},
  886. {.kind = TokenKind::IntTypeLiteral,
  887. .line = 2,
  888. .column = 8,
  889. .indent_column = 5,
  890. .text = {"i1"}},
  891. {.kind = TokenKind::IntTypeLiteral,
  892. .line = 2,
  893. .column = 11,
  894. .indent_column = 5,
  895. .text = {"i20"}},
  896. {.kind = TokenKind::IntTypeLiteral,
  897. .line = 2,
  898. .column = 15,
  899. .indent_column = 5,
  900. .text = {"i999999999999"}},
  901. {.kind = TokenKind::Identifier,
  902. .line = 2,
  903. .column = 29,
  904. .indent_column = 5,
  905. .text = {"i0x1"}},
  906. {.kind = TokenKind::Identifier,
  907. .line = 3,
  908. .column = 5,
  909. .indent_column = 5,
  910. .text = {"u0"}},
  911. {.kind = TokenKind::UnsignedIntTypeLiteral,
  912. .line = 3,
  913. .column = 8,
  914. .indent_column = 5,
  915. .text = {"u1"}},
  916. {.kind = TokenKind::UnsignedIntTypeLiteral,
  917. .line = 3,
  918. .column = 11,
  919. .indent_column = 5,
  920. .text = {"u64"}},
  921. {.kind = TokenKind::Identifier,
  922. .line = 3,
  923. .column = 15,
  924. .indent_column = 5,
  925. .text = {"u64b"}},
  926. {.kind = TokenKind::FloatTypeLiteral,
  927. .line = 4,
  928. .column = 5,
  929. .indent_column = 5,
  930. .text = {"f32"}},
  931. {.kind = TokenKind::FloatTypeLiteral,
  932. .line = 4,
  933. .column = 9,
  934. .indent_column = 5,
  935. .text = {"f80"}},
  936. {.kind = TokenKind::FloatTypeLiteral,
  937. .line = 4,
  938. .column = 13,
  939. .indent_column = 5,
  940. .text = {"f1"}},
  941. {.kind = TokenKind::Identifier,
  942. .line = 4,
  943. .column = 16,
  944. .indent_column = 5,
  945. .text = {"fi"}},
  946. {.kind = TokenKind::Identifier,
  947. .line = 5,
  948. .column = 5,
  949. .indent_column = 5,
  950. .text = {"s1"}},
  951. {.kind = TokenKind::FileEnd, .line = 6, .column = 3},
  952. }));
  953. auto type_size = [&](int token_index) {
  954. auto token = buffer.tokens().begin()[token_index];
  955. return value_stores.ints().Get(buffer.GetTypeLiteralSize(token));
  956. };
  957. EXPECT_EQ(type_size(2), 1);
  958. EXPECT_EQ(type_size(3), 20);
  959. EXPECT_EQ(type_size(4), 999999999999ULL);
  960. EXPECT_EQ(type_size(7), 1);
  961. EXPECT_EQ(type_size(8), 64);
  962. EXPECT_EQ(type_size(10), 32);
  963. EXPECT_EQ(type_size(11), 80);
  964. EXPECT_EQ(type_size(12), 1);
  965. }
  966. TEST_F(LexerTest, TypeLiteralTooManyDigits) {
  967. // We increase the number of digits until the first one that is to large.
  968. Testing::MockDiagnosticConsumer consumer;
  969. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  970. Diagnostics::Kind::TooManyTypeBitWidthDigits,
  971. Diagnostics::Level::Error, 1, 2, _)));
  972. std::string code = "i";
  973. // A 128-bit APInt should be plenty large, but if needed in the future it can
  974. // be widened without issue.
  975. llvm::APInt bits = llvm::APInt::getZero(128);
  976. for ([[maybe_unused]] auto _ : llvm::seq(1, 30)) {
  977. code.append("9");
  978. bits = bits * 10 + 9;
  979. auto [buffer, value_stores] =
  980. compile_helper_.GetTokenizedBufferWithSharedValueStore(code, &consumer);
  981. if (buffer.has_errors()) {
  982. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  983. {.kind = TokenKind::FileStart},
  984. {.kind = TokenKind::Error, .text = code},
  985. {.kind = TokenKind::FileEnd},
  986. }));
  987. break;
  988. }
  989. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  990. {.kind = TokenKind::FileStart},
  991. {.kind = TokenKind::IntTypeLiteral, .text = code},
  992. {.kind = TokenKind::FileEnd},
  993. }));
  994. auto token = buffer.tokens().begin()[1];
  995. EXPECT_TRUE(llvm::APInt::isSameValue(
  996. value_stores.ints().Get(buffer.GetTypeLiteralSize(token)), bits));
  997. }
  998. // Make sure we can also gracefully reject very large number of digits without
  999. // crashing or hanging, and show the correct number.
  1000. constexpr int Count = 10000;
  1001. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1002. Diagnostics::Kind::TooManyTypeBitWidthDigits,
  1003. Diagnostics::Level::Error, 1, 2,
  1004. HasSubstr(llvm::formatv(" {0} ", Count)))));
  1005. code = "i";
  1006. code.append(Count, '9');
  1007. auto& buffer = compile_helper_.GetTokenizedBuffer(code, &consumer);
  1008. ASSERT_TRUE(buffer.has_errors());
  1009. ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  1010. {.kind = TokenKind::FileStart},
  1011. {.kind = TokenKind::Error, .text = code},
  1012. {.kind = TokenKind::FileEnd},
  1013. }));
  1014. }
  1015. TEST_F(LexerTest, DiagnosticTrailingComment) {
  1016. llvm::StringLiteral testcase = R"(
  1017. // Hello!
  1018. var String x; // trailing comment
  1019. )";
  1020. Testing::MockDiagnosticConsumer consumer;
  1021. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1022. Diagnostics::Kind::TrailingComment,
  1023. Diagnostics::Level::Error, 3, 19, _)));
  1024. compile_helper_.GetTokenizedBuffer(testcase, &consumer);
  1025. }
  1026. TEST_F(LexerTest, DiagnosticWhitespace) {
  1027. Testing::MockDiagnosticConsumer consumer;
  1028. EXPECT_CALL(consumer,
  1029. HandleDiagnostic(IsSingleDiagnostic(
  1030. Diagnostics::Kind::NoWhitespaceAfterCommentIntroducer,
  1031. Diagnostics::Level::Error, 1, 3, _)));
  1032. compile_helper_.GetTokenizedBuffer("//no space after comment", &consumer);
  1033. }
  1034. TEST_F(LexerTest, DiagnosticUnrecognizedEscape) {
  1035. Testing::MockDiagnosticConsumer consumer;
  1036. EXPECT_CALL(consumer,
  1037. HandleDiagnostic(IsSingleDiagnostic(
  1038. Diagnostics::Kind::UnknownEscapeSequence,
  1039. Diagnostics::Level::Error, 1, 8, HasSubstr("`b`"))));
  1040. compile_helper_.GetTokenizedBuffer(R"("hello\bworld")", &consumer);
  1041. }
  1042. TEST_F(LexerTest, DiagnosticBadHex) {
  1043. Testing::MockDiagnosticConsumer consumer;
  1044. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1045. Diagnostics::Kind::HexadecimalEscapeMissingDigits,
  1046. Diagnostics::Level::Error, 1, 9, _)));
  1047. compile_helper_.GetTokenizedBuffer(R"("hello\xabworld")", &consumer);
  1048. }
  1049. TEST_F(LexerTest, DiagnosticInvalidDigit) {
  1050. Testing::MockDiagnosticConsumer consumer;
  1051. EXPECT_CALL(consumer,
  1052. HandleDiagnostic(IsSingleDiagnostic(
  1053. Diagnostics::Kind::InvalidDigit, Diagnostics::Level::Error, 1,
  1054. 6, HasSubstr("'a'"))));
  1055. compile_helper_.GetTokenizedBuffer("0x123abc", &consumer);
  1056. }
  1057. TEST_F(LexerTest, DiagnosticCR) {
  1058. Testing::MockDiagnosticConsumer consumer;
  1059. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1060. Diagnostics::Kind::UnsupportedCrLineEnding,
  1061. Diagnostics::Level::Error, 1, 1, _)));
  1062. compile_helper_.GetTokenizedBuffer("\r", &consumer);
  1063. }
  1064. TEST_F(LexerTest, DiagnosticLfCr) {
  1065. Testing::MockDiagnosticConsumer consumer;
  1066. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1067. Diagnostics::Kind::UnsupportedLfCrLineEnding,
  1068. Diagnostics::Level::Error, 2, 1, _)));
  1069. compile_helper_.GetTokenizedBuffer("\n\r", &consumer);
  1070. }
  1071. TEST_F(LexerTest, DiagnosticMissingTerminator) {
  1072. Testing::MockDiagnosticConsumer consumer;
  1073. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1074. Diagnostics::Kind::UnterminatedString,
  1075. Diagnostics::Level::Error, 1, 1, _)));
  1076. compile_helper_.GetTokenizedBuffer(R"(#" ")", &consumer);
  1077. }
  1078. TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
  1079. Testing::MockDiagnosticConsumer consumer;
  1080. EXPECT_CALL(consumer, HandleDiagnostic(IsSingleDiagnostic(
  1081. Diagnostics::Kind::UnrecognizedCharacters,
  1082. Diagnostics::Level::Error, 1, 1, _)));
  1083. compile_helper_.GetTokenizedBuffer("\b", &consumer);
  1084. }
  1085. TEST_F(LexerTest, DiagnosticFileTooLarge) {
  1086. Testing::MockDiagnosticConsumer consumer;
  1087. static constexpr size_t NumLines = 10'000'000;
  1088. std::string input;
  1089. input.reserve(NumLines * 3);
  1090. for ([[maybe_unused]] auto _ : llvm::seq(NumLines)) {
  1091. input += "{}\n";
  1092. }
  1093. EXPECT_CALL(consumer,
  1094. HandleDiagnostic(IsSingleDiagnostic(
  1095. Diagnostics::Kind::TooManyTokens, Diagnostics::Level::Error,
  1096. TokenIndex::Max / 2, 1, _)));
  1097. compile_helper_.GetTokenizedBuffer(input, &consumer);
  1098. }
  1099. // Outputs comments to the stream to create a comment block.
  1100. static auto AppendCommentLines(RawStringOstream& out, int count,
  1101. llvm::StringRef tag) -> void {
  1102. for (int i : llvm::seq(count)) {
  1103. out << "// " << tag << i << "\n";
  1104. }
  1105. }
  1106. TEST_F(LexerTest, CommentBlock) {
  1107. for (int comments_before = 0; comments_before < 5; ++comments_before) {
  1108. RawStringOstream prefix;
  1109. AppendCommentLines(prefix, comments_before, "B");
  1110. std::string prefix_out = prefix.TakeStr();
  1111. for (int comments_after = 1; comments_after < 5; ++comments_after) {
  1112. RawStringOstream source;
  1113. source << prefix_out;
  1114. if (comments_before > 0) {
  1115. source << "//\n";
  1116. }
  1117. AppendCommentLines(source, comments_after, "C");
  1118. SCOPED_TRACE(llvm::formatv(
  1119. "{0} comment lines before the empty comment line, {1} after",
  1120. comments_before, comments_after));
  1121. auto& buffer = compile_helper_.GetTokenizedBuffer(source.TakeStr());
  1122. ASSERT_FALSE(buffer.has_errors());
  1123. EXPECT_THAT(buffer.comments_size(), Eq(1));
  1124. }
  1125. }
  1126. }
  1127. TEST_F(LexerTest, IndentedComments) {
  1128. for (int indent = 0; indent < 40; ++indent) {
  1129. SCOPED_TRACE(llvm::formatv("Indent: {0}", indent));
  1130. RawStringOstream source;
  1131. source.indent(indent);
  1132. source << "// Comment\n";
  1133. std::string source_str = source.TakeStr();
  1134. auto& buffer = compile_helper_.GetTokenizedBuffer(source_str);
  1135. ASSERT_FALSE(buffer.has_errors());
  1136. EXPECT_THAT(buffer.comments_size(), Eq(1));
  1137. std::string simd_source =
  1138. source_str +
  1139. "\"Add a bunch of padding so that SIMD logic shouldn't hit EOF\"";
  1140. auto& simd_buffer = compile_helper_.GetTokenizedBuffer(simd_source);
  1141. ASSERT_FALSE(simd_buffer.has_errors());
  1142. EXPECT_THAT(simd_buffer.comments_size(), Eq(1));
  1143. }
  1144. }
  1145. TEST_F(LexerTest, MultipleComments) {
  1146. // TODO: Switch format to `llvm::StringLiteral` if
  1147. // `llvm::StringLiteral::c_str` is added.
  1148. constexpr char Format[] = R"(
  1149. {0}
  1150. {1}
  1151. {2}
  1152. {3}
  1153. '''This is a string, not a comment. The next comment will stop SIMD due to being
  1154. too close to the EOF.
  1155. '''
  1156. {4}
  1157. x
  1158. )";
  1159. constexpr llvm::StringLiteral Comments[] = {
  1160. // NOLINTNEXTLINE(bugprone-suspicious-missing-comma)
  1161. "// This comment should be possible to parse with SIMD.\n"
  1162. "// This one too.\n",
  1163. "// This one as well, though it's a different indent.\n"
  1164. " // And mixes indent.\n"
  1165. " // And mixes indent more.\n",
  1166. "// This is one comment:\n"
  1167. "//Invalid\n"
  1168. "// Valid\n"
  1169. "//Invalid\n"
  1170. "//\n"
  1171. "// Valid\n"
  1172. "//\n"
  1173. "// Valid\n",
  1174. "// This uses a high indent, which stops SIMD.\n", "//\n"};
  1175. std::string source = llvm::formatv(Format, Comments[0], Comments[1],
  1176. Comments[2], Comments[3], Comments[4])
  1177. .str();
  1178. auto& buffer = compile_helper_.GetTokenizedBuffer(source);
  1179. EXPECT_TRUE(buffer.has_errors());
  1180. EXPECT_THAT(buffer.comments_size(), Eq(std::size(Comments)));
  1181. for (int i :
  1182. llvm::seq(std::min<int>(buffer.comments_size(), std::size(Comments)))) {
  1183. EXPECT_THAT(buffer.GetCommentText(CommentIndex(i)).str(),
  1184. testing::StrEq(Comments[i]));
  1185. }
  1186. EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
  1187. {.kind = TokenKind::FileStart},
  1188. {.kind = TokenKind::StringLiteral},
  1189. {.kind = TokenKind::Identifier},
  1190. {.kind = TokenKind::FileEnd},
  1191. }));
  1192. }
  1193. TEST_F(LexerTest, PrintingOutputYaml) {
  1194. // Test that we can parse this into YAML and verify line and indent data.
  1195. auto& buffer =
  1196. compile_helper_.GetTokenizedBuffer("\n ;\n\n\n; ;\n\n\n\n\n\n\n\n\n\n\n");
  1197. ASSERT_FALSE(buffer.has_errors());
  1198. RawStringOstream print_stream;
  1199. buffer.Print(print_stream);
  1200. EXPECT_THAT(
  1201. Yaml::Value::FromText(print_stream.TakeStr()),
  1202. IsYaml(ElementsAre(Yaml::Sequence(ElementsAre(Yaml::Mapping(ElementsAre(
  1203. Pair("filename", buffer.source().filename().str()),
  1204. Pair("tokens", Yaml::Sequence(ElementsAre(
  1205. Yaml::Mapping(ElementsAre(
  1206. Pair("index", "0"), Pair("kind", "FileStart"),
  1207. Pair("line", "1"), Pair("column", "1"),
  1208. Pair("indent", "1"), Pair("spelling", ""))),
  1209. Yaml::Mapping(ElementsAre(
  1210. Pair("index", "1"), Pair("kind", "Semi"),
  1211. Pair("line", "2"), Pair("column", "2"),
  1212. Pair("indent", "2"), Pair("spelling", ";"),
  1213. Pair("has_leading_space", "true"))),
  1214. Yaml::Mapping(ElementsAre(
  1215. Pair("index", "2"), Pair("kind", "Semi"),
  1216. Pair("line", "5"), Pair("column", "1"),
  1217. Pair("indent", "1"), Pair("spelling", ";"),
  1218. Pair("has_leading_space", "true"))),
  1219. Yaml::Mapping(ElementsAre(
  1220. Pair("index", "3"), Pair("kind", "Semi"),
  1221. Pair("line", "5"), Pair("column", "3"),
  1222. Pair("indent", "1"), Pair("spelling", ";"),
  1223. Pair("has_leading_space", "true"))),
  1224. Yaml::Mapping(ElementsAre(
  1225. Pair("index", "4"), Pair("kind", "FileEnd"),
  1226. Pair("line", "15"), Pair("column", "1"),
  1227. Pair("indent", "1"), Pair("spelling", ""),
  1228. Pair("has_leading_space", "true")))))))))))));
  1229. }
  1230. } // namespace
  1231. } // namespace Carbon::Lex