Jelajahi Sumber

Split lexing logic and storage to separate files. (#3365)

Just reorganizing logic a little, trying to mirror the direction we've
gone with check, lower, etc. That is, lex.h contains a function `Lex`
that is used directly.

Note, I'm avoiding making meaningful changes here. It could in theory
still affect inlining in benchmarks, but I'm not seeing an impact.

Before:

```
------------------------------------------------------------------------------------------------------
Benchmark                                            Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------------------------------------
BM_ValidKeywords                               2784949 ns      2784867 ns          249 bytes_per_second=214.452M/s tokens_per_second=35.9084M/s
BM_ValidKeywordsAsRawIdentifiers               3222597 ns      3222551 ns          210 bytes_per_second=244.513M/s tokens_per_second=31.0313M/s
BM_RawIdentifierFocus                          5907836 ns      5907518 ns          103 bytes_per_second=264.873M/s tokens_per_second=16.9276M/s
BM_ValidIdentifiers<1, 64, false>              6255128 ns      6254297 ns          105 bytes_per_second=235.488M/s tokens_per_second=15.989M/s
BM_ValidIdentifiers<1, 1, true>                3677630 ns      3677398 ns          192 bytes_per_second=77.7999M/s tokens_per_second=27.1931M/s
BM_ValidIdentifiers<3, 5, true>                5427693 ns      5427116 ns          110 bytes_per_second=105.434M/s tokens_per_second=18.426M/s
BM_ValidIdentifiers<3, 16, true>               5063246 ns      5062761 ns          115 bytes_per_second=216.623M/s tokens_per_second=19.7521M/s
BM_ValidIdentifiers<12, 64, true>              5518589 ns      5518118 ns          100 bytes_per_second=691.264M/s tokens_per_second=18.1221M/s
BM_ValidIdentifiers<16, 16, true>              4890776 ns      4890782 ns          112 bytes_per_second=350.989M/s tokens_per_second=20.4466M/s
BM_ValidIdentifiers<24, 24, true>              4974729 ns      4974582 ns          112 bytes_per_second=498.444M/s tokens_per_second=20.1022M/s
BM_ValidIdentifiers<32, 32, true>              5517583 ns      5517085 ns           99 bytes_per_second=587.718M/s tokens_per_second=18.1255M/s
BM_ValidIdentifiers<48, 48, true>              5914759 ns      5914222 ns           94 bytes_per_second=806.255M/s tokens_per_second=16.9084M/s
BM_ValidIdentifiers<64, 64, true>              7556040 ns      7556036 ns           77 bytes_per_second=833.009M/s tokens_per_second=13.2345M/s
BM_ValidIdentifiers<80, 80, true>              7739113 ns      7737696 ns           76 bytes_per_second=1010.65M/s tokens_per_second=12.9237M/s
BM_HorizontalWhitespace/1                      5015062 ns      5014443 ns          108 bytes_per_second=114.111M/s tokens_per_second=19.9424M/s
BM_HorizontalWhitespace/4                      5165496 ns      5165425 ns          111 bytes_per_second=166.163M/s tokens_per_second=19.3595M/s
BM_HorizontalWhitespace/16                     5616796 ns      5616447 ns          102 bytes_per_second=356.578M/s tokens_per_second=17.8049M/s
BM_HorizontalWhitespace/64                     7912904 ns      7912346 ns           78 bytes_per_second=831.648M/s tokens_per_second=12.6385M/s
BM_HorizontalWhitespace/128                   11086218 ns     11083155 ns           57 bytes_per_second=1.11759G/s tokens_per_second=9.0227M/s
BM_RandomSource                                4796549 ns      4795733 ns          145 bytes_per_second=216.783M/s lines_per_second=6.61943M/s tokens_per_second=20.8519M/s
BM_GroupingSymbols/1/0/0                       3937151 ns      3936581 ns          176 bytes_per_second=216.914M/s lines_per_second=19.0521M/s tokens_per_second=25.4028M/s
BM_GroupingSymbols/2/0/0                       3000029 ns      2999506 ns          239 bytes_per_second=243.125M/s lines_per_second=27.7812M/s tokens_per_second=33.3388M/s
BM_GroupingSymbols/3/0/0                       2729059 ns      2728834 ns          251 bytes_per_second=261.26M/s lines_per_second=32.065M/s tokens_per_second=36.6457M/s
BM_GroupingSymbols/4/0/0                       2432363 ns      2432209 ns          291 bytes_per_second=304.738M/s lines_per_second=37.0034M/s tokens_per_second=41.1149M/s
BM_GroupingSymbols/8/0/0                       2144080 ns      2143967 ns          326 bytes_per_second=468.547M/s lines_per_second=44.0468M/s tokens_per_second=46.6425M/s
BM_GroupingSymbols/16/0/0                      2290055 ns      2289711 ns          308 bytes_per_second=741.709M/s lines_per_second=42.3866M/s tokens_per_second=43.6736M/s
BM_GroupingSymbols/32/0/0                      3102790 ns      3102186 ns          220 bytes_per_second=1027.1M/s lines_per_second=31.7437M/s tokens_per_second=32.2353M/s
BM_GroupingSymbols/0/1/0                       3406964 ns      3406421 ns          207 bytes_per_second=222.677M/s lines_per_second=7.33908M/s tokens_per_second=29.3563M/s
BM_GroupingSymbols/0/2/0                       2244101 ns      2244006 ns          312 bytes_per_second=239.985M/s lines_per_second=7.42689M/s tokens_per_second=44.5632M/s
BM_GroupingSymbols/0/3/0                       1976449 ns      1976080 ns          347 bytes_per_second=216M/s lines_per_second=6.32566M/s tokens_per_second=50.6052M/s
BM_GroupingSymbols/0/4/0                       1600539 ns      1600325 ns          429 bytes_per_second=224.777M/s lines_per_second=6.24873M/s tokens_per_second=62.4873M/s
BM_GroupingSymbols/0/8/0                       1085044 ns      1084941 ns          646 bytes_per_second=222.765M/s lines_per_second=5.12009M/s tokens_per_second=92.1709M/s
BM_GroupingSymbols/0/16/0                       824804 ns       824756 ns          817 bytes_per_second=209.166M/s lines_per_second=3.5659M/s tokens_per_second=121.248M/s
BM_GroupingSymbols/0/32/0                       685741 ns       685741 ns         1004 bytes_per_second=196.576M/s lines_per_second=2.20929M/s tokens_per_second=145.828M/s
BM_GroupingSymbols/0/0/1                       3467507 ns      3467077 ns          206 bytes_per_second=218.781M/s lines_per_second=7.21069M/s tokens_per_second=28.8427M/s
BM_GroupingSymbols/0/0/2                       2284154 ns      2283937 ns          309 bytes_per_second=235.79M/s lines_per_second=7.29705M/s tokens_per_second=43.784M/s
BM_GroupingSymbols/0/0/3                       1965548 ns      1965225 ns          356 bytes_per_second=217.193M/s lines_per_second=6.36059M/s tokens_per_second=50.8848M/s
BM_GroupingSymbols/0/0/4                       1623965 ns      1623725 ns          440 bytes_per_second=221.538M/s lines_per_second=6.15868M/s tokens_per_second=61.5868M/s
BM_GroupingSymbols/0/0/8                       1080601 ns      1080452 ns          650 bytes_per_second=223.691M/s lines_per_second=5.14137M/s tokens_per_second=92.5539M/s
BM_GroupingSymbols/0/0/16                       840820 ns       840677 ns          828 bytes_per_second=205.205M/s lines_per_second=3.49837M/s tokens_per_second=118.952M/s
BM_GroupingSymbols/0/0/32                       707793 ns       707734 ns          991 bytes_per_second=190.467M/s lines_per_second=2.14063M/s tokens_per_second=141.296M/s
BM_GroupingSymbols/32/1/0                      2804159 ns      2803869 ns          245 bytes_per_second=1103.7M/s lines_per_second=34.0779M/s tokens_per_second=35.665M/s
BM_GroupingSymbols/32/2/0                      2676229 ns      2675855 ns          261 bytes_per_second=1123.78M/s lines_per_second=34.688M/s tokens_per_second=37.3712M/s
BM_GroupingSymbols/32/3/0                      2652102 ns      2651836 ns          262 bytes_per_second=1103.24M/s lines_per_second=34.0217M/s tokens_per_second=37.7097M/s
BM_GroupingSymbols/32/4/0                      2600677 ns      2600552 ns          268 bytes_per_second=1096.17M/s lines_per_second=33.7678M/s tokens_per_second=38.4534M/s
BM_GroupingSymbols/32/8/0                      2382017 ns      2381869 ns          294 bytes_per_second=1083.42M/s lines_per_second=33.2659M/s tokens_per_second=41.9838M/s
BM_GroupingSymbols/32/16/0                     2102340 ns      2102243 ns          325 bytes_per_second=1034.45M/s lines_per_second=31.5377M/s tokens_per_second=47.5682M/s
BM_GroupingSymbols/32/32/0                     1745079 ns      1744914 ns          403 bytes_per_second=952.64M/s lines_per_second=28.6461M/s tokens_per_second=57.3094M/s
BM_GroupingSymbols/32/32/1                     1701414 ns      1701255 ns          415 bytes_per_second=962.73M/s lines_per_second=28.9228M/s tokens_per_second=58.7802M/s
BM_GroupingSymbols/32/32/2                     1688503 ns      1688121 ns          415 bytes_per_second=957.019M/s lines_per_second=28.7242M/s tokens_per_second=59.2375M/s
BM_GroupingSymbols/32/32/3                     1679701 ns      1679499 ns          419 bytes_per_second=948.651M/s lines_per_second=28.446M/s tokens_per_second=59.5416M/s
BM_GroupingSymbols/32/32/4                     1647815 ns      1647816 ns          427 bytes_per_second=953.342M/s lines_per_second=28.559M/s tokens_per_second=60.6864M/s
BM_GroupingSymbols/32/32/8                     1581283 ns      1581075 ns          450 bytes_per_second=942.39M/s lines_per_second=28.1201M/s tokens_per_second=63.2481M/s
BM_GroupingSymbols/32/32/16                    1445591 ns      1445526 ns          477 bytes_per_second=936.105M/s lines_per_second=27.7442M/s tokens_per_second=69.179M/s
BM_GroupingSymbols/32/32/32                    1296335 ns      1296094 ns          544 bytes_per_second=882.943M/s lines_per_second=25.8276M/s tokens_per_second=77.1549M/s
BM_BlankLines/1                                5774442 ns      5774454 ns          107 bytes_per_second=99.0921M/s lines_per_second=17.3175M/s tokens_per_second=17.3177M/s
BM_BlankLines/4                                8712135 ns      8711977 ns           75 bytes_per_second=98.5198M/s lines_per_second=45.9133M/s tokens_per_second=11.4785M/s
BM_BlankLines/16                              32313782 ns     32307655 ns           22 bytes_per_second=61.9884M/s lines_per_second=49.5234M/s tokens_per_second=3.09524M/s
BM_BlankLines/64                              87562163 ns     87543476 ns            7 bytes_per_second=75.166M/s lines_per_second=73.1058M/s tokens_per_second=1.14229M/s
BM_BlankLines/128                            160336428 ns    160298644 ns            4 bytes_per_second=79.1257M/s lines_per_second=79.8502M/s tokens_per_second=623.836k/s
BM_CommentLines/1/0/0                          7298959 ns      7298427 ns           89 bytes_per_second=117.601M/s lines_per_second=27.4029M/s tokens_per_second=13.7016M/s
BM_CommentLines/4/0/0                         10002223 ns     10002239 ns           67 bytes_per_second=171.622M/s lines_per_second=49.9883M/s tokens_per_second=9.99776M/s
BM_CommentLines/128/0/0                      143356660 ns    143356412 ns            5 bytes_per_second=259.444M/s lines_per_second=89.9846M/s tokens_per_second=697.562k/s
BM_CommentLines/1/30/0                         7421994 ns      7421289 ns           87 bytes_per_second=501.166M/s lines_per_second=26.9492M/s tokens_per_second=13.4747M/s
BM_CommentLines/4/30/0                        11304903 ns     11303972 ns           56 bytes_per_second=1.13696G/s lines_per_second=44.2318M/s tokens_per_second=8.84645M/s
BM_CommentLines/128/30/0                     156244144 ns    156217089 ns            4 bytes_per_second=2.52178G/s lines_per_second=82.5766M/s tokens_per_second=640.135k/s
BM_CommentLines/1/70/0                         7486688 ns      7485340 ns           80 bytes_per_second=1006.49M/s lines_per_second=26.7186M/s tokens_per_second=13.3594M/s
BM_CommentLines/4/70/0                        14762881 ns     14761417 ns           45 bytes_per_second=1.88011G/s lines_per_second=33.8717M/s tokens_per_second=6.77442M/s
BM_CommentLines/128/70/0                     179933089 ns    179903592 ns            4 bytes_per_second=4.84025G/s lines_per_second=71.7044M/s tokens_per_second=555.853k/s
BM_CommentLines/1/0/2                          7473119 ns      7472370 ns           87 bytes_per_second=140.389M/s lines_per_second=26.765M/s tokens_per_second=13.3826M/s
BM_CommentLines/4/0/2                          9702727 ns      9700268 ns           66 bytes_per_second=255.615M/s lines_per_second=51.5445M/s tokens_per_second=10.309M/s
BM_CommentLines/128/0/2                      140504132 ns    140480254 ns            5 bytes_per_second=438.544M/s lines_per_second=91.8269M/s tokens_per_second=711.844k/s
BM_CommentLines/1/30/2                         7530847 ns      7529227 ns           82 bytes_per_second=519.314M/s lines_per_second=26.5629M/s tokens_per_second=13.2816M/s
BM_CommentLines/4/30/2                        11646677 ns     11644972 ns           56 bytes_per_second=1.16764G/s lines_per_second=42.9366M/s tokens_per_second=8.5874M/s
BM_CommentLines/128/30/2                     161292392 ns    161273904 ns            4 bytes_per_second=2.59054G/s lines_per_second=79.9873M/s tokens_per_second=620.063k/s
BM_CommentLines/1/70/2                         7700751 ns      7700365 ns           83 bytes_per_second=1003.16M/s lines_per_second=25.9725M/s tokens_per_second=12.9864M/s
BM_CommentLines/4/70/2                        14133018 ns     14131523 ns           45 bytes_per_second=2.01664G/s lines_per_second=35.3815M/s tokens_per_second=7.07638M/s
BM_CommentLines/128/70/2                     179085026 ns    179057072 ns            4 bytes_per_second=4.99628G/s lines_per_second=72.0433M/s tokens_per_second=558.481k/s
BM_CommentLines/1/0/8                          7792060 ns      7791951 ns           83 bytes_per_second=208.065M/s lines_per_second=25.6673M/s tokens_per_second=12.8338M/s
BM_CommentLines/4/0/8                         10329194 ns     10329006 ns           61 bytes_per_second=461.644M/s lines_per_second=48.4069M/s tokens_per_second=9.68147M/s
BM_CommentLines/128/0/8                      140917902 ns    140917975 ns            5 bytes_per_second=956.927M/s lines_per_second=91.5417M/s tokens_per_second=709.633k/s
BM_CommentLines/1/30/8                         7813308 ns      7811702 ns           82 bytes_per_second=573.784M/s lines_per_second=25.6024M/s tokens_per_second=12.8013M/s
BM_CommentLines/4/30/8                        12052779 ns     12051440 ns           54 bytes_per_second=1.31373G/s lines_per_second=41.4884M/s tokens_per_second=8.29776M/s
BM_CommentLines/128/30/8                     163767409 ns    163753783 ns            4 bytes_per_second=2.9881G/s lines_per_second=78.776M/s tokens_per_second=610.673k/s
BM_CommentLines/1/70/8                         8188137 ns      8187614 ns           80 bytes_per_second=1013.35M/s lines_per_second=24.4269M/s tokens_per_second=12.2136M/s
BM_CommentLines/4/70/8                        15723650 ns     15721559 ns           43 bytes_per_second=1.95485G/s lines_per_second=31.8031M/s tokens_per_second=6.36069M/s
BM_CommentLines/128/70/8                     182579515 ns    182554119 ns            4 bytes_per_second=5.29237G/s lines_per_second=70.6633M/s tokens_per_second=547.783k/s
BM_SpeedOfLightStrCpy                            27009 ns        27006 ns        25887 bytes_per_second=37.594G/s lines_per_second=1.17547G/s tokens_per_second=3.70286G/s
BM_SpeedOfLightDispatch<1>                     1850860 ns      1850660 ns          381 bytes_per_second=561.764M/s lines_per_second=17.1533M/s tokens_per_second=54.0348M/s
BM_SpeedOfLightDispatch<2>                     1876473 ns      1876369 ns          347 bytes_per_second=554.067M/s lines_per_second=16.9183M/s tokens_per_second=53.2944M/s
BM_SpeedOfLightDispatch<4>                     2208441 ns      2208443 ns          314 bytes_per_second=470.755M/s lines_per_second=14.3744M/s tokens_per_second=45.2808M/s
BM_SpeedOfLightDispatch<8>                     2824174 ns      2824137 ns          250 bytes_per_second=368.125M/s lines_per_second=11.2406M/s tokens_per_second=35.409M/s
BM_SpeedOfLightDispatch<16>                    4306633 ns      4306325 ns          165 bytes_per_second=241.42M/s lines_per_second=7.37172M/s tokens_per_second=23.2217M/s
BM_SpeedOfLightDispatch<32>                    6300653 ns      6300102 ns          113 bytes_per_second=165.019M/s lines_per_second=5.03881M/s tokens_per_second=15.8728M/s
BM_SpeedOfLightDispatch<MaxDispatchTargets>    8879663 ns      8878510 ns           79 bytes_per_second=117.096M/s lines_per_second=3.57549M/s tokens_per_second=11.2632M/s
```

After:

```
------------------------------------------------------------------------------------------------------
Benchmark                                            Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------------------------------------
BM_ValidKeywords                               2821833 ns      2821832 ns          247 bytes_per_second=211.642M/s tokens_per_second=35.438M/s
BM_ValidKeywordsAsRawIdentifiers               3204326 ns      3203964 ns          216 bytes_per_second=245.931M/s tokens_per_second=31.2113M/s
BM_RawIdentifierFocus                          6076723 ns      6076107 ns           98 bytes_per_second=257.524M/s tokens_per_second=16.4579M/s
BM_ValidIdentifiers<1, 64, false>              6303093 ns      6302632 ns          101 bytes_per_second=233.682M/s tokens_per_second=15.8664M/s
BM_ValidIdentifiers<1, 1, true>                3687668 ns      3687338 ns          194 bytes_per_second=77.5902M/s tokens_per_second=27.1198M/s
BM_ValidIdentifiers<3, 5, true>                5298920 ns      5298465 ns          106 bytes_per_second=107.994M/s tokens_per_second=18.8734M/s
BM_ValidIdentifiers<3, 16, true>               4880695 ns      4879704 ns          118 bytes_per_second=224.75M/s tokens_per_second=20.493M/s
BM_ValidIdentifiers<12, 64, true>              5413070 ns      5411832 ns          103 bytes_per_second=704.84M/s tokens_per_second=18.478M/s
BM_ValidIdentifiers<16, 16, true>              5052780 ns      5051309 ns          110 bytes_per_second=339.835M/s tokens_per_second=19.7968M/s
BM_ValidIdentifiers<24, 24, true>              5221580 ns      5220851 ns          104 bytes_per_second=474.933M/s tokens_per_second=19.154M/s
BM_ValidIdentifiers<32, 32, true>              5786811 ns      5786806 ns           99 bytes_per_second=560.325M/s tokens_per_second=17.2807M/s
BM_ValidIdentifiers<48, 48, true>              5959506 ns      5959502 ns           96 bytes_per_second=800.129M/s tokens_per_second=16.7799M/s
BM_ValidIdentifiers<64, 64, true>              7831469 ns      7830629 ns           75 bytes_per_second=803.799M/s tokens_per_second=12.7704M/s
BM_ValidIdentifiers<80, 80, true>              7905843 ns      7904727 ns           75 bytes_per_second=989.298M/s tokens_per_second=12.6507M/s
BM_HorizontalWhitespace/1                      5091171 ns      5090660 ns          109 bytes_per_second=112.402M/s tokens_per_second=19.6438M/s
BM_HorizontalWhitespace/4                      5020344 ns      5020344 ns          112 bytes_per_second=170.965M/s tokens_per_second=19.919M/s
BM_HorizontalWhitespace/16                     5846801 ns      5846459 ns           99 bytes_per_second=342.549M/s tokens_per_second=17.1044M/s
BM_HorizontalWhitespace/64                     7803183 ns      7802357 ns           78 bytes_per_second=843.372M/s tokens_per_second=12.8166M/s
BM_HorizontalWhitespace/128                   10600500 ns     10598602 ns           59 bytes_per_second=1.16869G/s tokens_per_second=9.43521M/s
BM_RandomSource                                4824062 ns      4823496 ns          139 bytes_per_second=215.536M/s lines_per_second=6.58133M/s tokens_per_second=20.7319M/s
BM_GroupingSymbols/1/0/0                       4116846 ns      4116549 ns          170 bytes_per_second=207.431M/s lines_per_second=18.2191M/s tokens_per_second=24.2922M/s
BM_GroupingSymbols/2/0/0                       3024336 ns      3024156 ns          236 bytes_per_second=241.144M/s lines_per_second=27.5548M/s tokens_per_second=33.0671M/s
BM_GroupingSymbols/3/0/0                       2789794 ns      2789256 ns          252 bytes_per_second=255.601M/s lines_per_second=31.3704M/s tokens_per_second=35.8519M/s
BM_GroupingSymbols/4/0/0                       2496498 ns      2496237 ns          283 bytes_per_second=296.921M/s lines_per_second=36.0543M/s tokens_per_second=40.0603M/s
BM_GroupingSymbols/8/0/0                       2200846 ns      2200611 ns          313 bytes_per_second=456.487M/s lines_per_second=42.9131M/s tokens_per_second=45.4419M/s
BM_GroupingSymbols/16/0/0                      2415237 ns      2415015 ns          288 bytes_per_second=703.225M/s lines_per_second=40.1873M/s tokens_per_second=41.4076M/s
BM_GroupingSymbols/32/0/0                      3171195 ns      3170504 ns          215 bytes_per_second=1004.97M/s lines_per_second=31.0597M/s tokens_per_second=31.5407M/s
BM_GroupingSymbols/0/1/0                       3627393 ns      3626737 ns          193 bytes_per_second=209.15M/s lines_per_second=6.89325M/s tokens_per_second=27.573M/s
BM_GroupingSymbols/0/2/0                       2501189 ns      2500946 ns          275 bytes_per_second=215.33M/s lines_per_second=6.66388M/s tokens_per_second=39.9849M/s
BM_GroupingSymbols/0/3/0                       2149513 ns      2149282 ns          318 bytes_per_second=198.593M/s lines_per_second=5.8159M/s tokens_per_second=46.5272M/s
BM_GroupingSymbols/0/4/0                       1793658 ns      1793292 ns          384 bytes_per_second=200.59M/s lines_per_second=5.57634M/s tokens_per_second=55.7634M/s
BM_GroupingSymbols/0/8/0                       1302555 ns      1302272 ns          542 bytes_per_second=185.589M/s lines_per_second=4.26562M/s tokens_per_second=76.7889M/s
BM_GroupingSymbols/0/16/0                      1042993 ns      1042818 ns          671 bytes_per_second=165.428M/s lines_per_second=2.82024M/s tokens_per_second=95.8941M/s
BM_GroupingSymbols/0/32/0                       955561 ns       955471 ns          749 bytes_per_second=141.082M/s lines_per_second=1.58561M/s tokens_per_second=104.66M/s
BM_GroupingSymbols/0/0/1                       3659797 ns      3659369 ns          194 bytes_per_second=207.285M/s lines_per_second=6.83178M/s tokens_per_second=27.3271M/s
BM_GroupingSymbols/0/0/2                       2467556 ns      2467190 ns          281 bytes_per_second=218.276M/s lines_per_second=6.75505M/s tokens_per_second=40.5319M/s
BM_GroupingSymbols/0/0/3                       2152274 ns      2151938 ns          326 bytes_per_second=198.348M/s lines_per_second=5.80872M/s tokens_per_second=46.4697M/s
BM_GroupingSymbols/0/0/4                       1805982 ns      1805877 ns          368 bytes_per_second=199.192M/s lines_per_second=5.53747M/s tokens_per_second=55.3747M/s
BM_GroupingSymbols/0/0/8                       1313041 ns      1312833 ns          539 bytes_per_second=184.096M/s lines_per_second=4.23131M/s tokens_per_second=76.1711M/s
BM_GroupingSymbols/0/0/16                      1065565 ns      1065301 ns          659 bytes_per_second=161.937M/s lines_per_second=2.76072M/s tokens_per_second=93.8702M/s
BM_GroupingSymbols/0/0/32                       946630 ns       946514 ns          726 bytes_per_second=142.417M/s lines_per_second=1.60061M/s tokens_per_second=105.651M/s
BM_GroupingSymbols/32/1/0                      2991120 ns      2991121 ns          233 bytes_per_second=1034.6M/s lines_per_second=31.9445M/s tokens_per_second=33.4323M/s
BM_GroupingSymbols/32/2/0                      2893280 ns      2892944 ns          245 bytes_per_second=1039.45M/s lines_per_second=32.085M/s tokens_per_second=34.5669M/s
BM_GroupingSymbols/32/3/0                      2819177 ns      2819024 ns          239 bytes_per_second=1037.81M/s lines_per_second=32.004M/s tokens_per_second=35.4733M/s
BM_GroupingSymbols/32/4/0                      2778376 ns      2778018 ns          249 bytes_per_second=1026.15M/s lines_per_second=31.6107M/s tokens_per_second=35.9969M/s
BM_GroupingSymbols/32/8/0                      2538279 ns      2538279 ns          275 bytes_per_second=1016.65M/s lines_per_second=31.216M/s tokens_per_second=39.3968M/s
BM_GroupingSymbols/32/16/0                     2291819 ns      2291693 ns          305 bytes_per_second=948.937M/s lines_per_second=28.9306M/s tokens_per_second=43.6359M/s
BM_GroupingSymbols/32/32/0                     1943560 ns      1943560 ns          366 bytes_per_second=855.273M/s lines_per_second=25.7183M/s tokens_per_second=51.452M/s
BM_GroupingSymbols/32/32/1                     1902069 ns      1901915 ns          375 bytes_per_second=861.158M/s lines_per_second=25.8713M/s tokens_per_second=52.5786M/s
BM_GroupingSymbols/32/32/2                     1877847 ns      1877752 ns          379 bytes_per_second=860.371M/s lines_per_second=25.8234M/s tokens_per_second=53.2552M/s
BM_GroupingSymbols/32/32/3                     1837280 ns      1837016 ns          381 bytes_per_second=867.308M/s lines_per_second=26.0069M/s tokens_per_second=54.4361M/s
BM_GroupingSymbols/32/32/4                     1841010 ns      1840902 ns          380 bytes_per_second=853.349M/s lines_per_second=25.5636M/s tokens_per_second=54.3212M/s
BM_GroupingSymbols/32/32/8                     1734676 ns      1734437 ns          405 bytes_per_second=859.062M/s lines_per_second=25.6337M/s tokens_per_second=57.6556M/s
BM_GroupingSymbols/32/32/16                    1641169 ns      1640934 ns          422 bytes_per_second=824.63M/s lines_per_second=24.4403M/s tokens_per_second=60.9409M/s
BM_GroupingSymbols/32/32/32                    1506988 ns      1506914 ns          472 bytes_per_second=759.418M/s lines_per_second=22.2143M/s tokens_per_second=66.3608M/s
BM_BlankLines/1                                5658057 ns      5657150 ns          110 bytes_per_second=101.147M/s lines_per_second=17.6766M/s tokens_per_second=17.6767M/s
BM_BlankLines/4                                8346196 ns      8346052 ns           77 bytes_per_second=102.839M/s lines_per_second=47.9264M/s tokens_per_second=11.9817M/s
BM_BlankLines/16                              31147085 ns     31144610 ns           22 bytes_per_second=64.3033M/s lines_per_second=51.3727M/s tokens_per_second=3.21083M/s
BM_BlankLines/64                              83743719 ns     83743762 ns            8 bytes_per_second=78.5765M/s lines_per_second=76.4228M/s tokens_per_second=1.19412M/s
BM_BlankLines/128                            152299627 ns    152274606 ns            4 bytes_per_second=83.2952M/s lines_per_second=84.0578M/s tokens_per_second=656.708k/s
BM_CommentLines/1/0/0                          7535704 ns      7535149 ns           83 bytes_per_second=113.906M/s lines_per_second=26.542M/s tokens_per_second=13.2711M/s
BM_CommentLines/4/0/0                         10107724 ns     10106088 ns           66 bytes_per_second=169.858M/s lines_per_second=49.4746M/s tokens_per_second=9.89503M/s
BM_CommentLines/128/0/0                      130061022 ns    130029826 ns            5 bytes_per_second=286.034M/s lines_per_second=99.207M/s tokens_per_second=769.054k/s
BM_CommentLines/1/30/0                         7773816 ns      7772726 ns           83 bytes_per_second=478.506M/s lines_per_second=25.7307M/s tokens_per_second=12.8655M/s
BM_CommentLines/4/30/0                        11436116 ns     11434452 ns           56 bytes_per_second=1.12398G/s lines_per_second=43.7271M/s tokens_per_second=8.7455M/s
BM_CommentLines/128/30/0                     155047059 ns    155033899 ns            4 bytes_per_second=2.54103G/s lines_per_second=83.2068M/s tokens_per_second=645.02k/s
BM_CommentLines/1/70/0                         8016209 ns      8014861 ns           75 bytes_per_second=939.998M/s lines_per_second=24.9534M/s tokens_per_second=12.4768M/s
BM_CommentLines/4/70/0                        14894752 ns     14891800 ns           44 bytes_per_second=1.86365G/s lines_per_second=33.5752M/s tokens_per_second=6.7151M/s
BM_CommentLines/128/70/0                     176667108 ns    176631061 ns            4 bytes_per_second=4.92993G/s lines_per_second=73.0329M/s tokens_per_second=566.152k/s
BM_CommentLines/1/0/2                          7764475 ns      7763675 ns           84 bytes_per_second=135.121M/s lines_per_second=25.7607M/s tokens_per_second=12.8805M/s
BM_CommentLines/4/0/2                         10238104 ns     10236809 ns           65 bytes_per_second=242.217M/s lines_per_second=48.8429M/s tokens_per_second=9.76867M/s
BM_CommentLines/128/0/2                      130208823 ns    130190640 ns            5 bytes_per_second=473.204M/s lines_per_second=99.0845M/s tokens_per_second=768.104k/s
BM_CommentLines/1/30/2                         7941224 ns      7940584 ns           78 bytes_per_second=492.411M/s lines_per_second=25.1868M/s tokens_per_second=12.5935M/s
BM_CommentLines/4/30/2                        11936879 ns     11934453 ns           56 bytes_per_second=1.13932G/s lines_per_second=41.8951M/s tokens_per_second=8.3791M/s
BM_CommentLines/128/30/2                     156978531 ns    156967142 ns            4 bytes_per_second=2.66162G/s lines_per_second=82.182M/s tokens_per_second=637.076k/s
BM_CommentLines/1/70/2                         8223614 ns      8222923 ns           79 bytes_per_second=939.409M/s lines_per_second=24.322M/s tokens_per_second=12.1611M/s
BM_CommentLines/4/70/2                        15216239 ns     15215047 ns           45 bytes_per_second=1.87303G/s lines_per_second=32.8619M/s tokens_per_second=6.57244M/s
BM_CommentLines/128/70/2                     176810589 ns    176755958 ns            4 bytes_per_second=5.06133G/s lines_per_second=72.9813M/s tokens_per_second=565.752k/s
BM_CommentLines/1/0/8                          7901146 ns      7899003 ns           82 bytes_per_second=205.245M/s lines_per_second=25.3194M/s tokens_per_second=12.6598M/s
BM_CommentLines/4/0/8                         10135919 ns     10134576 ns           66 bytes_per_second=470.501M/s lines_per_second=49.3356M/s tokens_per_second=9.86721M/s
BM_CommentLines/128/0/8                      132206448 ns    132206347 ns            5 bytes_per_second=1019.98M/s lines_per_second=97.5738M/s tokens_per_second=756.393k/s
BM_CommentLines/1/30/8                         7981189 ns      7981202 ns           79 bytes_per_second=561.598M/s lines_per_second=25.0586M/s tokens_per_second=12.5294M/s
BM_CommentLines/4/30/8                        12311389 ns     12309078 ns           54 bytes_per_second=1.28623G/s lines_per_second=40.62M/s tokens_per_second=8.12409M/s
BM_CommentLines/128/30/8                     160432032 ns    160393999 ns            4 bytes_per_second=3.05069G/s lines_per_second=80.4261M/s tokens_per_second=623.465k/s
BM_CommentLines/1/70/8                         8199080 ns      8199078 ns           79 bytes_per_second=1011.93M/s lines_per_second=24.3927M/s tokens_per_second=12.1965M/s
BM_CommentLines/4/70/8                        16087954 ns     16086159 ns           44 bytes_per_second=1.91055G/s lines_per_second=31.0823M/s tokens_per_second=6.21652M/s
BM_CommentLines/128/70/8                     175714908 ns    175675241 ns            4 bytes_per_second=5.4996G/s lines_per_second=73.4302M/s tokens_per_second=569.232k/s
BM_SpeedOfLightStrCpy                            28860 ns        28858 ns        26092 bytes_per_second=35.181G/s lines_per_second=1.10002G/s tokens_per_second=3.46519G/s
BM_SpeedOfLightDispatch<1>                     1823558 ns      1823473 ns          385 bytes_per_second=570.14M/s lines_per_second=17.4091M/s tokens_per_second=54.8404M/s
BM_SpeedOfLightDispatch<2>                     2013453 ns      2013250 ns          343 bytes_per_second=516.396M/s lines_per_second=15.768M/s tokens_per_second=49.6709M/s
BM_SpeedOfLightDispatch<4>                     2225145 ns      2224920 ns          312 bytes_per_second=467.268M/s lines_per_second=14.2679M/s tokens_per_second=44.9454M/s
BM_SpeedOfLightDispatch<8>                     2851730 ns      2851590 ns          251 bytes_per_second=364.581M/s lines_per_second=11.1324M/s tokens_per_second=35.0682M/s
BM_SpeedOfLightDispatch<16>                    4431584 ns      4431156 ns          161 bytes_per_second=234.619M/s lines_per_second=7.16404M/s tokens_per_second=22.5675M/s
BM_SpeedOfLightDispatch<32>                    6229285 ns      6228092 ns          111 bytes_per_second=166.927M/s lines_per_second=5.09707M/s tokens_per_second=16.0563M/s
BM_SpeedOfLightDispatch<MaxDispatchTargets>    8967228 ns      8965583 ns           79 bytes_per_second=115.958M/s lines_per_second=3.54076M/s tokens_per_second=11.1538M/s
```

---------

Co-authored-by: Chandler Carruth <chandlerc@gmail.com>
Jon Ross-Perkins 2 tahun lalu
induk
melakukan
cafcd88882

+ 1 - 0
language_server/BUILD

@@ -23,6 +23,7 @@ cc_binary(
         "//common:error",
         "//toolchain/base:value_store",
         "//toolchain/diagnostics:null_diagnostics",
+        "//toolchain/lex",
         "//toolchain/lex:tokenized_buffer",
         "//toolchain/parse:node_kind",
         "//toolchain/parse:tree",

+ 2 - 3
language_server/language_server.cpp

@@ -7,7 +7,7 @@
 #include "clang-tools-extra/clangd/Protocol.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
-#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/parse/node_kind.h"
 #include "toolchain/parse/tree.h"
 #include "toolchain/source/source_buffer.h"
@@ -99,8 +99,7 @@ void LanguageServer::OnDocumentSymbol(
               llvm::MemoryBuffer::getMemBufferCopy(files_.at(file)));
 
   auto buf = SourceBuffer::CreateFromFile(vfs, file, NullDiagnosticConsumer());
-  auto lexed =
-      Lex::TokenizedBuffer::Lex(value_stores, *buf, NullDiagnosticConsumer());
+  auto lexed = Lex::Lex(value_stores, *buf, NullDiagnosticConsumer());
   auto parsed = Parse::Tree::Parse(lexed, NullDiagnosticConsumer(), nullptr);
   std::vector<clang::clangd::DocumentSymbol> result;
   for (const auto& node : parsed.postorder()) {

+ 1 - 1
toolchain/driver/BUILD

@@ -26,7 +26,7 @@ cc_library(
         "//toolchain/codegen",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:sorting_diagnostic_consumer",
-        "//toolchain/lex:tokenized_buffer",
+        "//toolchain/lex",
         "//toolchain/lower",
         "//toolchain/parse:tree",
         "//toolchain/sem_ir:file",

+ 3 - 4
toolchain/driver/driver.cpp

@@ -21,7 +21,7 @@
 #include "toolchain/codegen/codegen.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/sorting_diagnostic_consumer.h"
-#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/lower/lower.h"
 #include "toolchain/parse/tree.h"
 #include "toolchain/sem_ir/formatter.h"
@@ -420,9 +420,8 @@ class Driver::CompilationUnit {
     CARBON_VLOG() << "*** SourceBuffer ***\n```\n"
                   << source_->text() << "\n```\n";
 
-    LogCall("Lex::TokenizedBuffer::Lex", [&] {
-      tokens_ = Lex::TokenizedBuffer::Lex(value_stores_, *source_, *consumer_);
-    });
+    LogCall("Lex::Lex",
+            [&] { tokens_ = Lex::Lex(value_stores_, *source_, *consumer_); });
     if (options_.dump_tokens) {
       consumer_->Flush();
       driver_->output_stream_ << tokens_;

+ 23 - 1
toolchain/lex/BUILD

@@ -174,6 +174,25 @@ cc_fuzz_test(
     ],
 )
 
+cc_library(
+    name = "lex",
+    srcs = ["lex.cpp"],
+    hdrs = ["lex.h"],
+    deps = [
+        ":character_set",
+        ":helpers",
+        ":numeric_literal",
+        ":string_literal",
+        ":token_kind",
+        ":tokenized_buffer",
+        "//common:check",
+        "//toolchain/base:value_store",
+        "//toolchain/diagnostics:diagnostic_emitter",
+        "//toolchain/source:source_buffer",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "tokenized_buffer",
     srcs = ["tokenized_buffer.cpp"],
@@ -200,6 +219,7 @@ cc_library(
     testonly = 1,
     hdrs = ["tokenized_buffer_test_helpers.h"],
     deps = [
+        ":lex",
         ":tokenized_buffer",
         "//common:check",
         "//toolchain/base:value_store",
@@ -213,6 +233,7 @@ cc_test(
     size = "small",
     srcs = ["tokenized_buffer_test.cpp"],
     deps = [
+        ":lex",
         ":tokenized_buffer",
         ":tokenized_buffer_test_helpers",
         "//testing/base:gtest_main",
@@ -232,7 +253,7 @@ cc_fuzz_test(
     srcs = ["tokenized_buffer_fuzzer.cpp"],
     corpus = glob(["fuzzer_corpus/tokenized_buffer/*"]),
     deps = [
-        ":tokenized_buffer",
+        ":lex",
         "//common:check",
         "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
@@ -246,6 +267,7 @@ cc_binary(
     testonly = 1,
     srcs = ["tokenized_buffer_benchmark.cpp"],
     deps = [
+        ":lex",
         ":token_kind",
         ":tokenized_buffer",
         "//common:check",

+ 1315 - 0
toolchain/lex/lex.cpp

@@ -0,0 +1,1315 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/lex/lex.h"
+
+#include <array>
+
+#include "common/check.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Compiler.h"
+#include "toolchain/base/value_store.h"
+#include "toolchain/lex/character_set.h"
+#include "toolchain/lex/helpers.h"
+#include "toolchain/lex/numeric_literal.h"
+#include "toolchain/lex/string_literal.h"
+#include "toolchain/lex/tokenized_buffer.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#define CARBON_USE_SIMD 1
+#elif __x86_64__
+#include <x86intrin.h>
+#define CARBON_USE_SIMD 1
+#else
+#define CARBON_USE_SIMD 0
+#endif
+
+namespace Carbon::Lex {
+
+// Implementation of the lexer logic itself.
+//
+// The design is that lexing can loop over the source buffer, consuming it into
+// tokens by calling into this API. This class handles the state and breaks down
+// the different lexing steps that may be used. It directly updates the provided
+// tokenized buffer with the lexed tokens.
+//
+// We'd typically put this in an anonymous namespace, but it is `friend`-ed by
+// the `TokenizedBuffer`. One of the important benefits of being in an anonymous
+// namespace is having internal linkage. That allows the optimizer to much more
+// aggressively inline away functions that are called in only one place. We keep
+// that benefit for now by using the `internal_linkage` attribute.
+//
+// TODO: Investigate ways to refactor the code that allow moving this into an
+// anonymous namespace without overly exposing implementation details of the
+// `TokenizedBuffer` or undermining the performance constraints of the lexer.
+class [[clang::internal_linkage]] Lexer {
+ public:
+  // Symbolic result of a lexing action. This indicates whether we successfully
+  // lexed a token, or whether other lexing actions should be attempted.
+  //
+  // While it wraps a simple boolean state, its API both helps make the failures
+  // more self documenting, and by consuming the actual token constructively
+  // when one is produced, it helps ensure the correct result is returned.
+  class LexResult {
+   public:
+    // Consumes (and discard) a valid token to construct a result
+    // indicating a token has been produced. Relies on implicit conversions.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    LexResult(Token /*discarded_token*/) : LexResult(true) {}
+
+    // Returns a result indicating no token was produced.
+    static auto NoMatch() -> LexResult { return LexResult(false); }
+
+    // Tests whether a token was produced by the lexing routine, and
+    // the lexer can continue forming tokens.
+    explicit operator bool() const { return formed_token_; }
+
+   private:
+    explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
+
+    bool formed_token_;
+  };
+
+  Lexer(SharedValueStores& value_stores, SourceBuffer& source,
+        DiagnosticConsumer& consumer)
+      : buffer_(value_stores, source),
+        consumer_(consumer),
+        translator_(&buffer_),
+        emitter_(translator_, consumer_),
+        token_translator_(&buffer_),
+        token_emitter_(token_translator_, consumer_) {}
+
+  // Find all line endings and create the line data structures.
+  //
+  // Explicitly kept out-of-line because this is a significant loop that is
+  // useful to have in the profile and it doesn't simplify by inlining at all.
+  // But because it can, the compiler will flatten this otherwise.
+  [[gnu::noinline]] auto CreateLines(llvm::StringRef source_text) -> void;
+
+  auto current_line() -> Line { return Line(line_index_); }
+
+  auto current_line_info() -> TokenizedBuffer::LineInfo* {
+    return &buffer_.line_infos_[line_index_];
+  }
+
+  auto ComputeColumn(ssize_t position) -> int {
+    CARBON_DCHECK(position >= current_line_info()->start);
+    return position - current_line_info()->start;
+  }
+
+  auto NoteWhitespace() -> void {
+    buffer_.token_infos_.back().has_trailing_space = true;
+  }
+
+  auto SkipHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
+      -> void;
+
+  auto LexHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
+      -> void;
+
+  auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
+      -> void;
+
+  auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
+      -> void;
+
+  auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void;
+
+  auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
+      -> LexResult;
+
+  auto LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
+      -> LexResult;
+
+  auto LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                             ssize_t& position) -> Token;
+
+  auto LexOpeningSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                             ssize_t& position) -> LexResult;
+
+  auto LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                             ssize_t& position) -> LexResult;
+
+  auto LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
+      -> LexResult;
+
+  // Given a word that has already been lexed, determine whether it is a type
+  // literal and if so form the corresponding token.
+  auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column) -> LexResult;
+
+  // Closes all open groups that cannot remain open across a closing symbol.
+  // Users may pass `Error` to close all open groups.
+  //
+  // Explicitly kept out-of-line because it's on an error path, and so inlining
+  // would be performance neutral. Keeping it out-of-line makes the generated
+  // code easier to understand when profiling.
+  [[gnu::noinline]] auto CloseInvalidOpenGroups(TokenKind kind,
+                                                ssize_t position) -> void;
+
+  auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
+      -> LexResult;
+
+  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
+                                      ssize_t& position) -> LexResult;
+
+  auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult;
+
+  auto LexStartOfFile(llvm::StringRef source_text, ssize_t& position) -> void;
+
+  auto LexEndOfFile(llvm::StringRef source_text, ssize_t position) -> void;
+
+  // The main entry point for dispatching through the lexer's table. This method
+  // should always fully consume the source text.
+  auto Lex() && -> TokenizedBuffer;
+
+ private:
+  TokenizedBuffer buffer_;
+
+  ssize_t line_index_;
+
+  llvm::SmallVector<Token> open_groups_;
+
+  ErrorTrackingDiagnosticConsumer consumer_;
+
+  TokenizedBuffer::SourceBufferLocationTranslator translator_;
+  LexerDiagnosticEmitter emitter_;
+
+  TokenLocationTranslator token_translator_;
+  TokenDiagnosticEmitter token_emitter_;
+};
+
+// TODO: Move Overload and VariantMatch somewhere more central.
+
+// Form an overload set from a list of functions. For example:
+//
+// ```
+// auto overloaded = Overload{[] (int) {}, [] (float) {}};
+// ```
+template <typename... Fs>
+struct Overload : Fs... {
+  using Fs::operator()...;
+};
+template <typename... Fs>
+Overload(Fs...) -> Overload<Fs...>;
+
+// Pattern-match against the type of the value stored in the variant `V`. Each
+// element of `fs` should be a function that takes one or more of the variant
+// values in `V`.
+template <typename V, typename... Fs>
+auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
+  return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
+}
+
+#if CARBON_USE_SIMD
+namespace {
+#if __ARM_NEON
+using SIMDMaskT = uint8x16_t;
+#elif __x86_64__
+using SIMDMaskT = __m128i;
+#else
+#error "Unsupported SIMD architecture!"
+#endif
+using SIMDMaskArrayT = std::array<SIMDMaskT, sizeof(SIMDMaskT) + 1>;
+}  // namespace
+// A table of masks to include 0-16 bytes of an SSE register.
+static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
+  SIMDMaskArrayT masks = {};
+  for (int i = 1; i < static_cast<int>(masks.size()); ++i) {
+    masks[i] =
+        // The SIMD types and constexpr require a C-style cast.
+        // NOLINTNEXTLINE(google-readability-casting)
+        (SIMDMaskT)(std::numeric_limits<unsigned __int128>::max() >>
+                    ((sizeof(SIMDMaskT) - i) * 8));
+  }
+  return masks;
+}();
+#endif  // CARBON_USE_SIMD
+
+// A table of booleans that we can use to classify bytes as being valid
+// identifier start. This is used by raw identifier detection.
+static constexpr std::array<bool, 256> IsIdStartByteTable = [] {
+  std::array<bool, 256> table = {};
+  for (char c = 'A'; c <= 'Z'; ++c) {
+    table[c] = true;
+  }
+  for (char c = 'a'; c <= 'z'; ++c) {
+    table[c] = true;
+  }
+  table['_'] = true;
+  return table;
+}();
+
+// A table of booleans that we can use to classify bytes as being valid
+// identifier (or keyword) characters. This is used in the generic,
+// non-vectorized fallback code to scan for length of an identifier.
+static constexpr std::array<bool, 256> IsIdByteTable = [] {
+  std::array<bool, 256> table = IsIdStartByteTable;
+  for (char c = '0'; c <= '9'; ++c) {
+    table[c] = true;
+  }
+  return table;
+}();
+
+// Baseline scalar version, also available for scalar-fallback in SIMD code.
+// Uses `ssize_t` for performance when indexing in the loop.
+//
+// TODO: This assumes all Unicode characters are non-identifiers.
+static auto ScanForIdentifierPrefixScalar(llvm::StringRef text, ssize_t i)
+    -> llvm::StringRef {
+  const ssize_t size = text.size();
+  while (i < size && IsIdByteTable[static_cast<unsigned char>(text[i])]) {
+    ++i;
+  }
+
+  return text.substr(0, i);
+}
+
+#if CARBON_USE_SIMD && __x86_64__
+// The SIMD code paths uses a scheme derived from the techniques in Geoff
+// Langdale and Daniel Lemire's work on parsing JSON[1]. Specifically, that
+// paper outlines a technique of using two 4-bit indexed in-register look-up
+// tables (LUTs) to classify bytes in a branchless SIMD code sequence.
+//
+// [1]: https://arxiv.org/pdf/1902.08318.pdf
+//
+// The goal is to get a bit mask classifying different sets of bytes. For each
+// input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
+// character. Otherwise, we want the mask bits to be set with the following
+// logic derived by inspecting the high nibble and low nibble of the input:
+// bit0 = 1 for `_`: high `0x5` and low `0xF`
+// bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
+// bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
+// bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
+// bit4 = unused
+// bit5 = unused
+// bit6 = unused
+// bit7 = unused
+//
+// No bits set means definitively non-ID ASCII character.
+//
+// Bits 4-7 remain unused if we need to classify more characters.
+namespace {
+// Struct used to implement the nibble LUT for SIMD implementations.
+//
+// Forced to 16-byte alignment to ensure we can load it easily in SIMD code.
+struct alignas(16) NibbleLUT {
+  auto Load() const -> __m128i {
+    return _mm_load_si128(reinterpret_cast<const __m128i*>(this));
+  }
+
+  uint8_t nibble_0;
+  uint8_t nibble_1;
+  uint8_t nibble_2;
+  uint8_t nibble_3;
+  uint8_t nibble_4;
+  uint8_t nibble_5;
+  uint8_t nibble_6;
+  uint8_t nibble_7;
+  uint8_t nibble_8;
+  uint8_t nibble_9;
+  uint8_t nibble_a;
+  uint8_t nibble_b;
+  uint8_t nibble_c;
+  uint8_t nibble_d;
+  uint8_t nibble_e;
+  uint8_t nibble_f;
+};
+}  // namespace
+
+static constexpr NibbleLUT HighLUT = {
+    .nibble_0 = 0b0000'0000,
+    .nibble_1 = 0b0000'0000,
+    .nibble_2 = 0b0000'0000,
+    .nibble_3 = 0b0000'0010,
+    .nibble_4 = 0b0000'0100,
+    .nibble_5 = 0b0000'1001,
+    .nibble_6 = 0b0000'0100,
+    .nibble_7 = 0b0000'1000,
+    .nibble_8 = 0b1000'0000,
+    .nibble_9 = 0b1000'0000,
+    .nibble_a = 0b1000'0000,
+    .nibble_b = 0b1000'0000,
+    .nibble_c = 0b1000'0000,
+    .nibble_d = 0b1000'0000,
+    .nibble_e = 0b1000'0000,
+    .nibble_f = 0b1000'0000,
+};
+static constexpr NibbleLUT LowLUT = {
+    .nibble_0 = 0b1000'1010,
+    .nibble_1 = 0b1000'1110,
+    .nibble_2 = 0b1000'1110,
+    .nibble_3 = 0b1000'1110,
+    .nibble_4 = 0b1000'1110,
+    .nibble_5 = 0b1000'1110,
+    .nibble_6 = 0b1000'1110,
+    .nibble_7 = 0b1000'1110,
+    .nibble_8 = 0b1000'1110,
+    .nibble_9 = 0b1000'1110,
+    .nibble_a = 0b1000'1100,
+    .nibble_b = 0b1000'0100,
+    .nibble_c = 0b1000'0100,
+    .nibble_d = 0b1000'0100,
+    .nibble_e = 0b1000'0100,
+    .nibble_f = 0b1000'0101,
+};
+
+static auto ScanForIdentifierPrefixX86(llvm::StringRef text)
+    -> llvm::StringRef {
+  const auto high_lut = HighLUT.Load();
+  const auto low_lut = LowLUT.Load();
+
+  // Use `ssize_t` for performance here as we index memory in a tight loop.
+  ssize_t i = 0;
+  const ssize_t size = text.size();
+  while ((i + 16) <= size) {
+    __m128i input =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
+
+    // The high bits of each byte indicate a non-ASCII character encoded using
+    // UTF-8. Test those and fall back to the scalar code if present. These
+    // bytes will also cause spurious zeros in the LUT results, but we can
+    // ignore that because we track them independently here.
+#if __SSE4_1__
+    if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
+      break;
+    }
+#else
+    if (_mm_movemask_epi8(input) != 0) {
+      break;
+    }
+#endif
+
+    // Do two LUT lookups and mask the results together to get the results for
+    // both low and high nibbles. Note that we don't need to mask out the high
+    // bit of input here because we track that above for UTF-8 handling.
+    __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
+    // Note that the input needs to be masked to only include the high nibble or
+    // we could end up with bit7 set forcing the result to a zero byte.
+    __m128i input_high =
+        _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
+    __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
+    __m128i mask = _mm_and_si128(low_mask, high_mask);
+
+    // Now compare to find the completely zero bytes.
+    __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
+    int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
+
+    // Check if there are bits in the tail mask, which means zero bytes and the
+    // end of the identifier. We could do this without materializing the scalar
+    // mask on more recent CPUs, but we generally expect the median length we
+    // encounter to be <16 characters and so we avoid the extra instruction in
+    // that case and predict this branch to succeed so it is laid out in a
+    // reasonable way.
+    if (LLVM_LIKELY(tail_ascii_mask != 0)) {
+      // Move past the definitively classified bytes that are part of the
+      // identifier, and return the complete identifier text.
+      i += __builtin_ctz(tail_ascii_mask);
+      return text.substr(0, i);
+    }
+    i += 16;
+  }
+
+  return ScanForIdentifierPrefixScalar(text, i);
+}
+
+#endif  // CARBON_USE_SIMD && __x86_64__
+
+// Scans the provided text and returns the prefix `StringRef` of contiguous
+// identifier characters.
+//
+// This is a performance sensitive function and where profitable uses vectorized
+// code sequences to optimize its scanning. When modifying, the identifier
+// lexing benchmarks should be checked for regressions.
+//
+// Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
+//
+// TODO: Currently, this code does not implement Carbon's design for Unicode
+// characters in identifiers. It does work on UTF-8 code unit sequences, but
+// currently considers non-ASCII characters to be non-identifier characters.
+// Some work has been done to ensure the hot loop, while optimized, retains
+// enough information to add Unicode handling without completely destroying the
+// relevant optimizations.
+static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
+  // Dispatch to an optimized architecture optimized routine.
+#if CARBON_USE_SIMD && __x86_64__
+  return ScanForIdentifierPrefixX86(text);
+#elif CARBON_USE_SIMD && __ARM_NEON
+  // Somewhat surprisingly, there is basically nothing worth doing in SIMD on
+  // Arm to optimize this scan. The Neon SIMD operations end up requiring you to
+  // move from the SIMD unit to the scalar unit in the critical path of finding
+  // the offset of the end of an identifier. Current ARM cores make the code
+  // sequences here (quite) unpleasant. For example, on Apple M1 and similar
+  // cores, the latency is as much as 10 cycles just to extract from the vector.
+  // SIMD might be more interesting on Neoverse cores, but it'd be nice to avoid
+  // core-specific tunings at this point.
+  //
+  // If this proves problematic and critical to optimize, the current leading
+  // theory is to have the newline searching code also create a bitmask for the
+  // entire source file of identifier and non-identifier bytes, and then use the
+  // bit-counting instructions here to do a fast scan of that bitmask. However,
+  // crossing that bridge will add substantial complexity to the newline
+  // scanner, and so currently we just use a boring scalar loop that pipelines
+  // well.
+#endif
+  return ScanForIdentifierPrefixScalar(text, 0);
+}
+
+using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef source_text,
+                               ssize_t position) -> void;
+using DispatchTableT = std::array<DispatchFunctionT*, 256>;
+
+static constexpr std::array<TokenKind, 256> OneCharTokenKindTable = [] {
+  std::array<TokenKind, 256> table = {};
+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
+  table[(Spelling)[0]] = TokenKind::TokenName;
+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
+  table[(Spelling)[0]] = TokenKind::TokenName;
+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
+  table[(Spelling)[0]] = TokenKind::TokenName;
+#include "toolchain/lex/token_kind.def"
+  return table;
+}();
+
+// We use a collection of static member functions for table-based dispatch to
+// lexer methods. These are named static member functions so that they show up
+// helpfully in profiles and backtraces, but they tend to not contain the
+// interesting logic and simply delegate to the relevant methods. All of their
+// signatures need to be exactly the same however in order to ensure we can
+// build efficient dispatch tables out of them. All of them end by doing a
+// must-tail return call to this routine. It handles continuing the dispatch
+// chain.
+static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
+                         ssize_t position) -> void;
+
+// Define a set of dispatch functions that simply forward to a method that
+// lexes a token. This includes validating that an actual token was produced,
+// and continuing the dispatch.
+#define CARBON_DISPATCH_LEX_TOKEN(LexMethod)                                 \
+  static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
+                                  ssize_t position)                          \
+      ->void {                                                               \
+    Lexer::LexResult result = lexer.LexMethod(source_text, position);        \
+    CARBON_CHECK(result) << "Failed to form a token!";                       \
+    [[clang::musttail]] return DispatchNext(lexer, source_text, position);   \
+  }
+CARBON_DISPATCH_LEX_TOKEN(LexError)
+CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
+CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
+CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
+CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
+CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
+
+// A custom dispatch functions that pre-select the symbol token to lex.
+#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod)                           \
+  static auto Dispatch##LexMethod##SymbolToken(                               \
+      Lexer& lexer, llvm::StringRef source_text, ssize_t position)            \
+      ->void {                                                                \
+    Lexer::LexResult result = lexer.LexMethod##SymbolToken(                   \
+        source_text, OneCharTokenKindTable[source_text[position]], position); \
+    CARBON_CHECK(result) << "Failed to form a token!";                        \
+    [[clang::musttail]] return DispatchNext(lexer, source_text, position);    \
+  }
+CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
+CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
+CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
+
+// Define a set of non-token dispatch functions that handle things like
+// whitespace and comments.
+#define CARBON_DISPATCH_LEX_NON_TOKEN(LexMethod)                             \
+  static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
+                                  ssize_t position)                          \
+      ->void {                                                               \
+    lexer.LexMethod(source_text, position);                                  \
+    [[clang::musttail]] return DispatchNext(lexer, source_text, position);   \
+  }
+CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
+CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
+CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
+
+// Build a table of function pointers that we can use to dispatch to the
+// correct lexer routine based on the first byte of source text.
+//
+// While it is tempting to simply use a `switch` on the first byte and
+// dispatch with cases into this, in practice that doesn't produce great code.
+// There seem to be two issues that are the root cause.
+//
+// First, there are lots of different values of bytes that dispatch to a
+// fairly small set of routines, and then some byte values that dispatch
+// differently for each byte. This pattern isn't one that the compiler-based
+// lowering of switches works well with -- it tries to balance all the cases,
+// and in doing so emits several compares and other control flow rather than a
+// simple jump table.
+//
+// Second, with a `case`, it isn't as obvious how to create a single, uniform
+// interface that is effective for *every* byte value, and thus makes for a
+// single consistent table-based dispatch. By forcing these to be function
+// pointers, we also coerce the code to use a strictly homogeneous structure
+// that can form a single dispatch table.
+//
+// These two actually interact -- the second issue is part of what makes the
+// non-table lowering in the first one desirable for many switches and cases.
+//
+// Ultimately, when table-based dispatch is such an important technique, we
+// get better results by taking full control and manually creating the
+// dispatch structures.
+//
+// The functions in this table also use tail-recursion to implement the loop
+// of the lexer. This is based on the technique described more fully for any
+// kind of byte-stream loop structure here:
+// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
+static constexpr auto MakeDispatchTable() -> DispatchTableT {
+  DispatchTableT table = {};
+  // First set the table entries to dispatch to our error token handler as the
+  // base case. Everything valid comes from an override below.
+  for (int i = 0; i < 256; ++i) {
+    table[i] = &DispatchLexError;
+  }
+
+  // Symbols have some special dispatching. First, set the first character of
+  // each symbol token spelling to dispatch to the symbol lexer. We don't
+  // provide a pre-computed token here, so the symbol lexer will compute the
+  // exact symbol token kind. We'll override this with more specific dispatch
+  // below.
+#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
+  table[(Spelling)[0]] = &DispatchLexSymbolToken;
+#include "toolchain/lex/token_kind.def"
+
+  // Now special cased single-character symbols that are guaranteed to not
+  // join with another symbol. These are grouping symbols, terminators,
+  // or separators in the grammar and have a good reason to be
+  // orthogonal to any other punctuation. We do this separately because this
+  // needs to override some of the generic handling above, and provide a
+  // custom token.
+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
+  table[(Spelling)[0]] = &DispatchLexOneCharSymbolToken;
+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
+  table[(Spelling)[0]] = &DispatchLexOpeningSymbolToken;
+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
+  table[(Spelling)[0]] = &DispatchLexClosingSymbolToken;
+#include "toolchain/lex/token_kind.def"
+
+  // Override the handling for `/` to consider comments as well as a `/`
+  // symbol.
+  table['/'] = &DispatchLexCommentOrSlash;
+
+  table['_'] = &DispatchLexKeywordOrIdentifier;
+  // Note that we don't use `llvm::seq` because this needs to be `constexpr`
+  // evaluated.
+  for (unsigned char c = 'a'; c <= 'z'; ++c) {
+    table[c] = &DispatchLexKeywordOrIdentifier;
+  }
+  table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
+  for (unsigned char c = 'A'; c <= 'Z'; ++c) {
+    table[c] = &DispatchLexKeywordOrIdentifier;
+  }
+  // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
+  // as whitespace characters should already have been skipped and the
+  // only remaining valid Unicode characters would be part of an
+  // identifier. That code can either accept or reject.
+  for (int i = 0x80; i < 0x100; ++i) {
+    table[i] = &DispatchLexKeywordOrIdentifier;
+  }
+
+  for (unsigned char c = '0'; c <= '9'; ++c) {
+    table[c] = &DispatchLexNumericLiteral;
+  }
+
+  table['\''] = &DispatchLexStringLiteral;
+  table['"'] = &DispatchLexStringLiteral;
+  table['#'] = &DispatchLexStringLiteral;
+
+  table[' '] = &DispatchLexHorizontalWhitespace;
+  table['\t'] = &DispatchLexHorizontalWhitespace;
+  table['\n'] = &DispatchLexVerticalWhitespace;
+
+  return table;
+};
+
+static constexpr DispatchTableT DispatchTable = MakeDispatchTable();
+
+static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
+                         ssize_t position) -> void {
+  if (LLVM_LIKELY(position < static_cast<ssize_t>(source_text.size()))) {
+    // The common case is to tail recurse based on the next character. Note
+    // that because this is a must-tail return, this cannot fail to tail-call
+    // and will not grow the stack. This is in essence a loop with dynamic
+    // tail dispatch to the next stage of the loop.
+    [[clang::musttail]] return DispatchTable[static_cast<unsigned char>(
+        source_text[position])](lexer, source_text, position);
+  }
+
+  // When we finish the source text, stop recursing. We also hint this so that
+  // the tail-dispatch is optimized as that's essentially the loop back-edge
+  // and this is the loop exit.
+  lexer.LexEndOfFile(source_text, position);
+}
+
+auto Lexer::Lex() && -> TokenizedBuffer {
+  llvm::StringRef source_text = buffer_.source_->text();
+
+  // First build up our line data structures.
+  CreateLines(source_text);
+
+  ssize_t position = 0;
+  LexStartOfFile(source_text, position);
+
+  // Manually enter the dispatch loop. This call will tail-recurse through the
+  // dispatch table until everything from source_text is consumed.
+  DispatchNext(*this, source_text, position);
+
+  if (consumer_.seen_error()) {
+    buffer_.has_errors_ = true;
+  }
+
+  return std::move(buffer_);
+}
+
+auto Lexer::CreateLines(llvm::StringRef source_text) -> void {
+  // We currently use `memchr` here which typically is well optimized to use
+  // SIMD or other significantly faster than byte-wise scanning. We also use
+  // carefully selected variables and the `ssize_t` type for performance and
+  // code size of this hot loop.
+  //
+  // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
+  // routine here in order to handle CR+LF line endings, as we'll want those
+  // to stay on the fast path. We'll also need to detect and diagnose Unicode
+  // vertical whitespace. Starting with `memchr` should give us a strong
+  // baseline performance target when adding those features.
+  const char* const text = source_text.data();
+  const ssize_t size = source_text.size();
+  ssize_t start = 0;
+  while (const char* nl = reinterpret_cast<const char*>(
+             memchr(&text[start], '\n', size - start))) {
+    ssize_t nl_index = nl - text;
+    buffer_.AddLine(TokenizedBuffer::LineInfo(start, nl_index - start));
+    start = nl_index + 1;
+  }
+  // The last line ends at the end of the file.
+  buffer_.AddLine(TokenizedBuffer::LineInfo(start, size - start));
+
+  // If the last line wasn't empty, the file ends with an unterminated line.
+  // Add an extra blank line so that we never need to handle the special case
+  // of being on the last line inside the lexer and needing to not increment
+  // to the next line.
+  if (start != size) {
+    buffer_.AddLine(TokenizedBuffer::LineInfo(size, 0));
+  }
+
+  // Now that all the infos are allocated, get a fresh pointer to the first
+  // info for use while lexing.
+  line_index_ = 0;
+}
+
+auto Lexer::SkipHorizontalWhitespace(llvm::StringRef source_text,
+                                     ssize_t& position) -> void {
+  // Handle adjacent whitespace quickly. This comes up frequently for example
+  // due to indentation. We don't expect *huge* runs, so just use a scalar
+  // loop. While still scalar, this avoids repeated table dispatch and marking
+  // whitespace.
+  while (position < static_cast<ssize_t>(source_text.size()) &&
+         (source_text[position] == ' ' || source_text[position] == '\t')) {
+    ++position;
+  }
+}
+
+auto Lexer::LexHorizontalWhitespace(llvm::StringRef source_text,
+                                    ssize_t& position) -> void {
+  CARBON_DCHECK(source_text[position] == ' ' || source_text[position] == '\t');
+  NoteWhitespace();
+  // Skip runs using an optimized code path.
+  SkipHorizontalWhitespace(source_text, position);
+}
+
+auto Lexer::LexVerticalWhitespace(llvm::StringRef source_text,
+                                  ssize_t& position) -> void {
+  NoteWhitespace();
+  ++line_index_;
+  auto* line_info = current_line_info();
+  ssize_t line_start = line_info->start;
+  position = line_start;
+  SkipHorizontalWhitespace(source_text, position);
+  line_info->indent = position - line_start;
+}
+
+auto Lexer::LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
+    -> void {
+  CARBON_DCHECK(source_text[position] == '/');
+
+  // Both comments and slash symbols start with a `/`. We disambiguate with a
+  // max-munch rule -- if the next character is another `/` then we lex it as
+  // a comment start. If it isn't, then we lex as a slash. We also optimize
+  // for the comment case as we expect that to be much more important for
+  // overall lexer performance.
+  if (LLVM_LIKELY(position + 1 < static_cast<ssize_t>(source_text.size()) &&
+                  source_text[position + 1] == '/')) {
+    LexComment(source_text, position);
+    return;
+  }
+
+  // This code path should produce a token, make sure that happens.
+  LexResult result = LexSymbolToken(source_text, position);
+  CARBON_CHECK(result) << "Failed to form a token!";
+}
+
+auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
+  CARBON_DCHECK(source_text.substr(position).startswith("//"));
+
+  // Any comment must be the only non-whitespace on the line.
+  const auto* line_info = current_line_info();
+  if (LLVM_UNLIKELY(position != line_info->start + line_info->indent)) {
+    CARBON_DIAGNOSTIC(TrailingComment, Error,
+                      "Trailing comments are not permitted.");
+
+    emitter_.Emit(source_text.begin() + position, TrailingComment);
+
+    // Note that we cannot fall-through here as the logic below doesn't handle
+    // trailing comments. For simplicity, we just consume the trailing comment
+    // itself and let the normal lexer handle the newline as if there weren't
+    // a comment at all.
+    position = line_info->start + line_info->length;
+    return;
+  }
+
+  // The introducer '//' must be followed by whitespace or EOF.
+  bool is_valid_after_slashes = true;
+  if (position + 2 < static_cast<ssize_t>(source_text.size()) &&
+      LLVM_UNLIKELY(!IsSpace(source_text[position + 2]))) {
+    CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
+                      "Whitespace is required after '//'.");
+    emitter_.Emit(source_text.begin() + position + 2,
+                  NoWhitespaceAfterCommentIntroducer);
+
+    // We use this to tweak the lexing of blocks below.
+    is_valid_after_slashes = false;
+  }
+
+  // Skip over this line.
+  ssize_t line_index = line_index_;
+  ++line_index;
+  position = buffer_.line_infos_[line_index].start;
+
+  // A very common pattern is a long block of comment lines all with the same
+  // indent and comment start. We skip these comment blocks in bulk both for
+  // speed and to reduce redundant diagnostics if each line has the same
+  // erroneous comment start like `//!`.
+  //
+  // When we have SIMD support this is even more important for speed, as short
+  // indents can be scanned extremely quickly with SIMD and we expect these to
+  // be the dominant cases.
+  //
+  // TODO: We should extend this to 32-byte SIMD on platforms with support.
+  constexpr int MaxIndent = 13;
+  const int indent = line_info->indent;
+  const ssize_t first_line_start = line_info->start;
+  ssize_t prefix_size = indent + (is_valid_after_slashes ? 3 : 2);
+  auto skip_to_next_line = [this, indent, &line_index, &position] {
+    // We're guaranteed to have a line here even on a comment on the last line
+    // as we ensure there is an empty line structure at the end of every file.
+    ++line_index;
+    auto* next_line_info = &buffer_.line_infos_[line_index];
+    next_line_info->indent = indent;
+    position = next_line_info->start;
+  };
+  if (CARBON_USE_SIMD &&
+      position + 16 < static_cast<ssize_t>(source_text.size()) &&
+      indent <= MaxIndent) {
+    // Load a mask based on the amount of text we want to compare.
+    auto mask = PrefixMasks[prefix_size];
+#if __ARM_NEON
+    // Load and mask the prefix of the current line.
+    auto prefix = vld1q_u8(reinterpret_cast<const uint8_t*>(source_text.data() +
+                                                            first_line_start));
+    prefix = vandq_u8(mask, prefix);
+    do {
+      // Load and mask the next line to consider's prefix.
+      auto next_prefix = vld1q_u8(
+          reinterpret_cast<const uint8_t*>(source_text.data() + position));
+      next_prefix = vandq_u8(mask, next_prefix);
+      // Compare the two prefixes and if any lanes differ, break.
+      auto compare = vceqq_u8(prefix, next_prefix);
+      if (vminvq_u8(compare) == 0) {
+        break;
+      }
+
+      skip_to_next_line();
+    } while (position + 16 < static_cast<ssize_t>(source_text.size()));
+#elif __x86_64__
+    // Use the current line's prefix as the exemplar to compare against.
+    // We don't mask here as we will mask when doing the comparison.
+    auto prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(
+        source_text.data() + first_line_start));
+    do {
+      // Load the next line to consider's prefix.
+      auto next_prefix = _mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(source_text.data() + position));
+      // Compute the difference between the next line and our exemplar. Again,
+      // we don't mask the difference because the comparison below will be
+      // masked.
+      auto prefix_diff = _mm_xor_si128(prefix, next_prefix);
+      // If we have any differences (non-zero bits) within the mask, we can't
+      // skip the next line too.
+      if (!_mm_test_all_zeros(mask, prefix_diff)) {
+        break;
+      }
+
+      skip_to_next_line();
+    } while (position + 16 < static_cast<ssize_t>(source_text.size()));
+#else
+#error "Unsupported SIMD architecture!"
+#endif
+    // TODO: If we finish the loop due to the position approaching the end of
+    // the buffer we may fail to skip the last line in a comment block that
+    // has an invalid initial sequence and thus emit extra diagnostics. We
+    // should really fall through to the generic skipping logic, but the code
+    // organization will need to change significantly to allow that.
+  } else {
+    while (position + prefix_size < static_cast<ssize_t>(source_text.size()) &&
+           memcmp(source_text.data() + first_line_start,
+                  source_text.data() + position, prefix_size) == 0) {
+      skip_to_next_line();
+    }
+  }
+
+  // Now compute the indent of this next line before we finish.
+  ssize_t line_start = position;
+  SkipHorizontalWhitespace(source_text, position);
+
+  // Now that we're done scanning, update to the latest line index and indent.
+  line_index_ = line_index;
+  current_line_info()->indent = position - line_start;
+}
+
+auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
+    -> LexResult {
+  std::optional<NumericLiteral> literal =
+      NumericLiteral::Lex(source_text.substr(position));
+  if (!literal) {
+    return LexError(source_text, position);
+  }
+
+  int int_column = ComputeColumn(position);
+  int token_size = literal->text().size();
+  position += token_size;
+
+  return VariantMatch(
+      literal->ComputeValue(emitter_),
+      [&](NumericLiteral::IntegerValue&& value) {
+        auto token = buffer_.AddToken({.kind = TokenKind::IntegerLiteral,
+                                       .token_line = current_line(),
+                                       .column = int_column});
+        buffer_.GetTokenInfo(token).integer_id =
+            buffer_.value_stores_->integers().Add(std::move(value.value));
+        return token;
+      },
+      [&](NumericLiteral::RealValue&& value) {
+        auto token = buffer_.AddToken({.kind = TokenKind::RealLiteral,
+                                       .token_line = current_line(),
+                                       .column = int_column});
+        buffer_.GetTokenInfo(token).real_id =
+            buffer_.value_stores_->reals().Add(Real{
+                .mantissa = value.mantissa,
+                .exponent = value.exponent,
+                .is_decimal = (value.radix == NumericLiteral::Radix::Decimal)});
+        return token;
+      },
+      [&](NumericLiteral::UnrecoverableError) {
+        auto token = buffer_.AddToken({
+            .kind = TokenKind::Error,
+            .token_line = current_line(),
+            .column = int_column,
+            .error_length = token_size,
+        });
+        return token;
+      });
+}
+
+auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
+    -> LexResult {
+  std::optional<StringLiteral> literal =
+      StringLiteral::Lex(source_text.substr(position));
+  if (!literal) {
+    return LexError(source_text, position);
+  }
+
+  Line string_line = current_line();
+  int string_column = ComputeColumn(position);
+  ssize_t literal_size = literal->text().size();
+  position += literal_size;
+
+  // Update line and column information.
+  if (literal->is_multi_line()) {
+    while (current_line_info()->start + current_line_info()->length <
+           position) {
+      ++line_index_;
+      current_line_info()->indent = string_column;
+    }
+    // Note that we've updated the current line at this point, but
+    // `set_indent_` is already true from above. That remains correct as the
+    // last line of the multi-line literal *also* has its indent set.
+  }
+
+  if (literal->is_terminated()) {
+    auto string_id = buffer_.value_stores_->string_literals().Add(
+        literal->ComputeValue(buffer_.allocator_, emitter_));
+    auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral,
+                                   .token_line = string_line,
+                                   .column = string_column,
+                                   .string_literal_id = string_id});
+    return token;
+  } else {
+    CARBON_DIAGNOSTIC(UnterminatedString, Error,
+                      "String is missing a terminator.");
+    emitter_.Emit(literal->text().begin(), UnterminatedString);
+    return buffer_.AddToken(
+        {.kind = TokenKind::Error,
+         .token_line = string_line,
+         .column = string_column,
+         .error_length = static_cast<int32_t>(literal_size)});
+  }
+}
+
+auto Lexer::LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                                  ssize_t& position) -> Token {
+  // Verify in a debug build that the incoming token kind is correct.
+  CARBON_DCHECK(kind != TokenKind::Error);
+  CARBON_DCHECK(kind.fixed_spelling().size() == 1);
+  CARBON_DCHECK(source_text[position] == kind.fixed_spelling().front())
+      << "Source text starts with '" << source_text[position]
+      << "' instead of the spelling '" << kind.fixed_spelling()
+      << "' of the incoming token kind '" << kind << "'";
+
+  Token token = buffer_.AddToken({.kind = kind,
+                                  .token_line = current_line(),
+                                  .column = ComputeColumn(position)});
+  ++position;
+  return token;
+}
+
+auto Lexer::LexOpeningSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                                  ssize_t& position) -> LexResult {
+  Token token = LexOneCharSymbolToken(source_text, kind, position);
+  open_groups_.push_back(token);
+  return token;
+}
+
+auto Lexer::LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
+                                  ssize_t& position) -> LexResult {
+  auto unmatched_error = [&] {
+    CARBON_DIAGNOSTIC(UnmatchedClosing, Error,
+                      "Closing symbol without a corresponding opening symbol.");
+    emitter_.Emit(source_text.begin() + position, UnmatchedClosing);
+    Token token = buffer_.AddToken({.kind = TokenKind::Error,
+                                    .token_line = current_line(),
+                                    .column = ComputeColumn(position),
+                                    .error_length = 1});
+    ++position;
+    return token;
+  };
+
+  // If we have no open groups, this is an error.
+  if (LLVM_UNLIKELY(open_groups_.empty())) {
+    return unmatched_error();
+  }
+
+  Token opening_token = open_groups_.back();
+  // Close any invalid open groups first.
+  if (LLVM_UNLIKELY(buffer_.GetTokenInfo(opening_token).kind !=
+                    kind.opening_symbol())) {
+    CloseInvalidOpenGroups(kind, position);
+    // This may exhaust the open groups so re-check and re-error if needed.
+    if (open_groups_.empty()) {
+      return unmatched_error();
+    }
+    opening_token = open_groups_.back();
+    CARBON_DCHECK(buffer_.GetTokenInfo(opening_token).kind ==
+                  kind.opening_symbol());
+  }
+  open_groups_.pop_back();
+
+  // Now that the groups are all matched up, lex the actual token.
+  Token token = LexOneCharSymbolToken(source_text, kind, position);
+
+  // Note that it is important to get fresh token infos here as lexing the
+  // open token would invalidate any pointers.
+  buffer_.GetTokenInfo(opening_token).closing_token = token;
+  buffer_.GetTokenInfo(token).opening_token = opening_token;
+
+  return token;
+}
+
+auto Lexer::LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
+    -> LexResult {
+  // One character symbols and grouping symbols are handled with dedicated
+  // dispatch. We only lex the multi-character tokens here.
+  TokenKind kind = llvm::StringSwitch<TokenKind>(source_text.substr(position))
+#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
+  .StartsWith(Spelling, TokenKind::Name)
+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling)
+#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName)
+#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName)
+#include "toolchain/lex/token_kind.def"
+                       .Default(TokenKind::Error);
+  if (kind == TokenKind::Error) {
+    return LexError(source_text, position);
+  }
+
+  Token token = buffer_.AddToken({.kind = kind,
+                                  .token_line = current_line(),
+                                  .column = ComputeColumn(position)});
+  position += kind.fixed_spelling().size();
+  return token;
+}
+
+auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
+    -> LexResult {
+  if (word.size() < 2) {
+    // Too short to form one of these tokens.
+    return LexResult::NoMatch();
+  }
+  if (word[1] < '1' || word[1] > '9') {
+    // Doesn't start with a valid initial digit.
+    return LexResult::NoMatch();
+  }
+
+  std::optional<TokenKind> kind;
+  switch (word.front()) {
+    case 'i':
+      kind = TokenKind::IntegerTypeLiteral;
+      break;
+    case 'u':
+      kind = TokenKind::UnsignedIntegerTypeLiteral;
+      break;
+    case 'f':
+      kind = TokenKind::FloatingPointTypeLiteral;
+      break;
+    default:
+      return LexResult::NoMatch();
+  };
+
+  llvm::StringRef suffix = word.substr(1);
+  if (!CanLexInteger(emitter_, suffix)) {
+    return buffer_.AddToken(
+        {.kind = TokenKind::Error,
+         .token_line = current_line(),
+         .column = column,
+         .error_length = static_cast<int32_t>(word.size())});
+  }
+  llvm::APInt suffix_value;
+  if (suffix.getAsInteger(10, suffix_value)) {
+    return LexResult::NoMatch();
+  }
+
+  auto token = buffer_.AddToken(
+      {.kind = *kind, .token_line = current_line(), .column = column});
+  buffer_.GetTokenInfo(token).integer_id =
+      buffer_.value_stores_->integers().Add(std::move(suffix_value));
+  return token;
+}
+
+auto Lexer::CloseInvalidOpenGroups(TokenKind kind, ssize_t position) -> void {
+  CARBON_CHECK(kind.is_closing_symbol() || kind == TokenKind::Error);
+  CARBON_CHECK(!open_groups_.empty());
+
+  int column = ComputeColumn(position);
+
+  do {
+    Token opening_token = open_groups_.back();
+    TokenKind opening_kind = buffer_.GetTokenInfo(opening_token).kind;
+    if (kind == opening_kind.closing_symbol()) {
+      return;
+    }
+
+    open_groups_.pop_back();
+    CARBON_DIAGNOSTIC(
+        MismatchedClosing, Error,
+        "Closing symbol does not match most recent opening symbol.");
+    token_emitter_.Emit(opening_token, MismatchedClosing);
+
+    CARBON_CHECK(!buffer_.tokens().empty())
+        << "Must have a prior opening token!";
+    Token prev_token = buffer_.tokens().end()[-1];
+
+    // TODO: do a smarter backwards scan for where to put the closing
+    // token.
+    Token closing_token = buffer_.AddToken(
+        {.kind = opening_kind.closing_symbol(),
+         .has_trailing_space = buffer_.HasTrailingWhitespace(prev_token),
+         .is_recovery = true,
+         .token_line = current_line(),
+         .column = column});
+    buffer_.GetTokenInfo(opening_token).closing_token = closing_token;
+    buffer_.GetTokenInfo(closing_token).opening_token = opening_token;
+  } while (!open_groups_.empty());
+}
+
+auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
+                                   ssize_t& position) -> LexResult {
+  if (static_cast<unsigned char>(source_text[position]) > 0x7F) {
+    // TODO: Need to add support for Unicode lexing.
+    return LexError(source_text, position);
+  }
+  CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
+
+  int column = ComputeColumn(position);
+
+  // Take the valid characters off the front of the source buffer.
+  llvm::StringRef identifier_text =
+      ScanForIdentifierPrefix(source_text.substr(position));
+  CARBON_CHECK(!identifier_text.empty()) << "Must have at least one character!";
+  position += identifier_text.size();
+
+  // Check if the text is a type literal, and if so form such a literal.
+  if (LexResult result = LexWordAsTypeLiteralToken(identifier_text, column)) {
+    return result;
+  }
+
+  // Check if the text matches a keyword token, and if so use that.
+  TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
+#define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
+#include "toolchain/lex/token_kind.def"
+                       .Default(TokenKind::Error);
+  if (kind != TokenKind::Error) {
+    return buffer_.AddToken(
+        {.kind = kind, .token_line = current_line(), .column = column});
+  }
+
+  // Otherwise we have a generic identifier.
+  return buffer_.AddToken(
+      {.kind = TokenKind::Identifier,
+       .token_line = current_line(),
+       .column = column,
+       .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
+}
+
+auto Lexer::LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
+                                           ssize_t& position) -> LexResult {
+  CARBON_CHECK(source_text[position] == 'r');
+  // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
+  // identifier starting with the 'r'.
+  // TODO: Need to add support for Unicode lexing.
+  if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
+                  source_text[position + 1] != '#' ||
+                  !IsIdStartByteTable[source_text[position + 2]])) {
+    // TODO: Should this print a different error when there is `r#`, but it
+    // isn't followed by identifier text? Or is it right to put it back so
+    // that the `#` could be parsed as part of a raw string literal?
+    return LexKeywordOrIdentifier(source_text, position);
+  }
+
+  int column = ComputeColumn(position);
+
+  // Take the valid characters off the front of the source buffer.
+  llvm::StringRef identifier_text =
+      ScanForIdentifierPrefix(source_text.substr(position + 2));
+  CARBON_CHECK(!identifier_text.empty()) << "Must have at least one character!";
+  position += identifier_text.size() + 2;
+
+  // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
+
+  // Otherwise we have a raw identifier.
+  // TODO: This token doesn't carry any indicator that it's raw, so
+  // diagnostics are unclear.
+  return buffer_.AddToken(
+      {.kind = TokenKind::Identifier,
+       .token_line = current_line(),
+       .column = column,
+       .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
+}
+
+auto Lexer::LexError(llvm::StringRef source_text, ssize_t& position)
+    -> LexResult {
+  llvm::StringRef error_text =
+      source_text.substr(position).take_while([](char c) {
+        if (IsAlnum(c)) {
+          return false;
+        }
+        switch (c) {
+          case '_':
+          case '\t':
+          case '\n':
+            return false;
+          default:
+            break;
+        }
+        return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
+#define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
+#include "toolchain/lex/token_kind.def"
+            .Default(true);
+      });
+  if (error_text.empty()) {
+    // TODO: Reimplement this to use the lexer properly. In the meantime,
+    // guarantee that we eat at least one byte.
+    error_text = source_text.substr(position, 1);
+  }
+
+  auto token = buffer_.AddToken(
+      {.kind = TokenKind::Error,
+       .token_line = current_line(),
+       .column = ComputeColumn(position),
+       .error_length = static_cast<int32_t>(error_text.size())});
+  CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
+                    "Encountered unrecognized characters while parsing.");
+  emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
+
+  position += error_text.size();
+  return token;
+}
+
+auto Lexer::LexStartOfFile(llvm::StringRef source_text, ssize_t& position)
+    -> void {
+  // Before lexing any source text, add the start-of-file token so that code
+  // can assume a non-empty token buffer for the rest of lexing. Note that the
+  // start-of-file always has trailing space because it *is* whitespace.
+  buffer_.AddToken({.kind = TokenKind::StartOfFile,
+                    .has_trailing_space = true,
+                    .token_line = current_line(),
+                    .column = 0});
+
+  // Also skip any horizontal whitespace and record the indentation of the
+  // first line.
+  SkipHorizontalWhitespace(source_text, position);
+  auto* line_info = current_line_info();
+  CARBON_CHECK(line_info->start == 0);
+  line_info->indent = position;
+}
+
+auto Lexer::LexEndOfFile(llvm::StringRef source_text, ssize_t position)
+    -> void {
+  CARBON_CHECK(position == static_cast<ssize_t>(source_text.size()));
+  // Check if the last line is empty and not the first line (and only). If so,
+  // re-pin the last line to be the prior one so that diagnostics and editors
+  // can treat newlines as terminators even though we internally handle them
+  // as separators in case of a missing newline on the last line. We do this
+  // here instead of detecting this when we see the newline to avoid more
+  // conditions along that fast path.
+  if (position == current_line_info()->start && line_index_ != 0) {
+    --line_index_;
+    --position;
+  } else {
+    // Update the line length as this is also the end of a line.
+    current_line_info()->length = ComputeColumn(position);
+  }
+
+  // The end-of-file token is always considered to be whitespace.
+  NoteWhitespace();
+
+  // Close any open groups. We do this after marking whitespace, it will
+  // preserve that.
+  if (!open_groups_.empty()) {
+    CloseInvalidOpenGroups(TokenKind::Error, position);
+  }
+
+  buffer_.AddToken({.kind = TokenKind::EndOfFile,
+                    .token_line = current_line(),
+                    .column = ComputeColumn(position)});
+}
+
+auto Lex(SharedValueStores& value_stores, SourceBuffer& source,
+         DiagnosticConsumer& consumer) -> TokenizedBuffer {
+  return Lexer(value_stores, source, consumer).Lex();
+}
+
+}  // namespace Carbon::Lex

+ 24 - 0
toolchain/lex/lex.h

@@ -0,0 +1,24 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_LEX_LEX_H_
+#define CARBON_TOOLCHAIN_LEX_LEX_H_
+
+#include "toolchain/base/value_store.h"
+#include "toolchain/diagnostics/diagnostic_emitter.h"
+#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/source/source_buffer.h"
+
+namespace Carbon::Lex {
+
+// Lexes a buffer of source code into a tokenized buffer.
+//
+// The provided source buffer must outlive any returned `TokenizedBuffer`
+// which will refer into the source.
+auto Lex(SharedValueStores& value_stores, SourceBuffer& source,
+         DiagnosticConsumer& consumer) -> TokenizedBuffer;
+
+}  // namespace Carbon::Lex
+
+#endif  // CARBON_TOOLCHAIN_LEX_LEX_H_

+ 0 - 1242
toolchain/lex/tokenized_buffer.cpp

@@ -4,1262 +4,20 @@
 
 #include "toolchain/lex/tokenized_buffer.h"
 
-#include <algorithm>
-#include <array>
 #include <cmath>
 
 #include "common/check.h"
 #include "common/string_helpers.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/lex/character_set.h"
-#include "toolchain/lex/helpers.h"
 #include "toolchain/lex/numeric_literal.h"
 #include "toolchain/lex/string_literal.h"
 
-#if __ARM_NEON
-#include <arm_neon.h>
-#define CARBON_USE_SIMD 1
-#elif __x86_64__
-#include <x86intrin.h>
-#define CARBON_USE_SIMD 1
-#else
-#define CARBON_USE_SIMD 0
-#endif
-
 namespace Carbon::Lex {
 
-// TODO: Move Overload and VariantMatch somewhere more central.
-
-// Form an overload set from a list of functions. For example:
-//
-// ```
-// auto overloaded = Overload{[] (int) {}, [] (float) {}};
-// ```
-template <typename... Fs>
-struct Overload : Fs... {
-  using Fs::operator()...;
-};
-template <typename... Fs>
-Overload(Fs...) -> Overload<Fs...>;
-
-// Pattern-match against the type of the value stored in the variant `V`. Each
-// element of `fs` should be a function that takes one or more of the variant
-// values in `V`.
-template <typename V, typename... Fs>
-auto VariantMatch(V&& v, Fs&&... fs) -> decltype(auto) {
-  return std::visit(Overload{std::forward<Fs&&>(fs)...}, std::forward<V&&>(v));
-}
-
-#if CARBON_USE_SIMD
-namespace {
-#if __ARM_NEON
-using SIMDMaskT = uint8x16_t;
-#elif __x86_64__
-using SIMDMaskT = __m128i;
-#else
-#error "Unsupported SIMD architecture!"
-#endif
-using SIMDMaskArrayT = std::array<SIMDMaskT, sizeof(SIMDMaskT) + 1>;
-}  // namespace
-// A table of masks to include 0-16 bytes of an SSE register.
-static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
-  SIMDMaskArrayT masks = {};
-  for (int i = 1; i < static_cast<int>(masks.size()); ++i) {
-    // The SIMD types and constexpr require a C-style cast.
-    // NOLINTNEXTLINE(google-readability-casting)
-    masks[i] = (SIMDMaskT)(std::numeric_limits<unsigned __int128>::max() >>
-                           ((sizeof(SIMDMaskT) - i) * 8));
-  }
-  return masks;
-}();
-#endif  // CARBON_USE_SIMD
-
-// A table of booleans that we can use to classify bytes as being valid
-// identifier start. This is used by raw identifier detection.
-constexpr std::array<bool, 256> IsIdStartByteTable = [] {
-  std::array<bool, 256> table = {};
-  for (char c = 'A'; c <= 'Z'; ++c) {
-    table[c] = true;
-  }
-  for (char c = 'a'; c <= 'z'; ++c) {
-    table[c] = true;
-  }
-  table['_'] = true;
-  return table;
-}();
-
-// A table of booleans that we can use to classify bytes as being valid
-// identifier (or keyword) characters. This is used in the generic,
-// non-vectorized fallback code to scan for length of an identifier.
-constexpr std::array<bool, 256> IsIdByteTable = [] {
-  std::array<bool, 256> table = IsIdStartByteTable;
-  for (char c = '0'; c <= '9'; ++c) {
-    table[c] = true;
-  }
-  return table;
-}();
-
-// Baseline scalar version, also available for scalar-fallback in SIMD code.
-// Uses `ssize_t` for performance when indexing in the loop.
-//
-// TODO: This assumes all Unicode characters are non-identifiers.
-static auto ScanForIdentifierPrefixScalar(llvm::StringRef text, ssize_t i)
-    -> llvm::StringRef {
-  const ssize_t size = text.size();
-  while (i < size && IsIdByteTable[static_cast<unsigned char>(text[i])]) {
-    ++i;
-  }
-
-  return text.substr(0, i);
-}
-
-#if CARBON_USE_SIMD && __x86_64__
-// The SIMD code paths uses a scheme derived from the techniques in Geoff
-// Langdale and Daniel Lemire's work on parsing JSON[1]. Specifically, that
-// paper outlines a technique of using two 4-bit indexed in-register look-up
-// tables (LUTs) to classify bytes in a branchless SIMD code sequence.
-//
-// [1]: https://arxiv.org/pdf/1902.08318.pdf
-//
-// The goal is to get a bit mask classifying different sets of bytes. For each
-// input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
-// character. Otherwise, we want the mask bits to be set with the following
-// logic derived by inspecting the high nibble and low nibble of the input:
-// bit0 = 1 for `_`: high `0x5` and low `0xF`
-// bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
-// bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
-// bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
-// bit4 = unused
-// bit5 = unused
-// bit6 = unused
-// bit7 = unused
-//
-// No bits set means definitively non-ID ASCII character.
-//
-// Bits 4-7 remain unused if we need to classify more characters.
-namespace {
-// Struct used to implement the nibble LUT for SIMD implementations.
-//
-// Forced to 16-byte alignment to ensure we can load it easily in SIMD code.
-struct alignas(16) NibbleLUT {
-  auto Load() const -> __m128i {
-    return _mm_load_si128(reinterpret_cast<const __m128i*>(this));
-  }
-
-  uint8_t nibble_0;
-  uint8_t nibble_1;
-  uint8_t nibble_2;
-  uint8_t nibble_3;
-  uint8_t nibble_4;
-  uint8_t nibble_5;
-  uint8_t nibble_6;
-  uint8_t nibble_7;
-  uint8_t nibble_8;
-  uint8_t nibble_9;
-  uint8_t nibble_a;
-  uint8_t nibble_b;
-  uint8_t nibble_c;
-  uint8_t nibble_d;
-  uint8_t nibble_e;
-  uint8_t nibble_f;
-};
-}  // namespace
-
-constexpr NibbleLUT HighLUT = {
-    .nibble_0 = 0b0000'0000,
-    .nibble_1 = 0b0000'0000,
-    .nibble_2 = 0b0000'0000,
-    .nibble_3 = 0b0000'0010,
-    .nibble_4 = 0b0000'0100,
-    .nibble_5 = 0b0000'1001,
-    .nibble_6 = 0b0000'0100,
-    .nibble_7 = 0b0000'1000,
-    .nibble_8 = 0b1000'0000,
-    .nibble_9 = 0b1000'0000,
-    .nibble_a = 0b1000'0000,
-    .nibble_b = 0b1000'0000,
-    .nibble_c = 0b1000'0000,
-    .nibble_d = 0b1000'0000,
-    .nibble_e = 0b1000'0000,
-    .nibble_f = 0b1000'0000,
-};
-constexpr NibbleLUT LowLUT = {
-    .nibble_0 = 0b1000'1010,
-    .nibble_1 = 0b1000'1110,
-    .nibble_2 = 0b1000'1110,
-    .nibble_3 = 0b1000'1110,
-    .nibble_4 = 0b1000'1110,
-    .nibble_5 = 0b1000'1110,
-    .nibble_6 = 0b1000'1110,
-    .nibble_7 = 0b1000'1110,
-    .nibble_8 = 0b1000'1110,
-    .nibble_9 = 0b1000'1110,
-    .nibble_a = 0b1000'1100,
-    .nibble_b = 0b1000'0100,
-    .nibble_c = 0b1000'0100,
-    .nibble_d = 0b1000'0100,
-    .nibble_e = 0b1000'0100,
-    .nibble_f = 0b1000'0101,
-};
-
-static auto ScanForIdentifierPrefixX86(llvm::StringRef text)
-    -> llvm::StringRef {
-  const auto high_lut = HighLUT.Load();
-  const auto low_lut = LowLUT.Load();
-
-  // Use `ssize_t` for performance here as we index memory in a tight loop.
-  ssize_t i = 0;
-  const ssize_t size = text.size();
-  while ((i + 16) <= size) {
-    __m128i input =
-        _mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
-
-    // The high bits of each byte indicate a non-ASCII character encoded using
-    // UTF-8. Test those and fall back to the scalar code if present. These
-    // bytes will also cause spurious zeros in the LUT results, but we can
-    // ignore that because we track them independently here.
-#if __SSE4_1__
-    if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
-      break;
-    }
-#else
-    if (_mm_movemask_epi8(input) != 0) {
-      break;
-    }
-#endif
-
-    // Do two LUT lookups and mask the results together to get the results for
-    // both low and high nibbles. Note that we don't need to mask out the high
-    // bit of input here because we track that above for UTF-8 handling.
-    __m128i low_mask = _mm_shuffle_epi8(low_lut, input);
-    // Note that the input needs to be masked to only include the high nibble or
-    // we could end up with bit7 set forcing the result to a zero byte.
-    __m128i input_high =
-        _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
-    __m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
-    __m128i mask = _mm_and_si128(low_mask, high_mask);
-
-    // Now compare to find the completely zero bytes.
-    __m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
-    int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
-
-    // Check if there are bits in the tail mask, which means zero bytes and the
-    // end of the identifier. We could do this without materializing the scalar
-    // mask on more recent CPUs, but we generally expect the median length we
-    // encounter to be <16 characters and so we avoid the extra instruction in
-    // that case and predict this branch to succeed so it is laid out in a
-    // reasonable way.
-    if (LLVM_LIKELY(tail_ascii_mask != 0)) {
-      // Move past the definitively classified bytes that are part of the
-      // identifier, and return the complete identifier text.
-      i += __builtin_ctz(tail_ascii_mask);
-      return text.substr(0, i);
-    }
-    i += 16;
-  }
-
-  return ScanForIdentifierPrefixScalar(text, i);
-}
-
-#endif  // CARBON_USE_SIMD && __x86_64__
-
-// Scans the provided text and returns the prefix `StringRef` of contiguous
-// identifier characters.
-//
-// This is a performance sensitive function and where profitable uses vectorized
-// code sequences to optimize its scanning. When modifying, the identifier
-// lexing benchmarks should be checked for regressions.
-//
-// Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
-//
-// TODO: Currently, this code does not implement Carbon's design for Unicode
-// characters in identifiers. It does work on UTF-8 code unit sequences, but
-// currently considers non-ASCII characters to be non-identifier characters.
-// Some work has been done to ensure the hot loop, while optimized, retains
-// enough information to add Unicode handling without completely destroying the
-// relevant optimizations.
-static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
-  // Dispatch to an optimized architecture optimized routine.
-#if CARBON_USE_SIMD && __x86_64__
-  return ScanForIdentifierPrefixX86(text);
-#elif CARBON_USE_SIMD && __ARM_NEON
-  // Somewhat surprisingly, there is basically nothing worth doing in SIMD on
-  // Arm to optimize this scan. The Neon SIMD operations end up requiring you to
-  // move from the SIMD unit to the scalar unit in the critical path of finding
-  // the offset of the end of an identifier. Current ARM cores make the code
-  // sequences here (quite) unpleasant. For example, on Apple M1 and similar
-  // cores, the latency is as much as 10 cycles just to extract from the vector.
-  // SIMD might be more interesting on Neoverse cores, but it'd be nice to avoid
-  // core-specific tunings at this point.
-  //
-  // If this proves problematic and critical to optimize, the current leading
-  // theory is to have the newline searching code also create a bitmask for the
-  // entire source file of identifier and non-identifier bytes, and then use the
-  // bit-counting instructions here to do a fast scan of that bitmask. However,
-  // crossing that bridge will add substantial complexity to the newline
-  // scanner, and so currently we just use a boring scalar loop that pipelines
-  // well.
-#endif
-  return ScanForIdentifierPrefixScalar(text, 0);
-}
-
-// Implementation of the lexer logic itself.
-//
-// The design is that lexing can loop over the source buffer, consuming it into
-// tokens by calling into this API. This class handles the state and breaks down
-// the different lexing steps that may be used. It directly updates the provided
-// tokenized buffer with the lexed tokens.
-class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
- public:
-  // Symbolic result of a lexing action. This indicates whether we successfully
-  // lexed a token, or whether other lexing actions should be attempted.
-  //
-  // While it wraps a simple boolean state, its API both helps make the failures
-  // more self documenting, and by consuming the actual token constructively
-  // when one is produced, it helps ensure the correct result is returned.
-  class LexResult {
-   public:
-    // Consumes (and discard) a valid token to construct a result
-    // indicating a token has been produced. Relies on implicit conversions.
-    // NOLINTNEXTLINE(google-explicit-constructor)
-    LexResult(Token /*discarded_token*/) : LexResult(true) {}
-
-    // Returns a result indicating no token was produced.
-    static auto NoMatch() -> LexResult { return LexResult(false); }
-
-    // Tests whether a token was produced by the lexing routine, and
-    // the lexer can continue forming tokens.
-    explicit operator bool() const { return formed_token_; }
-
-   private:
-    explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
-
-    bool formed_token_;
-  };
-
-  Lexer(SharedValueStores& value_stores, SourceBuffer& source,
-        DiagnosticConsumer& consumer)
-      : buffer_(value_stores, source),
-        consumer_(consumer),
-        translator_(&buffer_),
-        emitter_(translator_, consumer_),
-        token_translator_(&buffer_),
-        token_emitter_(token_translator_, consumer_) {}
-
-  // Find all line endings and create the line data structures. Explicitly kept
-  // out-of-line because this is a significant loop that is useful to have in
-  // the profile and it doesn't simplify by inlining at all. But because it can,
-  // the compiler will flatten this otherwise.
-  [[gnu::noinline]] auto CreateLines(llvm::StringRef source_text) -> void {
-    // We currently use `memchr` here which typically is well optimized to use
-    // SIMD or other significantly faster than byte-wise scanning. We also use
-    // carefully selected variables and the `ssize_t` type for performance and
-    // code size of this hot loop.
-    //
-    // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
-    // routine here in order to handle CR+LF line endings, as we'll want those
-    // to stay on the fast path. We'll also need to detect and diagnose Unicode
-    // vertical whitespace. Starting with `memchr` should give us a strong
-    // baseline performance target when adding those features.
-    const char* const text = source_text.data();
-    const ssize_t size = source_text.size();
-    ssize_t start = 0;
-    while (const char* nl = reinterpret_cast<const char*>(
-               memchr(&text[start], '\n', size - start))) {
-      ssize_t nl_index = nl - text;
-      buffer_.AddLine(LineInfo(start, nl_index - start));
-      start = nl_index + 1;
-    }
-    // The last line ends at the end of the file.
-    buffer_.AddLine(LineInfo(start, size - start));
-
-    // If the last line wasn't empty, the file ends with an unterminated line.
-    // Add an extra blank line so that we never need to handle the special case
-    // of being on the last line inside the lexer and needing to not increment
-    // to the next line.
-    if (start != size) {
-      buffer_.AddLine(LineInfo(size, 0));
-    }
-
-    // Now that all the infos are allocated, get a fresh pointer to the first
-    // info for use while lexing.
-    line_index_ = 0;
-  }
-
-  auto current_line() -> Line { return Line(line_index_); }
-
-  auto current_line_info() -> LineInfo* {
-    return &buffer_.line_infos_[line_index_];
-  }
-
-  auto ComputeColumn(ssize_t position) -> int {
-    CARBON_DCHECK(position >= current_line_info()->start);
-    return position - current_line_info()->start;
-  }
-
-  auto NoteWhitespace() -> void {
-    buffer_.token_infos_.back().has_trailing_space = true;
-  }
-
-  auto SkipHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
-      -> void {
-    // Handle adjacent whitespace quickly. This comes up frequently for example
-    // due to indentation. We don't expect *huge* runs, so just use a scalar
-    // loop. While still scalar, this avoids repeated table dispatch and marking
-    // whitespace.
-    while (position < static_cast<ssize_t>(source_text.size()) &&
-           (source_text[position] == ' ' || source_text[position] == '\t')) {
-      ++position;
-    }
-  }
-
-  auto LexHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
-      -> void {
-    CARBON_DCHECK(source_text[position] == ' ' ||
-                  source_text[position] == '\t');
-    NoteWhitespace();
-    // Skip runs using an optimized code path.
-    SkipHorizontalWhitespace(source_text, position);
-  }
-
-  auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
-      -> void {
-    NoteWhitespace();
-    ++line_index_;
-    auto* line_info = current_line_info();
-    ssize_t line_start = line_info->start;
-    position = line_start;
-    SkipHorizontalWhitespace(source_text, position);
-    line_info->indent = position - line_start;
-  }
-
-  auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
-      -> void {
-    CARBON_DCHECK(source_text[position] == '/');
-
-    // Both comments and slash symbols start with a `/`. We disambiguate with a
-    // max-munch rule -- if the next character is another `/` then we lex it as
-    // a comment start. If it isn't, then we lex as a slash. We also optimize
-    // for the comment case as we expect that to be much more important for
-    // overall lexer performance.
-    if (LLVM_LIKELY(position + 1 < static_cast<ssize_t>(source_text.size()) &&
-                    source_text[position + 1] == '/')) {
-      LexComment(source_text, position);
-      return;
-    }
-
-    // This code path should produce a token, make sure that happens.
-    LexResult result = LexSymbolToken(source_text, position);
-    CARBON_CHECK(result) << "Failed to form a token!";
-  }
-
-  auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
-    CARBON_DCHECK(source_text.substr(position).startswith("//"));
-
-    // Any comment must be the only non-whitespace on the line.
-    const auto* line_info = current_line_info();
-    if (LLVM_UNLIKELY(position != line_info->start + line_info->indent)) {
-      CARBON_DIAGNOSTIC(TrailingComment, Error,
-                        "Trailing comments are not permitted.");
-
-      emitter_.Emit(source_text.begin() + position, TrailingComment);
-
-      // Note that we cannot fall-through here as the logic below doesn't handle
-      // trailing comments. For simplicity, we just consume the trailing comment
-      // itself and let the normal lexer handle the newline as if there weren't
-      // a comment at all.
-      position = line_info->start + line_info->length;
-      return;
-    }
-
-    // The introducer '//' must be followed by whitespace or EOF.
-    bool is_valid_after_slashes = true;
-    if (position + 2 < static_cast<ssize_t>(source_text.size()) &&
-        LLVM_UNLIKELY(!IsSpace(source_text[position + 2]))) {
-      CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
-                        "Whitespace is required after '//'.");
-      emitter_.Emit(source_text.begin() + position + 2,
-                    NoWhitespaceAfterCommentIntroducer);
-
-      // We use this to tweak the lexing of blocks below.
-      is_valid_after_slashes = false;
-    }
-
-    // Skip over this line.
-    ssize_t line_index = line_index_;
-    ++line_index;
-    position = buffer_.line_infos_[line_index].start;
-
-    // A very common pattern is a long block of comment lines all with the same
-    // indent and comment start. We skip these comment blocks in bulk both for
-    // speed and to reduce redundant diagnostics if each line has the same
-    // erroneous comment start like `//!`.
-    //
-    // When we have SIMD support this is even more important for speed, as short
-    // indents can be scanned extremely quickly with SIMD and we expect these to
-    // be the dominant cases.
-    //
-    // TODO: We should extend this to 32-byte SIMD on platforms with support.
-    constexpr int MaxIndent = 13;
-    const int indent = line_info->indent;
-    const ssize_t first_line_start = line_info->start;
-    ssize_t prefix_size = indent + (is_valid_after_slashes ? 3 : 2);
-    auto skip_to_next_line = [this, indent, &line_index, &position] {
-      // We're guaranteed to have a line here even on a comment on the last line
-      // as we ensure there is an empty line structure at the end of every file.
-      ++line_index;
-      auto* next_line_info = &buffer_.line_infos_[line_index];
-      next_line_info->indent = indent;
-      position = next_line_info->start;
-    };
-    if (CARBON_USE_SIMD &&
-        position + 16 < static_cast<ssize_t>(source_text.size()) &&
-        indent <= MaxIndent) {
-      // Load a mask based on the amount of text we want to compare.
-      auto mask = PrefixMasks[prefix_size];
-#if __ARM_NEON
-      // Load and mask the prefix of the current line.
-      auto prefix = vld1q_u8(reinterpret_cast<const uint8_t*>(
-          source_text.data() + first_line_start));
-      prefix = vandq_u8(mask, prefix);
-      do {
-        // Load and mask the next line to consider's prefix.
-        auto next_prefix = vld1q_u8(
-            reinterpret_cast<const uint8_t*>(source_text.data() + position));
-        next_prefix = vandq_u8(mask, next_prefix);
-        // Compare the two prefixes and if any lanes differ, break.
-        auto compare = vceqq_u8(prefix, next_prefix);
-        if (vminvq_u8(compare) == 0) {
-          break;
-        }
-
-        skip_to_next_line();
-      } while (position + 16 < static_cast<ssize_t>(source_text.size()));
-#elif __x86_64__
-      // Use the current line's prefix as the exemplar to compare against.
-      // We don't mask here as we will mask when doing the comparison.
-      auto prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(
-          source_text.data() + first_line_start));
-      do {
-        // Load the next line to consider's prefix.
-        auto next_prefix = _mm_loadu_si128(
-            reinterpret_cast<const __m128i*>(source_text.data() + position));
-        // Compute the difference between the next line and our exemplar. Again,
-        // we don't mask the difference because the comparison below will be
-        // masked.
-        auto prefix_diff = _mm_xor_si128(prefix, next_prefix);
-        // If we have any differences (non-zero bits) within the mask, we can't
-        // skip the next line too.
-        if (!_mm_test_all_zeros(mask, prefix_diff)) {
-          break;
-        }
-
-        skip_to_next_line();
-      } while (position + 16 < static_cast<ssize_t>(source_text.size()));
-#else
-#error "Unsupported SIMD architecture!"
-#endif
-      // TODO: If we finish the loop due to the position approaching the end of
-      // the buffer we may fail to skip the last line in a comment block that
-      // has an invalid initial sequence and thus emit extra diagnostics. We
-      // should really fall through to the generic skipping logic, but the code
-      // organization will need to change significantly to allow that.
-    } else {
-      while (position + prefix_size <
-                 static_cast<ssize_t>(source_text.size()) &&
-             memcmp(source_text.data() + first_line_start,
-                    source_text.data() + position, prefix_size) == 0) {
-        skip_to_next_line();
-      }
-    }
-
-    // Now compute the indent of this next line before we finish.
-    ssize_t line_start = position;
-    SkipHorizontalWhitespace(source_text, position);
-
-    // Now that we're done scanning, update to the latest line index and indent.
-    line_index_ = line_index;
-    current_line_info()->indent = position - line_start;
-  }
-
-  auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
-      -> LexResult {
-    std::optional<NumericLiteral> literal =
-        NumericLiteral::Lex(source_text.substr(position));
-    if (!literal) {
-      return LexError(source_text, position);
-    }
-
-    int int_column = ComputeColumn(position);
-    int token_size = literal->text().size();
-    position += token_size;
-
-    return VariantMatch(
-        literal->ComputeValue(emitter_),
-        [&](NumericLiteral::IntegerValue&& value) {
-          auto token = buffer_.AddToken({.kind = TokenKind::IntegerLiteral,
-                                         .token_line = current_line(),
-                                         .column = int_column});
-          buffer_.GetTokenInfo(token).integer_id =
-              buffer_.value_stores_->integers().Add(std::move(value.value));
-          return token;
-        },
-        [&](NumericLiteral::RealValue&& value) {
-          auto token = buffer_.AddToken({.kind = TokenKind::RealLiteral,
-                                         .token_line = current_line(),
-                                         .column = int_column});
-          buffer_.GetTokenInfo(token).real_id =
-              buffer_.value_stores_->reals().Add(
-                  Real{.mantissa = value.mantissa,
-                       .exponent = value.exponent,
-                       .is_decimal =
-                           (value.radix == NumericLiteral::Radix::Decimal)});
-          return token;
-        },
-        [&](NumericLiteral::UnrecoverableError) {
-          auto token = buffer_.AddToken({
-              .kind = TokenKind::Error,
-              .token_line = current_line(),
-              .column = int_column,
-              .error_length = token_size,
-          });
-          return token;
-        });
-  }
-
-  auto LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
-      -> LexResult {
-    std::optional<StringLiteral> literal =
-        StringLiteral::Lex(source_text.substr(position));
-    if (!literal) {
-      return LexError(source_text, position);
-    }
-
-    Line string_line = current_line();
-    int string_column = ComputeColumn(position);
-    ssize_t literal_size = literal->text().size();
-    position += literal_size;
-
-    // Update line and column information.
-    if (literal->is_multi_line()) {
-      while (current_line_info()->start + current_line_info()->length <
-             position) {
-        ++line_index_;
-        current_line_info()->indent = string_column;
-      }
-      // Note that we've updated the current line at this point, but
-      // `set_indent_` is already true from above. That remains correct as the
-      // last line of the multi-line literal *also* has its indent set.
-    }
-
-    if (literal->is_terminated()) {
-      auto string_id = buffer_.value_stores_->string_literals().Add(
-          literal->ComputeValue(buffer_.allocator_, emitter_));
-      auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral,
-                                     .token_line = string_line,
-                                     .column = string_column,
-                                     .string_literal_id = string_id});
-      return token;
-    } else {
-      CARBON_DIAGNOSTIC(UnterminatedString, Error,
-                        "String is missing a terminator.");
-      emitter_.Emit(literal->text().begin(), UnterminatedString);
-      return buffer_.AddToken(
-          {.kind = TokenKind::Error,
-           .token_line = string_line,
-           .column = string_column,
-           .error_length = static_cast<int32_t>(literal_size)});
-    }
-  }
-
-  auto LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
-                             ssize_t& position) -> Token {
-    // Verify in a debug build that the incoming token kind is correct.
-    CARBON_DCHECK(kind != TokenKind::Error);
-    CARBON_DCHECK(kind.fixed_spelling().size() == 1);
-    CARBON_DCHECK(source_text[position] == kind.fixed_spelling().front())
-        << "Source text starts with '" << source_text[position]
-        << "' instead of the spelling '" << kind.fixed_spelling()
-        << "' of the incoming token kind '" << kind << "'";
-
-    Token token = buffer_.AddToken({.kind = kind,
-                                    .token_line = current_line(),
-                                    .column = ComputeColumn(position)});
-    ++position;
-    return token;
-  }
-
-  auto LexOpeningSymbolToken(llvm::StringRef source_text, TokenKind kind,
-                             ssize_t& position) -> LexResult {
-    Token token = LexOneCharSymbolToken(source_text, kind, position);
-    open_groups_.push_back(token);
-    return token;
-  }
-
-  auto LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
-                             ssize_t& position) -> LexResult {
-    auto unmatched_error = [&] {
-      CARBON_DIAGNOSTIC(
-          UnmatchedClosing, Error,
-          "Closing symbol without a corresponding opening symbol.");
-      emitter_.Emit(source_text.begin() + position, UnmatchedClosing);
-      Token token = buffer_.AddToken({.kind = TokenKind::Error,
-                                      .token_line = current_line(),
-                                      .column = ComputeColumn(position),
-                                      .error_length = 1});
-      ++position;
-      return token;
-    };
-
-    // If we have no open groups, this is an error.
-    if (LLVM_UNLIKELY(open_groups_.empty())) {
-      return unmatched_error();
-    }
-
-    Token opening_token = open_groups_.back();
-    // Close any invalid open groups first.
-    if (LLVM_UNLIKELY(buffer_.GetTokenInfo(opening_token).kind !=
-                      kind.opening_symbol())) {
-      CloseInvalidOpenGroups(kind, position);
-      // This may exhaust the open groups so re-check and re-error if needed.
-      if (open_groups_.empty()) {
-        return unmatched_error();
-      }
-      opening_token = open_groups_.back();
-      CARBON_DCHECK(buffer_.GetTokenInfo(opening_token).kind ==
-                    kind.opening_symbol());
-    }
-    open_groups_.pop_back();
-
-    // Now that the groups are all matched up, lex the actual token.
-    Token token = LexOneCharSymbolToken(source_text, kind, position);
-
-    // Note that it is important to get fresh token infos here as lexing the
-    // open token would invalidate any pointers.
-    buffer_.GetTokenInfo(opening_token).closing_token = token;
-    buffer_.GetTokenInfo(token).opening_token = opening_token;
-
-    return token;
-  }
-
-  auto LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
-      -> LexResult {
-    // One character symbols and grouping symbols are handled with dedicated
-    // dispatch. We only lex the multi-character tokens here.
-    TokenKind kind = llvm::StringSwitch<TokenKind>(source_text.substr(position))
-#define CARBON_SYMBOL_TOKEN(Name, Spelling) \
-  .StartsWith(Spelling, TokenKind::Name)
-#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling)
-#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName)
-#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName)
-#include "toolchain/lex/token_kind.def"
-                         .Default(TokenKind::Error);
-    if (kind == TokenKind::Error) {
-      return LexError(source_text, position);
-    }
-
-    Token token = buffer_.AddToken({.kind = kind,
-                                    .token_line = current_line(),
-                                    .column = ComputeColumn(position)});
-    position += kind.fixed_spelling().size();
-    return token;
-  }
-
-  // Given a word that has already been lexed, determine whether it is a type
-  // literal and if so form the corresponding token.
-  auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
-      -> LexResult {
-    if (word.size() < 2) {
-      // Too short to form one of these tokens.
-      return LexResult::NoMatch();
-    }
-    if (word[1] < '1' || word[1] > '9') {
-      // Doesn't start with a valid initial digit.
-      return LexResult::NoMatch();
-    }
-
-    std::optional<TokenKind> kind;
-    switch (word.front()) {
-      case 'i':
-        kind = TokenKind::IntegerTypeLiteral;
-        break;
-      case 'u':
-        kind = TokenKind::UnsignedIntegerTypeLiteral;
-        break;
-      case 'f':
-        kind = TokenKind::FloatingPointTypeLiteral;
-        break;
-      default:
-        return LexResult::NoMatch();
-    };
-
-    llvm::StringRef suffix = word.substr(1);
-    if (!CanLexInteger(emitter_, suffix)) {
-      return buffer_.AddToken(
-          {.kind = TokenKind::Error,
-           .token_line = current_line(),
-           .column = column,
-           .error_length = static_cast<int32_t>(word.size())});
-    }
-    llvm::APInt suffix_value;
-    if (suffix.getAsInteger(10, suffix_value)) {
-      return LexResult::NoMatch();
-    }
-
-    auto token = buffer_.AddToken(
-        {.kind = *kind, .token_line = current_line(), .column = column});
-    buffer_.GetTokenInfo(token).integer_id =
-        buffer_.value_stores_->integers().Add(std::move(suffix_value));
-    return token;
-  }
-
-  // Closes all open groups that cannot remain open across a closing symbol.
-  // Users may pass `Error` to close all open groups.
-  [[gnu::noinline]] auto CloseInvalidOpenGroups(TokenKind kind,
-                                                ssize_t position) -> void {
-    CARBON_CHECK(kind.is_closing_symbol() || kind == TokenKind::Error);
-    CARBON_CHECK(!open_groups_.empty());
-
-    int column = ComputeColumn(position);
-
-    do {
-      Token opening_token = open_groups_.back();
-      TokenKind opening_kind = buffer_.GetTokenInfo(opening_token).kind;
-      if (kind == opening_kind.closing_symbol()) {
-        return;
-      }
-
-      open_groups_.pop_back();
-      CARBON_DIAGNOSTIC(
-          MismatchedClosing, Error,
-          "Closing symbol does not match most recent opening symbol.");
-      token_emitter_.Emit(opening_token, MismatchedClosing);
-
-      CARBON_CHECK(!buffer_.tokens().empty())
-          << "Must have a prior opening token!";
-      Token prev_token = buffer_.tokens().end()[-1];
-
-      // TODO: do a smarter backwards scan for where to put the closing
-      // token.
-      Token closing_token = buffer_.AddToken(
-          {.kind = opening_kind.closing_symbol(),
-           .has_trailing_space = buffer_.HasTrailingWhitespace(prev_token),
-           .is_recovery = true,
-           .token_line = current_line(),
-           .column = column});
-      TokenInfo& opening_token_info = buffer_.GetTokenInfo(opening_token);
-      TokenInfo& closing_token_info = buffer_.GetTokenInfo(closing_token);
-      opening_token_info.closing_token = closing_token;
-      closing_token_info.opening_token = opening_token;
-    } while (!open_groups_.empty());
-  }
-
-  auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
-      -> LexResult {
-    if (static_cast<unsigned char>(source_text[position]) > 0x7F) {
-      // TODO: Need to add support for Unicode lexing.
-      return LexError(source_text, position);
-    }
-    CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
-
-    int column = ComputeColumn(position);
-
-    // Take the valid characters off the front of the source buffer.
-    llvm::StringRef identifier_text =
-        ScanForIdentifierPrefix(source_text.substr(position));
-    CARBON_CHECK(!identifier_text.empty())
-        << "Must have at least one character!";
-    position += identifier_text.size();
-
-    // Check if the text is a type literal, and if so form such a literal.
-    if (LexResult result = LexWordAsTypeLiteralToken(identifier_text, column)) {
-      return result;
-    }
-
-    // Check if the text matches a keyword token, and if so use that.
-    TokenKind kind = llvm::StringSwitch<TokenKind>(identifier_text)
-#define CARBON_KEYWORD_TOKEN(Name, Spelling) .Case(Spelling, TokenKind::Name)
-#include "toolchain/lex/token_kind.def"
-                         .Default(TokenKind::Error);
-    if (kind != TokenKind::Error) {
-      return buffer_.AddToken(
-          {.kind = kind, .token_line = current_line(), .column = column});
-    }
-
-    // Otherwise we have a generic identifier.
-    return buffer_.AddToken(
-        {.kind = TokenKind::Identifier,
-         .token_line = current_line(),
-         .column = column,
-         .ident_id =
-             buffer_.value_stores_->identifiers().Add(identifier_text)});
-  }
-
-  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
-                                      ssize_t& position) -> LexResult {
-    CARBON_CHECK(source_text[position] == 'r');
-    // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
-    // identifier starting with the 'r'.
-    // TODO: Need to add support for Unicode lexing.
-    if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
-                    source_text[position + 1] != '#' ||
-                    !IsIdStartByteTable[source_text[position + 2]])) {
-      // TODO: Should this print a different error when there is `r#`, but it
-      // isn't followed by identifier text? Or is it right to put it back so
-      // that the `#` could be parsed as part of a raw string literal?
-      return LexKeywordOrIdentifier(source_text, position);
-    }
-
-    int column = ComputeColumn(position);
-
-    // Take the valid characters off the front of the source buffer.
-    llvm::StringRef identifier_text =
-        ScanForIdentifierPrefix(source_text.substr(position + 2));
-    CARBON_CHECK(!identifier_text.empty())
-        << "Must have at least one character!";
-    position += identifier_text.size() + 2;
-
-    // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
-
-    // Otherwise we have a raw identifier.
-    // TODO: This token doesn't carry any indicator that it's raw, so
-    // diagnostics are unclear.
-    return buffer_.AddToken(
-        {.kind = TokenKind::Identifier,
-         .token_line = current_line(),
-         .column = column,
-         .ident_id =
-             buffer_.value_stores_->identifiers().Add(identifier_text)});
-  }
-
-  auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
-    llvm::StringRef error_text =
-        source_text.substr(position).take_while([](char c) {
-          if (IsAlnum(c)) {
-            return false;
-          }
-          switch (c) {
-            case '_':
-            case '\t':
-            case '\n':
-              return false;
-            default:
-              break;
-          }
-          return llvm::StringSwitch<bool>(llvm::StringRef(&c, 1))
-#define CARBON_SYMBOL_TOKEN(Name, Spelling) .StartsWith(Spelling, false)
-#include "toolchain/lex/token_kind.def"
-              .Default(true);
-        });
-    if (error_text.empty()) {
-      // TODO: Reimplement this to use the lexer properly. In the meantime,
-      // guarantee that we eat at least one byte.
-      error_text = source_text.substr(position, 1);
-    }
-
-    auto token = buffer_.AddToken(
-        {.kind = TokenKind::Error,
-         .token_line = current_line(),
-         .column = ComputeColumn(position),
-         .error_length = static_cast<int32_t>(error_text.size())});
-    CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
-                      "Encountered unrecognized characters while parsing.");
-    emitter_.Emit(error_text.begin(), UnrecognizedCharacters);
-
-    position += error_text.size();
-    return token;
-  }
-
-  auto LexStartOfFile(llvm::StringRef source_text, ssize_t& position) -> void {
-    // Before lexing any source text, add the start-of-file token so that code
-    // can assume a non-empty token buffer for the rest of lexing. Note that the
-    // start-of-file always has trailing space because it *is* whitespace.
-    buffer_.AddToken({.kind = TokenKind::StartOfFile,
-                      .has_trailing_space = true,
-                      .token_line = current_line(),
-                      .column = 0});
-
-    // Also skip any horizontal whitespace and record the indentation of the
-    // first line.
-    SkipHorizontalWhitespace(source_text, position);
-    auto* line_info = current_line_info();
-    CARBON_CHECK(line_info->start == 0);
-    line_info->indent = position;
-  }
-
-  auto LexEndOfFile(llvm::StringRef source_text, ssize_t position) -> void {
-    CARBON_CHECK(position == static_cast<ssize_t>(source_text.size()));
-    // Check if the last line is empty and not the first line (and only). If so,
-    // re-pin the last line to be the prior one so that diagnostics and editors
-    // can treat newlines as terminators even though we internally handle them
-    // as separators in case of a missing newline on the last line. We do this
-    // here instead of detecting this when we see the newline to avoid more
-    // conditions along that fast path.
-    if (position == current_line_info()->start && line_index_ != 0) {
-      --line_index_;
-      --position;
-    } else {
-      // Update the line length as this is also the end of a line.
-      current_line_info()->length = ComputeColumn(position);
-    }
-
-    // The end-of-file token is always considered to be whitespace.
-    NoteWhitespace();
-
-    // Close any open groups. We do this after marking whitespace, it will
-    // preserve that.
-    if (!open_groups_.empty()) {
-      CloseInvalidOpenGroups(TokenKind::Error, position);
-    }
-
-    buffer_.AddToken({.kind = TokenKind::EndOfFile,
-                      .token_line = current_line(),
-                      .column = ComputeColumn(position)});
-  }
-
-  // We use a collection of static member functions for table-based dispatch to
-  // lexer methods. These are named static member functions so that they show up
-  // helpfully in profiles and backtraces, but they tend to not contain the
-  // interesting logic and simply delegate to the relevant methods. All of their
-  // signatures need to be exactly the same however in order to ensure we can
-  // build efficient dispatch tables out of them. All of them end by doing a
-  // must-tail return call to this routine. It handles continuing the dispatch
-  // chain.
-  static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
-                           ssize_t position) -> void {
-    if (LLVM_LIKELY(position < static_cast<ssize_t>(source_text.size()))) {
-      // The common case is to tail recurse based on the next character. Note
-      // that because this is a must-tail return, this cannot fail to tail-call
-      // and will not grow the stack. This is in essence a loop with dynamic
-      // tail dispatch to the next stage of the loop.
-      [[clang::musttail]] return DispatchTable[static_cast<unsigned char>(
-          source_text[position])](lexer, source_text, position);
-    }
-
-    // When we finish the source text, stop recursing. We also hint this so that
-    // the tail-dispatch is optimized as that's essentially the loop back-edge
-    // and this is the loop exit.
-    lexer.LexEndOfFile(source_text, position);
-  }
-
-  // Define a set of dispatch functions that simply forward to a method that
-  // lexes a token. This includes validating that an actual token was produced,
-  // and continuing the dispatch.
-#define CARBON_DISPATCH_LEX_TOKEN(LexMethod)                                 \
-  static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
-                                  ssize_t position)                          \
-      ->void {                                                               \
-    LexResult result = lexer.LexMethod(source_text, position);               \
-    CARBON_CHECK(result) << "Failed to form a token!";                       \
-    [[clang::musttail]] return DispatchNext(lexer, source_text, position);   \
-  }
-  CARBON_DISPATCH_LEX_TOKEN(LexError)
-  CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
-  CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
-  CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
-  CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
-  CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
-
-  // A custom dispatch functions that pre-select the symbol token to lex.
-#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod)                           \
-  static auto Dispatch##LexMethod##SymbolToken(                               \
-      Lexer& lexer, llvm::StringRef source_text, ssize_t position)            \
-      ->void {                                                                \
-    LexResult result = lexer.LexMethod##SymbolToken(                          \
-        source_text, OneCharTokenKindTable[source_text[position]], position); \
-    CARBON_CHECK(result) << "Failed to form a token!";                        \
-    [[clang::musttail]] return DispatchNext(lexer, source_text, position);    \
-  }
-  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
-  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
-  CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
-
-  // Define a set of non-token dispatch functions that handle things like
-  // whitespace and comments.
-#define CARBON_DISPATCH_LEX_NON_TOKEN(LexMethod)                             \
-  static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
-                                  ssize_t position)                          \
-      ->void {                                                               \
-    lexer.LexMethod(source_text, position);                                  \
-    [[clang::musttail]] return DispatchNext(lexer, source_text, position);   \
-  }
-  CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
-  CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
-  CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
-
-  // The main entry point for dispatching through the lexer's table. This method
-  // should always fully consume the source text.
-  auto Lex() && -> TokenizedBuffer {
-    llvm::StringRef source_text = buffer_.source_->text();
-
-    // First build up our line data structures.
-    CreateLines(source_text);
-
-    ssize_t position = 0;
-    LexStartOfFile(source_text, position);
-
-    // Manually enter the dispatch loop. This call will tail-recurse through the
-    // dispatch table until everything from source_text is consumed.
-    DispatchNext(*this, source_text, position);
-
-    if (consumer_.seen_error()) {
-      buffer_.has_errors_ = true;
-    }
-
-    return std::move(buffer_);
-  }
-
- private:
-  using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef source_text,
-                                 ssize_t position) -> void;
-  using DispatchTableT = std::array<DispatchFunctionT*, 256>;
-
-  // Build a table of function pointers that we can use to dispatch to the
-  // correct lexer routine based on the first byte of source text.
-  //
-  // While it is tempting to simply use a `switch` on the first byte and
-  // dispatch with cases into this, in practice that doesn't produce great code.
-  // There seem to be two issues that are the root cause.
-  //
-  // First, there are lots of different values of bytes that dispatch to a
-  // fairly small set of routines, and then some byte values that dispatch
-  // differently for each byte. This pattern isn't one that the compiler-based
-  // lowering of switches works well with -- it tries to balance all the cases,
-  // and in doing so emits several compares and other control flow rather than a
-  // simple jump table.
-  //
-  // Second, with a `case`, it isn't as obvious how to create a single, uniform
-  // interface that is effective for *every* byte value, and thus makes for a
-  // single consistent table-based dispatch. By forcing these to be function
-  // pointers, we also coerce the code to use a strictly homogeneous structure
-  // that can form a single dispatch table.
-  //
-  // These two actually interact -- the second issue is part of what makes the
-  // non-table lowering in the first one desirable for many switches and cases.
-  //
-  // Ultimately, when table-based dispatch is such an important technique, we
-  // get better results by taking full control and manually creating the
-  // dispatch structures.
-  //
-  // The functions in this table also use tail-recursion to implement the loop
-  // of the lexer. This is based on the technique described more fully for any
-  // kind of byte-stream loop structure here:
-  // https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
-  constexpr static auto MakeDispatchTable() -> DispatchTableT {
-    DispatchTableT table = {};
-    // First set the table entries to dispatch to our error token handler as the
-    // base case. Everything valid comes from an override below.
-    for (int i = 0; i < 256; ++i) {
-      table[i] = &DispatchLexError;
-    }
-
-    // Symbols have some special dispatching. First, set the first character of
-    // each symbol token spelling to dispatch to the symbol lexer. We don't
-    // provide a pre-computed token here, so the symbol lexer will compute the
-    // exact symbol token kind. We'll override this with more specific dispatch
-    // below.
-#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
-  table[(Spelling)[0]] = &DispatchLexSymbolToken;
-#include "toolchain/lex/token_kind.def"
-
-    // Now special cased single-character symbols that are guaranteed to not
-    // join with another symbol. These are grouping symbols, terminators,
-    // or separators in the grammar and have a good reason to be
-    // orthogonal to any other punctuation. We do this separately because this
-    // needs to override some of the generic handling above, and provide a
-    // custom token.
-#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
-  table[(Spelling)[0]] = &DispatchLexOneCharSymbolToken;
-#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
-  table[(Spelling)[0]] = &DispatchLexOpeningSymbolToken;
-#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
-  table[(Spelling)[0]] = &DispatchLexClosingSymbolToken;
-#include "toolchain/lex/token_kind.def"
-
-    // Override the handling for `/` to consider comments as well as a `/`
-    // symbol.
-    table['/'] = &DispatchLexCommentOrSlash;
-
-    table['_'] = &DispatchLexKeywordOrIdentifier;
-    // Note that we don't use `llvm::seq` because this needs to be `constexpr`
-    // evaluated.
-    for (unsigned char c = 'a'; c <= 'z'; ++c) {
-      table[c] = &DispatchLexKeywordOrIdentifier;
-    }
-    table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
-    for (unsigned char c = 'A'; c <= 'Z'; ++c) {
-      table[c] = &DispatchLexKeywordOrIdentifier;
-    }
-    // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
-    // as whitespace characters should already have been skipped and the
-    // only remaining valid Unicode characters would be part of an
-    // identifier. That code can either accept or reject.
-    for (int i = 0x80; i < 0x100; ++i) {
-      table[i] = &DispatchLexKeywordOrIdentifier;
-    }
-
-    for (unsigned char c = '0'; c <= '9'; ++c) {
-      table[c] = &DispatchLexNumericLiteral;
-    }
-
-    table['\''] = &DispatchLexStringLiteral;
-    table['"'] = &DispatchLexStringLiteral;
-    table['#'] = &DispatchLexStringLiteral;
-
-    table[' '] = &DispatchLexHorizontalWhitespace;
-    table['\t'] = &DispatchLexHorizontalWhitespace;
-    table['\n'] = &DispatchLexVerticalWhitespace;
-
-    return table;
-  };
-
-  static const DispatchTableT DispatchTable;
-
-  static const std::array<TokenKind, 256> OneCharTokenKindTable;
-
-  TokenizedBuffer buffer_;
-
-  ssize_t line_index_;
-
-  llvm::SmallVector<Token> open_groups_;
-
-  ErrorTrackingDiagnosticConsumer consumer_;
-
-  SourceBufferLocationTranslator translator_;
-  LexerDiagnosticEmitter emitter_;
-
-  TokenLocationTranslator token_translator_;
-  TokenDiagnosticEmitter token_emitter_;
-};
-
-constexpr TokenizedBuffer::Lexer::DispatchTableT
-    TokenizedBuffer::Lexer::DispatchTable = MakeDispatchTable();
-
-constexpr std::array<TokenKind, 256>
-    TokenizedBuffer::Lexer::OneCharTokenKindTable = [] {
-      std::array<TokenKind, 256> table = {};
-#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
-  table[(Spelling)[0]] = TokenKind::TokenName;
-#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
-  table[(Spelling)[0]] = TokenKind::TokenName;
-#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
-  table[(Spelling)[0]] = TokenKind::TokenName;
-#include "toolchain/lex/token_kind.def"
-      return table;
-    }();
-
-auto TokenizedBuffer::Lex(SharedValueStores& value_stores, SourceBuffer& source,
-                          DiagnosticConsumer& consumer) -> TokenizedBuffer {
-  Lexer lexer(value_stores, source, consumer);
-  return std::move(lexer).Lex();
-}
-
 auto TokenizedBuffer::GetKind(Token token) const -> TokenKind {
   return GetTokenInfo(token).kind;
 }

+ 3 - 14
toolchain/lex/tokenized_buffer.h

@@ -133,13 +133,6 @@ class TokenLocationTranslator : public DiagnosticLocationTranslator<Token> {
 // `HasError` returning true.
 class TokenizedBuffer : public Printable<TokenizedBuffer> {
  public:
-  // Lexes a buffer of source code into a tokenized buffer.
-  //
-  // The provided source buffer must outlive any returned `TokenizedBuffer`
-  // which will refer into the source.
-  static auto Lex(SharedValueStores& value_stores, SourceBuffer& source,
-                  DiagnosticConsumer& consumer) -> TokenizedBuffer;
-
   [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
   [[nodiscard]] auto GetLine(Token token) const -> Line;
 
@@ -243,10 +236,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto filename() const -> llvm::StringRef { return source_->filename(); }
 
  private:
-  // Implementation detail struct implementing the actual lexer logic.
-  class Lexer;
-  friend Lexer;
-
+  friend class Lexer;
   friend class TokenLocationTranslator;
 
   // A diagnostic location translator that maps token locations into source
@@ -335,9 +325,8 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   };
 
   // The constructor is merely responsible for trivial initialization of
-  // members. A working object of this type is built with the `lex` function
-  // above so that its return can indicate if an error was encountered while
-  // lexing.
+  // members. A working object of this type is built with `Lex::Lex` so that its
+  // return can indicate if an error was encountered while lexing.
   explicit TokenizedBuffer(SharedValueStores& value_stores,
                            SourceBuffer& source)
       : value_stores_(&value_stores), source_(&source) {}

+ 3 - 2
toolchain/lex/tokenized_buffer_benchmark.cpp

@@ -14,6 +14,7 @@
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/lex/token_kind.h"
 #include "toolchain/lex/tokenized_buffer.h"
 
@@ -375,14 +376,14 @@ class LexerBenchHelper {
 
   auto Lex() -> TokenizedBuffer {
     DiagnosticConsumer& consumer = NullDiagnosticConsumer();
-    return TokenizedBuffer::Lex(value_stores_, source_, consumer);
+    return Lex::Lex(value_stores_, source_, consumer);
   }
 
   auto DiagnoseErrors() -> std::string {
     std::string result;
     llvm::raw_string_ostream out(result);
     StreamDiagnosticConsumer consumer(out);
-    auto buffer = TokenizedBuffer::Lex(value_stores_, source_, consumer);
+    auto buffer = Lex::Lex(value_stores_, source_, consumer);
     consumer.Flush();
     CARBON_CHECK(buffer.has_errors())
         << "Asked to diagnose errors but none found!";

+ 2 - 3
toolchain/lex/tokenized_buffer_fuzzer.cpp

@@ -8,7 +8,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
-#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/lex/lex.h"
 
 namespace Carbon::Testing {
 
@@ -35,8 +35,7 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
       SourceBuffer::CreateFromFile(fs, TestFileName, NullDiagnosticConsumer());
 
   SharedValueStores value_stores;
-  auto buffer = Lex::TokenizedBuffer::Lex(value_stores, *source,
-                                          NullDiagnosticConsumer());
+  auto buffer = Lex::Lex(value_stores, *source, NullDiagnosticConsumer());
   if (buffer.has_errors()) {
     return 0;
   }

+ 2 - 1
toolchain/lex/tokenized_buffer_test.cpp

@@ -15,6 +15,7 @@
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/mocks.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/lex/tokenized_buffer_test_helpers.h"
 #include "toolchain/testing/yaml_test_helpers.h"
 
@@ -46,7 +47,7 @@ class LexerTest : public ::testing::Test {
   auto Lex(llvm::StringRef text,
            DiagnosticConsumer& consumer = ConsoleDiagnosticConsumer())
       -> TokenizedBuffer {
-    return TokenizedBuffer::Lex(value_stores_, GetSourceBuffer(text), consumer);
+    return Lex::Lex(value_stores_, GetSourceBuffer(text), consumer);
   }
 
   SharedValueStores value_stores_;

+ 2 - 1
toolchain/parse/BUILD

@@ -74,6 +74,7 @@ cc_test(
         "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:mocks",
+        "//toolchain/lex",
         "//toolchain/lex:tokenized_buffer",
         "//toolchain/testing:yaml_test_helpers",
         "@com_google_googletest//:gtest",
@@ -92,7 +93,7 @@ cc_fuzz_test(
         "//toolchain/base:value_store",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:null_diagnostics",
-        "//toolchain/lex:tokenized_buffer",
+        "//toolchain/lex",
         "@llvm-project//llvm:Support",
     ],
 )

+ 2 - 3
toolchain/parse/parse_fuzzer.cpp

@@ -8,7 +8,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
-#include "toolchain/lex/tokenized_buffer.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/parse/tree.h"
 
 namespace Carbon::Testing {
@@ -33,8 +33,7 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
 
   // Lex the input.
   SharedValueStores value_stores;
-  auto tokens = Lex::TokenizedBuffer::Lex(value_stores, *source,
-                                          NullDiagnosticConsumer());
+  auto tokens = Lex::Lex(value_stores, *source, NullDiagnosticConsumer());
   if (tokens.has_errors()) {
     return 0;
   }

+ 3 - 2
toolchain/parse/tree_test.cpp

@@ -13,6 +13,7 @@
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/mocks.h"
+#include "toolchain/lex/lex.h"
 #include "toolchain/lex/tokenized_buffer.h"
 #include "toolchain/testing/yaml_test_helpers.h"
 
@@ -36,8 +37,8 @@ class TreeTest : public ::testing::Test {
   }
 
   auto GetTokenizedBuffer(llvm::StringRef t) -> Lex::TokenizedBuffer& {
-    token_storage_.push_front(Lex::TokenizedBuffer::Lex(
-        value_stores_, GetSourceBuffer(t), consumer_));
+    token_storage_.push_front(
+        Lex::Lex(value_stores_, GetSourceBuffer(t), consumer_));
     return token_storage_.front();
   }