source_stats.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. #!/usr/bin/env python3
  2. """Script to compute statistics about source code."""
  3. from __future__ import annotations
  4. __copyright__ = """
  5. Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  6. Exceptions. See /LICENSE for license information.
  7. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  8. """
  9. import argparse
  10. from alive_progress import alive_bar # type:ignore
  11. import math
  12. from multiprocessing import Pool
  13. import re
  14. import termplotlib as tpl # type:ignore
  15. from pathlib import Path
  16. from typing import Optional
  17. from dataclasses import dataclass, field, asdict
  18. from collections import Counter
  19. BLANK_RE = re.compile(r"\s*")
  20. COMMENT_RE = re.compile(r"\s*///*\s*")
  21. LINE_RE = re.compile(
  22. r"""
  23. (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
  24. (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
  25. (?P<trailing_comment>//.*)|
  26. (?P<internal_comment>/\*.*\*/)|
  27. (?P<string_literal>"([^"]|\\")*"|'([^']|\\')*')|
  28. (?P<float_literal>\b(0[xb][0-9a-fA-F']*|[0-9][0-9']*)\.[0-9a-fA-F']*([eEpP][0-9a-fA-F']*)?)|
  29. (?P<int_literal>\b(0[xb][0-9a-fA-F']+|[0-9][0-9']*)([eEpP][0-9a-fA-F']*)?)|
  30. (?P<symbol>[\[\]{}(),.;]|[-+=!@#$%^&*/?|<>]+)|
  31. (?P<keyword>\b(auto|bool|break|case|catch|char|class|const|continue|default|do|double|else|enum|explicit|extern|false|float|for|friend|goto|if|inline|int|long|mutable|namespace|new|nullptr|operator|private|protected|public|return|short|signed|sizeof|static|struct|switch|template|this|throw|true|try|typedef|union|unsigned|using|virtual|void|while)\b)|
  32. (?P<id>\b\w+\b)
  33. """,
  34. re.X,
  35. )
  36. @dataclass
  37. class Stats:
  38. """Stats collected while scanning source files"""
  39. lines: int = 0
  40. blank_lines: int = 0
  41. comment_lines: int = 0
  42. empty_comment_lines: int = 0
  43. comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
  44. lines_with_trailing_comments: int = 0
  45. classes: int = 0
  46. internal_comments: int = 0
  47. string_literals: int = 0
  48. string_literals_per_line: Counter[int] = field(
  49. default_factory=lambda: Counter()
  50. )
  51. int_literals: int = 0
  52. int_literals_per_line: Counter[int] = field(
  53. default_factory=lambda: Counter()
  54. )
  55. float_literals: int = 0
  56. float_literals_per_line: Counter[int] = field(
  57. default_factory=lambda: Counter()
  58. )
  59. symbols: int = 0
  60. symbols_per_line: Counter[int] = field(default_factory=lambda: Counter())
  61. keywords: int = 0
  62. keywords_per_line: Counter[int] = field(default_factory=lambda: Counter())
  63. identifiers: int = 0
  64. identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
  65. ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
  66. unique_ids_per_ten_lines: Counter[int] = field(
  67. default_factory=lambda: Counter()
  68. )
  69. def accumulate(self, other: Stats) -> None:
  70. self.lines += other.lines
  71. self.blank_lines += other.blank_lines
  72. self.empty_comment_lines += other.empty_comment_lines
  73. self.comment_lines += other.comment_lines
  74. self.comment_line_widths.update(other.comment_line_widths)
  75. self.lines_with_trailing_comments += other.lines_with_trailing_comments
  76. self.classes += other.classes
  77. self.internal_comments += other.internal_comments
  78. self.string_literals += other.string_literals
  79. self.string_literals_per_line.update(other.string_literals_per_line)
  80. self.int_literals += other.int_literals
  81. self.int_literals_per_line.update(other.int_literals_per_line)
  82. self.float_literals += other.float_literals
  83. self.float_literals_per_line.update(other.float_literals_per_line)
  84. self.symbols += other.symbols
  85. self.symbols_per_line.update(other.symbols_per_line)
  86. self.keywords += other.keywords
  87. self.keywords_per_line.update(other.keywords_per_line)
  88. self.identifiers += other.identifiers
  89. self.identifier_widths.update(other.identifier_widths)
  90. self.ids_per_line.update(other.ids_per_line)
  91. self.unique_ids_per_ten_lines.update(other.unique_ids_per_ten_lines)
  92. def scan_file(file: Path) -> Stats:
  93. """Scans the provided file and accumulates stats."""
  94. stats = Stats()
  95. unique_ids = set()
  96. for line in file.open():
  97. # Strip off the line endings.
  98. line = line.rstrip("\r\n")
  99. # Skip over super long lines that are often URLs or structured data that
  100. # doesn't match "normal" source code patterns.
  101. if len(line) > 80:
  102. continue
  103. stats.lines += 1
  104. if re.fullmatch(BLANK_RE, line):
  105. stats.blank_lines += 1
  106. continue
  107. if m := re.match(COMMENT_RE, line):
  108. stats.comment_lines += 1
  109. if m.end() == len(line):
  110. stats.empty_comment_lines += 1
  111. else:
  112. stats.comment_line_widths[len(line)] += 1
  113. continue
  114. line_string_literals = 0
  115. line_int_literals = 0
  116. line_float_literals = 0
  117. line_symbols = 0
  118. line_keywords = 0
  119. line_identifiers = 0
  120. for m in re.finditer(LINE_RE, line):
  121. if m.group("trailing_comment"):
  122. stats.lines_with_trailing_comments += 1
  123. break
  124. if m.group("class_intro"):
  125. stats.classes += 1
  126. line_keywords += 1
  127. line_identifiers += 1
  128. stats.identifier_widths[len(m.group("class_name"))] += 1
  129. elif m.group("end_open_curly"):
  130. line_symbols += 1
  131. elif m.group("internal_comment"):
  132. stats.internal_comments += 1
  133. elif m.group("string_literal"):
  134. line_string_literals += 1
  135. elif m.group("int_literal"):
  136. line_int_literals += 1
  137. elif m.group("float_literal"):
  138. line_float_literals += 1
  139. elif m.group("symbol"):
  140. line_symbols += 1
  141. elif m.group("keyword"):
  142. line_keywords += 1
  143. else:
  144. assert m.group("id"), "Line is '%s', and match is '%s'" % (
  145. line,
  146. line[m.start() : m.end()],
  147. )
  148. line_identifiers += 1
  149. stats.identifier_widths[len(m.group("id"))] += 1
  150. unique_ids.add(m.group("id"))
  151. stats.string_literals += line_string_literals
  152. stats.string_literals_per_line[line_string_literals] += 1
  153. stats.int_literals += line_int_literals
  154. stats.int_literals_per_line[line_int_literals] += 1
  155. stats.float_literals += line_float_literals
  156. stats.float_literals_per_line[line_float_literals] += 1
  157. stats.symbols += line_symbols
  158. stats.symbols_per_line[line_symbols] += 1
  159. stats.keywords += line_keywords
  160. stats.keywords_per_line[line_keywords] += 1
  161. stats.identifiers += line_identifiers
  162. stats.ids_per_line[line_identifiers] += 1
  163. if stats.lines > 0:
  164. stats.unique_ids_per_ten_lines[
  165. math.ceil((len(unique_ids) * 10) / stats.lines)
  166. ] += 1
  167. return stats
  168. def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
  169. """Parsers command-line arguments and flags."""
  170. parser = argparse.ArgumentParser(description=__doc__)
  171. parser.add_argument(
  172. "files",
  173. metavar="FILE",
  174. type=Path,
  175. nargs="+",
  176. help="A file to scan while collecting statistics.",
  177. )
  178. return parser.parse_args(args=args)
  179. def main() -> None:
  180. parsed_args = parse_args()
  181. stats = Stats()
  182. with alive_bar(len(parsed_args.files)) as bar:
  183. with Pool() as p:
  184. for file_stats in p.imap_unordered(scan_file, parsed_args.files):
  185. stats.accumulate(file_stats)
  186. bar()
  187. print(
  188. """
  189. ## Stats ##
  190. Lines: %(lines)d
  191. Blank lines: %(blank_lines)d
  192. Comment lines: %(comment_lines)d
  193. Empty comment lines: %(empty_comment_lines)d
  194. Lines with trailing comments: %(lines_with_trailing_comments)d
  195. Classes: %(classes)d
  196. Internal comments: %(internal_comments)d
  197. String literals: %(string_literals)d
  198. Int literals: %(int_literals)d
  199. Float literals: %(float_literals)d
  200. Symbols: %(symbols)d
  201. Keywords: %(keywords)d
  202. IDs: %(identifiers)d"""
  203. % asdict(stats)
  204. )
  205. tokens = (
  206. stats.string_literals
  207. + stats.int_literals
  208. + stats.float_literals
  209. + stats.symbols
  210. + stats.keywords
  211. + stats.identifiers
  212. )
  213. print(
  214. f"""
  215. Fraction of blank lines: {stats.blank_lines / stats.lines}
  216. Fraction of comment lines: {stats.comment_lines / stats.lines}
  217. Total counted tokens: {tokens}
  218. Fraction string literals: {stats.string_literals / tokens}
  219. Fraction int literals: {stats.int_literals / tokens}
  220. Fraction float literals: {stats.float_literals / tokens}
  221. Fraction symbols: {stats.symbols / tokens}
  222. Fraction keywords: {stats.keywords / tokens}
  223. Fraction IDs: {stats.identifiers / tokens}
  224. """
  225. )
  226. def print_histogram(
  227. title: str, data: dict[int, int], column_format: str
  228. ) -> None:
  229. print()
  230. key_min = min(data.keys())
  231. key_max = max(data.keys()) + 1
  232. values = [data.get(k, 0) for k in range(key_min, key_max)]
  233. keys = [column_format % k for k in range(key_min, key_max)]
  234. total = sum(values)
  235. median = key_min
  236. p90 = key_min
  237. p95 = key_min
  238. p99 = key_min
  239. count = total
  240. for k in range(key_min, key_max):
  241. count -= data.get(k, 0)
  242. if median == key_min and count <= total / 2:
  243. median = k
  244. if p90 == key_min and count <= total / 10:
  245. p90 = k
  246. if p95 == key_min and count <= total / 20:
  247. p95 = k
  248. if p99 == key_min and count <= total / 100:
  249. p99 = k
  250. print(
  251. title + f" (median: {median}, p90: {p90}, p95: {p95}, p99: {p99})"
  252. )
  253. fig = tpl.figure()
  254. fig.barh(values, keys)
  255. fig.show()
  256. print_histogram(
  257. "## Comment line widths ##", stats.comment_line_widths, "%d columns"
  258. )
  259. print_histogram(
  260. "## String literals per line ##",
  261. stats.string_literals_per_line,
  262. "%d literals",
  263. )
  264. print_histogram(
  265. "## Int literals per line ##",
  266. stats.int_literals_per_line,
  267. "%d literals",
  268. )
  269. print_histogram(
  270. "## Float literals per line ##",
  271. stats.float_literals_per_line,
  272. "%d literals",
  273. )
  274. print_histogram(
  275. "## Symbols per line ##", stats.symbols_per_line, "%d symbols"
  276. )
  277. print_histogram(
  278. "## Keywords per line ##", stats.keywords_per_line, "%d keywords"
  279. )
  280. print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
  281. print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")
  282. print_histogram(
  283. "## Unique IDs per 10 lines ##",
  284. stats.unique_ids_per_ten_lines,
  285. "%d ids",
  286. )
  287. if __name__ == "__main__":
  288. main()