source_stats.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/bin/env python3
  2. """Script to compute statistics about source code."""
  3. from __future__ import annotations
  4. __copyright__ = """
  5. Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  6. Exceptions. See /LICENSE for license information.
  7. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  8. """
  9. import argparse
  10. from alive_progress import alive_bar # type:ignore
  11. from multiprocessing import Pool
  12. import re
  13. import termplotlib as tpl # type:ignore
  14. from pathlib import Path
  15. from typing import Dict, List, Optional
  16. from dataclasses import dataclass, field, asdict
  17. from collections import Counter
  18. BLANK_RE = re.compile(r"\s*")
  19. COMMENT_RE = re.compile(r"\s*///*\s*")
  20. LINE_RE = re.compile(
  21. r"""
  22. (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
  23. (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
  24. (?P<trailing_comment>//.*)|
  25. (?P<id>\b\w+\b)
  26. """,
  27. re.X,
  28. )
  29. @dataclass
  30. class Stats:
  31. """Stats collected while scanning source files"""
  32. lines: int = 0
  33. blank_lines: int = 0
  34. comment_lines: int = 0
  35. empty_comment_lines: int = 0
  36. comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
  37. lines_with_trailing_comments: int = 0
  38. classes: int = 0
  39. identifiers: int = 0
  40. identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
  41. ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
  42. def accumulate(self, other: Stats) -> None:
  43. self.lines += other.lines
  44. self.blank_lines += other.blank_lines
  45. self.empty_comment_lines += other.empty_comment_lines
  46. self.comment_lines += other.comment_lines
  47. self.comment_line_widths.update(other.comment_line_widths)
  48. self.lines_with_trailing_comments += other.lines_with_trailing_comments
  49. self.classes += other.classes
  50. self.identifiers += other.identifiers
  51. self.identifier_widths.update(other.identifier_widths)
  52. self.ids_per_line.update(other.ids_per_line)
  53. def scan_file(file: Path) -> Stats:
  54. """Scans the provided file and accumulates stats."""
  55. stats = Stats()
  56. for line in file.open():
  57. # Strip off the line endings.
  58. line = line.rstrip("\r\n")
  59. # Skip over super long lines that are often URLs or structured data that
  60. # doesn't match "normal" source code patterns.
  61. if len(line) > 80:
  62. continue
  63. stats.lines += 1
  64. if re.fullmatch(BLANK_RE, line):
  65. stats.blank_lines += 1
  66. continue
  67. if m := re.match(COMMENT_RE, line):
  68. stats.comment_lines += 1
  69. if m.end() == len(line):
  70. stats.empty_comment_lines += 1
  71. else:
  72. stats.comment_line_widths[len(line)] += 1
  73. continue
  74. line_identifiers = 0
  75. for m in re.finditer(LINE_RE, line):
  76. if m.group("trailing_comment"):
  77. stats.lines_with_trailing_comments += 1
  78. break
  79. if m.group("class_intro"):
  80. stats.classes += 1
  81. line_identifiers += 1
  82. stats.identifier_widths[len(m.group("class_name"))] += 1
  83. elif m.group("end_open_curly"):
  84. pass
  85. else:
  86. assert m.group("id"), "Line is '%s', and match is '%s'" % (
  87. line,
  88. line[m.start() : m.end()],
  89. )
  90. line_identifiers += 1
  91. stats.identifier_widths[len(m.group("id"))] += 1
  92. stats.identifiers += line_identifiers
  93. stats.ids_per_line[line_identifiers] += 1
  94. return stats
  95. def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
  96. """Parsers command-line arguments and flags."""
  97. parser = argparse.ArgumentParser(description=__doc__)
  98. parser.add_argument(
  99. "files",
  100. metavar="FILE",
  101. type=Path,
  102. nargs="+",
  103. help="A file to scan while collecting statistics.",
  104. )
  105. return parser.parse_args(args=args)
  106. def main() -> None:
  107. parsed_args = parse_args()
  108. stats = Stats()
  109. with alive_bar(len(parsed_args.files)) as bar:
  110. with Pool() as p:
  111. for file_stats in p.imap_unordered(scan_file, parsed_args.files):
  112. stats.accumulate(file_stats)
  113. bar()
  114. print(
  115. """
  116. ## Stats ##
  117. Lines: %(lines)d
  118. Blank lines: %(blank_lines)d
  119. Comment lines: %(comment_lines)d
  120. Empty comment lines: %(empty_comment_lines)d
  121. Lines with trailing comments: %(lines_with_trailing_comments)d
  122. Classes: %(classes)d
  123. IDs: %(identifiers)d"""
  124. % asdict(stats)
  125. )
  126. def print_histogram(
  127. title: str, data: Dict[int, int], column_format: str
  128. ) -> None:
  129. print()
  130. print(title)
  131. key_min = min(data.keys())
  132. key_max = max(data.keys()) + 1
  133. values = [data.get(k, 0) for k in range(key_min, key_max)]
  134. keys = [column_format % k for k in range(key_min, key_max)]
  135. fig = tpl.figure()
  136. fig.barh(values, keys)
  137. fig.show()
  138. print_histogram(
  139. "## Comment line widths ##", stats.comment_line_widths, "%d columns"
  140. )
  141. print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
  142. print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")
  143. if __name__ == "__main__":
  144. main()