| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- #!/usr/bin/env python3
- """Script to compute statistics about source code."""
- from __future__ import annotations
- __copyright__ = """
- Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- Exceptions. See /LICENSE for license information.
- SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- """
- import argparse
- from alive_progress import alive_bar # type:ignore
- from multiprocessing import Pool
- import re
- import termplotlib as tpl # type:ignore
- from pathlib import Path
- from typing import Dict, List, Optional
- from dataclasses import dataclass, field, asdict
- from collections import Counter
- BLANK_RE = re.compile(r"\s*")
- COMMENT_RE = re.compile(r"\s*///*\s*")
- LINE_RE = re.compile(
- r"""
- (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
- (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
- (?P<trailing_comment>//.*)|
- (?P<id>\b\w+\b)
- """,
- re.X,
- )
- @dataclass
- class Stats:
- """Stats collected while scanning source files"""
- lines: int = 0
- blank_lines: int = 0
- comment_lines: int = 0
- empty_comment_lines: int = 0
- comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
- lines_with_trailing_comments: int = 0
- classes: int = 0
- identifiers: int = 0
- identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
- ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
- def accumulate(self, other: Stats) -> None:
- self.lines += other.lines
- self.blank_lines += other.blank_lines
- self.empty_comment_lines += other.empty_comment_lines
- self.comment_lines += other.comment_lines
- self.comment_line_widths.update(other.comment_line_widths)
- self.lines_with_trailing_comments += other.lines_with_trailing_comments
- self.classes += other.classes
- self.identifiers += other.identifiers
- self.identifier_widths.update(other.identifier_widths)
- self.ids_per_line.update(other.ids_per_line)
- def scan_file(file: Path) -> Stats:
- """Scans the provided file and accumulates stats."""
- stats = Stats()
- for line in file.open():
- # Strip off the line endings.
- line = line.rstrip("\r\n")
- # Skip over super long lines that are often URLs or structured data that
- # doesn't match "normal" source code patterns.
- if len(line) > 80:
- continue
- stats.lines += 1
- if re.fullmatch(BLANK_RE, line):
- stats.blank_lines += 1
- continue
- if m := re.match(COMMENT_RE, line):
- stats.comment_lines += 1
- if m.end() == len(line):
- stats.empty_comment_lines += 1
- else:
- stats.comment_line_widths[len(line)] += 1
- continue
- line_identifiers = 0
- for m in re.finditer(LINE_RE, line):
- if m.group("trailing_comment"):
- stats.lines_with_trailing_comments += 1
- break
- if m.group("class_intro"):
- stats.classes += 1
- line_identifiers += 1
- stats.identifier_widths[len(m.group("class_name"))] += 1
- elif m.group("end_open_curly"):
- pass
- else:
- assert m.group("id"), "Line is '%s', and match is '%s'" % (
- line,
- line[m.start() : m.end()],
- )
- line_identifiers += 1
- stats.identifier_widths[len(m.group("id"))] += 1
- stats.identifiers += line_identifiers
- stats.ids_per_line[line_identifiers] += 1
- return stats
- def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
- """Parsers command-line arguments and flags."""
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "files",
- metavar="FILE",
- type=Path,
- nargs="+",
- help="A file to scan while collecting statistics.",
- )
- return parser.parse_args(args=args)
- def main() -> None:
- parsed_args = parse_args()
- stats = Stats()
- with alive_bar(len(parsed_args.files)) as bar:
- with Pool() as p:
- for file_stats in p.imap_unordered(scan_file, parsed_args.files):
- stats.accumulate(file_stats)
- bar()
- print(
- """
- ## Stats ##
- Lines: %(lines)d
- Blank lines: %(blank_lines)d
- Comment lines: %(comment_lines)d
- Empty comment lines: %(empty_comment_lines)d
- Lines with trailing comments: %(lines_with_trailing_comments)d
- Classes: %(classes)d
- IDs: %(identifiers)d"""
- % asdict(stats)
- )
- def print_histogram(
- title: str, data: Dict[int, int], column_format: str
- ) -> None:
- print()
- print(title)
- key_min = min(data.keys())
- key_max = max(data.keys()) + 1
- values = [data.get(k, 0) for k in range(key_min, key_max)]
- keys = [column_format % k for k in range(key_min, key_max)]
- fig = tpl.figure()
- fig.barh(values, keys)
- fig.show()
- print_histogram(
- "## Comment line widths ##", stats.comment_line_widths, "%d columns"
- )
- print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
- print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")
- if __name__ == "__main__":
- main()
|