Kaynağa Gözat

Add a script to scan source code for basic stats. (#3150)

This is a rough script that uses regexes to do a simple scan of source
code and extract some basic source code statistics. Things like column
width, comment line density, identifier lengths and densities.

After scanning, it prints out both raw stats and in a few cases renders
a quick histogram to the terminal to help visualize a relevant
distribution.

I threw this together pretty quickly, and this is an area of Python I
have very limited familiarity with, so happy to have any suggestions for
how to better approach this.
Chandler Carruth 2 yıl önce
ebeveyn
işleme
2de7d262b4
1 değiştirilmiş dosya ile 164 ekleme ve 0 silme
  1. 164 0
      scripts/source_stats.py

+ 164 - 0
scripts/source_stats.py

@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+"""Script to compute statistics about source code."""
+
+from __future__ import annotations
+
+__copyright__ = """
+Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+Exceptions. See /LICENSE for license information.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""
+
+import argparse
+from alive_progress import alive_bar  # type:ignore
+from multiprocessing import Pool
+import re
+import termplotlib as tpl  # type:ignore
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import dataclass, field, asdict
+from collections import Counter
+
+BLANK_RE = re.compile(r"\s*")
+COMMENT_RE = re.compile(r"\s*///*\s*")
+LINE_RE = re.compile(
+    r"""
+    (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
+    (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
+    (?P<trailing_comment>//.*)|
+    (?P<id>\b\w+\b)
+""",
+    re.X,
+)
+
+
+@dataclass
+class Stats:
+    """Stats collected while scanning source files"""
+
+    lines: int = 0
+    blank_lines: int = 0
+    comment_lines: int = 0
+    empty_comment_lines: int = 0
+    comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
+    lines_with_trailing_comments: int = 0
+    classes: int = 0
+    identifiers: int = 0
+    identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
+    ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
+
+    def accumulate(self, other: Stats) -> None:
+        self.lines += other.lines
+        self.blank_lines += other.blank_lines
+        self.empty_comment_lines += other.empty_comment_lines
+        self.comment_lines += other.comment_lines
+        self.comment_line_widths.update(other.comment_line_widths)
+        self.lines_with_trailing_comments += other.lines_with_trailing_comments
+        self.classes += other.classes
+        self.identifiers += other.identifiers
+        self.identifier_widths.update(other.identifier_widths)
+        self.ids_per_line.update(other.ids_per_line)
+
+
+def scan_file(file: Path) -> Stats:
+    """Scans the provided file and accumulates stats."""
+    stats = Stats()
+    for line in file.open():
+        # Strip off the line endings.
+        line = line.rstrip("\r\n")
+        # Skip over super long lines that are often URLs or structured data that
+        # doesn't match "normal" source code patterns.
+        if len(line) > 80:
+            continue
+        stats.lines += 1
+        if re.fullmatch(BLANK_RE, line):
+            stats.blank_lines += 1
+            continue
+        if m := re.match(COMMENT_RE, line):
+            stats.comment_lines += 1
+            if m.end() == len(line):
+                stats.empty_comment_lines += 1
+            else:
+                stats.comment_line_widths[len(line)] += 1
+            continue
+        line_identifiers = 0
+        for m in re.finditer(LINE_RE, line):
+            if m.group("trailing_comment"):
+                stats.lines_with_trailing_comments += 1
+                break
+            if m.group("class_intro"):
+                stats.classes += 1
+                line_identifiers += 1
+                stats.identifier_widths[len(m.group("class_name"))] += 1
+            elif m.group("end_open_curly"):
+                pass
+            else:
+                assert m.group("id"), "Line is '%s', and match is '%s'" % (
+                    line,
+                    line[m.start() : m.end()],
+                )
+                line_identifiers += 1
+                stats.identifier_widths[len(m.group("id"))] += 1
+        stats.identifiers += line_identifiers
+        stats.ids_per_line[line_identifiers] += 1
+    return stats
+
+
+def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
+    """Parsers command-line arguments and flags."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "files",
+        metavar="FILE",
+        type=Path,
+        nargs="+",
+        help="A file to scan while collecting statistics.",
+    )
+    return parser.parse_args(args=args)
+
+
+def main() -> None:
+    parsed_args = parse_args()
+    stats = Stats()
+    with alive_bar(len(parsed_args.files)) as bar:
+        with Pool() as p:
+            for file_stats in p.imap_unordered(scan_file, parsed_args.files):
+                stats.accumulate(file_stats)
+                bar()
+
+    print(
+        """
+## Stats ##
+Lines: %(lines)d
+Blank lines: %(blank_lines)d
+Comment lines: %(comment_lines)d
+Empty comment lines: %(empty_comment_lines)d
+Lines with trailing comments: %(lines_with_trailing_comments)d
+Classes: %(classes)d
+IDs: %(identifiers)d"""
+        % asdict(stats)
+    )
+
+    def print_histogram(
+        title: str, data: Dict[int, int], column_format: str
+    ) -> None:
+        print()
+        print(title)
+        key_min = min(data.keys())
+        key_max = max(data.keys()) + 1
+        values = [data.get(k, 0) for k in range(key_min, key_max)]
+        keys = [column_format % k for k in range(key_min, key_max)]
+        fig = tpl.figure()
+        fig.barh(values, keys)
+        fig.show()
+
+    print_histogram(
+        "## Comment line widths ##", stats.comment_line_widths, "%d columns"
+    )
+    print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
+    print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")
+
+
+if __name__ == "__main__":
+    main()