Просмотр исходного кода

Expand source stats to roughly cover more tokens. (#3277)

This is pretty rough both in code-quality and what it tries to cover.
The coverage approximation is probably fine as it mostly just gives some
vague idea of the distributions we should aim at when evaluating
performance. We still want performance with unusual distributions to be
good.

Happy to improve the code quality if folks want, and especially with
suggestions on what would help. For this kind of quick scripting, it
seemed "fine" to me.
Chandler Carruth 2 лет назад
Родитель
Сommit
0e8e94d8bf
1 измененных файлов с 125 добавлено и 2 удалено
  1. 125 2
      scripts/source_stats.py

+ 125 - 2
scripts/source_stats.py

@@ -27,6 +27,12 @@ LINE_RE = re.compile(
     (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
     (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
     (?P<trailing_comment>//.*)|
+    (?P<internal_comment>/\*.*\*/)|
+    (?P<string_literal>"([^"]|\\")*"|'([^']|\\')*')|
+    (?P<float_literal>\b(0[xb][0-9a-fA-F']*|[0-9][0-9']*)\.[0-9a-fA-F']*([eEpP][0-9a-fA-F']*)?)|
+    (?P<int_literal>\b(0[xb][0-9a-fA-F']+|[0-9][0-9']*)([eEpP][0-9a-fA-F']*)?)|
+    (?P<symbol>[\[\]{}(),.;]|[-+=!@#$%^&*/?|<>]+)|
+    (?P<keyword>\b(auto|bool|break|case|catch|char|class|const|continue|default|do|double|else|enum|explicit|extern|false|float|for|friend|goto|if|inline|int|long|mutable|namespace|new|nullptr|operator|private|protected|public|return|short|signed|sizeof|static|struct|switch|template|this|throw|true|try|typedef|union|unsigned|using|virtual|void|while)\b)|
     (?P<id>\b\w+\b)
 """,
     re.X,
@@ -44,6 +50,23 @@ class Stats:
     comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
     lines_with_trailing_comments: int = 0
     classes: int = 0
+    internal_comments: int = 0
+    string_literals: int = 0
+    string_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    int_literals: int = 0
+    int_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    float_literals: int = 0
+    float_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    symbols: int = 0
+    symbols_per_line: Counter[int] = field(default_factory=lambda: Counter())
+    keywords: int = 0
+    keywords_per_line: Counter[int] = field(default_factory=lambda: Counter())
     identifiers: int = 0
     identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
     ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
@@ -56,6 +79,17 @@ class Stats:
         self.comment_line_widths.update(other.comment_line_widths)
         self.lines_with_trailing_comments += other.lines_with_trailing_comments
         self.classes += other.classes
+        self.internal_comments += other.internal_comments
+        self.string_literals += other.string_literals
+        self.string_literals_per_line.update(other.string_literals_per_line)
+        self.int_literals += other.int_literals
+        self.int_literals_per_line.update(other.int_literals_per_line)
+        self.float_literals += other.float_literals
+        self.float_literals_per_line.update(other.float_literals_per_line)
+        self.symbols += other.symbols
+        self.symbols_per_line.update(other.symbols_per_line)
+        self.keywords += other.keywords
+        self.keywords_per_line.update(other.keywords_per_line)
         self.identifiers += other.identifiers
         self.identifier_widths.update(other.identifier_widths)
         self.ids_per_line.update(other.ids_per_line)
@@ -82,6 +116,11 @@ def scan_file(file: Path) -> Stats:
             else:
                 stats.comment_line_widths[len(line)] += 1
             continue
+        line_string_literals = 0
+        line_int_literals = 0
+        line_float_literals = 0
+        line_symbols = 0
+        line_keywords = 0
         line_identifiers = 0
         for m in re.finditer(LINE_RE, line):
             if m.group("trailing_comment"):
@@ -89,10 +128,23 @@ def scan_file(file: Path) -> Stats:
                 break
             if m.group("class_intro"):
                 stats.classes += 1
+                line_keywords += 1
                 line_identifiers += 1
                 stats.identifier_widths[len(m.group("class_name"))] += 1
             elif m.group("end_open_curly"):
-                pass
+                line_symbols += 1
+            elif m.group("internal_comment"):
+                stats.internal_comments += 1
+            elif m.group("string_literal"):
+                line_string_literals += 1
+            elif m.group("int_literal"):
+                line_int_literals += 1
+            elif m.group("float_literal"):
+                line_float_literals += 1
+            elif m.group("symbol"):
+                line_symbols += 1
+            elif m.group("keyword"):
+                line_keywords += 1
             else:
                 assert m.group("id"), "Line is '%s', and match is '%s'" % (
                     line,
@@ -100,6 +152,16 @@ def scan_file(file: Path) -> Stats:
                 )
                 line_identifiers += 1
                 stats.identifier_widths[len(m.group("id"))] += 1
+        stats.string_literals += line_string_literals
+        stats.string_literals_per_line[line_string_literals] += 1
+        stats.int_literals += line_int_literals
+        stats.int_literals_per_line[line_int_literals] += 1
+        stats.float_literals += line_float_literals
+        stats.float_literals_per_line[line_float_literals] += 1
+        stats.symbols += line_symbols
+        stats.symbols_per_line[line_symbols] += 1
+        stats.keywords += line_keywords
+        stats.keywords_per_line[line_keywords] += 1
         stats.identifiers += line_identifiers
         stats.ids_per_line[line_identifiers] += 1
     return stats
@@ -136,19 +198,57 @@ Comment lines: %(comment_lines)d
 Empty comment lines: %(empty_comment_lines)d
 Lines with trailing comments: %(lines_with_trailing_comments)d
 Classes: %(classes)d
+Internal comments: %(internal_comments)d
+String literals: %(string_literals)d
+Int literals: %(int_literals)d
+Float literals: %(float_literals)d
+Symbols: %(symbols)d
+Keywords: %(keywords)d
 IDs: %(identifiers)d"""
         % asdict(stats)
     )
 
+    tokens = (
+        stats.string_literals
+        + stats.int_literals
+        + stats.float_literals
+        + stats.symbols
+        + stats.keywords
+        + stats.identifiers
+    )
+    print(
+        f"""
+Fraction of blank lines: {stats.blank_lines / stats.lines}
+Fraction of comment lines: {stats.comment_lines / stats.lines}
+
+Total counted tokens: {tokens}
+Fraction string literals: {stats.string_literals / tokens}
+Fraction int literals: {stats.int_literals / tokens}
+Fraction float literals: {stats.float_literals / tokens}
+Fraction symbols: {stats.symbols / tokens}
+Fraction keywords: {stats.keywords / tokens}
+Fraction IDs: {stats.identifiers / tokens}
+    """
+    )
+
     def print_histogram(
         title: str, data: Dict[int, int], column_format: str
     ) -> None:
         print()
-        print(title)
         key_min = min(data.keys())
         key_max = max(data.keys()) + 1
         values = [data.get(k, 0) for k in range(key_min, key_max)]
         keys = [column_format % k for k in range(key_min, key_max)]
+        total = sum(values)
+        median = key_min
+        count = total
+        for k in range(key_min, key_max):
+            count -= data.get(k, 0)
+            if count <= total / 2:
+                median = k
+                break
+
+        print(title + f" (median: {median})")
         fig = tpl.figure()
         fig.barh(values, keys)
         fig.show()
@@ -156,6 +256,29 @@ IDs: %(identifiers)d"""
     print_histogram(
         "## Comment line widths ##", stats.comment_line_widths, "%d columns"
     )
+
+    print_histogram(
+        "## String literals per line ##",
+        stats.string_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Int literals per line ##",
+        stats.int_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Float literals per line ##",
+        stats.float_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Symbols per line ##", stats.symbols_per_line, "%d symbols"
+    )
+    print_histogram(
+        "## Keywords per line ##", stats.keywords_per_line, "%d keywords"
+    )
+
     print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
     print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")