Jelajahi Sumber

Add transient error retries to bazel integration tests (#6796)

e.g. for failures like
https://github.com/carbon-language/carbon-lang/actions/runs/22418738440/job/64911093548

We work around this similarly in run_bazel.py already; this is mirroring
over some of the logic (sharing would require work on Python's set up).

Assisted-by: Google Antigravity with Gemini 3 Flash

---------

Co-authored-by: Chandler Carruth <chandlerc@gmail.com>
Jon Ross-Perkins 2 bulan lalu
induk
melakukan
17897bb05d
1 mengubah file dengan 37 tambahan dan 15 penghapusan
  1. 37 15
      examples/bazel_test_runner.py

+ 37 - 15
examples/bazel_test_runner.py

@@ -10,6 +10,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
 import sys
+import time
 import unittest
 from bazel_tools.tools.python.runfiles import runfiles
 from bazel_integration_test.py import test_base
@@ -30,29 +31,50 @@ class BazelExampleTest(test_base.TestBase):
             f"--override_module=carbon_toolchain={self.install_module}"
         ]
 
-    def log_lines(self, lines: list[str]) -> None:
-        for line in lines:
-            print(line, file=sys.stderr)
+    def _run_bazel(self, command: list[str]) -> str:
+        """Runs bazel with retry logic for transient errors."""
+        for attempt in range(5):
+            exit_code, stdout, stderr = self.RunBazel(
+                self.startup_flags + command + self.flags
+            )
 
-    def test_compile_lib(self) -> None:
-        # TODO: Can remove this if we can make linking a binary sufficiently
-        # efficient.
-        exit_code, stdout, stderr = self.RunBazel(
-            self.startup_flags + ["build"] + self.flags + ["//:example_lib"]
-        )
-        self.log_lines(stderr)
+            # Several error codes are reliably permanent, break immediately.
+            # `1`  -- The build failed.
+            # `2`  -- Command line or environment problem.
+            # `3`  -- Tests failed or timed out, we don't retry at this layer
+            #         on execution timeout.
+            # `4`  -- Test command but no tests found.
+            # `8`  -- Explicitly interrupted build.
+            #
+            # Note that `36` is documented as "likely permanent", but we retry
+            # it as most of our transient failures actually produce that error
+            # code.
+            perm_error = (1, 2, 3, 4, 8)
+            if exit_code in perm_error:
+                break
+
+            for line in stderr:
+                print(line, file=sys.stderr)
+
+            if exit_code == 0:
+                return stdout
+
+            # Retry transient errors with a brief delay.
+            print(f"Attempt {attempt + 1} failed with exit code {exit_code}")
+            time.sleep(attempt)
         self.AssertExitCode(exit_code, 0, stderr)
 
+    def test_compile_lib(self) -> None:
+        # TODO: Can remove this in favor of always running `test_run` if we can
+        # make linking a binary sufficiently efficient.
+        self._run_bazel(["build", "//:example_lib"])
+
     @unittest.skipUnless(
         "CARBON_BAZEL_TEST_FULL" in os.environ,
         "Skipping expensive test step for minimal testing",
     )
     def test_run(self) -> None:
-        exit_code, stdout, stderr = self.RunBazel(
-            self.startup_flags + ["run"] + self.flags + ["//:example"]
-        )
-        self.log_lines(stderr)
-        self.AssertExitCode(exit_code, 0, stderr)
+        stdout = self._run_bazel(["run", "//:example"])
         self.assertEqual(stdout, ["Hello World!"])