[CI] [GHA] Introduce GHA pipeline rerunner (openvinotoolkit#23865)

akashchi · web-flow · commit 8a97fdac1fc9 · 2024-04-16T14:52:12.000Z
### Details:
This PR introduces a GHA pipeline rerunner. It should scan the failed
workflows' logs and re-run those having known sporadic errors.

The rerunner is a Python script which is used by a dedicated workflow.

The workflow will not run from this PR, it needs to be in `master`. I've
checked the workflow and script in the private repo.

### Tickets:
 - *136935*
diff --git a/.gitattributes b/.gitattributes
@@ -65,3 +65,5 @@
 *.vsdx filter=lfs diff=lfs merge=lfs -text
 *.bmp filter=lfs diff=lfs merge=lfs -text
 *.svg filter=lfs diff=lfs merge=lfs -text
+.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip filter=lfs diff=lfs merge=lfs -text
+.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/scripts/workflow_rerun/__init__.py b/.github/scripts/workflow_rerun/__init__.py
diff --git a/.github/scripts/workflow_rerun/argument_parser.py b/.github/scripts/workflow_rerun/argument_parser.py
@@ -0,0 +1,20 @@
+import argparse
+from pathlib import Path
+
+
+def get_arguments() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repository-name', 
+                        type=str, 
+                        required=True,
+                        help='Repository name in the OWNER/REPOSITORY format')
+    parser.add_argument('--run-id', 
+                        type=int, 
+                        required=True,
+                        help='Workflow Run ID')
+    parser.add_argument('--errors-to-look-for-file', 
+                        type=str, 
+                        required=False,
+                        help='.json file with the errors to look for in logs',
+                        default=Path(__file__).resolve().parent.joinpath('errors_to_look_for.json'))
+    return parser.parse_args()
diff --git a/.github/scripts/workflow_rerun/constants.py b/.github/scripts/workflow_rerun/constants.py
@@ -0,0 +1,17 @@
+import logging
+import os
+
+
+GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
+
+
+def init_logger():
+    LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
+    logging.basicConfig(level=LOGLEVEL,
+                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+                        datefmt='%m-%d-%Y %H:%M:%S')
+
+
+init_logger()
+
+LOGGER = logging.getLogger('rerunner')
diff --git a/.github/scripts/workflow_rerun/errors_to_look_for.json b/.github/scripts/workflow_rerun/errors_to_look_for.json
@@ -0,0 +1,42 @@
+[
+    {
+        "error_text": "This is a problem related to network connectivity",
+        "ticket": 135929
+    },
+    {
+        "error_text": "Unable to make request",
+        "ticket": 135715
+    },
+    {
+        "error_text": "GnuTLS recv error",
+        "ticket": 131918
+    },
+    {
+        "error_text": "Connection was reset",
+        "ticket": 131818
+    },
+    {
+        "error_text": "Failed to connect to github.com",
+        "ticket": 131657
+    },
+    {
+        "error_text": "Could not resolve host: github.com",
+        "ticket": 131546
+    },
+    {
+        "error_text": "retrieving gpg key timed out",
+        "ticket": 131538
+    },
+    {
+        "error_text": "Retry limit has been reached for chunk",
+        "ticket": 131537
+    },
+    {
+        "error_text": "fatal error: downloading",
+        "ticket": 131424
+    },
+    {
+        "error_text": "Failure when receiving data from the peer",
+        "ticket": 137121
+    }
+]
diff --git a/.github/scripts/workflow_rerun/log_analyzer.py b/.github/scripts/workflow_rerun/log_analyzer.py
@@ -0,0 +1,132 @@
+import json
+import re
+import tempfile
+from pathlib import Path
+from typing import TypedDict
+from zipfile import ZipFile
+
+from workflow_rerun.constants import LOGGER
+
+
+class LogFile(TypedDict):
+    file_name: str
+    path: Path
+
+
+class ErrorData(TypedDict):
+    error_text: str
+    ticket: int
+
+
+class LogAnalyzer:
+    def __init__(self, 
+                 path_to_log_archive: Path,
+                 path_to_errors_file: Path) -> None:
+        self._path_to_log_archive = path_to_log_archive
+        self._path_to_errors_file = path_to_errors_file
+
+        self._errors_to_look_for: list[ErrorData] = []
+        self._collect_errors_to_look_for()
+
+        self._log_dir = tempfile.TemporaryDirectory().name
+        
+        self._log_files: list[LogFile] = []
+        self._collect_log_files()
+        
+        all_txt_log_files_pretty = '\n'.join(map(lambda item: str(item['path']), self._log_files))
+        LOGGER.info(f'ALL .txt LOG FILES: \n{all_txt_log_files_pretty}')
+
+        self.found_matching_error = False
+    
+    def _collect_errors_to_look_for(self) -> None:
+        with open(file=self._path_to_errors_file, 
+                  mode='r', 
+                  encoding='utf-8') as errors_file:
+            errors_data = json.load(errors_file)
+            for error_data in errors_data:
+                self._errors_to_look_for.append(
+                    ErrorData(error_text=error_data['error_text'], 
+                              ticket=error_data['ticket'])
+                    )
+
+    def _collect_log_files(self) -> None:
+        """
+        Collects the .txt log files from the log archive
+
+        The GitHub Actions pipeline logs archive should have the following structure:
+            > Job_name_0
+                > step_name_0.txt
+                > step_name_1.txt
+                ...
+            > Job_name_1
+                > step_name_0.txt
+                > step_name_1.txt
+                ...
+            > Job_name_2
+                ...
+            ...
+        
+        We need to only analyze the `*.txt` files
+        """
+        
+        with ZipFile(file=self._path_to_log_archive, 
+                     mode='r') as zip_file:
+            zip_file.extractall(self._log_dir)
+        
+        for _file in Path(self._log_dir).iterdir():
+            if _file.is_dir():
+                for log_file in _file.iterdir():
+                    self._log_files.append(LogFile(file_name=log_file.name, 
+                                                   path=log_file.resolve()))
+
+    def _is_error_in_log(self, 
+                         error_to_look_for: str, 
+                         log_file_path: Path) -> bool:
+        """
+        Searches for the error in the provided log
+        """
+
+        error_to_look_for = self._clean_up_string(error_to_look_for)        
+
+        with open(file=log_file_path, 
+                  mode='r', 
+                  encoding='utf-8') as log_file:
+            for line in log_file:
+                if error_to_look_for in self._clean_up_string(line):
+                    return True
+        return False
+        
+    @staticmethod
+    def _clean_up_string(string: str) -> str:
+        """
+        Replaces special characters with spaces in the string, strips it from leading and following spaces,
+        and lowers it
+        
+        for "Could not resolve host: github.com" returns "could not resolve host github com"
+        
+        This cleanup is applied to both errors to look for and logs themselves for matching
+        """
+        return re.sub(r'[^A-Za-z0-9]+', ' ', string).lower().strip()
+
+    def analyze(self) -> None:
+        """
+        Iterates over the known errors and tries to find them in the collected log files
+        """
+        for error in self._errors_to_look_for:
+            
+            LOGGER.info(f'LOOKING FOR "{error["error_text"]}" ERROR...')
+            
+            for log_file in self._log_files:
+                if self._is_error_in_log(error_to_look_for=error['error_text'], 
+                                         log_file_path=log_file['path']):
+                    LOGGER.info(f'FOUND "{error["error_text"]}" ERROR IN {log_file["path"]}. TICKET: {error["ticket"]}')
+                    self.found_matching_error = True
+                    return
+
+if __name__ == '__main__':
+    # Usage example
+    log_analyzer = LogAnalyzer(path_to_log_archive=Path('/tmp/logs/log.zip'), 
+                               path_to_errors_file=Path('/tmp/errors_to_look_for.json'))
+    log_analyzer.analyze()
+    if log_analyzer.found_matching_error:
+        print('found matching error, see logs above')
diff --git a/.github/scripts/workflow_rerun/log_collector.py b/.github/scripts/workflow_rerun/log_collector.py
@@ -0,0 +1,21 @@
+from pathlib import Path
+
+import requests
+from github.WorkflowRun import WorkflowRun
+from workflow_rerun.constants import GITHUB_TOKEN, LOGGER
+
+
+def collect_logs_for_run(run: WorkflowRun,
+                         log_archive_path: Path) -> Path:
+    """
+    Collects log archive for a pipeline
+    """
+    with open(file=log_archive_path, 
+              mode='wb') as log_archive:
+        LOGGER.info(f'STARTED LOG COLLECTION FOR {run.id} IN {log_archive_path}')
+        # PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/{run_id}/logs" endpoint so we have to use requests
+        log_archive.write(requests.get(url=run.logs_url, 
+                                       headers={'Authorization': f'Bearer {GITHUB_TOKEN}'}).content)
+        LOGGER.info(f'COLLECTED LOGS FOR {run.id} IN {log_archive_path}')
+
+    return log_archive_path
diff --git a/.github/scripts/workflow_rerun/rerunner.py b/.github/scripts/workflow_rerun/rerunner.py
@@ -0,0 +1,53 @@
+import sys
+import tempfile
+from pathlib import Path
+
+from github import Github, Auth
+from workflow_rerun.argument_parser import get_arguments
+from workflow_rerun.constants import GITHUB_TOKEN, LOGGER
+from workflow_rerun.log_analyzer import LogAnalyzer
+from workflow_rerun.log_collector import collect_logs_for_run
+
+if __name__ == '__main__':
+
+    args = get_arguments()
+    run_id = args.run_id
+    repository_name = args.repository_name
+
+    github = Github(auth=Auth.Token(token=GITHUB_TOKEN))
+    gh_repo = github.get_repo(full_name_or_id=repository_name)
+    run = gh_repo.get_workflow_run(id_=run_id)
+    
+    LOGGER.info(f'CHECKING IF RERUN IS NEEDED FOR {run.html_url} RUN IN {repository_name}.')
+    
+    # Check if the run has already been retriggered
+    # we do not want to fall into a loop with retriggers
+    if run.run_attempt > 1:
+        LOGGER.info(f'THERE ARE {run.run_attempt} ATTEMPTS ALREADY. NOT CHECKING LOGS AND NOT RETRIGGERING. EXITING')
+        sys.exit(0)
+        
+    log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name)
+
+    collect_logs_for_run(
+        run=run,
+        log_archive_path=log_archive_path,
+    )
+
+    log_analyzer = LogAnalyzer(
+        path_to_log_archive=log_archive_path,
+        path_to_errors_file=args.error_to_look_for_file,
+    )
+    log_analyzer.analyze()
+    
+    if log_analyzer.found_matching_error:
+        LOGGER.info(f'FOUND MATCHING ERROR, RETRIGGERING {run.html_url}')
+        status = run.rerun()
+        if status:
+            LOGGER.info(f'RUN RETRIGGERED SUCCESSFULLY: {run.html_url}')
+        else:
+            LOGGER.info(f'RUN WAS NOT RETRIGGERED, SEE ABOVE')
+        
+        # "status" is True (which is 1) if everything is ok, False (which is 0) otherwise
+        sys.exit(not status)
+    else:
+        LOGGER.info(f'NO ERROR WAS FOUND, NOT RETRIGGERING')
diff --git a/.github/scripts/workflow_rerun/tests/__init__.py b/.github/scripts/workflow_rerun/tests/__init__.py
diff --git a/.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip b/.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:894d636bcf156a7f3fae09f3c1d61df6b3db89117a917a3079995805c29115b3
+size 89247
diff --git a/.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip b/.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f094a737d7ea40dba8d3fb13493275cae243d08e5f1dabce90c316c951a6ac2
+size 52047
diff --git a/.github/scripts/workflow_rerun/tests/integration_test.py b/.github/scripts/workflow_rerun/tests/integration_test.py
@@ -0,0 +1,52 @@
+"""
+Integration tests
+"""
+
+import unittest
+from pathlib import Path
+from github import Github, Auth
+import os
+import tempfile
+
+
+from workflow_rerun.log_analyzer import LogAnalyzer
+from workflow_rerun.log_collector import collect_logs_for_run
+
+
+class IntegrationTest(unittest.TestCase):
+    """
+    A class for testing integration between LogAnalyzer and log_collection
+    """
+
+    def setUp(self) -> None:
+        print(f'\nIn test: "{self._testMethodName}"', flush=True)
+        self._cwd = Path(__file__).parent
+        self.errors_to_look_for_file = self._cwd.parent.joinpath(
+            'errors_to_look_for.json'
+        )
+        self.github = Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN')))
+        self.gh_repo = self.github.get_repo(full_name_or_id='openvinotoolkit/openvino')
+
+        # Even if we use "failure" for status we cannot guarantee logs containing any of the known error
+        # So these tests use the logs of the most recent successfull pipeline
+        self.wf_run = self.gh_repo.get_workflow_runs(status='success')[0]
+        print(f'Workflow run for testing: {self.wf_run}', flush=True)
+
+    def test_log_collection_and_analysis(self) -> None:
+        """
+        Ensure logs collected by collect_logs_for_run are analyzed by LogAnalyzer
+        """
+
+        log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name)
+        collect_logs_for_run(run=self.wf_run, 
+                             log_archive_path=log_archive_path)
+
+        analyzer = LogAnalyzer(
+            path_to_log_archive=log_archive_path,
+            path_to_errors_file=self.errors_to_look_for_file,
+        )
+        analyzer.analyze()
+        self.assertFalse(analyzer.found_matching_error)
+    
+    def tearDown(self) -> None:
+        self.github.close()
diff --git a/.github/scripts/workflow_rerun/tests/log_analyzer_test.py b/.github/scripts/workflow_rerun/tests/log_analyzer_test.py
diff --git a/.github/scripts/workflow_rerun/tests/log_collector_test.py b/.github/scripts/workflow_rerun/tests/log_collector_test.py
diff --git a/.github/workflows/workflow_rerunner.yml b/.github/workflows/workflow_rerunner.yml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:894d636bcf156a7f3fae09f3c1d61df6b3db89117a917a3079995805c29115b3`
	`3`	`+size 89247`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:3f094a737d7ea40dba8d3fb13493275cae243d08e5f1dabce90c316c951a6ac2`
	`3`	`+size 52047`