Skip to content

Commit 8a97fda

Browse files
authored
[CI] [GHA] Introduce GHA pipeline rerunner (openvinotoolkit#23865)
### Details: This PR introduces a GHA pipeline rerunner. It should scan the failed workflows' logs and re-run those having known sporadic errors. The rerunner is a Python script which is used by a dedicated workflow. The workflow will not run from this PR, it needs to be in `master`. I've checked the workflow and script in the private repo. ### Tickets: - *136935*
1 parent 03fa47a commit 8a97fda

15 files changed

+556
-0
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,5 @@
6565
*.vsdx filter=lfs diff=lfs merge=lfs -text
6666
*.bmp filter=lfs diff=lfs merge=lfs -text
6767
*.svg filter=lfs diff=lfs merge=lfs -text
68+
.github/scripts/workflow_rerun/tests/data/log_archive_with_error.zip filter=lfs diff=lfs merge=lfs -text
69+
.github/scripts/workflow_rerun/tests/data/log_archive_wo_error.zip filter=lfs diff=lfs merge=lfs -text

.github/scripts/workflow_rerun/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import argparse
2+
from pathlib import Path
3+
4+
5+
def get_arguments() -> argparse.Namespace:
6+
parser = argparse.ArgumentParser()
7+
parser.add_argument('-r', '--repository-name',
8+
type=str,
9+
required=True,
10+
help='Repository name in the OWNER/REPOSITORY format')
11+
parser.add_argument('--run-id',
12+
type=int,
13+
required=True,
14+
help='Workflow Run ID')
15+
parser.add_argument('--errors-to-look-for-file',
16+
type=str,
17+
required=False,
18+
help='.json file with the errors to look for in logs',
19+
default=Path(__file__).resolve().parent.joinpath('errors_to_look_for.json'))
20+
return parser.parse_args()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import logging
2+
import os
3+
4+
5+
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
6+
7+
8+
def init_logger():
9+
LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
10+
logging.basicConfig(level=LOGLEVEL,
11+
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
12+
datefmt='%m-%d-%Y %H:%M:%S')
13+
14+
15+
init_logger()
16+
17+
LOGGER = logging.getLogger('rerunner')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[
2+
{
3+
"error_text": "This is a problem related to network connectivity",
4+
"ticket": 135929
5+
},
6+
{
7+
"error_text": "Unable to make request",
8+
"ticket": 135715
9+
},
10+
{
11+
"error_text": "GnuTLS recv error",
12+
"ticket": 131918
13+
},
14+
{
15+
"error_text": "Connection was reset",
16+
"ticket": 131818
17+
},
18+
{
19+
"error_text": "Failed to connect to github.com",
20+
"ticket": 131657
21+
},
22+
{
23+
"error_text": "Could not resolve host: github.com",
24+
"ticket": 131546
25+
},
26+
{
27+
"error_text": "retrieving gpg key timed out",
28+
"ticket": 131538
29+
},
30+
{
31+
"error_text": "Retry limit has been reached for chunk",
32+
"ticket": 131537
33+
},
34+
{
35+
"error_text": "fatal error: downloading",
36+
"ticket": 131424
37+
},
38+
{
39+
"error_text": "Failure when receiving data from the peer",
40+
"ticket": 137121
41+
}
42+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import json
2+
import re
3+
import tempfile
4+
from pathlib import Path
5+
from typing import TypedDict
6+
from zipfile import ZipFile
7+
8+
from workflow_rerun.constants import LOGGER
9+
10+
11+
class LogFile(TypedDict):
12+
file_name: str
13+
path: Path
14+
15+
16+
class ErrorData(TypedDict):
17+
error_text: str
18+
ticket: int
19+
20+
21+
class LogAnalyzer:
22+
def __init__(self,
23+
path_to_log_archive: Path,
24+
path_to_errors_file: Path) -> None:
25+
self._path_to_log_archive = path_to_log_archive
26+
self._path_to_errors_file = path_to_errors_file
27+
28+
self._errors_to_look_for: list[ErrorData] = []
29+
self._collect_errors_to_look_for()
30+
31+
self._log_dir = tempfile.TemporaryDirectory().name
32+
33+
self._log_files: list[LogFile] = []
34+
self._collect_log_files()
35+
36+
all_txt_log_files_pretty = '\n'.join(map(lambda item: str(item['path']), self._log_files))
37+
LOGGER.info(f'ALL .txt LOG FILES: \n{all_txt_log_files_pretty}')
38+
39+
self.found_matching_error = False
40+
41+
def _collect_errors_to_look_for(self) -> None:
42+
with open(file=self._path_to_errors_file,
43+
mode='r',
44+
encoding='utf-8') as errors_file:
45+
errors_data = json.load(errors_file)
46+
for error_data in errors_data:
47+
self._errors_to_look_for.append(
48+
ErrorData(error_text=error_data['error_text'],
49+
ticket=error_data['ticket'])
50+
)
51+
52+
def _collect_log_files(self) -> None:
53+
"""
54+
Collects the .txt log files from the log archive
55+
56+
The GitHub Actions pipeline logs archive should have the following structure:
57+
> Job_name_0
58+
> step_name_0.txt
59+
> step_name_1.txt
60+
...
61+
> Job_name_1
62+
> step_name_0.txt
63+
> step_name_1.txt
64+
...
65+
> Job_name_2
66+
...
67+
...
68+
69+
We need to only analyze the `*.txt` files
70+
"""
71+
72+
with ZipFile(file=self._path_to_log_archive,
73+
mode='r') as zip_file:
74+
zip_file.extractall(self._log_dir)
75+
76+
for _file in Path(self._log_dir).iterdir():
77+
if _file.is_dir():
78+
for log_file in _file.iterdir():
79+
self._log_files.append(LogFile(file_name=log_file.name,
80+
path=log_file.resolve()))
81+
82+
def _is_error_in_log(self,
83+
error_to_look_for: str,
84+
log_file_path: Path) -> bool:
85+
"""
86+
Searches for the error in the provided log
87+
"""
88+
89+
error_to_look_for = self._clean_up_string(error_to_look_for)
90+
91+
with open(file=log_file_path,
92+
mode='r',
93+
encoding='utf-8') as log_file:
94+
for line in log_file:
95+
if error_to_look_for in self._clean_up_string(line):
96+
return True
97+
return False
98+
99+
@staticmethod
100+
def _clean_up_string(string: str) -> str:
101+
"""
102+
Replaces special characters with spaces in the string, strips it from leading and following spaces,
103+
and lowers it
104+
105+
for "Could not resolve host: github.com" returns "could not resolve host github com"
106+
107+
This cleanup is applied to both errors to look for and logs themselves for matching
108+
"""
109+
return re.sub(r'[^A-Za-z0-9]+', ' ', string).lower().strip()
110+
111+
def analyze(self) -> None:
112+
"""
113+
Iterates over the known errors and tries to find them in the collected log files
114+
"""
115+
for error in self._errors_to_look_for:
116+
117+
LOGGER.info(f'LOOKING FOR "{error["error_text"]}" ERROR...')
118+
119+
for log_file in self._log_files:
120+
if self._is_error_in_log(error_to_look_for=error['error_text'],
121+
log_file_path=log_file['path']):
122+
LOGGER.info(f'FOUND "{error["error_text"]}" ERROR IN {log_file["path"]}. TICKET: {error["ticket"]}')
123+
self.found_matching_error = True
124+
return
125+
126+
if __name__ == '__main__':
127+
# Usage example
128+
log_analyzer = LogAnalyzer(path_to_log_archive=Path('/tmp/logs/log.zip'),
129+
path_to_errors_file=Path('/tmp/errors_to_look_for.json'))
130+
log_analyzer.analyze()
131+
if log_analyzer.found_matching_error:
132+
print('found matching error, see logs above')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pathlib import Path
2+
3+
import requests
4+
from github.WorkflowRun import WorkflowRun
5+
from workflow_rerun.constants import GITHUB_TOKEN, LOGGER
6+
7+
8+
def collect_logs_for_run(run: WorkflowRun,
9+
log_archive_path: Path) -> Path:
10+
"""
11+
Collects log archive for a pipeline
12+
"""
13+
with open(file=log_archive_path,
14+
mode='wb') as log_archive:
15+
LOGGER.info(f'STARTED LOG COLLECTION FOR {run.id} IN {log_archive_path}')
16+
# PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/{run_id}/logs" endpoint so we have to use requests
17+
log_archive.write(requests.get(url=run.logs_url,
18+
headers={'Authorization': f'Bearer {GITHUB_TOKEN}'}).content)
19+
LOGGER.info(f'COLLECTED LOGS FOR {run.id} IN {log_archive_path}')
20+
21+
return log_archive_path
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import sys
2+
import tempfile
3+
from pathlib import Path
4+
5+
from github import Github, Auth
6+
from workflow_rerun.argument_parser import get_arguments
7+
from workflow_rerun.constants import GITHUB_TOKEN, LOGGER
8+
from workflow_rerun.log_analyzer import LogAnalyzer
9+
from workflow_rerun.log_collector import collect_logs_for_run
10+
11+
if __name__ == '__main__':
12+
13+
args = get_arguments()
14+
run_id = args.run_id
15+
repository_name = args.repository_name
16+
17+
github = Github(auth=Auth.Token(token=GITHUB_TOKEN))
18+
gh_repo = github.get_repo(full_name_or_id=repository_name)
19+
run = gh_repo.get_workflow_run(id_=run_id)
20+
21+
LOGGER.info(f'CHECKING IF RERUN IS NEEDED FOR {run.html_url} RUN IN {repository_name}.')
22+
23+
# Check if the run has already been retriggered
24+
# we do not want to fall into a loop with retriggers
25+
if run.run_attempt > 1:
26+
LOGGER.info(f'THERE ARE {run.run_attempt} ATTEMPTS ALREADY. NOT CHECKING LOGS AND NOT RETRIGGERING. EXITING')
27+
sys.exit(0)
28+
29+
log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name)
30+
31+
collect_logs_for_run(
32+
run=run,
33+
log_archive_path=log_archive_path,
34+
)
35+
36+
log_analyzer = LogAnalyzer(
37+
path_to_log_archive=log_archive_path,
38+
path_to_errors_file=args.error_to_look_for_file,
39+
)
40+
log_analyzer.analyze()
41+
42+
if log_analyzer.found_matching_error:
43+
LOGGER.info(f'FOUND MATCHING ERROR, RETRIGGERING {run.html_url}')
44+
status = run.rerun()
45+
if status:
46+
LOGGER.info(f'RUN RETRIGGERED SUCCESSFULLY: {run.html_url}')
47+
else:
48+
LOGGER.info(f'RUN WAS NOT RETRIGGERED, SEE ABOVE')
49+
50+
# "status" is True (which is 1) if everything is ok, False (which is 0) otherwise
51+
sys.exit(not status)
52+
else:
53+
LOGGER.info(f'NO ERROR WAS FOUND, NOT RETRIGGERING')

.github/scripts/workflow_rerun/tests/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:894d636bcf156a7f3fae09f3c1d61df6b3db89117a917a3079995805c29115b3
3+
size 89247
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:3f094a737d7ea40dba8d3fb13493275cae243d08e5f1dabce90c316c951a6ac2
3+
size 52047
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""
2+
Integration tests
3+
"""
4+
5+
import unittest
6+
from pathlib import Path
7+
from github import Github, Auth
8+
import os
9+
import tempfile
10+
11+
12+
from workflow_rerun.log_analyzer import LogAnalyzer
13+
from workflow_rerun.log_collector import collect_logs_for_run
14+
15+
16+
class IntegrationTest(unittest.TestCase):
17+
"""
18+
A class for testing integration between LogAnalyzer and log_collection
19+
"""
20+
21+
def setUp(self) -> None:
22+
print(f'\nIn test: "{self._testMethodName}"', flush=True)
23+
self._cwd = Path(__file__).parent
24+
self.errors_to_look_for_file = self._cwd.parent.joinpath(
25+
'errors_to_look_for.json'
26+
)
27+
self.github = Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN')))
28+
self.gh_repo = self.github.get_repo(full_name_or_id='openvinotoolkit/openvino')
29+
30+
# Even if we use "failure" for status we cannot guarantee logs containing any of the known error
31+
# So these tests use the logs of the most recent successfull pipeline
32+
self.wf_run = self.gh_repo.get_workflow_runs(status='success')[0]
33+
print(f'Workflow run for testing: {self.wf_run}', flush=True)
34+
35+
def test_log_collection_and_analysis(self) -> None:
36+
"""
37+
Ensure logs collected by collect_logs_for_run are analyzed by LogAnalyzer
38+
"""
39+
40+
log_archive_path = Path(tempfile.NamedTemporaryFile(suffix='.zip').name)
41+
collect_logs_for_run(run=self.wf_run,
42+
log_archive_path=log_archive_path)
43+
44+
analyzer = LogAnalyzer(
45+
path_to_log_archive=log_archive_path,
46+
path_to_errors_file=self.errors_to_look_for_file,
47+
)
48+
analyzer.analyze()
49+
self.assertFalse(analyzer.found_matching_error)
50+
51+
def tearDown(self) -> None:
52+
self.github.close()

0 commit comments

Comments
 (0)