From 09f12f6edaaaa3b686fc413d7f65f34afa2c2316 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Mon, 24 Jun 2024 18:49:03 +0100 Subject: [PATCH 1/8] Add ObjectRetrier to CinderaBackupTests This adds the auto-retrier to the cinder client to get past race hazards and other transient errors. --- zaza/openstack/charm_tests/cinder_backup/tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zaza/openstack/charm_tests/cinder_backup/tests.py b/zaza/openstack/charm_tests/cinder_backup/tests.py index 97b365808..5201cbc56 100644 --- a/zaza/openstack/charm_tests/cinder_backup/tests.py +++ b/zaza/openstack/charm_tests/cinder_backup/tests.py @@ -22,6 +22,7 @@ import zaza.model import zaza.openstack.charm_tests.test_utils as test_utils +from zaza.openstack.utilities import ObjectRetrierWraps import zaza.openstack.utilities.ceph as ceph_utils import zaza.openstack.utilities.openstack as openstack_utils @@ -35,8 +36,8 @@ class CinderBackupTest(test_utils.OpenStackBaseTest): def setUpClass(cls): """Run class setup for running Cinder Backup tests.""" super(CinderBackupTest, cls).setUpClass() - cls.cinder_client = openstack_utils.get_cinder_session_client( - cls.keystone_session) + cls.cinder_client = ObjectRetrierWraps( + openstack_utils.get_cinder_session_client(cls.keystone_session)) @property def services(self): From 704059a9968b2b86c23879a7b324fe7e3821035e Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Tue, 25 Jun 2024 12:25:01 +0100 Subject: [PATCH 2/8] Add additional debug for ObjectRetrier It wasn't capturing member variables on the wrapped object that would then be used to make the call; thus, wrap those. This also disables (temporarily) the long running cinder backup test deletion whilst checking whether retries are the problem. --- .../charm_tests/cinder_backup/tests.py | 8 +++-- zaza/openstack/utilities/__init__.py | 36 +++++++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/zaza/openstack/charm_tests/cinder_backup/tests.py b/zaza/openstack/charm_tests/cinder_backup/tests.py index 5201cbc56..9314cc0f2 100644 --- a/zaza/openstack/charm_tests/cinder_backup/tests.py +++ b/zaza/openstack/charm_tests/cinder_backup/tests.py @@ -22,7 +22,7 @@ import zaza.model import zaza.openstack.charm_tests.test_utils as test_utils -from zaza.openstack.utilities import ObjectRetrierWraps +from zaza.openstack.utilities import retry_on_connect_failure import zaza.openstack.utilities.ceph as ceph_utils import zaza.openstack.utilities.openstack as openstack_utils @@ -36,8 +36,9 @@ class CinderBackupTest(test_utils.OpenStackBaseTest): def setUpClass(cls): """Run class setup for running Cinder Backup tests.""" super(CinderBackupTest, cls).setUpClass() - cls.cinder_client = ObjectRetrierWraps( - openstack_utils.get_cinder_session_client(cls.keystone_session)) + cls.cinder_client = retry_on_connect_failure( + openstack_utils.get_cinder_session_client(cls.keystone_session), + log=logging.warn) @property def services(self): @@ -73,6 +74,7 @@ def test_410_cinder_vol_create_backup_delete_restore_pool_inspect(self): inspect ceph cinder pool object count as the volume is created and deleted. """ + return unit_name = zaza.model.get_lead_unit_name('ceph-mon') obj_count_samples = [] pool_size_samples = [] diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py index 5798f2650..68b0ce8b1 100644 --- a/zaza/openstack/utilities/__init__.py +++ b/zaza/openstack/utilities/__init__.py @@ -19,6 +19,25 @@ from keystoneauth1.exceptions.connection import ConnectFailure +NEVER_RETRY_EXCEPTIONS = ( + AssertionError, + AttributeError, + ImportError, + IndexError, + KeyError, + NotImplementedError, + OverflowError, + RecursionError, + ReferenceError, + RuntimeError, + SyntaxError, + IndentationError, + SystemExit, + TypeError, + UnicodeError, + ZeroDivisionError, +) + class ObjectRetrierWraps(object): """An automatic retrier for an object. @@ -94,11 +113,19 @@ def __getattr__(self, name): """Get attribute; delegates to wrapped object.""" # Note the above may generate an attribute error; we expect this and # will fail with an attribute error. + __log = self.__kwargs['log'] + __log(f"__getattr__(..) called with {name}") attr = getattr(self.__obj, name) if callable(attr) or hasattr(attr, "__getattr__"): + __log(f"__getattr__(..): wrapping {attr}") return ObjectRetrierWraps(attr, **self.__kwargs) - else: - return attr + __log( f"__getattr__(): {name} is not callable or has __getattr__") + if isinstance(attr, property): + __log(f"__getattr__(): {name} is a property") + __log(f"__getattr__(): {name} on {self.__obj} is a {type(attr)}") + # return attr + __log(f"__getattr__(): wrapping {attr}") + return ObjectRetrierWraps(attr, **self.__kwargs) # TODO(ajkavanagh): Note detecting a property is a bit trickier. we # can do isinstance(attr, property), but then the act of accessing it # is what calls it. i.e. it would fail at the getattr(self.__obj, @@ -120,12 +147,17 @@ def __call__(self, *args, **kwargs): wait_so_far = 0 while True: try: + log(f"Running {self.__name__}({args}, {kwargs})") return obj(*args, **kwargs) except Exception as e: # if retry_exceptions is not None, or the type of the exception # is not in the list of retries, then raise an exception # immediately. This means that if retry_exceptions is None, # then the method is always retried. + if isinstance(e, NEVER_RETRY_EXCEPTIONS): + log("ObjectRetrierWraps: error {} is never caught" + .format(str(e))) + raise if (retry_exceptions is not None and type(e) not in retry_exceptions): raise From edf55189fb97314bc74c68ff61fd0a3833407ab8 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Wed, 26 Jun 2024 10:07:37 +0100 Subject: [PATCH 3/8] Re-enable test 410 for cinder backups This is to verify that the retries really do work. --- zaza/openstack/charm_tests/cinder_backup/tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zaza/openstack/charm_tests/cinder_backup/tests.py b/zaza/openstack/charm_tests/cinder_backup/tests.py index 9314cc0f2..fc73fdbd6 100644 --- a/zaza/openstack/charm_tests/cinder_backup/tests.py +++ b/zaza/openstack/charm_tests/cinder_backup/tests.py @@ -74,7 +74,6 @@ def test_410_cinder_vol_create_backup_delete_restore_pool_inspect(self): inspect ceph cinder pool object count as the volume is created and deleted. """ - return unit_name = zaza.model.get_lead_unit_name('ceph-mon') obj_count_samples = [] pool_size_samples = [] From 1f0808e238da62ffa801c54261739020ffaa62cc Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 27 Jun 2024 12:33:36 +0100 Subject: [PATCH 4/8] Fix ObjectRetrierWraps recursive wrapping In order to ensure that an object that contains other objects that are called (e.g. the VolumeManager object on the Cinder client object), the ObjectRetrierWraps class needs to more agressively wrap non builtin classes. --- requirements.txt | 5 +- unit_tests/utilities/test_utilities.py | 24 ++++++++- .../test_zaza_utilities_openstack.py | 3 ++ zaza/openstack/utilities/__init__.py | 50 +++++++++---------- 4 files changed, 54 insertions(+), 28 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1a3b33cc7..992af69e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -50,7 +50,10 @@ python-novaclient python-octaviaclient python-swiftclient python-watcherclient -tenacity +# Due to https://github.com/jd/tenacity/pull/479 the strategy for mocking out tenacity +# waits/times/etc no longer works. Pin to 8.4.1 until it is solved. +# Bug in tenacity tracking issue: https://github.com/jd/tenacity/issues/482 +tenacity<8.4.2 paramiko # Documentation requirements diff --git a/unit_tests/utilities/test_utilities.py b/unit_tests/utilities/test_utilities.py index 6af6089b3..2f6dcf6e3 100644 --- a/unit_tests/utilities/test_utilities.py +++ b/unit_tests/utilities/test_utilities.py @@ -113,6 +113,28 @@ def func(self): mock_sleep.assert_not_called() + @mock.patch("time.sleep") + def test_object_wrap_multilevel_with_exception(self, mock_sleep): + + class A: + + def func(self): + raise SomeException() + + class B: + + def __init__(self): + self.a = A() + + b = B() + # retry on a specific exception + wrapped_b = utilities.ObjectRetrierWraps( + b, num_retries=1, retry_exceptions=[SomeException]) + with self.assertRaises(SomeException): + wrapped_b.a.func() + + mock_sleep.assert_called_once_with(5) + @mock.patch("time.sleep") def test_log_called(self, mock_sleep): @@ -130,7 +152,7 @@ def func(self): # there should be two calls; one for the single retry and one for the # failure. - self.assertEqual(mock_log.call_count, 2) + self.assertEqual(mock_log.call_count, 6) @mock.patch("time.sleep") def test_back_off_maximum(self, mock_sleep): diff --git a/unit_tests/utilities/test_zaza_utilities_openstack.py b/unit_tests/utilities/test_zaza_utilities_openstack.py index 222aab2f8..e00f713b1 100644 --- a/unit_tests/utilities/test_zaza_utilities_openstack.py +++ b/unit_tests/utilities/test_zaza_utilities_openstack.py @@ -99,6 +99,9 @@ def setUp(self): self.neutronclient.list_agents.return_value = self.agents self.neutronclient.list_bgp_speaker_on_dragent.return_value = \ self.bgp_speakers + self.patch("zaza.openstack.utilities.ObjectRetrierWraps", + name="_object_retrier_wraps", + new=lambda x, *_, **__: x) def test_create_port(self): self.patch_object(openstack_utils, "get_net_uuid") diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py index 68b0ce8b1..7a8ca104f 100644 --- a/zaza/openstack/utilities/__init__.py +++ b/zaza/openstack/utilities/__init__.py @@ -15,6 +15,7 @@ """Collection of utilities to support zaza tests etc.""" +import logging import time from keystoneauth1.exceptions.connection import ConnectFailure @@ -95,10 +96,19 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0, If a list, then it will only retry if the exception is one of the ones in the list. :type retry_exceptions: List[Exception] + :param log: If False, disable logging; if None (the default) or True, + use logging.warn; otherwise use the passed param `log`. + :type param: None | Boolean | Callable """ # Note we use semi-private variable names that shouldn't clash with any # on the actual object. self.__obj = obj + if log in (None, True): + _log = logging.warning + elif log is False: + _log = lambda *_, **__: None # noqa + else: + _log = log self.__kwargs = { 'num_retries': num_retries, 'initial_interval': initial_interval, @@ -106,32 +116,19 @@ def __init__(self, obj, num_retries=3, initial_interval=5.0, backoff=1.0, 'max_interval': max_interval, 'total_wait': total_wait, 'retry_exceptions': retry_exceptions, - 'log': log or (lambda x: None), + 'log': _log, } + _log(f"ObjectRetrierWraps: wrapping {self.__obj}") def __getattr__(self, name): """Get attribute; delegates to wrapped object.""" - # Note the above may generate an attribute error; we expect this and - # will fail with an attribute error. - __log = self.__kwargs['log'] - __log(f"__getattr__(..) called with {name}") - attr = getattr(self.__obj, name) - if callable(attr) or hasattr(attr, "__getattr__"): - __log(f"__getattr__(..): wrapping {attr}") + obj = self.__obj + attr = getattr(obj, name) + if callable(attr): return ObjectRetrierWraps(attr, **self.__kwargs) - __log( f"__getattr__(): {name} is not callable or has __getattr__") - if isinstance(attr, property): - __log(f"__getattr__(): {name} is a property") - __log(f"__getattr__(): {name} on {self.__obj} is a {type(attr)}") - # return attr - __log(f"__getattr__(): wrapping {attr}") + if attr.__class__.__module__ == 'builtins': + return attr return ObjectRetrierWraps(attr, **self.__kwargs) - # TODO(ajkavanagh): Note detecting a property is a bit trickier. we - # can do isinstance(attr, property), but then the act of accessing it - # is what calls it. i.e. it would fail at the getattr(self.__obj, - # name) stage. The solution is to check first, and if it's a property, - # then treat it like the retrier. However, I think this is too - # complex for the first go, and to use manual retries in that instance. def __call__(self, *args, **kwargs): """Call the object; delegates to the wrapped object.""" @@ -147,7 +144,7 @@ def __call__(self, *args, **kwargs): wait_so_far = 0 while True: try: - log(f"Running {self.__name__}({args}, {kwargs})") + log(f"Running {self}({args}, {kwargs})") return obj(*args, **kwargs) except Exception as e: # if retry_exceptions is not None, or the type of the exception @@ -155,7 +152,7 @@ def __call__(self, *args, **kwargs): # immediately. This means that if retry_exceptions is None, # then the method is always retried. if isinstance(e, NEVER_RETRY_EXCEPTIONS): - log("ObjectRetrierWraps: error {} is never caught" + log("ObjectRetrierWraps: error {} is never caught; raising" .format(str(e))) raise if (retry_exceptions is not None and @@ -163,15 +160,16 @@ def __call__(self, *args, **kwargs): raise retry += 1 if retry > num_retries: - log("{}: exceeded number of retries, so erroring out" - .format(str(obj))) + log("ObjectRetrierWraps: {}: exceeded number of retries, " + "so erroring out" .format(str(obj))) raise e - log("{}: call failed: retrying in {} seconds" - .format(str(obj), wait)) + log("ObjectRetrierWraps: {}: call failed: retrying in {} " + "seconds" .format(str(obj), wait)) time.sleep(wait) wait_so_far += wait if wait_so_far >= total_wait: raise e + print('wait: ', wait, ' backoff:', backoff) wait = wait * backoff if wait > max_interval: wait = max_interval From d71c9b35456b17fda08c87654acfef9ddab20709 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Thu, 27 Jun 2024 15:24:48 +0100 Subject: [PATCH 5/8] Double Cinder backup restore time This accounts for slow ServerStack when using ceph in the model. --- zaza/openstack/charm_tests/cinder_backup/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zaza/openstack/charm_tests/cinder_backup/tests.py b/zaza/openstack/charm_tests/cinder_backup/tests.py index fc73fdbd6..258c7bb77 100644 --- a/zaza/openstack/charm_tests/cinder_backup/tests.py +++ b/zaza/openstack/charm_tests/cinder_backup/tests.py @@ -103,7 +103,7 @@ def test_410_cinder_vol_create_backup_delete_restore_pool_inspect(self): self.cinder_client.volumes, cinder_vol.id, wait_iteration_max_time=180, - stop_after_attempt=15, + stop_after_attempt=30, expected_status='available', msg='ceph-backed cinder volume') @@ -125,7 +125,7 @@ def test_410_cinder_vol_create_backup_delete_restore_pool_inspect(self): self.cinder_client.backups, vol_backup.id, wait_iteration_max_time=180, - stop_after_attempt=15, + stop_after_attempt=30, expected_status='available', msg='Backup volume') From 0bf4e9a55851ad106a4287d4f04e64f4d1cac12c Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Mon, 1 Jul 2024 16:11:54 +0100 Subject: [PATCH 6/8] Add more retries to allow manila backup restore to complete Due to slow 1GiB networking in ServerStack the restore often exceeds the retries. Increase them to enable the restore to succeed. --- zaza/openstack/charm_tests/manila/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zaza/openstack/charm_tests/manila/tests.py b/zaza/openstack/charm_tests/manila/tests.py index ea4073891..3a3f54c00 100644 --- a/zaza/openstack/charm_tests/manila/tests.py +++ b/zaza/openstack/charm_tests/manila/tests.py @@ -359,7 +359,7 @@ def test_manila_share(self): self.manila_client.shares, share.id, wait_iteration_max_time=120, - stop_after_attempt=2, + stop_after_attempt=10, expected_status="available", msg="Waiting for a share to become available") From 12c69fdca3592892bab852001dc99f98e095f2a1 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Mon, 1 Jul 2024 17:28:16 +0100 Subject: [PATCH 7/8] Reduce ObjectRetrierWraps logging noise during normal use --- zaza/openstack/utilities/__init__.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/zaza/openstack/utilities/__init__.py b/zaza/openstack/utilities/__init__.py index 7a8ca104f..02826ff7f 100644 --- a/zaza/openstack/utilities/__init__.py +++ b/zaza/openstack/utilities/__init__.py @@ -144,7 +144,6 @@ def __call__(self, *args, **kwargs): wait_so_far = 0 while True: try: - log(f"Running {self}({args}, {kwargs})") return obj(*args, **kwargs) except Exception as e: # if retry_exceptions is not None, or the type of the exception @@ -160,16 +159,15 @@ def __call__(self, *args, **kwargs): raise retry += 1 if retry > num_retries: - log("ObjectRetrierWraps: {}: exceeded number of retries, " - "so erroring out" .format(str(obj))) + log("ObjectRetrierWraps: exceeded number of retries, " + "so erroring out") raise e - log("ObjectRetrierWraps: {}: call failed: retrying in {} " - "seconds" .format(str(obj), wait)) + log("ObjectRetrierWraps: call failed: retrying in {} " + "seconds" .format(wait)) time.sleep(wait) wait_so_far += wait if wait_so_far >= total_wait: raise e - print('wait: ', wait, ' backoff:', backoff) wait = wait * backoff if wait > max_interval: wait = max_interval From b7c5b5a54d739a906080aa42233fafd309ab754c Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Mon, 1 Jul 2024 20:30:05 +0100 Subject: [PATCH 8/8] Modify test code to be less brittle The test for log function against ObjectRetrierWraps class is too brittle to the number of log calls. This commit makes it less brittle. --- unit_tests/utilities/test_utilities.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unit_tests/utilities/test_utilities.py b/unit_tests/utilities/test_utilities.py index 2f6dcf6e3..7eb99276d 100644 --- a/unit_tests/utilities/test_utilities.py +++ b/unit_tests/utilities/test_utilities.py @@ -150,9 +150,7 @@ def func(self): with self.assertRaises(SomeException): wrapped_a.func() - # there should be two calls; one for the single retry and one for the - # failure. - self.assertEqual(mock_log.call_count, 6) + mock_log.assert_called() @mock.patch("time.sleep") def test_back_off_maximum(self, mock_sleep):