Merge pull request #127 from ricolin/CLOUDOPS-548
[CLOUDOPS-569] Allow retry on openstack HttpException
diff --git a/requirements.txt b/requirements.txt
index 140798f..c6a3ae4 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@
tooz # Apache-2.0
sherlock>=0.4.1 # MIT
kubernetes # Apache-2.0
+tenacity
diff --git a/staffeln/common/openstack.py b/staffeln/common/openstack.py
index 710d112..f6dc94e 100644
--- a/staffeln/common/openstack.py
+++ b/staffeln/common/openstack.py
@@ -1,14 +1,42 @@
from __future__ import annotations
+import tenacity
from openstack import exceptions, proxy
from oslo_log import log
+from staffeln import conf
from staffeln.common import auth
from staffeln.i18n import _
+CONF = conf.CONF
LOG = log.getLogger(__name__)
+class RetryHTTPError(tenacity.retry_if_exception):
+ """Retry strategy that retries if the exception is an ``HTTPError`` with
+ a abnormal status code.
+ """
+
+ def __init__(self):
+ def is_http_error(exception):
+ # Make sure we don't retry on codes in skip list (default: [404]),
+ # as not found could be an expected status.
+ skip_codes = CONF.openstack.skip_retry_codes
+ result = (
+ isinstance(exception, exceptions.HttpException)
+ and str(exception.status_code) not in skip_codes
+ )
+ if result:
+ LOG.debug(
+ f"Getting HttpException {exception} (status "
+ f"code: {exception.status_code}), "
+ "retry till timeout..."
+ )
+ return result
+
+ super().__init__(predicate=is_http_error)
+
+
class OpenstackSDK:
def __init__(self):
self.conn_list = {}
@@ -26,6 +54,12 @@
self.conn = self.conn_list[project_id]
# user
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_user_id(self):
user_name = self.conn.config.auth["username"]
if "user_domain_id" in self.conn.config.auth:
@@ -38,15 +72,33 @@
user = self.conn.get_user(name_or_id=user_name)
return user.id
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_role_assignments(self, project_id, user_id=None):
filters = {"project": project_id}
if user_id:
filters["user"] = user_id
return self.conn.list_role_assignments(filters=filters)
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_user(self, user_id):
return self.conn.get_user(name_or_id=user_id)
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_project_member_emails(self, project_id):
members = self.get_role_assignments(project_id)
emails = []
@@ -63,9 +115,21 @@
emails.append(user.email)
return emails
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_projects(self):
return self.conn.list_projects()
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_servers(self, project_id=None, all_projects=True, details=True):
if project_id is not None:
return self.conn.compute.servers(
@@ -76,9 +140,21 @@
else:
return self.conn.compute.servers(details=details, all_projects=all_projects)
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_volume(self, uuid, project_id):
return self.conn.get_volume_by_id(uuid)
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_backup(self, uuid, project_id=None):
try:
return self.conn.get_volume_backup(uuid)
@@ -102,6 +178,12 @@
incremental=incremental,
)
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def delete_backup(self, uuid, project_id=None, force=False):
# Note(Alex): v3 is not supporting force delete?
# conn.block_storage.delete_backup(
@@ -115,11 +197,23 @@
except exceptions.ResourceNotFound:
return None
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_backup_quota(self, project_id):
# quota = conn.get_volume_quotas(project_id)
quota = self._get_volume_quotas(project_id)
return quota.backups
+ @tenacity.retry(
+ retry=RetryHTTPError(),
+ wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),
+ reraise=True,
+ stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),
+ )
def get_backup_gigabytes_quota(self, project_id):
# quota = conn.get_volume_quotas(project_id)
quota = self._get_volume_quotas(project_id)
diff --git a/staffeln/conf/conductor.py b/staffeln/conf/conductor.py
index 86407cc..f3d7623 100755
--- a/staffeln/conf/conductor.py
+++ b/staffeln/conf/conductor.py
@@ -10,6 +10,14 @@
title="Conductor Options",
help=_("Options under this group are used " "to define Conductor's configuration."),
)
+openstack_group = cfg.OptGroup(
+ "openstack",
+ title="OpenStack Options",
+ help=_(
+ "Options under this group are used "
+ "to define OpneStack related configuration."
+ ),
+)
backup_opts = [
cfg.IntOpt(
@@ -74,6 +82,36 @@
),
]
+openstack_opts = [
+ cfg.IntOpt(
+ "retry_timeout",
+ default=300,
+ min=1,
+ help=_(
+ "The timeout for retry OpenStackSDK HTTP exceptions, "
+ "the unit is one second."
+ ),
+ ),
+ cfg.IntOpt(
+ "max_retry_interval",
+ default=30,
+ min=0,
+ help=_(
+ "Max time interval for retry OpenStackSDK HTTP exceptions, "
+ "the unit is one second."
+ ),
+ ),
+ cfg.ListOpt(
+ "skip_retry_codes",
+ default=["404"],
+ help=_(
+ "A list of HTTP codes "
+ "to skip retry on for OpenStackSDK HTTP "
+ "exception."
+ ),
+ ),
+]
+
rotation_opts = [
cfg.IntOpt(
"rotation_workers",
@@ -138,6 +176,7 @@
conf.register_group(conductor_group)
conf.register_opts(backup_opts, group=conductor_group)
conf.register_opts(rotation_opts, group=conductor_group)
+ conf.register_opts(openstack_opts, group=openstack_group)
conf.register_opts(coordination_opts, group=coordination_group)
@@ -145,5 +184,6 @@
return {
"DEFAULT": rotation_opts,
conductor_group: backup_opts,
+ openstack_group: openstack_opts,
coordination_group: coordination_opts,
}
diff --git a/staffeln/tests/common/__init__.py b/staffeln/tests/common/__init__.py
new file mode 100644
index 0000000..953e217
--- /dev/null
+++ b/staffeln/tests/common/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024 VEXXHOST, Inc.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/staffeln/tests/common/test_openstacksdk.py b/staffeln/tests/common/test_openstacksdk.py
index ceeece9..dff6950 100644
--- a/staffeln/tests/common/test_openstacksdk.py
+++ b/staffeln/tests/common/test_openstacksdk.py
@@ -54,9 +54,7 @@
**kwargs,
)
self.assertEqual(status_code, exc.status_code)
- skip_retry_codes = conf.CONF.openstack.skip_retry_codes.replace(" ", "").split(
- ","
- )
+ skip_retry_codes = conf.CONF.openstack.skip_retry_codes
if str(status_code) not in skip_retry_codes:
if call_count == 1:
self.m_sleep.assert_called_once_with(1.0)
@@ -83,17 +81,18 @@
self._test_non_http_error(self.m_c.compute.servers, "get_servers")
def test_get_servers_conf_skip_http_error(self):
- conf.CONF.set_override("skip_retry_codes", "403,", "openstack")
+ conf.CONF.set_override("skip_retry_codes", [403], "openstack")
self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=403)
- self.assertEqual("403,", conf.CONF.openstack.skip_retry_codes)
+ self.assertEqual(["403"], conf.CONF.openstack.skip_retry_codes)
def test_get_servers_conf_skip_http_error_not_hit(self):
- conf.CONF.set_override("skip_retry_codes", "403,", "openstack")
+ conf.CONF.set_override("skip_retry_codes", [403], "openstack")
self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=404)
- self.assertEqual("403,", conf.CONF.openstack.skip_retry_codes)
+ self.assertEqual(["403"], conf.CONF.openstack.skip_retry_codes)
def test_get_servers_404_http_error(self):
self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=404)
+ self.assertEqual(["404"], conf.CONF.openstack.skip_retry_codes)
def test_get_servers_500_http_error(self):
self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=500)
diff --git a/test-requirements.txt b/test-requirements.txt
index 2c28785..afd3e2f 100755
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -10,3 +10,4 @@
stestr>=1.0.0 # Apache-2.0
testtools>=1.4.0 # MIT
pre-commit
+tenacity
diff --git a/tox.ini b/tox.ini
index 2de6fda..2542788 100755
--- a/tox.ini
+++ b/tox.ini
@@ -22,7 +22,8 @@
[testenv:{py3,py38,py39,py310}]
basepython = python3
-deps = -r{toxinidir}/test-requirements.txt
+deps = -r{toxinidir}/requirements.txt
+ -r{toxinidir}/test-requirements.txt
commands = stestr run --slowest {posargs}
[testenv:cover]