[CLOUDOPS-548] Allow retry on openstack HttpException
diff --git a/requirements.txt b/requirements.txt
index 140798f..c6a3ae4 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@
 tooz # Apache-2.0
 sherlock>=0.4.1 # MIT
 kubernetes # Apache-2.0
+tenacity
diff --git a/staffeln/common/openstack.py b/staffeln/common/openstack.py
index 710d112..f6dc94e 100644
--- a/staffeln/common/openstack.py
+++ b/staffeln/common/openstack.py
@@ -1,14 +1,42 @@
 from __future__ import annotations

 

+import tenacity

 from openstack import exceptions, proxy

 from oslo_log import log

 

+from staffeln import conf

 from staffeln.common import auth

 from staffeln.i18n import _

 

+CONF = conf.CONF

 LOG = log.getLogger(__name__)

 

 

+class RetryHTTPError(tenacity.retry_if_exception):

+    """Retry strategy that retries if the exception is an ``HTTPError`` with

+    a abnormal status code.

+    """

+

+    def __init__(self):

+        def is_http_error(exception):

+            # Make sure we don't retry on codes in skip list (default: [404]),

+            # as not found could be an expected status.

+            skip_codes = CONF.openstack.skip_retry_codes

+            result = (

+                isinstance(exception, exceptions.HttpException)

+                and str(exception.status_code) not in skip_codes

+            )

+            if result:

+                LOG.debug(

+                    f"Getting HttpException {exception} (status "

+                    f"code: {exception.status_code}), "

+                    "retry till timeout..."

+                )

+            return result

+

+        super().__init__(predicate=is_http_error)

+

+

 class OpenstackSDK:

     def __init__(self):

         self.conn_list = {}

@@ -26,6 +54,12 @@
         self.conn = self.conn_list[project_id]

 

     # user

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_user_id(self):

         user_name = self.conn.config.auth["username"]

         if "user_domain_id" in self.conn.config.auth:

@@ -38,15 +72,33 @@
             user = self.conn.get_user(name_or_id=user_name)

         return user.id

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_role_assignments(self, project_id, user_id=None):

         filters = {"project": project_id}

         if user_id:

             filters["user"] = user_id

         return self.conn.list_role_assignments(filters=filters)

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_user(self, user_id):

         return self.conn.get_user(name_or_id=user_id)

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_project_member_emails(self, project_id):

         members = self.get_role_assignments(project_id)

         emails = []

@@ -63,9 +115,21 @@
                         emails.append(user.email)

         return emails

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_projects(self):

         return self.conn.list_projects()

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_servers(self, project_id=None, all_projects=True, details=True):

         if project_id is not None:

             return self.conn.compute.servers(

@@ -76,9 +140,21 @@
         else:

             return self.conn.compute.servers(details=details, all_projects=all_projects)

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_volume(self, uuid, project_id):

         return self.conn.get_volume_by_id(uuid)

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_backup(self, uuid, project_id=None):

         try:

             return self.conn.get_volume_backup(uuid)

@@ -102,6 +178,12 @@
             incremental=incremental,

         )

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def delete_backup(self, uuid, project_id=None, force=False):

         # Note(Alex): v3 is not supporting force delete?

         # conn.block_storage.delete_backup(

@@ -115,11 +197,23 @@
         except exceptions.ResourceNotFound:

             return None

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_backup_quota(self, project_id):

         # quota = conn.get_volume_quotas(project_id)

         quota = self._get_volume_quotas(project_id)

         return quota.backups

 

+    @tenacity.retry(

+        retry=RetryHTTPError(),

+        wait=tenacity.wait_exponential(max=CONF.openstack.max_retry_interval),

+        reraise=True,

+        stop=tenacity.stop_after_delay(CONF.openstack.retry_timeout),

+    )

     def get_backup_gigabytes_quota(self, project_id):

         # quota = conn.get_volume_quotas(project_id)

         quota = self._get_volume_quotas(project_id)

diff --git a/staffeln/conf/conductor.py b/staffeln/conf/conductor.py
index 86407cc..f3d7623 100755
--- a/staffeln/conf/conductor.py
+++ b/staffeln/conf/conductor.py
@@ -10,6 +10,14 @@
     title="Conductor Options",
     help=_("Options under this group are used " "to define Conductor's configuration."),
 )
+openstack_group = cfg.OptGroup(
+    "openstack",
+    title="OpenStack Options",
+    help=_(
+        "Options under this group are used "
+        "to define OpneStack related configuration."
+    ),
+)
 
 backup_opts = [
     cfg.IntOpt(
@@ -74,6 +82,36 @@
     ),
 ]
 
+openstack_opts = [
+    cfg.IntOpt(
+        "retry_timeout",
+        default=300,
+        min=1,
+        help=_(
+            "The timeout for retry OpenStackSDK HTTP exceptions, "
+            "the unit is one second."
+        ),
+    ),
+    cfg.IntOpt(
+        "max_retry_interval",
+        default=30,
+        min=0,
+        help=_(
+            "Max time interval for retry OpenStackSDK HTTP exceptions, "
+            "the unit is one second."
+        ),
+    ),
+    cfg.ListOpt(
+        "skip_retry_codes",
+        default=["404"],
+        help=_(
+            "A list of HTTP codes "
+            "to skip retry on for OpenStackSDK HTTP "
+            "exception."
+        ),
+    ),
+]
+
 rotation_opts = [
     cfg.IntOpt(
         "rotation_workers",
@@ -138,6 +176,7 @@
     conf.register_group(conductor_group)
     conf.register_opts(backup_opts, group=conductor_group)
     conf.register_opts(rotation_opts, group=conductor_group)
+    conf.register_opts(openstack_opts, group=openstack_group)
     conf.register_opts(coordination_opts, group=coordination_group)
 
 
@@ -145,5 +184,6 @@
     return {
         "DEFAULT": rotation_opts,
         conductor_group: backup_opts,
+        openstack_group: openstack_opts,
         coordination_group: coordination_opts,
     }
diff --git a/staffeln/tests/common/__init__.py b/staffeln/tests/common/__init__.py
new file mode 100644
index 0000000..953e217
--- /dev/null
+++ b/staffeln/tests/common/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024 VEXXHOST, Inc.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/staffeln/tests/common/test_openstacksdk.py b/staffeln/tests/common/test_openstacksdk.py
index ceeece9..dff6950 100644
--- a/staffeln/tests/common/test_openstacksdk.py
+++ b/staffeln/tests/common/test_openstacksdk.py
@@ -54,9 +54,7 @@
             **kwargs,
         )
         self.assertEqual(status_code, exc.status_code)
-        skip_retry_codes = conf.CONF.openstack.skip_retry_codes.replace(" ", "").split(
-            ","
-        )
+        skip_retry_codes = conf.CONF.openstack.skip_retry_codes
         if str(status_code) not in skip_retry_codes:
             if call_count == 1:
                 self.m_sleep.assert_called_once_with(1.0)
@@ -83,17 +81,18 @@
         self._test_non_http_error(self.m_c.compute.servers, "get_servers")
 
     def test_get_servers_conf_skip_http_error(self):
-        conf.CONF.set_override("skip_retry_codes", "403,", "openstack")
+        conf.CONF.set_override("skip_retry_codes", [403], "openstack")
         self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=403)
-        self.assertEqual("403,", conf.CONF.openstack.skip_retry_codes)
+        self.assertEqual(["403"], conf.CONF.openstack.skip_retry_codes)
 
     def test_get_servers_conf_skip_http_error_not_hit(self):
-        conf.CONF.set_override("skip_retry_codes", "403,", "openstack")
+        conf.CONF.set_override("skip_retry_codes", [403], "openstack")
         self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=404)
-        self.assertEqual("403,", conf.CONF.openstack.skip_retry_codes)
+        self.assertEqual(["403"], conf.CONF.openstack.skip_retry_codes)
 
     def test_get_servers_404_http_error(self):
         self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=404)
+        self.assertEqual(["404"], conf.CONF.openstack.skip_retry_codes)
 
     def test_get_servers_500_http_error(self):
         self._test_http_error(self.m_c.compute.servers, "get_servers", status_code=500)
diff --git a/test-requirements.txt b/test-requirements.txt
index 2c28785..afd3e2f 100755
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -10,3 +10,4 @@
 stestr>=1.0.0 # Apache-2.0
 testtools>=1.4.0 # MIT
 pre-commit
+tenacity
diff --git a/tox.ini b/tox.ini
index 2de6fda..2542788 100755
--- a/tox.ini
+++ b/tox.ini
@@ -22,7 +22,8 @@
 
 [testenv:{py3,py38,py39,py310}]
 basepython = python3
-deps = -r{toxinidir}/test-requirements.txt
+deps = -r{toxinidir}/requirements.txt
+       -r{toxinidir}/test-requirements.txt
 commands = stestr run --slowest {posargs}
 
 [testenv:cover]