Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | {{/* |
| 4 | Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | you may not use this file except in compliance with the License. |
| 6 | You may obtain a copy of the License at |
| 7 | |
| 8 | http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | |
| 10 | Unless required by applicable law or agreed to in writing, software |
| 11 | distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | See the License for the specific language governing permissions and |
| 14 | limitations under the License. |
| 15 | */}} |
| 16 | |
| 17 | """ |
| 18 | Health probe script for OpenStack agents that uses RPC/unix domain socket for |
| 19 | communication. Sends message to agent through rpc call method and expects a |
| 20 | reply. It is expected to receive a failure from the agent's RPC server as the |
| 21 | method does not exist. |
| 22 | |
| 23 | Script returns failure to Kubernetes only when |
| 24 | a. agent is not reachable or |
| 25 | b. agent times out sending a reply. |
| 26 | |
| 27 | sys.stderr.write() writes to pod's events on failures. |
| 28 | |
| 29 | Usage example for Neutron L3 agent: |
| 30 | # python health-probe.py --config-file /etc/neutron/neutron.conf \ |
| 31 | # --config-file /etc/neutron/l3_agent.ini --agent-queue-name l3_agent |
| 32 | |
| 33 | Usage example for Neutron metadata agent: |
| 34 | # python health-probe.py --config-file /etc/neutron/neutron.conf \ |
| 35 | # --config-file /etc/neutron/metadata_agent.ini |
| 36 | """ |
| 37 | |
| 38 | import httplib2 |
| 39 | from http import client as httplib |
| 40 | import json |
| 41 | import os |
| 42 | import psutil |
| 43 | import signal |
| 44 | import socket |
| 45 | import sys |
| 46 | |
| 47 | from oslo_config import cfg |
| 48 | from oslo_context import context |
| 49 | from oslo_log import log |
| 50 | import oslo_messaging |
| 51 | |
| 52 | rpc_timeout = int(os.getenv('RPC_PROBE_TIMEOUT', '60')) |
| 53 | rpc_retries = int(os.getenv('RPC_PROBE_RETRIES', '2')) |
| 54 | rabbit_port = 5672 |
| 55 | tcp_established = "ESTABLISHED" |
| 56 | log.logging.basicConfig(level=log.{{ .Values.health_probe.logging.level }}) |
| 57 | |
| 58 | |
| 59 | def _get_hostname(use_fqdn): |
| 60 | if use_fqdn: |
| 61 | return socket.getfqdn() |
| 62 | return socket.gethostname() |
| 63 | |
| 64 | def check_agent_status(transport): |
| 65 | """Verify agent status. Return success if agent consumes message""" |
| 66 | try: |
| 67 | use_fqdn = cfg.CONF.use_fqdn |
| 68 | target = oslo_messaging.Target( |
| 69 | topic=cfg.CONF.agent_queue_name, |
| 70 | server=_get_hostname(use_fqdn)) |
Mohammed Naser | a720f88 | 2023-06-30 23:48:02 -0400 | [diff] [blame] | 71 | if hasattr(oslo_messaging, 'get_rpc_client'): |
| 72 | client = oslo_messaging.get_rpc_client(transport, target, |
| 73 | timeout=rpc_timeout, |
| 74 | retry=rpc_retries) |
| 75 | else: |
| 76 | client = oslo_messaging.RPCClient(transport, target, |
| 77 | timeout=rpc_timeout, |
| 78 | retry=rpc_retries) |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 79 | client.call(context.RequestContext(), |
| 80 | 'pod_health_probe_method_ignore_errors') |
| 81 | except oslo_messaging.exceptions.MessageDeliveryFailure: |
| 82 | # Log to pod events |
| 83 | sys.stderr.write("Health probe unable to reach message bus") |
| 84 | sys.exit(0) # return success |
| 85 | except oslo_messaging.rpc.client.RemoteError as re: |
| 86 | message = getattr(re, "message", str(re)) |
| 87 | if ("Endpoint does not support RPC method" in message) or \ |
| 88 | ("Endpoint does not support RPC version" in message): |
| 89 | sys.exit(0) # Call reached the agent |
| 90 | else: |
| 91 | sys.stderr.write("Health probe unable to reach agent") |
| 92 | sys.exit(1) # return failure |
| 93 | except oslo_messaging.exceptions.MessagingTimeout: |
| 94 | sys.stderr.write("Health probe timed out. Agent is down or response " |
| 95 | "timed out") |
| 96 | sys.exit(1) # return failure |
| 97 | except Exception as ex: |
| 98 | message = getattr(ex, "message", str(ex)) |
| 99 | sys.stderr.write("Health probe caught exception sending message to " |
| 100 | "agent: %s" % message) |
| 101 | sys.exit(0) |
| 102 | except: |
| 103 | sys.stderr.write("Health probe caught exception sending message to" |
| 104 | " agent") |
| 105 | sys.exit(0) |
| 106 | |
| 107 | |
| 108 | def sriov_readiness_check(): |
| 109 | """Checks the sriov configuration on the sriov nic's""" |
| 110 | return_status = 1 |
| 111 | with open('/etc/neutron/plugins/ml2/sriov_agent.ini') as nic: |
| 112 | for phy in nic: |
| 113 | if "physical_device_mappings" in phy: |
| 114 | phy_dev = phy.split('=', 1)[1] |
| 115 | phy_dev1 = phy_dev.rstrip().split(',') |
| 116 | if not phy_dev1: |
| 117 | sys.stderr.write("No Physical devices" |
| 118 | " configured as SRIOV NICs") |
| 119 | sys.exit(1) |
| 120 | for intf in phy_dev1: |
| 121 | phy, dev = intf.split(':') |
| 122 | try: |
| 123 | with open('/sys/class/net/%s/device/' |
| 124 | 'sriov_numvfs' % dev) as f: |
| 125 | for line in f: |
| 126 | numvfs = line.rstrip('\n') |
| 127 | if numvfs: |
| 128 | return_status = 0 |
| 129 | except IOError: |
| 130 | sys.stderr.write("IOError:No sriov_numvfs config file") |
| 131 | sys.exit(return_status) |
| 132 | |
| 133 | |
| 134 | def get_rabbitmq_ports(): |
| 135 | "Get RabbitMQ ports" |
| 136 | |
| 137 | rabbitmq_ports = set() |
| 138 | |
| 139 | try: |
| 140 | transport_url = oslo_messaging.TransportURL.parse(cfg.CONF) |
| 141 | for host in transport_url.hosts: |
| 142 | rabbitmq_ports.add(host.port) |
| 143 | except Exception as ex: |
| 144 | message = getattr(ex, "message", str(ex)) |
| 145 | sys.stderr.write("Health probe caught exception reading " |
| 146 | "RabbitMQ ports: %s" % message) |
| 147 | sys.exit(0) # return success |
| 148 | |
| 149 | return rabbitmq_ports |
| 150 | |
| 151 | |
| 152 | def tcp_socket_state_check(agentq): |
| 153 | """Check if the tcp socket to rabbitmq is in Established state""" |
| 154 | rabbit_sock_count = 0 |
| 155 | parentId = 0 |
| 156 | if agentq == "l3_agent": |
| 157 | proc = "neutron-l3-agen" |
| 158 | elif agentq == "dhcp_agent": |
| 159 | proc = "neutron-dhcp-ag" |
| 160 | elif agentq == "q-agent-notifier-tunnel-update": |
| 161 | proc = "neutron-openvsw" |
| 162 | else: |
| 163 | proc = "neutron-metadat" |
| 164 | |
| 165 | rabbitmq_ports = get_rabbitmq_ports() |
| 166 | |
| 167 | for p in psutil.process_iter(): |
| 168 | try: |
| 169 | with p.oneshot(): |
| 170 | if proc in " ".join(p.cmdline()): |
| 171 | if parentId == 0: |
| 172 | parentId = p.pid |
| 173 | else: |
| 174 | if p.ppid() == parentId: |
| 175 | continue |
| 176 | pcon = p.connections() |
| 177 | for con in pcon: |
| 178 | try: |
| 179 | port = con.raddr[1] |
| 180 | status = con.status |
| 181 | except IndexError: |
| 182 | continue |
| 183 | if port in rabbitmq_ports and\ |
| 184 | status == tcp_established: |
| 185 | rabbit_sock_count = rabbit_sock_count + 1 |
| 186 | except psutil.Error: |
| 187 | continue |
| 188 | |
| 189 | if rabbit_sock_count == 0: |
| 190 | sys.stderr.write("RabbitMQ sockets not Established") |
| 191 | # Do not kill the pod if RabbitMQ is not reachable/down |
| 192 | if not cfg.CONF.liveness_probe: |
| 193 | sys.exit(1) |
| 194 | |
| 195 | |
| 196 | class UnixDomainHTTPConnection(httplib.HTTPConnection): |
| 197 | """Connection class for HTTP over UNIX domain socket.""" |
| 198 | |
| 199 | def __init__(self, host, port=None, strict=None, timeout=None, |
| 200 | proxy_info=None): |
| 201 | httplib.HTTPConnection.__init__(self, host, port, strict) |
| 202 | self.timeout = timeout |
| 203 | self.socket_path = cfg.CONF.metadata_proxy_socket |
| 204 | |
| 205 | def connect(self): |
| 206 | self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) |
| 207 | if self.timeout: |
| 208 | self.sock.settimeout(self.timeout) |
| 209 | self.sock.connect(self.socket_path) |
| 210 | |
| 211 | |
| 212 | def test_socket_liveness(): |
| 213 | """Test if agent can respond to message over the socket""" |
| 214 | cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, |
| 215 | required=False)) |
| 216 | cfg.CONF.register_cli_opt(cfg.BoolOpt('use-fqdn', default=False, |
| 217 | required=False)) |
| 218 | cfg.CONF(sys.argv[1:]) |
| 219 | |
Oleksandr Kozachenko | c0022be | 2023-05-23 20:36:21 +0200 | [diff] [blame] | 220 | if "ovn_metadata_agent.ini" not in ','.join(sys.argv): |
| 221 | agentq = "metadata_agent" |
| 222 | tcp_socket_state_check(agentq) |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 223 | |
| 224 | try: |
| 225 | metadata_proxy_socket = cfg.CONF.metadata_proxy_socket |
| 226 | except cfg.NoSuchOptError: |
| 227 | cfg.CONF.register_opt(cfg.StrOpt( |
| 228 | 'metadata_proxy_socket', |
| 229 | default='/var/lib/neutron/openstack-helm/metadata_proxy')) |
| 230 | |
| 231 | headers = {'X-Forwarded-For': '169.254.169.254', |
| 232 | 'X-Neutron-Router-ID': 'pod-health-probe-check-ignore-errors'} |
| 233 | |
| 234 | h = httplib2.Http(timeout=30) |
| 235 | |
| 236 | try: |
| 237 | resp, content = h.request( |
| 238 | 'http://169.254.169.254', |
| 239 | method='GET', |
| 240 | headers=headers, |
| 241 | connection_type=UnixDomainHTTPConnection) |
| 242 | except socket.error as se: |
| 243 | msg = "Socket error: Health probe failed to connect to " \ |
| 244 | "Neutron Metadata agent: " |
| 245 | if se.strerror: |
| 246 | sys.stderr.write(msg + se.strerror) |
| 247 | elif getattr(se, "message", False): |
| 248 | sys.stderr.write(msg + se.message) |
| 249 | sys.exit(1) # return failure |
| 250 | except Exception as ex: |
| 251 | message = getattr(ex, "message", str(ex)) |
| 252 | sys.stderr.write("Health probe caught exception sending message to " |
| 253 | "Neutron Metadata agent: %s" % message) |
| 254 | sys.exit(0) # return success |
| 255 | |
| 256 | if resp.status >= 500: # Probe expects HTTP error code 404 |
| 257 | msg = "Health probe failed: Neutron Metadata agent failed to" \ |
| 258 | " process request: " |
| 259 | sys.stderr.write(msg + str(resp.__dict__)) |
| 260 | sys.exit(1) # return failure |
| 261 | |
| 262 | |
| 263 | def test_rpc_liveness(): |
| 264 | """Test if agent can consume message from queue""" |
| 265 | oslo_messaging.set_transport_defaults(control_exchange='neutron') |
| 266 | |
| 267 | rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit', |
| 268 | title='RabbitMQ options') |
| 269 | cfg.CONF.register_group(rabbit_group) |
| 270 | cfg.CONF.register_cli_opt(cfg.StrOpt('agent-queue-name')) |
| 271 | cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, |
| 272 | required=False)) |
| 273 | cfg.CONF.register_cli_opt(cfg.BoolOpt('use-fqdn', default=False, |
| 274 | required=False)) |
| 275 | |
| 276 | cfg.CONF(sys.argv[1:]) |
| 277 | |
| 278 | try: |
Mohammed Naser | a720f88 | 2023-06-30 23:48:02 -0400 | [diff] [blame] | 279 | transport = oslo_messaging.get_rpc_transport(cfg.CONF) |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 280 | except Exception as ex: |
| 281 | message = getattr(ex, "message", str(ex)) |
| 282 | sys.stderr.write("Message bus driver load error: %s" % message) |
| 283 | sys.exit(0) # return success |
| 284 | |
| 285 | if not cfg.CONF.transport_url or \ |
| 286 | not cfg.CONF.agent_queue_name: |
| 287 | sys.stderr.write("Both message bus URL and agent queue name are " |
| 288 | "required for Health probe to work") |
| 289 | sys.exit(0) # return success |
| 290 | |
| 291 | try: |
| 292 | cfg.CONF.set_override('rabbit_max_retries', 2, |
| 293 | group=rabbit_group) # 3 attempts |
| 294 | except cfg.NoSuchOptError as ex: |
| 295 | cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2), |
| 296 | group=rabbit_group) |
| 297 | |
| 298 | agentq = cfg.CONF.agent_queue_name |
| 299 | tcp_socket_state_check(agentq) |
| 300 | |
| 301 | check_agent_status(transport) |
| 302 | |
| 303 | def check_pid_running(pid): |
| 304 | if psutil.pid_exists(int(pid)): |
| 305 | return True |
| 306 | else: |
| 307 | return False |
| 308 | |
| 309 | if __name__ == "__main__": |
| 310 | |
| 311 | if "liveness-probe" in ','.join(sys.argv): |
| 312 | pidfile = "/tmp/liveness.pid" #nosec |
| 313 | else: |
| 314 | pidfile = "/tmp/readiness.pid" #nosec |
| 315 | data = {} |
| 316 | if os.path.isfile(pidfile): |
| 317 | with open(pidfile,'r') as f: |
Rico Lin | 0e15348 | 2024-05-03 03:29:14 +0800 | [diff] [blame] | 318 | file_content = f.read().strip() |
| 319 | if file_content: |
| 320 | data = json.loads(file_content) |
| 321 | |
| 322 | if 'pid' in data and check_pid_running(data['pid']): |
| 323 | if 'exit_count' in data and data['exit_count'] > 1: |
| 324 | # Third time in, kill the previous process |
| 325 | os.kill(int(data['pid']), signal.SIGTERM) |
| 326 | else: |
| 327 | data['exit_count'] = data.get('exit_count', 0) + 1 |
| 328 | with open(pidfile, 'w') as f: |
| 329 | json.dump(data, f) |
| 330 | sys.exit(0) |
| 331 | |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 332 | data['pid'] = os.getpid() |
| 333 | data['exit_count'] = 0 |
| 334 | with open(pidfile, 'w') as f: |
| 335 | json.dump(data, f) |
| 336 | |
| 337 | if "sriov_agent.ini" in ','.join(sys.argv): |
| 338 | sriov_readiness_check() |
| 339 | elif "metadata_agent.ini" not in ','.join(sys.argv): |
| 340 | test_rpc_liveness() |
| 341 | else: |
| 342 | test_socket_liveness() |
| 343 | |
| 344 | sys.exit(0) # return success |