blob: b5e170bcfdbf74f80e9c1aa905588f89e6cfc8b9 [file] [log] [blame]
Mohammed Naserf3f59a72023-01-15 21:02:04 -05001#!/usr/bin/env python
2
3{{/*
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/}}
16
17"""
18Health probe script for OpenStack agents that uses RPC/unix domain socket for
19communication. Sends message to agent through rpc call method and expects a
20reply. It is expected to receive a failure from the agent's RPC server as the
21method does not exist.
22
23Script returns failure to Kubernetes only when
24 a. agent is not reachable or
25 b. agent times out sending a reply.
26
27sys.stderr.write() writes to pod's events on failures.
28
29Usage example for Neutron L3 agent:
30# python health-probe.py --config-file /etc/neutron/neutron.conf \
31# --config-file /etc/neutron/l3_agent.ini --agent-queue-name l3_agent
32
33Usage example for Neutron metadata agent:
34# python health-probe.py --config-file /etc/neutron/neutron.conf \
35# --config-file /etc/neutron/metadata_agent.ini
36"""
37
38import httplib2
39from http import client as httplib
40import json
41import os
42import psutil
43import signal
44import socket
45import sys
46
47from oslo_config import cfg
48from oslo_context import context
49from oslo_log import log
50import oslo_messaging
51
52rpc_timeout = int(os.getenv('RPC_PROBE_TIMEOUT', '60'))
53rpc_retries = int(os.getenv('RPC_PROBE_RETRIES', '2'))
54rabbit_port = 5672
55tcp_established = "ESTABLISHED"
56log.logging.basicConfig(level=log.{{ .Values.health_probe.logging.level }})
57
58
59def _get_hostname(use_fqdn):
60 if use_fqdn:
61 return socket.getfqdn()
62 return socket.gethostname()
63
64def check_agent_status(transport):
65 """Verify agent status. Return success if agent consumes message"""
66 try:
67 use_fqdn = cfg.CONF.use_fqdn
68 target = oslo_messaging.Target(
69 topic=cfg.CONF.agent_queue_name,
70 server=_get_hostname(use_fqdn))
Mohammed Nasera720f882023-06-30 23:48:02 -040071 if hasattr(oslo_messaging, 'get_rpc_client'):
72 client = oslo_messaging.get_rpc_client(transport, target,
73 timeout=rpc_timeout,
74 retry=rpc_retries)
75 else:
76 client = oslo_messaging.RPCClient(transport, target,
77 timeout=rpc_timeout,
78 retry=rpc_retries)
Mohammed Naserf3f59a72023-01-15 21:02:04 -050079 client.call(context.RequestContext(),
80 'pod_health_probe_method_ignore_errors')
81 except oslo_messaging.exceptions.MessageDeliveryFailure:
82 # Log to pod events
83 sys.stderr.write("Health probe unable to reach message bus")
84 sys.exit(0) # return success
85 except oslo_messaging.rpc.client.RemoteError as re:
86 message = getattr(re, "message", str(re))
87 if ("Endpoint does not support RPC method" in message) or \
88 ("Endpoint does not support RPC version" in message):
89 sys.exit(0) # Call reached the agent
90 else:
91 sys.stderr.write("Health probe unable to reach agent")
92 sys.exit(1) # return failure
93 except oslo_messaging.exceptions.MessagingTimeout:
94 sys.stderr.write("Health probe timed out. Agent is down or response "
95 "timed out")
96 sys.exit(1) # return failure
97 except Exception as ex:
98 message = getattr(ex, "message", str(ex))
99 sys.stderr.write("Health probe caught exception sending message to "
100 "agent: %s" % message)
101 sys.exit(0)
102 except:
103 sys.stderr.write("Health probe caught exception sending message to"
104 " agent")
105 sys.exit(0)
106
107
108def sriov_readiness_check():
109 """Checks the sriov configuration on the sriov nic's"""
110 return_status = 1
111 with open('/etc/neutron/plugins/ml2/sriov_agent.ini') as nic:
112 for phy in nic:
113 if "physical_device_mappings" in phy:
114 phy_dev = phy.split('=', 1)[1]
115 phy_dev1 = phy_dev.rstrip().split(',')
116 if not phy_dev1:
117 sys.stderr.write("No Physical devices"
118 " configured as SRIOV NICs")
119 sys.exit(1)
120 for intf in phy_dev1:
121 phy, dev = intf.split(':')
122 try:
123 with open('/sys/class/net/%s/device/'
124 'sriov_numvfs' % dev) as f:
125 for line in f:
126 numvfs = line.rstrip('\n')
127 if numvfs:
128 return_status = 0
129 except IOError:
130 sys.stderr.write("IOError:No sriov_numvfs config file")
131 sys.exit(return_status)
132
133
134def get_rabbitmq_ports():
135 "Get RabbitMQ ports"
136
137 rabbitmq_ports = set()
138
139 try:
140 transport_url = oslo_messaging.TransportURL.parse(cfg.CONF)
141 for host in transport_url.hosts:
142 rabbitmq_ports.add(host.port)
143 except Exception as ex:
144 message = getattr(ex, "message", str(ex))
145 sys.stderr.write("Health probe caught exception reading "
146 "RabbitMQ ports: %s" % message)
147 sys.exit(0) # return success
148
149 return rabbitmq_ports
150
151
152def tcp_socket_state_check(agentq):
153 """Check if the tcp socket to rabbitmq is in Established state"""
154 rabbit_sock_count = 0
155 parentId = 0
156 if agentq == "l3_agent":
157 proc = "neutron-l3-agen"
158 elif agentq == "dhcp_agent":
159 proc = "neutron-dhcp-ag"
160 elif agentq == "q-agent-notifier-tunnel-update":
161 proc = "neutron-openvsw"
162 else:
163 proc = "neutron-metadat"
164
165 rabbitmq_ports = get_rabbitmq_ports()
166
167 for p in psutil.process_iter():
168 try:
169 with p.oneshot():
170 if proc in " ".join(p.cmdline()):
171 if parentId == 0:
172 parentId = p.pid
173 else:
174 if p.ppid() == parentId:
175 continue
176 pcon = p.connections()
177 for con in pcon:
178 try:
179 port = con.raddr[1]
180 status = con.status
181 except IndexError:
182 continue
183 if port in rabbitmq_ports and\
184 status == tcp_established:
185 rabbit_sock_count = rabbit_sock_count + 1
186 except psutil.Error:
187 continue
188
189 if rabbit_sock_count == 0:
190 sys.stderr.write("RabbitMQ sockets not Established")
191 # Do not kill the pod if RabbitMQ is not reachable/down
192 if not cfg.CONF.liveness_probe:
193 sys.exit(1)
194
195
196class UnixDomainHTTPConnection(httplib.HTTPConnection):
197 """Connection class for HTTP over UNIX domain socket."""
198
199 def __init__(self, host, port=None, strict=None, timeout=None,
200 proxy_info=None):
201 httplib.HTTPConnection.__init__(self, host, port, strict)
202 self.timeout = timeout
203 self.socket_path = cfg.CONF.metadata_proxy_socket
204
205 def connect(self):
206 self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
207 if self.timeout:
208 self.sock.settimeout(self.timeout)
209 self.sock.connect(self.socket_path)
210
211
212def test_socket_liveness():
213 """Test if agent can respond to message over the socket"""
214 cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
215 required=False))
216 cfg.CONF.register_cli_opt(cfg.BoolOpt('use-fqdn', default=False,
217 required=False))
218 cfg.CONF(sys.argv[1:])
219
Oleksandr Kozachenkoc0022be2023-05-23 20:36:21 +0200220 if "ovn_metadata_agent.ini" not in ','.join(sys.argv):
221 agentq = "metadata_agent"
222 tcp_socket_state_check(agentq)
Mohammed Naserf3f59a72023-01-15 21:02:04 -0500223
224 try:
225 metadata_proxy_socket = cfg.CONF.metadata_proxy_socket
226 except cfg.NoSuchOptError:
227 cfg.CONF.register_opt(cfg.StrOpt(
228 'metadata_proxy_socket',
229 default='/var/lib/neutron/openstack-helm/metadata_proxy'))
230
231 headers = {'X-Forwarded-For': '169.254.169.254',
232 'X-Neutron-Router-ID': 'pod-health-probe-check-ignore-errors'}
233
234 h = httplib2.Http(timeout=30)
235
236 try:
237 resp, content = h.request(
238 'http://169.254.169.254',
239 method='GET',
240 headers=headers,
241 connection_type=UnixDomainHTTPConnection)
242 except socket.error as se:
243 msg = "Socket error: Health probe failed to connect to " \
244 "Neutron Metadata agent: "
245 if se.strerror:
246 sys.stderr.write(msg + se.strerror)
247 elif getattr(se, "message", False):
248 sys.stderr.write(msg + se.message)
249 sys.exit(1) # return failure
250 except Exception as ex:
251 message = getattr(ex, "message", str(ex))
252 sys.stderr.write("Health probe caught exception sending message to "
253 "Neutron Metadata agent: %s" % message)
254 sys.exit(0) # return success
255
256 if resp.status >= 500: # Probe expects HTTP error code 404
257 msg = "Health probe failed: Neutron Metadata agent failed to" \
258 " process request: "
259 sys.stderr.write(msg + str(resp.__dict__))
260 sys.exit(1) # return failure
261
262
263def test_rpc_liveness():
264 """Test if agent can consume message from queue"""
265 oslo_messaging.set_transport_defaults(control_exchange='neutron')
266
267 rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
268 title='RabbitMQ options')
269 cfg.CONF.register_group(rabbit_group)
270 cfg.CONF.register_cli_opt(cfg.StrOpt('agent-queue-name'))
271 cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
272 required=False))
273 cfg.CONF.register_cli_opt(cfg.BoolOpt('use-fqdn', default=False,
274 required=False))
275
276 cfg.CONF(sys.argv[1:])
277
278 try:
Mohammed Nasera720f882023-06-30 23:48:02 -0400279 transport = oslo_messaging.get_rpc_transport(cfg.CONF)
Mohammed Naserf3f59a72023-01-15 21:02:04 -0500280 except Exception as ex:
281 message = getattr(ex, "message", str(ex))
282 sys.stderr.write("Message bus driver load error: %s" % message)
283 sys.exit(0) # return success
284
285 if not cfg.CONF.transport_url or \
286 not cfg.CONF.agent_queue_name:
287 sys.stderr.write("Both message bus URL and agent queue name are "
288 "required for Health probe to work")
289 sys.exit(0) # return success
290
291 try:
292 cfg.CONF.set_override('rabbit_max_retries', 2,
293 group=rabbit_group) # 3 attempts
294 except cfg.NoSuchOptError as ex:
295 cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
296 group=rabbit_group)
297
298 agentq = cfg.CONF.agent_queue_name
299 tcp_socket_state_check(agentq)
300
301 check_agent_status(transport)
302
303def check_pid_running(pid):
304 if psutil.pid_exists(int(pid)):
305 return True
306 else:
307 return False
308
309if __name__ == "__main__":
310
311 if "liveness-probe" in ','.join(sys.argv):
312 pidfile = "/tmp/liveness.pid" #nosec
313 else:
314 pidfile = "/tmp/readiness.pid" #nosec
315 data = {}
316 if os.path.isfile(pidfile):
317 with open(pidfile,'r') as f:
Rico Lin0e153482024-05-03 03:29:14 +0800318 file_content = f.read().strip()
319 if file_content:
320 data = json.loads(file_content)
321
322 if 'pid' in data and check_pid_running(data['pid']):
323 if 'exit_count' in data and data['exit_count'] > 1:
324 # Third time in, kill the previous process
325 os.kill(int(data['pid']), signal.SIGTERM)
326 else:
327 data['exit_count'] = data.get('exit_count', 0) + 1
328 with open(pidfile, 'w') as f:
329 json.dump(data, f)
330 sys.exit(0)
331
Mohammed Naserf3f59a72023-01-15 21:02:04 -0500332 data['pid'] = os.getpid()
333 data['exit_count'] = 0
334 with open(pidfile, 'w') as f:
335 json.dump(data, f)
336
337 if "sriov_agent.ini" in ','.join(sys.argv):
338 sriov_readiness_check()
339 elif "metadata_agent.ini" not in ','.join(sys.argv):
340 test_rpc_liveness()
341 else:
342 test_socket_liveness()
343
344 sys.exit(0) # return success