)]}' {"/COMMIT_MSG":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":7,"context_line":"Healthcheck middleware: add request pushback under cpu overloading."},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"When too many requests are directed to one proxy-server by NVSLB,"},{"line_number":10,"context_line":"this prox-server will become cpu overloaded and can\u0027t handle those"},{"line_number":11,"context_line":"requests in time, SREs have to touch its 503 file to prevent requests"},{"line_number":12,"context_line":"from keeping comming to this server."},{"line_number":13,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":1,"id":"baa2c9ac_490185a4","line":10,"range":{"start_line":10,"start_character":0,"end_line":10,"end_character":43},"updated":"2022-08-15 22:20:06.000000000","message":"When it\u0027s overloaded, is it the proxy-server processes in particular that tie up all the CPU, or are there other processes at fault, too?\n\nI guess I\u0027m a little worried about whether we could run into issues where a substantial portion of proxies start returning 503 due to *other* processes tying up CPU... which causes the remaining proxies to become more overloaded... seems like we could trigger a death-spiral of sorts.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"03a0c860_d06e422f","updated":"2022-08-15 22:20:06.000000000","message":"I *love* the idea of having this be more dynamic and driven by the actual state of the system -- I\u0027m a little worried about nodes running more than *just* proxy-server running into trouble, though. I think I might be more OK with it if we could isolate it to just actual CPU time used by proxy-servers? The concern about a death-spiral would probably be mitigated by making the exact threshold configurable.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c167c727921800adef137bc08a054a5841caf001","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"3c778c5e_1b2167ae","updated":"2022-08-16 14:34:24.000000000","message":"I like this this new feature doesn\u0027t add a new dependency. I think if we want to develop a custom /healthcheck middleware it might be easier to add a required dependency on something fancy like psutil\n\n\tclayg@banana:~/Workspace/vagrant-swift-all-in-one/swift$ git diff\n\tdiff --git a/swift/common/middleware/healthcheck.py b/swift/common/middleware/healthcheck.py\n\tindex f9f6b24ea..3b1838d62 100644\n\t--- a/swift/common/middleware/healthcheck.py\n\t+++ b/swift/common/middleware/healthcheck.py\n\t@@ -14,6 +14,8 @@\n\t # limitations under the License.\n\t \n\t import os\n\t+import logging\n\t+import psutil\n\t \n\t from swift.common.swob import Request, Response\n\t \n\t@@ -33,6 +35,9 @@ class HealthCheckMiddleware(object):\n\t\t self.app \u003d app\n\t\t self.disable_path \u003d conf.get(\u0027disable_path\u0027, \u0027\u0027)\n\t \n\t+ def num_connections(self):\n\t+ return sum(len(c.connections()) for c in psutil.Process(os.getpid()).parent().children())\n\t+\n\t def GET(self, req):\n\t\t \"\"\"Returns a 200 response with \"OK\" in the body.\"\"\"\n\t\t return Response(request\u003dreq, body\u003db\"OK\", content_type\u003d\"text/plain\")\n\t@@ -45,6 +50,7 @@ class HealthCheckMiddleware(object):\n\t def __call__(self, env, start_response):\n\t\t req \u003d Request(env)\n\t\t if req.path \u003d\u003d \u0027/healthcheck\u0027:\n\t+ logging.critical(\u0027num conn: %s\u0027, self.num_connections())\n\t\t handler \u003d self.GET\n\t\t if self.disable_path and os.path.exists(self.disable_path):\n\t\t\t handler \u003d self.DISABLED\n\n\n... or we could just troll /proc ourselves - it\u0027s probably not a lot of code.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"}],"swift/common/middleware/healthcheck.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"a8cf80ef2562cac1a5efd4c76c9d5bb877994731","unresolved":true,"context_lines":[{"line_number":39,"context_line":""},{"line_number":40,"context_line":" def _is_cpu_overloading(self):"},{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"}],"source_content_type":"text/x-python","patch_set":1,"id":"1a252086_d61c78f1","line":42,"range":{"start_line":42,"start_character":31,"end_line":42,"end_character":34},"updated":"2022-08-16 07:05:43.000000000","message":"I wonder if this might be an interesting time to take a look at the pythong psutil module. It allows us to probe the processes some more, including looking at the ones running for this process (and children), maybe even see things like zombie states etc.\n\nFurther it supports pulling out cpu_times and cpu_percent (not sure if that\u0027s global for just for this pid/group of pids). IO counters, \nload avg, disk io counters, even network usage. Maybe we block on number of connections, or if the failed rate goes too high, etc. https://psutil.readthedocs.io/en/latest/#\n\nJust thinking out loud. I like the idea of being more adaptive.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":39,"context_line":""},{"line_number":40,"context_line":" def _is_cpu_overloading(self):"},{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"}],"source_content_type":"text/x-python","patch_set":1,"id":"2617f402_92ebf21b","line":42,"range":{"start_line":42,"start_character":34,"end_line":42,"end_character":44},"updated":"2022-08-15 22:20:06.000000000","message":"This is going to include processes in uninterruptible sleep, yeah? Could skew the numbers, perhaps considerably -- especially if we\u0027re running object-servers as well.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":12050,"name":"Charles Hsu","email":"charles0126@gmail.com","username":"charz"},"change_message_id":"d9345f318d91384930188e14deff6d7c9fc259c2","unresolved":true,"context_lines":[{"line_number":39,"context_line":""},{"line_number":40,"context_line":" def _is_cpu_overloading(self):"},{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"}],"source_content_type":"text/x-python","patch_set":1,"id":"839dd1a9_25998afd","line":42,"range":{"start_line":42,"start_character":34,"end_line":42,"end_character":44},"in_reply_to":"2617f402_92ebf21b","updated":"2022-08-16 02:29:32.000000000","message":"Is there a way to monitor the node is running out of socket resources? If so, maybe that\u0027s another overload check we can do.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c167c727921800adef137bc08a054a5841caf001","unresolved":true,"context_lines":[{"line_number":39,"context_line":""},{"line_number":40,"context_line":" def _is_cpu_overloading(self):"},{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"}],"source_content_type":"text/x-python","patch_set":1,"id":"27cfa981_040e1997","line":42,"range":{"start_line":42,"start_character":34,"end_line":42,"end_character":44},"in_reply_to":"839dd1a9_25998afd","updated":"2022-08-16 14:34:24.000000000","message":"dang! That proc filesystem is pretty fast!\n\n\t(nvidia) clayg@banana:~$ python -m timeit -s \u0027import os\u0027 \u0027os.path.exists(\"README.md\")\u0027\n\t200000 loops, best of 5: 1.9 usec per loop\n\t(nvidia) clayg@banana:~$ python -m timeit -s \u0027import os\u0027 \u0027os.getloadavg()\u0027\n\t500000 loops, best of 5: 896 nsec per loop","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":40,"context_line":" def _is_cpu_overloading(self):"},{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"},{"line_number":46,"context_line":" else:"}],"source_content_type":"text/x-python","patch_set":1,"id":"48366c05_f55fd26f","line":43,"range":{"start_line":43,"start_character":21,"end_line":43,"end_character":27},"updated":"2022-08-15 22:20:06.000000000","message":"Is 15m going to allow us to respond quickly enough, or should we consider using load5? Agree that load1 would probably trip more often than would be useful.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"},{"line_number":46,"context_line":" else:"},{"line_number":47,"context_line":" return False"}],"source_content_type":"text/x-python","patch_set":1,"id":"79503f24_da45b0de","line":44,"range":{"start_line":44,"start_character":24,"end_line":44,"end_character":28},"updated":"2022-08-15 22:20:06.000000000","message":"Would make a good config option -- not sure whether it\u0027d be better as\n\n load_threshold \u003d 95\n\nor\n\n load_threshold \u003d .95\n\nthough. Could also rephrase this block as\n\n return cpu_usage \u003e\u003d ...","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c167c727921800adef137bc08a054a5841caf001","unresolved":true,"context_lines":[{"line_number":41,"context_line":" \"\"\"Indicates if cpu usage is overloading.\"\"\""},{"line_number":42,"context_line":" load1, load5, load15 \u003d os.getloadavg()"},{"line_number":43,"context_line":" cpu_usage \u003d (load15 / os.cpu_count()) * 100"},{"line_number":44,"context_line":" if cpu_usage \u003e\u003d 95.0:"},{"line_number":45,"context_line":" return True"},{"line_number":46,"context_line":" else:"},{"line_number":47,"context_line":" return False"}],"source_content_type":"text/x-python","patch_set":1,"id":"c3b7f7b7_13f72145","line":44,"range":{"start_line":44,"start_character":24,"end_line":44,"end_character":28},"in_reply_to":"79503f24_da45b0de","updated":"2022-08-16 14:34:24.000000000","message":"we could have three config options:\n\nload1_threshold\nload5_threshold\nload15_threshold","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":49,"context_line":" def CPU_PUSHED_BACK(self, req):"},{"line_number":50,"context_line":" \"\"\"Returns a 503 response when CPU is overloading.\"\"\""},{"line_number":51,"context_line":" return Response(request\u003dreq, status\u003d503,"},{"line_number":52,"context_line":" body\u003db\"PUSHED BACK BY CPU OVERLOADING\","},{"line_number":53,"context_line":" content_type\u003d\"text/plain\")"},{"line_number":54,"context_line":""},{"line_number":55,"context_line":" def DISABLED(self, req):"}],"source_content_type":"text/x-python","patch_set":1,"id":"bca36329_5c684f42","line":52,"updated":"2022-08-15 22:20:06.000000000","message":"+1, we don\u0027t want to piggy-back off DISABLED(); the separate message is great.","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"40535e51c2eebb2ca6f811df687670a0271c3f0e","unresolved":true,"context_lines":[{"line_number":62,"context_line":" if req.path \u003d\u003d \u0027/healthcheck\u0027:"},{"line_number":63,"context_line":" handler \u003d self.GET"},{"line_number":64,"context_line":" if self._is_cpu_overloading():"},{"line_number":65,"context_line":" handler \u003d self.CPU_PUSHED_BACK"},{"line_number":66,"context_line":" if self.disable_path and os.path.exists(self.disable_path):"},{"line_number":67,"context_line":" handler \u003d self.DISABLED"},{"line_number":68,"context_line":" return handler(req)(env, start_response)"}],"source_content_type":"text/x-python","patch_set":1,"id":"48e62491_9f174255","line":65,"updated":"2022-08-15 22:20:06.000000000","message":"nit: I might move this below the disabled check -- if both conditions trigger, I feel like the disabled-by-file message may be more useful. *shrug*","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"},{"author":{"_account_id":12050,"name":"Charles Hsu","email":"charles0126@gmail.com","username":"charz"},"change_message_id":"d9345f318d91384930188e14deff6d7c9fc259c2","unresolved":true,"context_lines":[{"line_number":62,"context_line":" if req.path \u003d\u003d \u0027/healthcheck\u0027:"},{"line_number":63,"context_line":" handler \u003d self.GET"},{"line_number":64,"context_line":" if self._is_cpu_overloading():"},{"line_number":65,"context_line":" handler \u003d self.CPU_PUSHED_BACK"},{"line_number":66,"context_line":" if self.disable_path and os.path.exists(self.disable_path):"},{"line_number":67,"context_line":" handler \u003d self.DISABLED"},{"line_number":68,"context_line":" return handler(req)(env, start_response)"}],"source_content_type":"text/x-python","patch_set":1,"id":"53b86f66_132eab74","line":65,"in_reply_to":"48e62491_9f174255","updated":"2022-08-16 02:29:32.000000000","message":"Do we want to add an option in the config file to enable it?","commit_id":"935ba385209c243c9e20c2f71fd779c56afd473d"}]}