)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":6926,"name":"Bogdan Dobrelya","email":"bdobreli@redhat.com","username":"bogdando"},"change_message_id":"aac72159ef08e5121149c8d14e78aa0e0def3b67","unresolved":true,"context_lines":[{"line_number":7,"context_line":"Bump haproxy check timeout"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Some services like nova may not respond within 10s for"},{"line_number":10,"context_line":"healthcheck in a resource constrained environment. Let\u0027s bump"},{"line_number":11,"context_line":"this to avoid haproxy marking these services unavailable."},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"\"When performing a healthcheck, the server has timeout"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":2,"id":"dba1d2ba_1439f092","line":10,"range":{"start_line":10,"start_character":57,"end_line":10,"end_character":61},"updated":"2022-02-24 14:43:54.000000000","message":"another solution could be prioritizing healthchecks with the haproxy 1.9+ new feature, like:\n\nhttp-request set-priority-class int(1) if req_health_check\nhttp-request set-priority-class int(10) if req_regular\n\nbut older versions (in Train we still have 1.8.4) would require another solution, like a dedicated backend for processing healthchecks, which looks suboptimal to me...","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":8833,"name":"Rabi Mishra","email":"ramishra@redhat.com","username":"rabi"},"change_message_id":"2d2bb2a8d2d9687276225a74a524713dd55ed3c7","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"4c09c56e_a2c69b1f","updated":"2022-02-17 14:58:19.000000000","message":"I think rdo jobs broken with  https://bugs.launchpad.net/bugs/1961056\n","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"1226305167e2c6622ac5a556e5aea5ac7f063110","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"be32b161_05531745","updated":"2022-02-17 14:36:38.000000000","message":"check-rdo","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"},{"author":{"_account_id":7144,"name":"James Slagle","email":"jslagle@redhat.com","username":"slagle"},"change_message_id":"1ad6e9007ce42545b046b3c3e85b2a3cf09eb5ec","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"2803fcde_76e2e9c5","updated":"2022-02-17 22:06:47.000000000","message":"isn\u0027t there a lot of history here? hasn\u0027t it been 10s for years? why would we now need 30s. I just want to make sure there isn\u0027t an underlying cause that needs to be addressed.","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"},{"author":{"_account_id":8833,"name":"Rabi Mishra","email":"ramishra@redhat.com","username":"rabi"},"change_message_id":"8791527f15666008987d01243864a8b3c02dff01","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"eb4e3449_7db22024","updated":"2022-03-01 04:32:08.000000000","message":"recheck","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"},{"author":{"_account_id":8833,"name":"Rabi Mishra","email":"ramishra@redhat.com","username":"rabi"},"change_message_id":"c143f843d306f1f7718f2b946096ebc728569aff","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"06fc9e25_894c2928","updated":"2022-02-25 05:34:38.000000000","message":"recheck https://review.opendev.org/c/openstack/puppet-tripleo/+/829888 merged","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"},{"author":{"_account_id":8833,"name":"Rabi Mishra","email":"ramishra@redhat.com","username":"rabi"},"change_message_id":"721935f973f380c4af09df5a9113928fd363d202","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"e486a4f1_cf4b75ac","in_reply_to":"2803fcde_76e2e9c5","updated":"2022-02-18 05:06:43.000000000","message":"The only history I know is where we increased the client/server timeout without increasing the check timeout https://github.com/openstack/puppet-tripleo/commit/1bee7bc8fa0ca5ace330e54bc3e64d7f6692d5a7.\n\nWe don\u0027t prioritize healthcheck requests over normal requests. Therefore, if there are more requests than the number of workers and those requests take more than 10 seconds for the backend to process, check requests would timeout and haproxy would mark the services unavailable.\n\nBelow[1] is the traceback from haproxy. GET /v2.1/servers/* requests take 14+s on an average (probably because that involves requests to nova, placement, ironic and what not, there is probably some performance regression here) and then all healthcheck requests fail and the service has been marked unhealthy.\n\nIf we bump timeout for requests to be processed within 2m or more by the backend, we also need to bump check timeout. We may get away by increasing number worker threads to more than the number of concurrent requests, but that\u0027s not a practical solution.\n\nAlso, there is a trade-off with setting an agressive check timeout, with load backend servers get slower, the healthchecks are delayed... until suddenly they all timeout together, HAProxy thinks ALL servers died at once and the entire service goes down. \n\n\n[1] Feb 18 04:23:38 undercloud haproxy[12]: 2620:dead:beef:4::2:57362 [18/Feb/2022:04:23:23.751] nova_osapi~ nova_osapi/undercloud.ctlplane.localdomain 0/0/0/14905/14905 200 4971 - - ---- 228/19/17/18/\n0 0/0 \"GET /v2.1/servers/fbf80eb5-32b3-4ebb-8ad3-f891e4271c4d HTTP/1.1\"\n...\nFeb 18 04:23:38 undercloud haproxy[12]: 2620:dead:beef:4::2:57372 [18/Feb/2022:04:23:24.374] nova_osapi~ nova_osapi/undercloud.ctlplane.localdomain 0/0/0/14324/14324 200 4976 - - ---- 228/19/16/17/0 0/0 \"GET /v2.1/servers/4e17e60f-aa63-4b94-842e-381fdcfe9a92 HTTP/1.1\"\n.....\nnova_osapi/undercloud.ctlplane.localdomain is DOWN, reason: Layer7 timeout, check duration: 10000ms. 0 active and 0 backup servers left. 18 sessions active, 0 requeued, 0 remaining in queue.\nFeb 18 04:23:39 undercloud haproxy[12]: proxy nova_osapi has no server available!","commit_id":"c9ba6a26219dbee9e8fd168f2b76bf7602c00f30"}]}
