)]}'
{"specs/xena/healthcheck-cleanup.rst":[{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"392a2cef857f1ffe4ef38fcfbae0d10ba4f72b7d","unresolved":true,"context_lines":[{"line_number":10,"context_line":""},{"line_number":11,"context_line":"https://blueprints.launchpad.net/tripleo/+spec/clean-container-healthchecks"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"We don\u0027t rely on the container healthcheck results for anything in the"},{"line_number":14,"context_line":"infrastructure. They are time and resource consuming, and their maintenance is"},{"line_number":15,"context_line":"mostly random. We can at least remove the ones that aren\u0027t hitting an actual"},{"line_number":16,"context_line":"API healthcheck endpoint."}],"source_content_type":"text/x-rst","patch_set":2,"id":"269cfdee_f9d20997","line":13,"range":{"start_line":13,"start_character":17,"end_line":13,"end_character":42},"updated":"2021-04-28 07:54:38.000000000","message":"would be nice to explicitly establish *which* healthchecks we are referring to here, i.e. point to https://opendev.org/openstack/tripleo-common/src/branch/master/healthcheck","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"5b1e68bd1365fce912b8abe057bd508cd3def5ee","unresolved":false,"context_lines":[{"line_number":10,"context_line":""},{"line_number":11,"context_line":"https://blueprints.launchpad.net/tripleo/+spec/clean-container-healthchecks"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"We don\u0027t rely on the container healthcheck results for anything in the"},{"line_number":14,"context_line":"infrastructure. They are time and resource consuming, and their maintenance is"},{"line_number":15,"context_line":"mostly random. We can at least remove the ones that aren\u0027t hitting an actual"},{"line_number":16,"context_line":"API healthcheck endpoint."}],"source_content_type":"text/x-rst","patch_set":2,"id":"9fe07630_f04abc83","line":13,"range":{"start_line":13,"start_character":17,"end_line":13,"end_character":42},"in_reply_to":"269cfdee_f9d20997","updated":"2021-04-29 11:59:59.000000000","message":"Adding the link as a ref","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"392a2cef857f1ffe4ef38fcfbae0d10ba4f72b7d","unresolved":true,"context_lines":[{"line_number":48,"context_line":"--------"},{"line_number":49,"context_line":""},{"line_number":50,"context_line":"A first step would be to remove every configuration using just a lame check,"},{"line_number":51,"context_line":"such as the *healtheck_socket*, *healthcheck_port*, and *healthcheck_curl* that"},{"line_number":52,"context_line":"aren\u0027t calling an actual API healthcheck endpoint."},{"line_number":53,"context_line":""},{"line_number":54,"context_line":"This would already drastically reduce the amount of \"podman\" calls, leading"}],"source_content_type":"text/x-rst","patch_set":2,"id":"418a3bda_e09f46e5","line":51,"range":{"start_line":51,"start_character":13,"end_line":51,"end_character":73},"updated":"2021-04-28 07:54:38.000000000","message":"again would be nice to point to those in https://opendev.org/openstack/tripleo-common/src/commit/e944f1f789861a4e7cbd04fedbf3efe7917919ad/healthcheck/common.sh#L28-L83","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":20172,"name":"Michele Baldessari","email":"michele@acksyn.org","username":"michele"},"change_message_id":"653ac03e0bef59f20e08a33d58a91d5410a91a51","unresolved":true,"context_lines":[{"line_number":57,"context_line":"In case an Operator wants to get some status information, they can leverage"},{"line_number":58,"context_line":"an existing validation::"},{"line_number":59,"context_line":""},{"line_number":60,"context_line":"  openstack tripleo validator run --validation service-status"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"This validation can be launched from the Undercloud directly, and will gather"},{"line_number":63,"context_line":"remote status for every OC nodes, then provide a clear summary."}],"source_content_type":"text/x-rst","patch_set":2,"id":"c73d337e_05ba68f7","line":60,"range":{"start_line":60,"start_character":2,"end_line":60,"end_character":61},"updated":"2021-04-23 07:36:29.000000000","message":"https://bugs.launchpad.net/tripleo/+bug/1925754 fwiw it\u0027s broken for me","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"5b1e68bd1365fce912b8abe057bd508cd3def5ee","unresolved":false,"context_lines":[{"line_number":57,"context_line":"In case an Operator wants to get some status information, they can leverage"},{"line_number":58,"context_line":"an existing validation::"},{"line_number":59,"context_line":""},{"line_number":60,"context_line":"  openstack tripleo validator run --validation service-status"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"This validation can be launched from the Undercloud directly, and will gather"},{"line_number":63,"context_line":"remote status for every OC nodes, then provide a clear summary."}],"source_content_type":"text/x-rst","patch_set":2,"id":"b521934b_714bb406","line":60,"range":{"start_line":60,"start_character":2,"end_line":60,"end_character":61},"in_reply_to":"c73d337e_05ba68f7","updated":"2021-04-29 11:59:59.000000000","message":"VF is on it - thanks for the info ^^\u0027.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"392a2cef857f1ffe4ef38fcfbae0d10ba4f72b7d","unresolved":true,"context_lines":[{"line_number":62,"context_line":"This validation can be launched from the Undercloud directly, and will gather"},{"line_number":63,"context_line":"remote status for every OC nodes, then provide a clear summary."},{"line_number":64,"context_line":""},{"line_number":65,"context_line":"A second step would be to add the missing real healthchecks - but we might"},{"line_number":66,"context_line":"want to consider an actual healthcheck service on the host instead of doing"},{"line_number":67,"context_line":"per-container checks (see step 3)."},{"line_number":68,"context_line":""},{"line_number":69,"context_line":"A third step would be to get healthchecks running as a dedicated service"},{"line_number":70,"context_line":"instead of within the container itself - for instance, instead of configuring"},{"line_number":71,"context_line":"the healthcheck in the container creation, we might want to deploy an actual"},{"line_number":72,"context_line":"monitoring tool of some kind, with scheduled runs. It can then just output"}],"source_content_type":"text/x-rst","patch_set":2,"id":"be8f25eb_caff1731","line":69,"range":{"start_line":65,"start_character":6,"end_line":69,"end_character":9},"updated":"2021-04-28 07:54:38.000000000","message":"I think we should get some clarity on these and make it more explicit here what we are planning to do.\n\nRight now it reads like \"we are removing the healthchecks\"... and \"maybe we might do this or that but maybe not\" \nIt seems like we are removing them without really considering how they may be replaced.\n\nIf we don\u0027t want them to be replaced then fine, let\u0027s say that we are OK with the existing checks (\"is container running OK\"). \n\nRight now it is neither one nor the other kind of open ended ;)","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"5b1e68bd1365fce912b8abe057bd508cd3def5ee","unresolved":true,"context_lines":[{"line_number":62,"context_line":"This validation can be launched from the Undercloud directly, and will gather"},{"line_number":63,"context_line":"remote status for every OC nodes, then provide a clear summary."},{"line_number":64,"context_line":""},{"line_number":65,"context_line":"A second step would be to add the missing real healthchecks - but we might"},{"line_number":66,"context_line":"want to consider an actual healthcheck service on the host instead of doing"},{"line_number":67,"context_line":"per-container checks (see step 3)."},{"line_number":68,"context_line":""},{"line_number":69,"context_line":"A third step would be to get healthchecks running as a dedicated service"},{"line_number":70,"context_line":"instead of within the container itself - for instance, instead of configuring"},{"line_number":71,"context_line":"the healthcheck in the container creation, we might want to deploy an actual"},{"line_number":72,"context_line":"monitoring tool of some kind, with scheduled runs. It can then just output"}],"source_content_type":"text/x-rst","patch_set":2,"id":"449dd012_84be90d7","line":69,"range":{"start_line":65,"start_character":6,"end_line":69,"end_character":9},"in_reply_to":"be8f25eb_caff1731","updated":"2021-04-29 11:59:59.000000000","message":"replacement requires more work than \"just\" creating them, especially in the light of the recent issues we\u0027re facing with \"long-running services\" - *apparently* podman\u0027s healthchecks are breaking things on the long-term. It\u0027s still under investigations, but there are good hints the issue we\u0027re seeing (system becomes unresponsive after some days) is linked to broken/zombie processes eating all the memory, and those are probably generated by podman healthchecks....\n\nSo first step is a sanity thing - clean it; then consider options. Since we\u0027re not using them anyway, there\u0027s no real reason to maintain them.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":10969,"name":"Shnaidman Sagi (Sergey)","display_name":"Shnaidman Sagi","email":"sshnaidm@redhat.com","username":"sergsh"},"change_message_id":"46e4e6a17d440ad6ab5eb3e16691ad65762eb85d","unresolved":true,"context_lines":[{"line_number":69,"context_line":"A third step would be to get healthchecks running as a dedicated service"},{"line_number":70,"context_line":"instead of within the container itself - for instance, instead of configuring"},{"line_number":71,"context_line":"the healthcheck in the container creation, we might want to deploy an actual"},{"line_number":72,"context_line":"monitoring tool of some kind, with scheduled runs. It can then just output"},{"line_number":73,"context_line":"the status in a local file an Operator could grab and send to some proper"},{"line_number":74,"context_line":"monitoring infrastructure."},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"bb6210cc_b21c0650","line":72,"range":{"start_line":72,"start_character":0,"end_line":72,"end_character":50},"updated":"2021-04-28 09:50:49.000000000","message":"This seems like out of scope of TripleO?\nIf we are talking about external monitoring tool and actions the can be executed on specific events - that\u0027s for cloudops more, day-2 operations, etc.\nDuring the deployment we don\u0027t need to check service health, since its failure will fail the deployment and we\u0027ll know about that. Validations seem to be helping for that.\nSo actually we do NOT need any alternative, because it was actually useless and validations should give better answer for services availability during the deployment.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":9592,"name":"wes hayutin","email":"whayutin@redhat.com","username":"weshayutin"},"change_message_id":"c400013a2397e4e634f2ece8757ea9582c27a840","unresolved":true,"context_lines":[{"line_number":80,"context_line":"Alternatives"},{"line_number":81,"context_line":"------------"},{"line_number":82,"context_line":""},{"line_number":83,"context_line":"No real alternative: either we keep the existing thing, or we improve it with"},{"line_number":84,"context_line":"the help of all DFGs."},{"line_number":85,"context_line":""},{"line_number":86,"context_line":"Keeping the current situation isn\u0027t good, since we don\u0027t do real checks about"},{"line_number":87,"context_line":"the service status and ability to work."},{"line_number":88,"context_line":""},{"line_number":89,"context_line":"Security Impact"},{"line_number":90,"context_line":"---------------"},{"line_number":91,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"cc8c37e1_5cdb5943","line":88,"range":{"start_line":83,"start_character":0,"end_line":88,"end_character":0},"updated":"2021-04-26 19:39:28.000000000","message":"Several alternatives were discussed, not listing them is not very transparent.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"5b1e68bd1365fce912b8abe057bd508cd3def5ee","unresolved":true,"context_lines":[{"line_number":80,"context_line":"Alternatives"},{"line_number":81,"context_line":"------------"},{"line_number":82,"context_line":""},{"line_number":83,"context_line":"No real alternative: either we keep the existing thing, or we improve it with"},{"line_number":84,"context_line":"the help of all DFGs."},{"line_number":85,"context_line":""},{"line_number":86,"context_line":"Keeping the current situation isn\u0027t good, since we don\u0027t do real checks about"},{"line_number":87,"context_line":"the service status and ability to work."},{"line_number":88,"context_line":""},{"line_number":89,"context_line":"Security Impact"},{"line_number":90,"context_line":"---------------"},{"line_number":91,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"f2e8f34f_58375971","line":88,"range":{"start_line":83,"start_character":0,"end_line":88,"end_character":0},"in_reply_to":"24e28e7d_777a52b9","updated":"2021-04-29 11:59:59.000000000","message":"indeed - the described steps above can be moved here, it will make things clearer and smarter.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"392a2cef857f1ffe4ef38fcfbae0d10ba4f72b7d","unresolved":true,"context_lines":[{"line_number":80,"context_line":"Alternatives"},{"line_number":81,"context_line":"------------"},{"line_number":82,"context_line":""},{"line_number":83,"context_line":"No real alternative: either we keep the existing thing, or we improve it with"},{"line_number":84,"context_line":"the help of all DFGs."},{"line_number":85,"context_line":""},{"line_number":86,"context_line":"Keeping the current situation isn\u0027t good, since we don\u0027t do real checks about"},{"line_number":87,"context_line":"the service status and ability to work."},{"line_number":88,"context_line":""},{"line_number":89,"context_line":"Security Impact"},{"line_number":90,"context_line":"---------------"},{"line_number":91,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"24e28e7d_777a52b9","line":88,"range":{"start_line":83,"start_character":0,"end_line":88,"end_character":0},"in_reply_to":"cc8c37e1_5cdb5943","updated":"2021-04-28 07:54:38.000000000","message":"i guess some of these are touched on in the paragraph above so possibly just some re-arranging needed here (mentions \u0027adding real healthchecks\u0027 or using a \u0027dedicated service\u0027)","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"392a2cef857f1ffe4ef38fcfbae0d10ba4f72b7d","unresolved":true,"context_lines":[{"line_number":150,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":151,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":152,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":153,"context_line":"3. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":154,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"},{"line_number":155,"context_line":"4. Initiate a cross-Team work toward actual healthcheck endpoints for the"},{"line_number":156,"context_line":"   services in need"},{"line_number":157,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"b89a2aa2_4faeabc7","line":154,"range":{"start_line":153,"start_character":3,"end_line":154,"end_character":22},"updated":"2021-04-28 07:54:38.000000000","message":"so it seems this is the planned option i.e. we plan to implement a \u0027proper\u0027 healthcheck service instead.\n\nLet\u0027s commit to this a bit more clearly in the earlier paragraphs?","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"5b1e68bd1365fce912b8abe057bd508cd3def5ee","unresolved":true,"context_lines":[{"line_number":150,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":151,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":152,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":153,"context_line":"3. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":154,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"},{"line_number":155,"context_line":"4. Initiate a cross-Team work toward actual healthcheck endpoints for the"},{"line_number":156,"context_line":"   services in need"},{"line_number":157,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"b14f579c_ffa13c83","line":154,"range":{"start_line":153,"start_character":3,"end_line":154,"end_character":22},"in_reply_to":"b89a2aa2_4faeabc7","updated":"2021-04-29 11:59:59.000000000","message":"Guess this step will be dropped, as well as 4th or, at least, reworded.","commit_id":"9a6abcf2863ed58ca35fad661a8e9fbca38c40bb"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":20,"context_line":"Problem Description"},{"line_number":21,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":22,"context_line":""},{"line_number":23,"context_line":"Since we moved the services in container, first with the docker engine, then"},{"line_number":24,"context_line":"with podman, container healthchecks have been implemented and used."},{"line_number":25,"context_line":""},{"line_number":26,"context_line":"While the very idea of healthchecks isn\u0027t bad, the way we (TripleO) are"}],"source_content_type":"text/x-rst","patch_set":3,"id":"2ca7b4f8_27864e2f","line":23,"range":{"start_line":23,"start_character":28,"end_line":23,"end_character":30},"updated":"2021-05-06 07:38:15.000000000","message":"to","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":20,"context_line":"Problem Description"},{"line_number":21,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":22,"context_line":""},{"line_number":23,"context_line":"Since we moved the services in container, first with the docker engine, then"},{"line_number":24,"context_line":"with podman, container healthchecks have been implemented and used."},{"line_number":25,"context_line":""},{"line_number":26,"context_line":"While the very idea of healthchecks isn\u0027t bad, the way we (TripleO) are"}],"source_content_type":"text/x-rst","patch_set":3,"id":"e7f4e44c_c01f0ea7","line":23,"range":{"start_line":23,"start_character":28,"end_line":23,"end_character":30},"in_reply_to":"2ca7b4f8_27864e2f","updated":"2021-05-06 08:31:23.000000000","message":"Done","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":27,"context_line":"making and using them is mostly wrong:"},{"line_number":28,"context_line":""},{"line_number":29,"context_line":"* no action is taken upon healthcheck failure"},{"line_number":30,"context_line":"* some (most) aren\u0027t actually checking if the service is working or not"},{"line_number":31,"context_line":""},{"line_number":32,"context_line":"The healthchecks such as *`healthcheck_port`_*, *`healthcheck_listen`_*,"},{"line_number":33,"context_line":"*`healthcheck_socket`_* as well as most of the scripts calling"}],"source_content_type":"text/x-rst","patch_set":3,"id":"de51f0ae_a0cbafbe","line":30,"range":{"start_line":30,"start_character":69,"end_line":30,"end_character":71},"updated":"2021-05-06 07:38:15.000000000","message":"\"but merely that the service container is running\"","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":27,"context_line":"making and using them is mostly wrong:"},{"line_number":28,"context_line":""},{"line_number":29,"context_line":"* no action is taken upon healthcheck failure"},{"line_number":30,"context_line":"* some (most) aren\u0027t actually checking if the service is working or not"},{"line_number":31,"context_line":""},{"line_number":32,"context_line":"The healthchecks such as *`healthcheck_port`_*, *`healthcheck_listen`_*,"},{"line_number":33,"context_line":"*`healthcheck_socket`_* as well as most of the scripts calling"}],"source_content_type":"text/x-rst","patch_set":3,"id":"c6912410_f588a917","line":30,"range":{"start_line":30,"start_character":69,"end_line":30,"end_character":71},"in_reply_to":"de51f0ae_a0cbafbe","updated":"2021-05-06 08:31:23.000000000","message":"Done","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":49,"context_line":"--------"},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"A deep cleaning of the current healthcheck is needed, such as the"},{"line_number":52,"context_line":"*healtheck_socket*, *healthcheck_port*, and *healthcheck_curl* that aren\u0027t"},{"line_number":53,"context_line":"calling an actual API healthcheck endpoint. This list isn\u0027t exhaustive."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"This will drastically reduce the amount of \"podman\" calls, leading"}],"source_content_type":"text/x-rst","patch_set":3,"id":"ff84fade_9e5458a8","line":52,"range":{"start_line":52,"start_character":0,"end_line":52,"end_character":61},"updated":"2021-05-06 07:38:15.000000000","message":"can point to these https://opendev.org/openstack/tripleo-common/src/commit/f7cfed9e5c81cb320d6a5577332c9ad822fda29d/healthcheck/common.sh#L28-L95","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":49,"context_line":"--------"},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"A deep cleaning of the current healthcheck is needed, such as the"},{"line_number":52,"context_line":"*healtheck_socket*, *healthcheck_port*, and *healthcheck_curl* that aren\u0027t"},{"line_number":53,"context_line":"calling an actual API healthcheck endpoint. This list isn\u0027t exhaustive."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"This will drastically reduce the amount of \"podman\" calls, leading"}],"source_content_type":"text/x-rst","patch_set":3,"id":"d1a21048_9778f0bb","line":52,"range":{"start_line":52,"start_character":0,"end_line":52,"end_character":61},"in_reply_to":"ff84fade_9e5458a8","updated":"2021-05-06 08:31:23.000000000","message":"will just re-use the previous links then.","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":71,"context_line":"------------"},{"line_number":72,"context_line":""},{"line_number":73,"context_line":"There are multiple alternatives we can even implement as a step-by-step"},{"line_number":74,"context_line":"solution:"},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"Replace the listed healthchecks by actual service healthchecks"},{"line_number":77,"context_line":".............................................................."}],"source_content_type":"text/x-rst","patch_set":3,"id":"653c9ae6_f2a39042","line":74,"range":{"start_line":74,"start_character":8,"end_line":74,"end_character":9},"updated":"2021-05-06 07:38:15.000000000","message":"Any of these can be considered as future work and warrants its own discussion and spec submission.","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":71,"context_line":"------------"},{"line_number":72,"context_line":""},{"line_number":73,"context_line":"There are multiple alternatives we can even implement as a step-by-step"},{"line_number":74,"context_line":"solution:"},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"Replace the listed healthchecks by actual service healthchecks"},{"line_number":77,"context_line":".............................................................."}],"source_content_type":"text/x-rst","patch_set":3,"id":"d460d51d_4f903a3d","line":74,"range":{"start_line":74,"start_character":8,"end_line":74,"end_character":9},"in_reply_to":"653c9ae6_f2a39042","updated":"2021-05-06 08:31:23.000000000","message":"Added a word about this.","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":157,"context_line":""},{"line_number":158,"context_line":"1. Triage existing healthcheck, and if they aren\u0027t calling actual endpoint,"},{"line_number":159,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":160,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":161,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":162,"context_line":"3. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":163,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"}],"source_content_type":"text/x-rst","patch_set":3,"id":"db4a4747_b1784fab","line":160,"range":{"start_line":160,"start_character":0,"end_line":160,"end_character":1},"updated":"2021-05-06 07:38:15.000000000","message":"this one will be hard i see you already added a note in the testing but mainly i don\u0027t see how we will quantify that","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":157,"context_line":""},{"line_number":158,"context_line":"1. Triage existing healthcheck, and if they aren\u0027t calling actual endpoint,"},{"line_number":159,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":160,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":161,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":162,"context_line":"3. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":163,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"}],"source_content_type":"text/x-rst","patch_set":3,"id":"0605c4fa_a5e8a63c","line":160,"range":{"start_line":160,"start_character":0,"end_line":160,"end_character":1},"in_reply_to":"db4a4747_b1784fab","updated":"2021-05-06 08:31:23.000000000","message":"Added a note","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"855bfa670487306418f66207d4f341f6b45504f9","unresolved":true,"context_lines":[{"line_number":182,"context_line":""},{"line_number":183,"context_line":"Testing will require different things:"},{"line_number":184,"context_line":""},{"line_number":185,"context_line":"* Proper metrics in order to ensure there\u0027s no negative impact - and that any"},{"line_number":186,"context_line":"  impact is measurable"},{"line_number":187,"context_line":"* Proper insurance the removal of the healthcheck doesn\u0027t affect the services"},{"line_number":188,"context_line":"  in a negative way"},{"line_number":189,"context_line":"* Proper testing of the validations, especially \"service-status\" in order to"}],"source_content_type":"text/x-rst","patch_set":3,"id":"084b6f78_01e7cbcd","line":186,"range":{"start_line":185,"start_character":0,"end_line":186,"end_character":1},"updated":"2021-05-06 07:38:15.000000000","message":"time is the easy one (i.e. we expect it will be faster now)\nstability will be hard - number of bugs? but probably mainly empirical \"we didn\u0027t see any/many unexpected instability in CI jobs after the first x days/weeks\" or something","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":28223,"name":"Cedric Jeanneret","display_name":"cjeanner (Tengu)","email":"cjeanner@redhat.com","username":"cjeanner"},"change_message_id":"6def83329e03217afc66fb3f13a93d7185a27ebd","unresolved":false,"context_lines":[{"line_number":182,"context_line":""},{"line_number":183,"context_line":"Testing will require different things:"},{"line_number":184,"context_line":""},{"line_number":185,"context_line":"* Proper metrics in order to ensure there\u0027s no negative impact - and that any"},{"line_number":186,"context_line":"  impact is measurable"},{"line_number":187,"context_line":"* Proper insurance the removal of the healthcheck doesn\u0027t affect the services"},{"line_number":188,"context_line":"  in a negative way"},{"line_number":189,"context_line":"* Proper testing of the validations, especially \"service-status\" in order to"}],"source_content_type":"text/x-rst","patch_set":3,"id":"90e0986b_7e26728e","line":186,"range":{"start_line":185,"start_character":0,"end_line":186,"end_character":1},"in_reply_to":"084b6f78_01e7cbcd","updated":"2021-05-06 08:31:23.000000000","message":"This is probably something to discuss with CloudOps at some point.","commit_id":"5c8aba4c17098de1a4a46a16f8a2802eebd6e528"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"030d00eccb9fdae5bf00c3d5a16939fd8a004c5b","unresolved":true,"context_lines":[{"line_number":30,"context_line":"* some (most) aren\u0027t actually checking if the service is working, but merely"},{"line_number":31,"context_line":"  that the service container is running"},{"line_number":32,"context_line":""},{"line_number":33,"context_line":"The healthchecks such as `healthcheck_port`_, `healthcheck_listen`_,"},{"line_number":34,"context_line":"`healthcheck_socket`_ as well as most of the scripts calling"},{"line_number":35,"context_line":"`healthcheck_curl`_ are mostly NOT doing anything more than ensuring a"},{"line_number":36,"context_line":"service is running - and we already have this info when the container is"}],"source_content_type":"text/x-rst","patch_set":8,"id":"4c11431b_1e5bbc30","line":33,"updated":"2021-05-19 07:45:33.000000000","message":"thank you this looks and reads great with the links in the finished doc https://ff7549a1676b7e0abf4f-a8638dce9de2f2c46b7008346a40a9c5.ssl.cf5.rackcdn.com/787535/8/check/openstack-tox-docs/40caea3/docs/specs/xena/healthcheck-cleanup.html","commit_id":"3e953a9a8bbf64f4d36fcc27253175b2e5635bef"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"030d00eccb9fdae5bf00c3d5a16939fd8a004c5b","unresolved":true,"context_lines":[{"line_number":76,"context_line":"solution, though any of them would more than probably require their own"},{"line_number":77,"context_line":"specifications and discussions:"},{"line_number":78,"context_line":""},{"line_number":79,"context_line":"Replace the listed healthchecks by actual service healthchecks"},{"line_number":80,"context_line":".............................................................."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"Doing so would allow to get a better understanding of the stack health, but"},{"line_number":83,"context_line":"will not solve the issue with podman calls (hence resource eating and related"},{"line_number":84,"context_line":"things)."},{"line_number":85,"context_line":"Such healchecks can be launched from an external tool, for instance based"},{"line_number":86,"context_line":"on a host\u0027s cron, or an actual service."},{"line_number":87,"context_line":""},{"line_number":88,"context_line":"Call the healthchecks from an external tool"},{"line_number":89,"context_line":"..........................................."},{"line_number":90,"context_line":""},{"line_number":91,"context_line":"Doing so would prevent the potential resource issues with the \"podman exec\""}],"source_content_type":"text/x-rst","patch_set":8,"id":"e93b72fc_bdae431a","line":88,"range":{"start_line":79,"start_character":2,"end_line":88,"end_character":19},"updated":"2021-05-19 07:45:33.000000000","message":"it seems like these first two \u0027alternatives\u0027 are actually your proposed work in this spec\nUnder work items numbers 3 and 4. So it is a bit confusing that these are listed in alternatives.","commit_id":"3e953a9a8bbf64f4d36fcc27253175b2e5635bef"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"030d00eccb9fdae5bf00c3d5a16939fd8a004c5b","unresolved":true,"context_lines":[{"line_number":162,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":163,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":164,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":165,"context_line":"3. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":166,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"},{"line_number":167,"context_line":"4. Initiate a cross-Team work toward actual healthcheck endpoints for the"},{"line_number":168,"context_line":"   services in need"},{"line_number":169,"context_line":""},{"line_number":170,"context_line":"The second work item is more an empirical data on the long term - we currently"}],"source_content_type":"text/x-rst","patch_set":8,"id":"14b80b2e_233a7693","line":167,"range":{"start_line":165,"start_character":0,"end_line":167,"end_character":2},"updated":"2021-05-19 07:45:33.000000000","message":"so really, #1 and #2 are the priority items that are being proposed and will be implemented as soon as this spec lands or sooner whereas 3 and 4 are the followup work for probably next cycle\n\n[EDIT] OK I see you go into more details in dependencies about this","commit_id":"3e953a9a8bbf64f4d36fcc27253175b2e5635bef"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"2e80a96e8a1f3b1518b670a7c0f3bef69d291542","unresolved":true,"context_lines":[{"line_number":162,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":163,"context_line":"2. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":164,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":165,"context_line":"The second work item is more an empirical data on the long term - we currently"},{"line_number":166,"context_line":"don\u0027t have actual data, appart a `Launchpad issue`_ pointing to a problem"},{"line_number":167,"context_line":"maybe caused by the way healthchecks are launched."},{"line_number":168,"context_line":""}],"source_content_type":"text/x-rst","patch_set":10,"id":"be3e126f_973e7ed9","line":165,"range":{"start_line":165,"start_character":0,"end_line":165,"end_character":1},"updated":"2021-06-07 07:27:30.000000000","message":"https://zuul.opendev.org/t/openstack/build/020642b920e94ca48b5ca804fcbfdc32\n\n/home/zuul/src/opendev.org/openstack/tripleo-specs/doc/source/specs/xena/healthcheck-cleanup.rst:165:Enumerated list ends without a blank line; unexpected unindent.","commit_id":"e5c33c17d4d53486e36b0bbbccec22a0a56372b6"},{"author":{"_account_id":8449,"name":"Marios Andreou","email":"marios.andreou@gmail.com","username":"marios"},"change_message_id":"2e80a96e8a1f3b1518b670a7c0f3bef69d291542","unresolved":true,"context_lines":[{"line_number":166,"context_line":"don\u0027t have actual data, appart a `Launchpad issue`_ pointing to a problem"},{"line_number":167,"context_line":"maybe caused by the way healthchecks are launched."},{"line_number":168,"context_line":""},{"line_number":169,"context_line":"Possible future work items"},{"line_number":170,"context_line":".........................."},{"line_number":171,"context_line":"1. Initiate a discussion with CloudOps (metrics team) regarding an dedicated"},{"line_number":172,"context_line":"   healthcheck service, and how to integrate it properly within TripleO"}],"source_content_type":"text/x-rst","patch_set":10,"id":"bcbbdad9_a7a07334","line":169,"updated":"2021-06-07 07:27:30.000000000","message":"ack thanks","commit_id":"e5c33c17d4d53486e36b0bbbccec22a0a56372b6"},{"author":{"_account_id":9816,"name":"Takashi Kajinami","email":"kajinamit@oss.nttdata.com","username":"kajinamit"},"change_message_id":"23858386d8265f49627a2df60a0fe68f711dfea6","unresolved":true,"context_lines":[{"line_number":158,"context_line":"Work Items"},{"line_number":159,"context_line":"----------"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":"#. Triage existing healthcheck, and if they aren\u0027t calling actual endpoint,"},{"line_number":162,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":163,"context_line":"#. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":164,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":165,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"584d8999_5907ddf0","line":162,"range":{"start_line":161,"start_character":0,"end_line":162,"end_character":42},"updated":"2021-06-08 14:09:13.000000000","message":"We can discuss this later, but even if a healtcheck is calling endpoint, if the same endpoint is called by haproxy then I think we can remove it .","commit_id":"01b4c0a1af5329023f0fdaa4671c960fbda95168"},{"author":{"_account_id":14985,"name":"Alex Schultz","email":"aschultz@next-development.com","username":"mwhahaha"},"change_message_id":"38817d151ec6141e4b5c49ae983ed4176f78ba4e","unresolved":true,"context_lines":[{"line_number":158,"context_line":"Work Items"},{"line_number":159,"context_line":"----------"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":"#. Triage existing healthcheck, and if they aren\u0027t calling actual endpoint,"},{"line_number":162,"context_line":"   deactive them in tripleo-heat-templates"},{"line_number":163,"context_line":"#. Ensure the stack stability isn\u0027t degraded by this change, and properly"},{"line_number":164,"context_line":"   document the \"service-status\" validation with the Validation Framework Team"},{"line_number":165,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"63afc1b2_38646c33","line":162,"range":{"start_line":161,"start_character":0,"end_line":162,"end_character":42},"in_reply_to":"584d8999_5907ddf0","updated":"2021-06-08 14:12:33.000000000","message":"we don\u0027t really have good visibility into the backends for haproxy, so i think if we have proper health check available we should run it on the container to help highlight when a backend goes down","commit_id":"01b4c0a1af5329023f0fdaa4671c960fbda95168"}]}
