)]}' {"/COMMIT_MSG":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":9,"context_line":"PoC:"},{"line_number":10,"context_line":"- Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":11,"context_line":"- Results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"Partial implement blueprint nova-services-graceful-shutdown"},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Change-Id: Ic1d5f039c4f1d9cc1c474d8750bff8c5274fd14e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"87d9b98b_b6241b78","line":12,"updated":"2025-11-21 18:52:49.000000000","message":"im not sure we shoudl include google docs links in the spec dir so i asume this is just to share the data priort to writign the spec and it will be removed.\n\nim also not sure we are allowed ot share google docs publicly like this form our work accounts so... ya it might be better to host that elsewhere.\n\nif this is on your personal accunt it proably fine but in either case we probaly shoudl remove this before it the final revsion fo the patch,","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":9,"context_line":"PoC:"},{"line_number":10,"context_line":"- Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":11,"context_line":"- Results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"Partial implement blueprint nova-services-graceful-shutdown"},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Change-Id: Ic1d5f039c4f1d9cc1c474d8750bff8c5274fd14e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"a8ba73a5_92ebbe71","line":12,"in_reply_to":"2d651c78_e015b014","updated":"2025-12-01 21:07:28.000000000","message":"ideally if this is required for the design of the spec to be understood then i should be kept in tree otherwise it should be in the refence section with the important information included in the spec directly.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":true,"context_lines":[{"line_number":9,"context_line":"PoC:"},{"line_number":10,"context_line":"- Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":11,"context_line":"- Results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"Partial implement blueprint nova-services-graceful-shutdown"},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Change-Id: Ic1d5f039c4f1d9cc1c474d8750bff8c5274fd14e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"2d651c78_e015b014","line":12,"in_reply_to":"87d9b98b_b6241b78","updated":"2025-11-29 04:04:00.000000000","message":"This is from my personal account, and I am not sure why we cannot use Google Docs as long as it is public and has public information for upstream work. Because PoC results were lengthy so I added it to the doc, which is not meant to be removed. I can host it somewhere else, but I find Google Docs easy to use and share.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":9,"context_line":"PoC:"},{"line_number":10,"context_line":"- Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":11,"context_line":"- Results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"Partial implement blueprint nova-services-graceful-shutdown"},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Change-Id: Ic1d5f039c4f1d9cc1c474d8750bff8c5274fd14e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"ebcce795_0ee24dc4","line":12,"in_reply_to":"a8ba73a5_92ebbe71","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"f99a58ff_0ed8e35e","updated":"2025-11-19 17:23:10.000000000","message":"I think this is a good start. I have a bunch of things inline","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"abac7d10_717b516c","updated":"2025-11-29 04:04:00.000000000","message":"Thanks a lot gibi and Sean for review. please check the revised version.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"8ad9bcfd_3886a6dc","updated":"2025-11-20 16:38:50.000000000","message":"Thanks for the responses. Things are clearer now. I keep my -1 mostly for the novncproxy investigation.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"907dc0c3f7f9e3a4c5a5cd938b20368fa2bbf50b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"3932a21f_7dfc3a20","in_reply_to":"f99a58ff_0ed8e35e","updated":"2025-11-20 05:20:33.000000000","message":"Thanks a lot for the detailed review. I will try to spin up the new version soon (once I check nova-novncproxy), meanwhile i replied to the comment, please check if anything more to be added.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"e00ccc05_3be97ff1","updated":"2025-12-01 16:20:56.000000000","message":"I\u0027m -1 mostly due to my question in the upgrade impact, but has other clarifying questions as well.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"75c8e845_c7e01df4","updated":"2025-12-02 17:19:26.000000000","message":"Just a couple of comments, but this is pretty much exactly what Gmaan and I discussed and is how I envision us making this change incrementally. So I\u0027d be +2 if not for just a few clarifying comments.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d81f74af7e8e3cf248563dd5177e72ff64ba15ea","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":8,"id":"2c99a072_5d8a0e58","updated":"2025-12-02 20:19:19.000000000","message":"Looks good. I have one clarification about the periodic tasks but nothing serious.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"8db4ca4fe0550f0fbf06ee9b77cf424e88000ff2","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":8,"id":"237857cb_7fb83364","updated":"2025-12-02 18:58:55.000000000","message":"Thanks Dan for review","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"4c0c9a5fbe8696edc0bf0c61eca9a105eb19de31","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":8,"id":"b0567f87_83b5cd24","updated":"2025-12-02 18:50:38.000000000","message":"Thanks, this looks okay to me to kick off the first of the two efforts. Hopefully gibi\u0027s concern is addressed (or not actually a concern).","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":10,"id":"a30e8c5b_15c720af","updated":"2025-12-03 09:47:39.000000000","message":"This is still more narrowly scoped then i woudl like but\nthis is the backlog spec so in a cycle or two we can update it again to cover step 3 and 4 as we complete on step 1 so im ok to proceed with this for this cycle.","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"b4cf9a3cbe8cdedfa65994cbc6525ae284756878","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"89ccf5e4_cf817af3","in_reply_to":"65f57760_9b886f11","updated":"2025-12-03 15:26:36.000000000","message":"yeah, we always have scope of improvement.++","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"2186a4bd826f2fa0bd230cc02e426c9167adeb6a","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":10,"id":"65f57760_9b886f11","in_reply_to":"a30e8c5b_15c720af","updated":"2025-12-03 13:27:23.000000000","message":"I like the idea that we have a well defined step1 and maybe step2 and we execute those, learn form them, and based on that we update this spec with more steps if needed.","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"c00a931b77a8c6babfc405b47f21ba944a9a6a6a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"bd40eb92_61f6c7b8","updated":"2025-12-03 18:52:30.000000000","message":"A couple more nits that can be fixed some other time.\n\nI too am +2ing because this is a backlog spec and I assume any commitment to review would be on the actual specs that come from this.","commit_id":"8d374dd715ddb6e83ddd36fb83e471ccff2a0e9e"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"09391c568ccd3ed7c6de98bb2cd9e8ed7612c4e2","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"5de325f7_9b067699","updated":"2025-12-03 18:41:26.000000000","message":"as this is a backlog spec im not going to apply the rule fo committing to review when i +2\ni think this is in a place where we have a direction we can build on so im ok to proceed with this version and focus on the spec ro this cycle.\nwith that said fi i do have capacity i will follow this work as best as i can.","commit_id":"8d374dd715ddb6e83ddd36fb83e471ccff2a0e9e"}],"specs/backlog/approved/nova-services-graceful-shutdown.rst":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This is backlog spec proposing the design of graceful shutdown."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Nova services (mainly Nova Compute service ) do not shut down gracefully. When"},{"line_number":16,"context_line":"services are stopped, it also stops all the in-progress operations, which not"},{"line_number":17,"context_line":"only interrupt the in-progress operations, but can leave instances in an"},{"line_number":18,"context_line":"unwanted or unrecoverable state. The idea is to let services stop processing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"aabbc229_a46710ca","line":15,"range":{"start_line":15,"start_character":15,"end_line":15,"end_character":43},"updated":"2025-11-19 17:23:10.000000000","message":"While I agree that the most important piece is nova-compute but I think other services has the same problem as well. E.g.:\n* Does nova-scheduler waits for an in progress select_destination RPC to handler to finish when stopped or the request just dropped?\n* Does nova-conductor finish the ongoing setup of a live-migration before stops if the stop signal comes while the conductor is waiting for the RPC response of the call check_can_live_migrate_destination? [1]\n* Does nova-novncproxy waits for VNC sessions to stop before it stops itself?\n\n\n-- later --\nI saw below that you detailed out things for each service. That answers some of these questions. Like scheduler task tracking in step2. \n\n[1]https://docs.openstack.org/nova/latest/reference/live-migration.html","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":false,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This is backlog spec proposing the design of graceful shutdown."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Nova services (mainly Nova Compute service ) do not shut down gracefully. When"},{"line_number":16,"context_line":"services are stopped, it also stops all the in-progress operations, which not"},{"line_number":17,"context_line":"only interrupt the in-progress operations, but can leave instances in an"},{"line_number":18,"context_line":"unwanted or unrecoverable state. The idea is to let services stop processing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"542a328a_aede9b45","line":15,"range":{"start_line":15,"start_character":15,"end_line":15,"end_character":43},"in_reply_to":"35566069_0e80b6da","updated":"2025-11-20 16:38:50.000000000","message":"Acknowledged","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This is backlog spec proposing the design of graceful shutdown."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Nova services (mainly Nova Compute service ) do not shut down gracefully. When"},{"line_number":16,"context_line":"services are stopped, it also stops all the in-progress operations, which not"},{"line_number":17,"context_line":"only interrupt the in-progress operations, but can leave instances in an"},{"line_number":18,"context_line":"unwanted or unrecoverable state. The idea is to let services stop processing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"9d5ed250_b4b8b3e1","line":15,"range":{"start_line":15,"start_character":15,"end_line":15,"end_character":43},"in_reply_to":"542a328a_aede9b45","updated":"2025-11-21 18:52:49.000000000","message":"well i don\u0027t think any of the service really shutdown gracefully \n\nthe current phrasing implies its mostly a problem for the compute agent.\n\nthe fact that conductor and schduelr isntnace are state less help but they equally dont supprot graceful shutdaon of a specific isntace.\n\ni woudl prefer if we updated this to \n```suggestion\nNova services do not shut down gracefully. When\n```","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This is backlog spec proposing the design of graceful shutdown."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Nova services (mainly Nova Compute service ) do not shut down gracefully. When"},{"line_number":16,"context_line":"services are stopped, it also stops all the in-progress operations, which not"},{"line_number":17,"context_line":"only interrupt the in-progress operations, but can leave instances in an"},{"line_number":18,"context_line":"unwanted or unrecoverable state. The idea is to let services stop processing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"704a2c09_d446e8c4","line":15,"range":{"start_line":15,"start_character":15,"end_line":15,"end_character":43},"in_reply_to":"9d5ed250_b4b8b3e1","updated":"2025-11-29 04:04:00.000000000","message":"Done. After my further investigation and testing, I found that nova api/metadata services shut down gracefully, and it is handled by the WSGI-supported server.\n\nAnd console services nova-novncproxy/nova-serialproxy/nova-spicehtml5proxy are also shutdown gracefully and handled by the websockify. I am adding more details in the proposed section","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This is backlog spec proposing the design of graceful shutdown."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Nova services (mainly Nova Compute service ) do not shut down gracefully. When"},{"line_number":16,"context_line":"services are stopped, it also stops all the in-progress operations, which not"},{"line_number":17,"context_line":"only interrupt the in-progress operations, but can leave instances in an"},{"line_number":18,"context_line":"unwanted or unrecoverable state. The idea is to let services stop processing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"35566069_0e80b6da","line":15,"range":{"start_line":15,"start_character":15,"end_line":15,"end_character":43},"in_reply_to":"aabbc229_a46710ca","updated":"2025-11-20 05:19:03.000000000","message":"\u003e While I agree that the most important piece is nova-compute but I think other services has the same problem as well. E.g.:\n\nYes, scheduler and conductor services are added to the scope. There is no RPC design change needed there, but yes, task tracking will be done for both services so that service shutdown can wait for in-progress tasks.\n\n\u003e * Does nova-scheduler waits for an in progress select_destination RPC to handler to finish when stopped or the request just dropped?\n\nIf the \u0027select_destination\u0027 request is started by the scheduler worker, then it will finish the tasks and then only will stop but it will stop taking any new request if shutdown is initiated.\n\n\u003e * Does nova-conductor finish the ongoing setup of a live-migration before stops if the stop signal comes while the conductor is waiting for the RPC response of the call check_can_live_migrate_destination? [1]\n\u003e * Does nova-novncproxy waits for VNC sessions to stop before it stops itself?\n\nYes, conductor will wait for ongoing task. For live migration example, if it has initiated the check_can_live_migrate_destination() on the destination compute then it will for the response from reply queue and after that only it will signal to nova.service stop() to proceed the shutdown. \n\n\u003e Does nova-novncproxy waits for VNC sessions to stop before it stops itself?\n\nI did not check the nova-novncproxy, I will check and add it.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":26,"context_line":"for the in-progress operations to be completed. When shutdown is initiated,"},{"line_number":27,"context_line":"services wait for the RPC server to stop and wait so that they can consume all"},{"line_number":28,"context_line":"the existing request messages (RPC call/cast) from the queue, but the service"},{"line_number":29,"context_line":"does not complete the operation."},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"Each Nova compute service has a single worker running and listening on a single"},{"line_number":32,"context_line":"RPC server (topic: compute.\u003chost\u003e). The same RPC server is used for the new"}],"source_content_type":"text/x-rst","patch_set":4,"id":"9a5b3457_c1067245","line":29,"updated":"2025-11-19 17:23:10.000000000","message":"Does it mean that the last RPC handlers is waited to finish or just that the RPC handler picked up the message from the queue but not yet processed it?","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":false,"context_lines":[{"line_number":26,"context_line":"for the in-progress operations to be completed. When shutdown is initiated,"},{"line_number":27,"context_line":"services wait for the RPC server to stop and wait so that they can consume all"},{"line_number":28,"context_line":"the existing request messages (RPC call/cast) from the queue, but the service"},{"line_number":29,"context_line":"does not complete the operation."},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"Each Nova compute service has a single worker running and listening on a single"},{"line_number":32,"context_line":"RPC server (topic: compute.\u003chost\u003e). The same RPC server is used for the new"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d4cf7517_c82dfd52","line":29,"in_reply_to":"2e7a81c5_3b42878d","updated":"2025-11-20 16:38:50.000000000","message":"Thanks for the detailed answer. It is really helpful.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":26,"context_line":"for the in-progress operations to be completed. When shutdown is initiated,"},{"line_number":27,"context_line":"services wait for the RPC server to stop and wait so that they can consume all"},{"line_number":28,"context_line":"the existing request messages (RPC call/cast) from the queue, but the service"},{"line_number":29,"context_line":"does not complete the operation."},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"Each Nova compute service has a single worker running and listening on a single"},{"line_number":32,"context_line":"RPC server (topic: compute.\u003chost\u003e). The same RPC server is used for the new"}],"source_content_type":"text/x-rst","patch_set":4,"id":"2e7a81c5_3b42878d","line":29,"in_reply_to":"9a5b3457_c1067245","updated":"2025-11-20 05:19:03.000000000","message":"RPC handler will always submit the request to the manager, which is taken care by the oslo.messaging RPC server stop() and wait(). It is now up to the service manager to finish the requests, and RPC handler will not be responsible/wait for that.\n\nThe flow is:\n \n- oslo.messaging listener (on RPC server) pick the message from the queue\n\n- If nova service shutdown is initiated, then nova.service.stop() calls *rpcserver.stop()*\n\n- rpcserver.stop() will stop the listener [1] in two steps: 1. it stop consuming any new requests[2] 2. It waits to send all the messages which are already picked from the queue to the ThreadPool executor (this executor start a new thread to process the message to dispatch to RPCDispatcher[3])[4][5][6]. This way, oslo.messaging listner make sure that all picked-up messages are submitted to the thread pool executor.\n\n- Till this point, the listener is stopped, but all requests are in progress at RPC server side and not yet reached the compute (or any other service) manager. Which is handled by the rpcserver.wait().\n\n- After rpcserver.stop(), nova.service.stop() calls *rpcserver.wait()* . rpcserver.wait() will make sure that all the requests present in the thread pool executor are finished before it shutdown[6]. Here RPCDispatcher will send all the requests to call the compute manager. Now RPC handler work is done.\n\n- If manager is still processing the requests, and the service shutdown comes, then it will interrupt the manager from finishing the task. If it is RPC call request, it will send the error to the waiting caller via reply queue. \n\n\n[1] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/server.py#L447\n[2] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/_drivers/amqpdriver.py#L456\n[3] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/server.py#L386\n[4] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/_drivers/amqpdriver.py#L463\n[5] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/_drivers/amqpdriver.py#L446\n[6] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/_drivers/base.py#L308\n[7] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/server.py#L461\n[8] https://github.com/openstack/oslo.messaging/blob/8961eedc7141f79aea2342cba927df584e003d1b/oslo_messaging/rpc/server.py#L155","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":47,"context_line":"so that it will not impact the users\u0027 in-progress operations."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."}],"source_content_type":"text/x-rst","patch_set":4,"id":"ba205580_3ccfd9fc","line":50,"updated":"2025-11-19 17:23:10.000000000","message":"~ if the service is gracefully terminated. \n\nOr do we want to give the bigger guarantee that the instance state is sane even if the service is forcefully terminated? That is pretty different game.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":47,"context_line":"so that it will not impact the users\u0027 in-progress operations."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."}],"source_content_type":"text/x-rst","patch_set":4,"id":"971b19a9_06f2da5e","line":50,"in_reply_to":"505aef2e_b39104ac","updated":"2025-11-21 18:52:49.000000000","message":"This is porbaly covered later but im interested in the way we will signal the grasceful shutdown.\n\nwe coudl do it via a signal handler, unix socket, systemd dbus event, file (inotify) trigter or even a per prochess http endpoint.\n\ni brign up the latter option only because gracefuly shutdown was one of the possible future work that i wanted to enable via the per proces healt check endpoint i proposed a few releases ago.\nhttps://specs.openstack.org/openstack/nova-specs/specs/2024.2/approved/per-process-healthchecks.html\n\nbasiclly i was expectign to add /metctics /logs /config enpont in addtion to /health and torying with the idea of beign abel to shotdown teh server with a post then monitor the progress via the other endpoints i.e. the number of rpcs or tasks in flight via /health or /metrics\n\nim raising that in the usecase section because i think we shoudl want to make sure we can track if we are making progress to the shutdown completing even if that only end up beign a log message or something likt that in the inital version.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":47,"context_line":"so that it will not impact the users\u0027 in-progress operations."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."}],"source_content_type":"text/x-rst","patch_set":4,"id":"684ab4d5_9f20da82","line":50,"in_reply_to":"971b19a9_06f2da5e","updated":"2025-11-29 04:04:00.000000000","message":"Yes, logging is in the scope of the proposed solution. Step 2 added to the proposed section is to track all ongoing tasks and log the details of how we are progressing towards a graceful shutdown. If a timeout occurs, we will be able to determine from the logs what actions were taken before the service is terminated and what is pending.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":47,"context_line":"so that it will not impact the users\u0027 in-progress operations."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."}],"source_content_type":"text/x-rst","patch_set":4,"id":"505aef2e_b39104ac","line":50,"in_reply_to":"ba205580_3ccfd9fc","updated":"2025-11-20 05:19:03.000000000","message":"Done.\n\nYeah, if a forceful shutdown, then it is difficult to make sure the resources are in good shape. The best we can do is to log all the state of in-progress operations so that operator/admin knows how shutdown interrupted the things.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"As a user, I want in-progress operations to be completed before the shutdown"},{"line_number":56,"context_line":"is initiated."}],"source_content_type":"text/x-rst","patch_set":4,"id":"1cfca410_d193daf0","line":53,"updated":"2025-11-21 18:52:49.000000000","message":"do you have a rference for how this works, it looks liek SIGTERM by default but there is alpha suprpot for custom signals\nhttps://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination\n\ncan you make that more expcit that SIGTERM is what you plan to support.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":50,"context_line":"usable state even if service is terminated."},{"line_number":51,"context_line":""},{"line_number":52,"context_line":"As an operator, I want to be able to take the actual benefits of the k8s pod"},{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"As a user, I want in-progress operations to be completed before the shutdown"},{"line_number":56,"context_line":"is initiated."}],"source_content_type":"text/x-rst","patch_set":4,"id":"9b86828c_fa8fec98","line":53,"in_reply_to":"1cfca410_d193daf0","updated":"2025-11-29 04:04:00.000000000","message":"yeah, somehere down I noted that but I will explicitly mention here just to be clear that propsoal is for SIGTERM only","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"As a user, I want in-progress operations to be completed before the shutdown"},{"line_number":56,"context_line":"is initiated."},{"line_number":57,"context_line":""},{"line_number":58,"context_line":"Proposed change"},{"line_number":59,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d907a0fa_891cd99d","line":56,"updated":"2025-11-19 17:23:10.000000000","message":"~ before the service exits","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"bcaad294d455238864037ce68e9a5e61961696a0","unresolved":false,"context_lines":[{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"As a user, I want in-progress operations to be completed before the shutdown"},{"line_number":56,"context_line":"is initiated."},{"line_number":57,"context_line":""},{"line_number":58,"context_line":"Proposed change"},{"line_number":59,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e896d081_d058e588","line":56,"in_reply_to":"2f6397ed_cf3f115a","updated":"2025-11-29 04:04:36.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":53,"context_line":"graceful shutdown when Nova services are running in k8s pods."},{"line_number":54,"context_line":""},{"line_number":55,"context_line":"As a user, I want in-progress operations to be completed before the shutdown"},{"line_number":56,"context_line":"is initiated."},{"line_number":57,"context_line":""},{"line_number":58,"context_line":"Proposed change"},{"line_number":59,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"2f6397ed_cf3f115a","line":56,"in_reply_to":"d907a0fa_891cd99d","updated":"2025-11-20 05:19:03.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":61,"context_line":"The proposed solution of graceful shutdown is based on the following design"},{"line_number":62,"context_line":"principles:"},{"line_number":63,"context_line":""},{"line_number":64,"context_line":"* When service shutdown is initiated:"},{"line_number":65,"context_line":""},{"line_number":66,"context_line":" * Do not process any new requests"},{"line_number":67,"context_line":" * New requests should not be lost. Once service is started, it should process"}],"source_content_type":"text/x-rst","patch_set":4,"id":"7347a3fa_957c9cb5","line":64,"range":{"start_line":64,"start_character":27,"end_line":64,"end_character":36},"updated":"2025-11-21 18:52:49.000000000","message":"```suggestion\n* When service shutdown is initiated by SIGTERM:\n```","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":61,"context_line":"The proposed solution of graceful shutdown is based on the following design"},{"line_number":62,"context_line":"principles:"},{"line_number":63,"context_line":""},{"line_number":64,"context_line":"* When service shutdown is initiated:"},{"line_number":65,"context_line":""},{"line_number":66,"context_line":" * Do not process any new requests"},{"line_number":67,"context_line":" * New requests should not be lost. Once service is started, it should process"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8dbf964e_260d31fc","line":64,"range":{"start_line":64,"start_character":27,"end_line":64,"end_character":36},"in_reply_to":"7347a3fa_957c9cb5","updated":"2025-11-29 04:04:00.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":68,"context_line":" the requests."},{"line_number":69,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":70,"context_line":" before shutdown."},{"line_number":71,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":72,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"* When service shutdown is completed:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"6f5539c6_618d7698","line":71,"updated":"2025-11-19 17:23:10.000000000","message":"+1\nIf we create some in-progress tasks tracking within nova, we should make it so that it can be useful for the healthcheck work later. https://review.opendev.org/q/topic:%22per-process-healthchecks%22","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":false,"context_lines":[{"line_number":68,"context_line":" the requests."},{"line_number":69,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":70,"context_line":" before shutdown."},{"line_number":71,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":72,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"* When service shutdown is completed:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e294b6d7_affc46ed","line":71,"in_reply_to":"6f5539c6_618d7698","updated":"2025-11-20 05:19:03.000000000","message":"yeah, idea is to make it generic and not very shutdown specific so that we can use in different things like healthcheck which is good example you gave.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"8c2e86d9_3bb3d53d","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"updated":"2025-11-19 17:23:10.000000000","message":"Will nova support such timeout and elevating the graceful shutdown request to a forceful one after the timeout? Or we just expect that external systems like k8s will first send a graceful request and later if the service is not stopped within a time period it will send a forceful request?\n\n-- later --\n\nOK I see below nova will have its own timeout.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":false,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"9605272d_c75a2a4a","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"8c2e86d9_3bb3d53d","updated":"2025-11-20 05:19:03.000000000","message":"Yeah, the idea is to have a configurable timeout per service.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"b37c8deb_e35c40cb","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"9605272d_c75a2a4a","updated":"2025-11-21 18:52:49.000000000","message":"when run under systemd you can also config a timeout before it will escalate to sigkill and i belive the same si true in k8s\n\nso while i tend to agree that nova can have a timout for it im not sure its actuly required since systemd and k8s will both escalation to non graceful shutdown indepently as wil docker/podman i blive.\n\nthe relevent systemd setting for this are \n\nhttps://www.freedesktop.org/software/systemd/man/latest/systemd.service.html#TimeoutStopSec\u003d\nhttps://www.freedesktop.org/software/systemd/man/latest/systemd.kill.html#KillSignal\u003d\nhttps://www.freedesktop.org/software/systemd/man/latest/systemd.kill.html#KillMode\u003d\nhttps://www.freedesktop.org/software/systemd/man/latest/systemd.kill.html#SendSIGKILL\u003d\nhttps://www.freedesktop.org/software/systemd/man/latest/systemd.kill.html#FinalKillSignal\u003d\n\nand for k8s its\n\nhttps://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination\n\n```\nOnce the grace period has expired, the KILL signal is sent to any remaining processes, and the Pod is then deleted from the API Server. I\n```\n\ngiven we can prevent these external process manager from killin us\n\nand in most cases your going to stop the comptue agent not by using kill directly but instead stoping it via systemd or a pod delete do we really need the compelxity of a time in nova?\n\nit might be better to now add a config otpion at all but we can add loging of the progress to stoping perodicaly and a final log if we mange to shutdown gracefully.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"a909e384_2dea1f66","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"983b37f5_d03d62d4","updated":"2025-12-01 21:07:28.000000000","message":"https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.graceful_shutdown_timeout\n\n```\nSpecify a timeout after which a gracefully shutdown server will exit. Zero value means endless wait.\n```\n\nwe shoudl really improve the help text as i thought that was inttened to be an alais for \n\nhttps://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.shutdown_timeout\n\n```\nTotal time to wait in seconds for an instance to perform a clean shutdown.\n\nIt determines the overall period (in seconds) a VM is allowed to perform a clean shutdown. While performing stop, rescue and shelve, rebuild operations, configuring this option gives the VM a chance to perform a controlled shutdown before the instance is powered off. The default timeout is 60 seconds. A value of 0 (zero) means the guest will be powered off immediately with no opportunity for guest OS clean-up.\n\nThe timeout value can be overridden on a per image basis by means of os_shutdown_timeout that is an image metadata setting allowing different types of operating systems to specify how much time they need to shut down cleanly.\n\nPossible values:\n\n A positive integer or 0 (default value is 60).\n```\n\nespically when they both default to the same value.\n\nso graceful_shutdown_timeout shoudl really be\n\n```\nSpecify a timeout nova will wait for the process to gracefully shutdown before teminating. Zero value means endless wait.\n```\n\ni think server in this context is refeing to the rpc server or wsgi server but we proably shoudl see if we can overreid the description that comes form oslo or fix it in oslo.\n\nhow woudl we feel about defaulting graceful_shutdown_timeout to 0?\n\nthe fact this will be handeled in oslo rather then nova remvoes some of my concerns of the unneded complexity.\n\ndocumenting that you shoudl set external timeout higher then internal helps but\nmakign the internal timeout opt in may also be warrented.\n\nim less annoyed by having 2 timeout since the secodn one is not a nova specific thing so im just raisign this as a point of dicussion rahter then advocating for changing the config option to 0 by defualt as a requirement to move forward.\n\nbut i do think its requried to meet the first usecase without a task resumablity guarentee.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"d247be62_8ca2670a","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"983b37f5_d03d62d4","updated":"2025-12-02 04:55:32.000000000","message":"yes, graceful_shutdown_timeout\u003d0 is the way to disable nova timeout.\n\nI agree to improve the help message and that is something needs to do in oslo.services instead of just for nova. I will fix that along with other bug about graceful_shutdown_timeout not taken care in threading mode.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":true,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"f8a47e31_67d226cd","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"b37c8deb_e35c40cb","updated":"2025-11-29 04:04:00.000000000","message":"We need the timeout for nova services also otherwise nova service will stop before external system time is up.\n\nWe have the \u0027graceful_shutdown_timeout\u0027 in oslo.service, which applies per service. In Nova, we need to consider that timeout otherwise oslo.service will terminate the SIGTERM handler. The main point is to document clearly what all timeout needs to be coordinated and configured carefully to actually shut down the service gracefully. Something like below:\n\n- configure external system graceful timeout \u003e\u003d \u0027graceful_shutdown_timeout\u0027 os nova service (configured in nova service conf under section [oslo.service].\u0027graceful_shutdown_timeout\u0027)\n\nNOTE: During this discussion, I found one bug in oslo.service. In the threading backend, it seems \u0027graceful_shutdown_timeout\u0027 was missed to consider. As cotyledon.ServiceManager() handles the SIGTERM handler, oslo.service should pass \u0027graceful_shutdown_timeout\u0027 in arg to cotyledon.ServiceManager(). By default it is 60 sec[2] and we can override the default to longer time per nova services. That is covered under the timeout section.\n\neventlet backend handles it, but anyway, we are not going to count that as that is going away soon.\n\nEven though I am adding more clarification about timeout in spec (section \u0027Graceful Shutdown Timeouts:\u0027) but I saw in the comments below that Sean is still concerned about the Nova timeout so I am keeping this comment unresolved until Sean agree on the proposed timeouts. \n\n[1] https://github.com/openstack/oslo.service/blob/8969233a0a45dad06c445fdf4a66920bd5f3eef0/oslo_service/backend/_threading/service.py#L148C25-L148C51\n\n[2] https://github.com/sileht/cotyledon/blob/0f80f64aae4b76a79c388c9ae565bc860524b30b/cotyledon/_service_manager.py#L123","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":75,"context_line":""},{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"983b37f5_d03d62d4","line":78,"range":{"start_line":78,"start_character":23,"end_line":78,"end_character":54},"in_reply_to":"f8a47e31_67d226cd","updated":"2025-12-01 16:20:56.000000000","message":"external \u003e\u003d internal works for me. This signals the intention that nova will terminated after \"internal\" amount of time. If not then that is a bug, but the external timeout will handle that bug and nova will be terminated externally.\n\nIf the customer wants to rely fully on the external system to kill nova then they can set graceful_shutdown_timeout to 0 so nova will wait forever to either graceful shutdown finishing all the tasks,or the external system times out and kills nova.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* When service is started again:"},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"25c25b13_b8e3bb6c","line":80,"range":{"start_line":79,"start_character":13,"end_line":80,"end_character":15},"updated":"2025-11-19 17:23:10.000000000","message":"I assume this logging is somewhat the same as L71. If the external system implements the timeout then nova might not have a chance to log anything when external system times out and kills the service.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* When service is started again:"},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"37f8d8dd_6142c970","line":80,"range":{"start_line":79,"start_character":13,"end_line":80,"end_character":15},"in_reply_to":"25c25b13_b8e3bb6c","updated":"2025-11-20 05:19:03.000000000","message":"Yes, it is the same as L71. In the case where nova did not get a chance to log, then we can continuously logs the things irrespective of shutdown. Initially, I was thinking of tracking the in-progress tasks always, but only logs when shutdown is initiated.\n\n- Nova track all in-progress tasks in periodic way.\n- Always log the state even shutdown is not initiated.\n- External system abruptly shutdown nova.\n- From periodic logging, we will be able to know what was the last state of in-progress things on that service.\n\nDoes that solve that case?","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* When service is started again:"},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"4ec78f8b_37a2155e","line":80,"range":{"start_line":79,"start_character":13,"end_line":80,"end_character":15},"in_reply_to":"37f8d8dd_6142c970","updated":"2025-11-20 16:38:50.000000000","message":"I think it is safe to assume that an external system like k8s that has its own graceful shutdown timeout will send first a graceful request then wait and then later kill the service if it is not stopped yet. So if nova starts logging when the graceful request is received that is probably enough.\n\nIf we want we can have config knob to always log the ongoing tasks periodically not just during shutdown. But I\u0027m OK not having that in scope for the first implementation.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* When service is started again:"},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"74d148fc_d796b124","line":80,"range":{"start_line":79,"start_character":13,"end_line":80,"end_character":15},"in_reply_to":"4ec78f8b_37a2155e","updated":"2025-11-21 18:52:49.000000000","message":"so as i commented above both systemd and k8s do have gracful shutdoaw capablites.\n\ni dont thinkw e need a config knob fo rthe loging and should jsut do it at teh info level for all shutdowns.\n\nbut i also dont think we need a config knob for a nova timeout for shuting down\n\nwe can jsut delegate that to the service manager that is in contole of the lifecycle of the agent.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":76,"context_line":" * Proper logging of unfinished operations."},{"line_number":77,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":78,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":79,"context_line":" timeout) then there should be a proper logging of all the unfinished"},{"line_number":80,"context_line":" operations. This will help to recover the system or instances."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* When service is started again:"},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"2940161d_5536a978","line":80,"range":{"start_line":79,"start_character":13,"end_line":80,"end_character":15},"in_reply_to":"74d148fc_d796b124","updated":"2025-11-29 04:04:00.000000000","message":"I think we all are on same page for logging.\n\n\nFor timeout, I commented above. Basically, we automatically get/need to take care the nova service timeout via oslo.service-\u003e\u0027graceful_shutdown_timeout\u0027","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d48a3fc7_e0a8707f","line":88,"updated":"2025-11-19 17:23:10.000000000","message":"or allowed to be processed by other instances of the same service type. E.g. if nova-conductor replica 1 is being shut down that should not prevent an already queued request to be handled by nova-conductor replica 2.\n\nAs nova-compute will always runs as a single worker per host for compute we expecte that the message will wait in the queue for that single replica to be started up again.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"191ac2bb_92c81263","line":88,"in_reply_to":"0d59da8a_00fa6f10","updated":"2025-11-21 18:52:49.000000000","message":"so this i think need more detail.\n\nyou proably go into this later but today in init_host we have a buch of logic to cancel ongoing live migration and other things that may have been interupted\n\nif we are supproting graceful shutdown we probly want a more gaceful way to resume those so it would good to callout if you are changeing any of the existing behavior that we do for the curernt startup cleanup.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1b471541dacf4769ced0751300c1dac45f65f392","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"50e7a94b_a9f2e9ed","line":88,"in_reply_to":"191ac2bb_92c81263","updated":"2025-11-27 14:40:08.000000000","message":"Good point. Indeed it is a complication. We need to define what actions nova compute do there and see if we can be more graceful than the current behavior. (I also think that in phase1 we can table this and get back to it in phase2 or phase3 to keep the phase1 scope limited)\n\nI did a quick look what the compute manager does during init_host and _init_instance. See below. Overall it seems we might have couple of actions to investigate but overall I don\u0027t see a big chance to be able to meaningfully improve as a) we cannot be sure that shutdown was graceful to start with, b) might not have the data to know what and how to retry/continue as the original RPC request was lost.\n\n---- details ----\n\n**_destroy_evacuated_instances** \u003d\u003e no need to change\n\n**_error_out_instances_whose_build_was_interrupted**. This ERRORs out instances where placement allocation exists for this host, but the resource claim hasn\u0027t happened yet and therefore instance.host is not pointing to the compute. If we can assume that the compute is gracefully shut down then such thing means the build request is still in the rabbit queue and the compute just starting up can pick it up and do the build successfully. However we cannot guarantee that it is always the case as\n* a) not every shutdown will be graceful. \n* b) rabbit message might be lost while the compute was down.\n\nDeciding at this point if there is a waiting build_and_run_instance RPC for this instance in the queue is not possible / practical. So we probably need to keep the blind ERROR out logic here to ensure that instances are not stuck to BUILD state forever. Even if it means nova might ERROR out instances that could have been built successfully if we don\u0027t ERROR out here.\n\n**_complete_partial_deletion** Finishing interrupted delete, probably will happen less frequently with graceful shutdown, but nothing to recover here and better to keep doing the cleanups to handle non graceful shutdowns.\n\n**interrupted build or rebuild where the instance.host is already pointing to this host** We set the instance to ERROR. Should happen less frequently with graceful shutdown but when happen due to non graceful shutdown we have nothing to recover here today as we don\u0027t know what to clean before we can cleanly re-try the build. Also we might not have all the information from the original build_and_run_instance RPC to do a proper re-try.\n\n**interrupted image snapshot / image upload / resize_prep** We just remove the task state today. That is strange. Anyhow we probably cannot easily re-try / restart here. But one can dig into how to resume the action if possible.\n\n\n**interrupted reboot, instance is not running** We already retry that operation today. So nothing extra is needed here.\n\n**interrupted reboot, instance is running** We clean the task, that is probably the right thing to do here as we rather OK to miss a reboot than due a double reboot.\n\n**interrupted unpause, poweroff, poweron** We already retry today so no extra change needed.\n\n**interrupted resize/migration** We revert. I think that is the safest to do. But one can dig into how hard it would be to continue migrating.\n\n**interrupted live migration** We abort it today. Again that is an interesting case if we can continue it or not. But probably not easy.\n\n**_resume_guests_state** The auto start of the instances when nova is so configured. Nothing to be changed here.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e04b0db2_0f7c343c","line":88,"in_reply_to":"37e360f7_5552aae2","updated":"2025-12-02 04:55:32.000000000","message":"\u003e to me if we want to supprot graceful shtudown then ongoing operation must be compelted or in a resumable state when teh agent start up.\n\nI disagree with your definition of graceful shutdown. A graceful shutdown covers only how the shutdown handles the ongoing tasks, data, open connections (which is not done in case of an abrupt shutdown). Resuming the system from where it was stopped is not part of a graceful shutdown. It is part of \u0027State preservation\u0027 where the system should be resume-able from where it was stopped. You are mixing two things here.\n\n\n\u003e \n\u003e if we cannot achive that i wondl not consieder it to be a graceful showdown as there is an impact to ongoing work. the only real excption to that is timeouts.\n\nOngoing work will be completed if the shutdown is graceful (if Nova service can finish all in-progress tasks), if not, either it is a timeout or another reason, then we say \"Nova tried, but this is not a graceful shutdown due to xyz reason, so there is an impact on ongoing work.\"\n\n\u003e \n\u003e im ok with putting that out of the spec that will descibe what will be done this cycle but i dont agree with declaring it out of scope fo the backlog spec.\n\u003e https://review.opendev.org/c/openstack/nova-specs/+/967585\n\u003e \n\u003e if we don\u0027t consider the \"complete or resumable\" status of ongoing operation to be in scope fo this i would concdier nova to already have graceful shutdwon supprot \n\u003e by virutue of having the stop event handeler.\n\nHow stop event handler implements the Nova graceful shutdown? We already have the handler in oslo.service and timeout \u0027graceful_shutdown_timeout\u0027 to wait in that handler. Can we set \u0027graceful_shutdown_timeout\u0027 to longer time and achieve the graceful shutdown?.\n\nThe main design to achieve the graceful shutdown is adding the 2nd RPC server in the compute service so that we can separate the new and in-progress RPC traffic. Without that, I am not sure how you are considering Nova already has graceful shutdown support. \n\nI think I explained in this spec, but writing again for quick ref. By definition, Graceful shutdown has below parts:\n\n1. Stop accepting the new requests: done by Step 1\n2. Finish in-progress requests: done by step 2 (Step 1 do it based on time-based wait)\n3. Cleanup and close the connections: we do it as part of the cleanup host etc.\n\nYou are saying we should add \u0027State preservation\u0027 also, which can achieve the \u0027resumable operations\u0027. I am not saying that is an invalid thing and we should not do that but my point is we cannot solve everything as part of a graceful shutdown. There are a lot of scenarios which can interrupt the operations and we cannot handle everything as part of graceful shutdown.\n\nWe can make Nova a \u0027self-resumable\u0027 service by making all those long running/stateless operations be resumable:\n- Recoveable/resumable live migration\n- Recoveable/resumable cold migration\n- Recoveable/resumable resize\n- Recoveable/resumable rebuild\n- Recoveable/resumable evacuate\n- Recoveable/resumable shelve \u0026 unshelve\n- Recoveable/resumable swap volume\n\nBut the question here is that can (should) we do all these in graceful shutdown effort? Or can we even make all these operations recoverable/resumable in a single effort? IMO, the answer is no.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"f43327a1_422bea96","line":88,"in_reply_to":"50e7a94b_a9f2e9ed","updated":"2025-11-29 04:04:00.000000000","message":"Thanks, Sean, for bringing the init_host() example and gibi for the detailed analysis of each operation during startup.\n\nI checked the init_host and cleanup_host which are called on service start and shutdown but honestly saying I considered that out of scope of graceful shutdown. I mean, starting the service and recovery/cleanup their instances (init_host) should be same irrespective of how shutdown was done. If operations can be resumed then they should be resumed in non graceful shutdown also.\n\nIf it is graceful shutdown, then we should have less cases of host\u0027s instances/operations stuck in between. That is whole point of graceful shutdown that try to keep/finish host\u0027s instances/operations before shutdown. For example, if graceful shutdown is completed then resize, migrations (or some more operation for which we decide to wait during graceful shutdown) should be finished. If it is not graceful shutdown (service terminated other than SIGTERM or SIGTERM graceful shutdown is interrupted due to timeout or so), then init_host should be the same as it is currently.\n\nBasically, I agree on possibilities of improvement in the recovery/resume of the instance stuck/half-backed/interrupted operations during init_host() but that should be irrespective of how shutdown was done.\n\nFor example, if live migration was interrupted and we want to make it recoverable operation, then we can make it more stateful and with more recovery checkpoints. But I think we should do that improvement irrespective of how shutdown was done so that we can resume the migration whenever service is up. Something a new feature like \"Recoverable/resumable live migration\". Other example is \"Recoverable/resumable swap volume\" where we need to checkpoint the disk copy progress and a resume the same when service is up. All this needs to be done per operations basis (which we want/can) and irrespective of how service was interrupted. Once we have more operations recoverable then we can improve thigns in init_host or cleanup_host.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":false,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"bb06a82e_b185bc1d","line":88,"in_reply_to":"b180335b_299db516","updated":"2025-12-03 09:47:39.000000000","message":"Acknowledged","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"fb9a9d64_2a68464e","line":88,"in_reply_to":"d48a3fc7_e0a8707f","updated":"2025-11-20 05:19:03.000000000","message":"Yeah, I wrote it from compute perspective, I will correct it and make it generic for all services.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"ef587ba3_84228560","line":88,"in_reply_to":"e04b0db2_0f7c343c","updated":"2025-12-02 22:05:34.000000000","message":"not really we are talking about a distribute system with defined task states. \nif during a graceful shutdown we leave the system in an undefined un recoverable sate that was not a graceful shutdown.\n\nwe do not need to have a perfect solution here but backlog specs are meant to describe the future end state we will achieve.\n\ni am holding this use case as the core use case i am comparing your proposal too.\n\n\"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova\nservices so that it will not impact the users\u0027 in-progress operations.\"\n\nto me that requires that the ongoing operation must be in one of its terminal states\nthe would load must not be in error as a result of the shutdown\nand it must not be stuck in a transition state wehre a user cannt mange the instance when the compute service is restarted.\n\nim fine with aborting a live migration for example or any other instance action being aborted or reverted as part of shutdown. so im not asking to resume an operation form the point we left off but moving it to a well defiend \nterminal state.\n\nif we can resume it in an reasonable way the i think we should aim to do that eventualy but we shoudl not leave an instance stuck in the building or scheduling state for example because the conductor that was managing the build request was restarted. it shoudl be burred in cell 0 and put in error with any allction that may have been created for ti cleaned up.\n\nif that means in the future we need a conductor level perodic to look for an clen up those stuck instance then that what we shoudl plan to do in a feature release.\n\nim not asking you to fix all of these in one go but interrupting any of the operations should not leave the instance in error. we shoudl be in a well defiend state that can be recoverd form without using reset state or db surgery.\n\nfor anyoperation if we have not completed the action by the time the timeout expires we need a way to report that we did not shutdown gracefully so that operators know they may have to manually resolve the issues that are not handled by init_host today.\n\nthis could take the form of a giant ERROR message on startup stating that the previous shutdown was not completed gracefully and perhaps listing the impacted instance that need actions.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"151540c600a2dff4ee4925a41d509c53178c3ee5","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"b180335b_299db516","line":88,"in_reply_to":"ef587ba3_84228560","updated":"2025-12-02 22:55:05.000000000","message":"sure, that make sense. Aborting the operations if they could not be finished and keep the instance in a usable state make sense.\n\nFor timeout or any error during graceful shotdown, loggig the state of progress will help operators to know what all operations are interrupted and might need manual recovery.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"172510aa_50ada4f3","line":88,"in_reply_to":"f43327a1_422bea96","updated":"2025-12-01 16:20:56.000000000","message":"\u003e If it is not graceful shutdown (service terminated other than SIGTERM or SIGTERM graceful shutdown is interrupted due to timeout or so), then init_host should be the same as it is currently.\n\n\u003e Basically, I agree on possibilities of improvement in the recovery/resume of the instance stuck/half-backed/interrupted operations during init_host() but that should be irrespective of how shutdown was done.\n\n\n\nYeah this make it hard to really improve on the implementation of init_host directly. \n\nI agree that init_host will observe less VMs in transitional states and therefore will do less work when the shutdown was graceful. That is our real gain.\n\nI\u0027m OK to remove this from the scope or push it to phase N+1","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"37e360f7_5552aae2","line":88,"in_reply_to":"f43327a1_422bea96","updated":"2025-12-01 21:07:28.000000000","message":"to me if we want to supprot graceful shtudown then ongoing operation must be compelted or in a resumable state when teh agent start up.\n\nif we cannot achive that i wondl not consieder it to be a graceful showdown as there is an impact to ongoing work. the only real excption to that is timeouts.\n\nim ok with putting that out of the spec that will descibe what will be done this cycle but i dont agree with declaring it out of scope fo the backlog spec.\nhttps://review.opendev.org/c/openstack/nova-specs/+/967585\n\nif we don\u0027t consider the \"complete or resumable\" status of ongoing operation to be in scope fo this i would concdier nova to already have graceful shutdwon supprot \nby virutue of having the stop event handeler.\n\nwithout this we cannot recommend restartign an agnet for thing like enablign debug more, general config changes or pasword/cert rotation if there are any inflight operations as it will not grafcefully handel that.\n\nfor example in the case of a live mgiration. we cant assume it will complete before systemd or oslo escalates to sig_kill semantics.\n\nso unless we can resume the live mgiration at thee nova level when we start we have not met my \"complete or resumable\" task state reqruiement for graceful shutdown.\n\nnote the live mgiration will continue to progress at the livber level while the agent is stop and it coudl actully start psot live migratoin (becuase the vm is now on the dest) before we restart.\n\nnow im not saying we need fully solve that in this spec btu that shoudl be somethign this spec seeks to enables.\n\n\nfor the config rotation update case oslo mutable config does ahve a role to play as well but most contianer based installer cant leverage that today due to how they do config management and rely on continent restart instead. meaning for those usecase we really do need graceful shutdown and resueme,","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":85,"context_line":" already in the queue."},{"line_number":86,"context_line":" * If the requests were not processed due to the shutdown being initiated,"},{"line_number":87,"context_line":" then those requests should not be lost and should stay in oslo.messaging"},{"line_number":88,"context_line":" queues."},{"line_number":89,"context_line":" * Once service is up, they should be able to consume those requests and"},{"line_number":90,"context_line":" process them."},{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"}],"source_content_type":"text/x-rst","patch_set":4,"id":"0d59da8a_00fa6f10","line":88,"in_reply_to":"fb9a9d64_2a68464e","updated":"2025-11-20 16:38:50.000000000","message":"OK","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8ce1b374_e51013e7","line":94,"updated":"2025-11-19 17:23:10.000000000","message":"We need to mention that a request that is kept might become stale while the service is down. Like the VM is deleted while the compute was down and therefore all queued up VM lifecycle operation request picked up later by the compute should be ignored.\n\nI\u0027m not sure that we want to handle any bugs that in this area as part of this spec. I\u0027m OK to cover that separately. But we should mention the generic problem about stale messages.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d2979e81_53f2fb07","line":94,"in_reply_to":"59e3c3c0_0eee4375","updated":"2025-11-21 18:52:49.000000000","message":"one of the existing bugs we have today that woudl be good to condier and maybe fix\nis as follows\n\nif you have a compute today that has a non graceful shutdown i.e. the psu explode\nand takes the compute offline. its possibel for rpcs to start stop reboot the instance ot pile up in the quene\n\n\nif the admin evacuate the instnace to a diffent host then fixes the orginal compute\nthen when the comptue compes by up it will process the stop action and mark the desired state for the instance as stopped in the db.\n\nlater when the power sync runs on the host aht has the instance it will see it should be \"stopped\" and stop the runing vm due to the old rpc that was proces by it old compute service\n\nthe fix for this is simple. when the compute agent start up form graceful or non graceful stops it shoudl confrim that the isntance the request is for is still manged by this host and discard if if its not.\n\nat least in the case fo an instnace action like stop or start.\n\nthis is an exsting bug but i think this spec is a good way to capture the intent to fix it.\n\nthe evacuate and stop interaction was reproted as a downstream or upstream bug before but i dont remember the lauchpad for this of the top of my head","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"59e3c3c0_0eee4375","line":94,"in_reply_to":"711ba8e9_5810d3ff","updated":"2025-11-20 16:38:50.000000000","message":"Just note the fact that the message can be stale. That is enough for now","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"711ba8e9_5810d3ff","line":94,"in_reply_to":"8ce1b374_e51013e7","updated":"2025-11-20 05:19:03.000000000","message":"Yeah, this can happen where a delete request is dropped due to a timeout, but other VM operations are picked up by compute. I am not sure if this spec can handle that, but I will note down this scenario.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"f5b90ada_9ea2fd2e","line":94,"in_reply_to":"9f6d1f99_e137106b","updated":"2025-11-29 04:04:00.000000000","message":"Yeah, even I did not completely understand that example, so fixing as part of that spec is not a good idea. let\u0027s keep that as a separate fix/tracker.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1b471541dacf4769ced0751300c1dac45f65f392","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * If the service is in a stop state for a long time, there is a chance that"},{"line_number":92,"context_line":" the RPC will timeout the call, or the message broker queue starts dropping"},{"line_number":93,"context_line":" messages due to timeout. That all depends on how the timeout is configured"},{"line_number":94,"context_line":" for RPC and the message queue."},{"line_number":95,"context_line":" * Start processing the new requests in the normal way."},{"line_number":96,"context_line":""},{"line_number":97,"context_line":"As a graceful shutdown goal, we need to do two things:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"9f6d1f99_e137106b","line":94,"in_reply_to":"d2979e81_53f2fb07","updated":"2025-11-27 14:40:08.000000000","message":"@smooney@redhat.com Thanks for the specific example. I agree that it should be fixed. But maybe we need a bug report for it as I keep forgetting what was the exact case when this can happen and decouple it from the spec in tracking. I think the bug can be fixed independently form the graceful shutdown effort.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":101,"context_line":""},{"line_number":102,"context_line":"#. Give services enough time to finish the operations. As a first step,"},{"line_number":103,"context_line":" this is proposed to be done via time-based wait and later with a proper"},{"line_number":104,"context_line":" tracking mechanism."},{"line_number":105,"context_line":""},{"line_number":106,"context_line":"This spec proposes achieving the above goals in two steps, and they can be"},{"line_number":107,"context_line":"implemented independently."}],"source_content_type":"text/x-rst","patch_set":4,"id":"7a74edda_c7def767","line":104,"updated":"2025-11-19 17:23:10.000000000","message":"There is always a time based wait when external systems like k8s handles the termination. https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-termination and nova cannot really influence when k8s will send the sigkill even if nova knows that there are still in-progress tasks. \n\nAnyhow what we can do is to document for the operator to align the timeouts between nova and k8s.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":101,"context_line":""},{"line_number":102,"context_line":"#. Give services enough time to finish the operations. As a first step,"},{"line_number":103,"context_line":" this is proposed to be done via time-based wait and later with a proper"},{"line_number":104,"context_line":" tracking mechanism."},{"line_number":105,"context_line":""},{"line_number":106,"context_line":"This spec proposes achieving the above goals in two steps, and they can be"},{"line_number":107,"context_line":"implemented independently."}],"source_content_type":"text/x-rst","patch_set":4,"id":"eff85e4d_beef682c","line":104,"in_reply_to":"384d77ec_2db3896f","updated":"2025-11-20 16:38:50.000000000","message":"Cool","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":101,"context_line":""},{"line_number":102,"context_line":"#. Give services enough time to finish the operations. As a first step,"},{"line_number":103,"context_line":" this is proposed to be done via time-based wait and later with a proper"},{"line_number":104,"context_line":" tracking mechanism."},{"line_number":105,"context_line":""},{"line_number":106,"context_line":"This spec proposes achieving the above goals in two steps, and they can be"},{"line_number":107,"context_line":"implemented independently."}],"source_content_type":"text/x-rst","patch_set":4,"id":"c6969e61_b5c564aa","line":104,"in_reply_to":"51065d6a_f027b413","updated":"2025-11-29 04:04:00.000000000","message":"Sure, I replied on the timeout things above and kept that comment unresolved for further discussion but closing it as we are good here.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":101,"context_line":""},{"line_number":102,"context_line":"#. Give services enough time to finish the operations. As a first step,"},{"line_number":103,"context_line":" this is proposed to be done via time-based wait and later with a proper"},{"line_number":104,"context_line":" tracking mechanism."},{"line_number":105,"context_line":""},{"line_number":106,"context_line":"This spec proposes achieving the above goals in two steps, and they can be"},{"line_number":107,"context_line":"implemented independently."}],"source_content_type":"text/x-rst","patch_set":4,"id":"384d77ec_2db3896f","line":104,"in_reply_to":"7a74edda_c7def767","updated":"2025-11-20 05:19:03.000000000","message":"Yeah, those timeouts need to be properly synchronized. As part of this spec, I am planning to write a document for graceful shutdown, and the external system shutdown timeout is good to capture in \u0027things to take care of for graceful shutdown\u0027","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f289f71cc06d5d87bdf1ee311680ca47ae54037","unresolved":true,"context_lines":[{"line_number":101,"context_line":""},{"line_number":102,"context_line":"#. Give services enough time to finish the operations. As a first step,"},{"line_number":103,"context_line":" this is proposed to be done via time-based wait and later with a proper"},{"line_number":104,"context_line":" tracking mechanism."},{"line_number":105,"context_line":""},{"line_number":106,"context_line":"This spec proposes achieving the above goals in two steps, and they can be"},{"line_number":107,"context_line":"implemented independently."}],"source_content_type":"text/x-rst","patch_set":4,"id":"51065d6a_f027b413","line":104,"in_reply_to":"eff85e4d_beef682c","updated":"2025-11-21 18:52:49.000000000","message":"i got as far as here today. ill continue reviewing this on monday\n\nnot to keep repreating myslef but i think we can rely solely on the external timeout rather then addign one ourselve in nova.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":146,"context_line":"* Nova API: No RPC change needed."},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" No work needed, as this service receives requests over http service, and it"},{"line_number":149,"context_line":" talks to other service on their RPC server."},{"line_number":150,"context_line":""},{"line_number":151,"context_line":"* Nova scheduler: No RPC change needed."},{"line_number":152,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"4c494aec_cdc7bc7f","line":149,"updated":"2025-11-19 17:23:10.000000000","message":"yep nova-api and nova-metadata-api does not implement an RPC server. However these services implement a http server. Do we know that when the nova-api service is stopped the ongoing http request is still gracefully handled? We have couple of nova-api calls that are synchronously sending RPC calls and waiting for the response.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":146,"context_line":"* Nova API: No RPC change needed."},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" No work needed, as this service receives requests over http service, and it"},{"line_number":149,"context_line":" talks to other service on their RPC server."},{"line_number":150,"context_line":""},{"line_number":151,"context_line":"* Nova scheduler: No RPC change needed."},{"line_number":152,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"a805b0fb_54fa281a","line":149,"in_reply_to":"29e8e2e2_d4c9093f","updated":"2025-11-29 04:04:00.000000000","message":"I was wrong here. I tested nova api wrongly by stopping them before request came to the nova API.\n\nNova API service shutdown gracefully and it is handled by the WSGI suppported server they are running on. I tested it with devstack env (uWSGI/mod_proxy_uwsgi) and it is not interrupting the in-progress tasks. I will add the details in spec.\n\n- To observe the wait during shutdown request, I added 10 sec sleep in server list API controller code.\n- Ran the \u0027openstack server list\u0027\n- Wait till you see the server list request reached to the contoller (you can see log from controller so it means nova started processing the request)\n- Stop the nova api service.\n- Service will be waiting until server list is request is finished and response is posted to user.\n - uWSGI make sure that each worker they are running for nova APIs requests are gracefully shutdown. Once all worker finish their tasks then only it will shutdown the service.\n \n \nSample log about above finding.\n\n\n*Nov 28 01:34:25 ubyntu-noble2 devstack@n-api.service[280779]: DEBUG nova.api.openstack.wsgi [None req-26b58348-bb73-4687-b341-9154df88cc69 None None] Calling method \u0027\u003cbound method VersionsV2.index of \u003cnova.api.openstack.compute.versions.VersionsV2 object at 0x7648ef3df290\u003e\u003e\u0027 {{(pid\u003d280779) _process_stack /opt/stack/nova/nova/api/openstack/wsgi.py:552}}\nNov 28 01:34:25 ubyntu-noble2 devstack@n-api.service[280779]: INFO nova.api.openstack.requestlog [None req-26b58348-bb73-4687-b341-9154df88cc69 None None] 10.0.2.15 \"GET /compute/v2.1\" status: 200 len: 387 microversion: 2.1 time: 0.006827\nNov 28 01:34:25 ubyntu-noble2 devstack@n-api.service[280779]: [pid: 280779|app: 0|req: 1/1] 10.0.2.15 () {58 vars in 980 bytes} [Fri Nov 28 01:34:25 2025] GET /compute/v2.1 \u003d\u003e generated 387 bytes in 49 msecs (HTTP/1.1 200) 9 headers in 357 bytes (1 switches on core 0)\nNov 28 01:34:26 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG *\n\n**here nova api received the request of server list**\n\n*nova.api.openstack.wsgi [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Calling method \u0027\u003cbound method ServersController.detail of \u003cnova.api.openstack.compute.servers.ServersController object at 0x7648eeaa3c20\u003e\u003e\u0027 {{(pid\u003d280780) _process_stack /opt/stack/nova/nova/api/openstack/wsgi.py:552}}*\n\n**I stopped the service but uWSGI hanlding the graceful shutdown. It wait for each worker to finish the ongoing tasks. server list request is on \u0027worker 2\u0027**\n\n*Nov 28 01:34:33 ubyntu-noble2 devstack@n-api.service[280778]: Fri Nov 28 01:34:33 2025 - graceful shutdown triggered...\nNov 28 01:34:33 ubyntu-noble2 systemd[1]: Stopping devstack@n-api.service - Devstack devstack@n-api.service...\nNov 28 01:34:33 ubyntu-noble2 devstack@n-api.service[280779]: Gracefully killing worker 1 (pid: 280779)...\nNov 28 01:34:33 ubyntu-noble2 devstack@n-api.service[280780]: Gracefully killing worker 2 (pid: 280780)...*\n\n**worker 1 is gone because there is no request on worker 1. worker 2 is still running**\n\n*Nov 28 01:34:35 ubyntu-noble2 devstack@n-api.service[280778]: worker 1 buried after 2 seconds\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG nova.compute.api [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Searching by: {\u0027deleted\u0027: False, \u0027project_id\u0027: \u00277b52dd1f59134699b47fc4bef4bcaabd\u0027} {{(pid\u003d280780) get_all /opt/stack/nova/nova/compute/api.py:3059}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Acquiring lock \"00000000-0000-0000-0000-000000000000\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:405}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"00000000-0000-0000-0000-000000000000\" acquired by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: waited 0.002s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:410}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"00000000-0000-0000-0000-000000000000\" \"released\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: held 0.001s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:424}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Acquiring lock \"d60727e2-25fd-47af-8602-121fd337d69c\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:405}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"d60727e2-25fd-47af-8602-121fd337d69c\" acquired by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: waited 0.001s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:410}}\nNov 28 01:34:46 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"d60727e2-25fd-47af-8602-121fd337d69c\" \"released\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: held 0.002s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:424}}\nNov 28 01:34:51 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG nova.compute.multi_cell_list [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Listed batch of 1 results from cell out of 1000 limit. Returned 1 total so far. {{(pid\u003d280780) do_query /opt/stack/nova/nova/compute/multi_cell_list.py:378}}\nNov 28 01:34:55 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Acquiring lock \"d60727e2-25fd-47af-8602-121fd337d69c\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:405}}\nNov 28 01:34:55 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"d60727e2-25fd-47af-8602-121fd337d69c\" acquired by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: waited 0.015s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:410}}\nNov 28 01:34:55 ubyntu-noble2 devstack@n-api.service[280780]: DEBUG oslo_concurrency.lockutils [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] Lock \"d60727e2-25fd-47af-8602-121fd337d69c\" \"released\" by \"nova.context.set_target_cell.\u003clocals\u003e.get_or_set_cached_cell_and_set_connections\" :: held 0.014s {{(pid\u003d280780) inner /opt/stack/data/venv/lib/python3.12/site-packages/oslo_concurrency/lockutils.py:424}}\nNov 28 01:34:55 ubyntu-noble2 devstack@n-api.service[280780]: WARNING neutronclient.v2_0.client [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] The python binding code in neutronclient is deprecated in favor of OpenstackSDK, please use that as this will be removed in a future release.\nNov 28 01:34:58 ubyntu-noble2 devstack@n-api.service[280780]: INFO nova.api.openstack.requestlog [None req-ceeb2298-66b9-4234-bd99-c0ce3006b377 admin admin] 10.0.2.15 \"GET /compute/v2.1/servers/detail?deleted\u003dFalse\" status: 200 len: 2370 microversion: 2.100 time: 32.394979\nNov 28 01:34:58 ubyntu-noble2 devstack@n-api.service[280780]: [pid: 280780|app: 0|req: 1/2] 10.0.2.15 () {64 vars in 1341 bytes} [Fri Nov 28 01:34:25 2025] GET /compute/v2.1/servers/detail?deleted\u003dFalse \u003d\u003e generated 2370 bytes in 32397 msecs (HTTP/1.1 200) 9 headers in 362 bytes (1 switches on core 0)*\n\n**Server list is finished and response is sent. Now worker 2 is also gone and service is stopped**\n\n*Nov 28 01:35:04 ubyntu-noble2 devstack@n-api.service[280778]: worker 2 buried after 31 seconds\nNov 28 01:35:04 ubyntu-noble2 devstack@n-api.service[280778]: goodbye to uWSGI.\nNov 28 01:35:04 ubyntu-noble2 systemd[1]: devstack@n-api.service: Deactivated successfully.\nNov 28 01:35:04 ubyntu-noble2 systemd[1]: Stopped devstack@n-api.service - Devstack devstack@n-api.service.\nNov 28 01:35:04 ubyntu-noble2 systemd[1]: devstack@n-api.service: Consumed 48.480s CPU time.*","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":146,"context_line":"* Nova API: No RPC change needed."},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" No work needed, as this service receives requests over http service, and it"},{"line_number":149,"context_line":" talks to other service on their RPC server."},{"line_number":150,"context_line":""},{"line_number":151,"context_line":"* Nova scheduler: No RPC change needed."},{"line_number":152,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"5098f805_29b22593","line":149,"in_reply_to":"4c494aec_cdc7bc7f","updated":"2025-11-20 05:19:03.000000000","message":"Yeah, I tested it and if I remember it abruptly stopped the request and give 500 error to user but let me double check it (My devstack VM is taking time to be up, once it is up I will update here).","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":146,"context_line":"* Nova API: No RPC change needed."},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" No work needed, as this service receives requests over http service, and it"},{"line_number":149,"context_line":" talks to other service on their RPC server."},{"line_number":150,"context_line":""},{"line_number":151,"context_line":"* Nova scheduler: No RPC change needed."},{"line_number":152,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"29e8e2e2_d4c9093f","line":149,"in_reply_to":"5098f805_29b22593","updated":"2025-11-20 16:38:50.000000000","message":"hm if it is an abrupt http 500 then it is not graceful either. And we might need to include the api to the impacted services.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":false,"context_lines":[{"line_number":170,"context_line":""},{"line_number":171,"context_line":" We still need to keep the worker up until all the responses are sent via"},{"line_number":172,"context_line":" the reply queue, and for that, we need to implement the in-progress task"},{"line_number":173,"context_line":" tracking in scheduler services, but that will be handled in step 2."},{"line_number":174,"context_line":""},{"line_number":175,"context_line":" This way, stopping a Nova scheduler worker will not impact the RPC"},{"line_number":176,"context_line":" communication on the scheduler service."}],"source_content_type":"text/x-rst","patch_set":4,"id":"61107bd4_dd5a4bd2","line":173,"updated":"2025-11-19 17:23:10.000000000","message":"Thanks for this note. It resolved some of my questions.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":false,"context_lines":[{"line_number":237,"context_line":" * nova.service will stop the 1st RPC server so that no new requests are"},{"line_number":238,"context_line":" picked by the compute. The 2nd RPC server is running and up."},{"line_number":239,"context_line":" * nova.service will wait for the manager to signal once all in-progress"},{"line_number":240,"context_line":" operations are finished."},{"line_number":241,"context_line":" * Once compute signal to nova.service, then it will stop the 2nd RPC server"},{"line_number":242,"context_line":" and proceed with service shutdown."},{"line_number":243,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"d0ec261d_ac9b3259","line":240,"updated":"2025-11-19 17:23:10.000000000","message":"I guess this will be detailed out later. If not the we need such details. Basically we have two RPC servers at this point to `.wait()` on to see if all the RPC handlers for each RPC server is finished. However that alone won\u0027t be enough if nova-compute waits for an external event (ie vif-plugged) but such wait is not happening in an RPC handler thread but in a thread in our other thread pools.\n\n\n-- later --\n\nOK later it is detailed that in step2 there will be a data structure with tasks maintained and we are waiting on that.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":249,"context_line":" irrespective of the compute finishing the things."},{"line_number":250,"context_line":" * Its default is 60 sec, which is less for the compute service to finish"},{"line_number":251,"context_line":" things. The proposal is to change this default value (only for the"},{"line_number":252,"context_line":" compute service) to 1800 sec."},{"line_number":253,"context_line":""},{"line_number":254,"context_line":" * RPC client:"},{"line_number":255,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"be1de3d4_2865e5c5","line":252,"updated":"2025-11-19 17:23:10.000000000","message":"As noted above we might not have a chance to influence when the graceful shutdown becomes forceful due to the external service manager like k8s.\n\nFor me it is OK to have a nova level timeout but for me it is low prio as the timeout is solved problem outside of nova already.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":249,"context_line":" irrespective of the compute finishing the things."},{"line_number":250,"context_line":" * Its default is 60 sec, which is less for the compute service to finish"},{"line_number":251,"context_line":" things. The proposal is to change this default value (only for the"},{"line_number":252,"context_line":" compute service) to 1800 sec."},{"line_number":253,"context_line":""},{"line_number":254,"context_line":" * RPC client:"},{"line_number":255,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"cc180739_d089acaa","line":252,"in_reply_to":"92a26123_18a4540b","updated":"2025-11-20 16:38:50.000000000","message":"yeah I\u0027m not trying to say that the external timeout solves the gracefulness. My point was really about nova\u0027s own timeout to upgrade from the graceful shutdown to the forceful one is something that is not a must for me as the external system will do that timeout and upgrade anyhow. But I\u0027m not against nova doing the upgrade from graceful to forceful shutdown as well.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":249,"context_line":" irrespective of the compute finishing the things."},{"line_number":250,"context_line":" * Its default is 60 sec, which is less for the compute service to finish"},{"line_number":251,"context_line":" things. The proposal is to change this default value (only for the"},{"line_number":252,"context_line":" compute service) to 1800 sec."},{"line_number":253,"context_line":""},{"line_number":254,"context_line":" * RPC client:"},{"line_number":255,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"92a26123_18a4540b","line":252,"in_reply_to":"be1de3d4_2865e5c5","updated":"2025-11-20 05:19:03.000000000","message":"I have not checked the implementation, but how does external timeout solve Nova service to give time to finish things but do not accept new things? If new requests keep coming that any timeout is not enough.\n\nThe main idea of Nova service timeout is how much to wait after we stop requesting the new request.\n\nBut I agree,w e should document it by explaining all timeouts, including external sysmtem and how they are related.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":249,"context_line":" irrespective of the compute finishing the things."},{"line_number":250,"context_line":" * Its default is 60 sec, which is less for the compute service to finish"},{"line_number":251,"context_line":" things. The proposal is to change this default value (only for the"},{"line_number":252,"context_line":" compute service) to 1800 sec."},{"line_number":253,"context_line":""},{"line_number":254,"context_line":" * RPC client:"},{"line_number":255,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"0cef7682_44d64256","line":252,"in_reply_to":"cc180739_d089acaa","updated":"2025-11-29 04:04:00.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":259,"context_line":" * If any RPC cast/call wants to send a message via the 2nd RPC server, they"},{"line_number":260,"context_line":" need to override the ``topic`` to ``compoute-ops.\u003chost\u003e`` during"},{"line_number":261,"context_line":" client.prepare() call."},{"line_number":262,"context_line":" * What all RPC cast/call will be using the 2nd RPC server will be decided"},{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"}],"source_content_type":"text/x-rst","patch_set":4,"id":"472291e0_e8d3f7c0","line":262,"range":{"start_line":262,"start_character":6,"end_line":262,"end_character":61},"updated":"2025-11-19 17:23:10.000000000","message":"Which RPC cast/call will be using the the 2nd RPC server","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":259,"context_line":" * If any RPC cast/call wants to send a message via the 2nd RPC server, they"},{"line_number":260,"context_line":" need to override the ``topic`` to ``compoute-ops.\u003chost\u003e`` during"},{"line_number":261,"context_line":" client.prepare() call."},{"line_number":262,"context_line":" * What all RPC cast/call will be using the 2nd RPC server will be decided"},{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d20fac8b_1005e1d4","line":262,"range":{"start_line":262,"start_character":6,"end_line":262,"end_character":61},"in_reply_to":"472291e0_e8d3f7c0","updated":"2025-11-20 05:19:03.000000000","message":"I kept this list to be decided at the implementation level and to grow eventually. But I can make a list here which are obvious, for example, migrations, resize, shelve(?).","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c5adb1909d9fc10679e6f7c06fa0a3d8e99cd2a0","unresolved":false,"context_lines":[{"line_number":259,"context_line":" * If any RPC cast/call wants to send a message via the 2nd RPC server, they"},{"line_number":260,"context_line":" need to override the ``topic`` to ``compoute-ops.\u003chost\u003e`` during"},{"line_number":261,"context_line":" client.prepare() call."},{"line_number":262,"context_line":" * What all RPC cast/call will be using the 2nd RPC server will be decided"},{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"}],"source_content_type":"text/x-rst","patch_set":4,"id":"ff371cea_81dc6b58","line":262,"range":{"start_line":262,"start_character":6,"end_line":262,"end_character":61},"in_reply_to":"8c5249a3_740d265d","updated":"2025-11-20 19:36:59.000000000","message":"Acknowledged","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":259,"context_line":" * If any RPC cast/call wants to send a message via the 2nd RPC server, they"},{"line_number":260,"context_line":" need to override the ``topic`` to ``compoute-ops.\u003chost\u003e`` during"},{"line_number":261,"context_line":" client.prepare() call."},{"line_number":262,"context_line":" * What all RPC cast/call will be using the 2nd RPC server will be decided"},{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8c5249a3_740d265d","line":262,"range":{"start_line":262,"start_character":6,"end_line":262,"end_character":61},"in_reply_to":"d20fac8b_1005e1d4","updated":"2025-11-20 16:38:50.000000000","message":"Sorry. I meant to only suggest a rewording here as for me \"What all RPC\" sounds strange","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"3db65c56_bbbe1774","line":266,"updated":"2025-11-19 17:23:10.000000000","message":"I agree that we need to keep some flexibility here to extend the list later during the implementation but I suggest to list the RPC methods we already identified to be moved to compute-ops topic in the spec as a starting point.\n\nBased on the live migration sequence[1] we need to move the followings:\n* check_can_live_migrate_source\n* live_migrate\n* post_live_migration_at_destination\n* remove_volume_connections\n* rollback_live_migraton_at_destination\n\n\n[1]https://docs.openstack.org/nova/latest/reference/live-migration.html","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"fa93ffb3_3e7b375d","line":266,"in_reply_to":"3db65c56_bbbe1774","updated":"2025-11-20 05:19:03.000000000","message":"Yes, I showed the example of live migration in my PoC - https://review.opendev.org/c/openstack/nova/+/967261/7/nova/compute/rpcapi.py\n\nI kept check_can_live_migrate_source to be called on the 1st RPC server because this is a very initial stage of live migration, and if either dest or the source compute shutdown is initiated then it should not pick the request to say yes for live migration. But that is something we can discuss during the code change.\n\nAlso, I do not want it to be a very hard list here, as it can grow eventually when we think xyz operation should be finished in the shutdown, so change it to use 2nd RPC server.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"f64735ea4fe2d0609276bc4a9cdad74e4bfbeb38","unresolved":true,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"f6946f28_39c57379","line":266,"in_reply_to":"dfa0947f_db652599","updated":"2025-11-21 08:06:00.000000000","message":"Sounds good. Thanks!","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c5adb1909d9fc10679e6f7c06fa0a3d8e99cd2a0","unresolved":true,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"dfa0947f_db652599","line":266,"in_reply_to":"e76571bb_6d024da1","updated":"2025-11-20 19:36:59.000000000","message":"Yes, it will be in \u0027error\u0027 state. I checked possible cases for live migration, and case 2 can be improved to save time. Details are below:\n\n1) If the destination host is shutting down:\n\n- 1.1) If the destination is requested (force live migration):\n \n The conductor will call check_can_live_migrate_destination()[1] on the destination compute and will get RPC timeout error. conductor LiveMigrationTask will raise MigrationPreCheckError[2] to compute. compute will make migration in error state[3].\n \n- 1.2) If no destination is requested:\n \n In this case, the conductor will query the scheduler to find a suitable destination. On every destination (selected by scheduler), conductor calls check_can_live_migrate_destination()[4], and if any destination is shutting down, then it will get MigrationPreCheckError. In this case, that destination is skipped, and it asks scheduler to find another destination. So this case is fine, as shutting down destination will say no to any new live migration.\n\n\n2) If the source host is shutting down:\n\n- The conductor LiveMigrationTask will call check_can_live_migrate_destination() on destination compute, and destination will check the same on source compute. It will call check_can_live_migrate_source() on source compute . If source compute is shutting down, it will raise the RPC timeout error to destination, which raise MigrationPreCheckError to conductor[5]. And comoute manager will mark migration to error.\n\n- NOTE: If source is shutting down, then every destination check_can_live_migrate_destination will throw error. That is unnecessary processing \nfor every destination, when we know source going to reject the live migration.\n\nI think we can improve case 2. If source is shutting down, then do not need to check every destination. Before conductor checks every possible destination, source compute should tell conductor to reject the migration as it is shuting down and will not take any new migration request. Something we can improve in self._check_host_is_up(self.source)[6]. Along with checking compute is up, we can make a RPC call to source compute and pre-check if it is shutting down.\n\n\n\n[1] via _check_requested_destination() -\u003e_call_livem_checks_on_host() https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/conductor/tasks/live_migrate.py#L134\n\n[2] https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/conductor/tasks/live_migrate.py#L384C29-L384C51\n\n[3] https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/conductor/manager.py#L522\n\n[4] via _call_livem_checks_on_host() https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/conductor/tasks/live_migrate.py#L574\n\n[5] https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/compute/manager.py#L9110C33-L9110C55\n\n[6] https://github.com/openstack/nova/blob/53aadaf967b708bfb03616535d45f6378a21cae0/nova/conductor/tasks/live_migrate.py#L84","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"cf3d5434_da77bd1a","line":266,"in_reply_to":"f6946f28_39c57379","updated":"2025-11-29 04:04:00.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":263,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":264,"context_line":" methods are used for the operations we want to finish during shutdown."},{"line_number":265,"context_line":" For example. live migration, server external events are two good"},{"line_number":266,"context_line":" candidates."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":""},{"line_number":269,"context_line":"* Timeout and wait time for services to finish the in-progress operations:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e76571bb_6d024da1","line":266,"in_reply_to":"fa93ffb3_3e7b375d","updated":"2025-11-20 16:38:50.000000000","message":"I\u0027m OK to make it a soft list, no worries.\n\nAbout check_can_live_migrate_source. I\u0027m wondering what if nova will put the migration in a proper error state is check_can_live_migrate_source times out. If it does then I\u0027m fine keeping it on the 1st RPC topic.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":283,"context_line":" * conductor service: 60 sec should be enough, but if not, then we can"},{"line_number":284,"context_line":" change its default for conductor also. For now, no change is proposed."},{"line_number":285,"context_line":" * scheduler service: 60 sec should be enough, but if not, then we can"},{"line_number":286,"context_line":" change its default for scheduler also. For now, no change is proposed."},{"line_number":287,"context_line":""},{"line_number":288,"context_line":"* Wait time for services to finish the in-progress operations:"},{"line_number":289,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"35fba536_be2737d9","line":286,"updated":"2025-11-19 17:23:10.000000000","message":"we need to consider nova-novncproxy","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":283,"context_line":" * conductor service: 60 sec should be enough, but if not, then we can"},{"line_number":284,"context_line":" change its default for conductor also. For now, no change is proposed."},{"line_number":285,"context_line":" * scheduler service: 60 sec should be enough, but if not, then we can"},{"line_number":286,"context_line":" change its default for scheduler also. For now, no change is proposed."},{"line_number":287,"context_line":""},{"line_number":288,"context_line":"* Wait time for services to finish the in-progress operations:"},{"line_number":289,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"25fb2012_a81ee09d","line":286,"in_reply_to":"1974767a_397b2ca6","updated":"2025-11-29 04:04:00.000000000","message":"Done","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":283,"context_line":" * conductor service: 60 sec should be enough, but if not, then we can"},{"line_number":284,"context_line":" change its default for conductor also. For now, no change is proposed."},{"line_number":285,"context_line":" * scheduler service: 60 sec should be enough, but if not, then we can"},{"line_number":286,"context_line":" change its default for scheduler also. For now, no change is proposed."},{"line_number":287,"context_line":""},{"line_number":288,"context_line":"* Wait time for services to finish the in-progress operations:"},{"line_number":289,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"1974767a_397b2ca6","line":286,"in_reply_to":"35fba536_be2737d9","updated":"2025-11-20 05:19:03.000000000","message":"yeah, I will add that. Thanks for pointing that out.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":333,"context_line":""},{"line_number":334,"context_line":".. note:: As per my PoC and manual testing till now, it does not require any"},{"line_number":335,"context_line":" change on oslo.messaging side."},{"line_number":336,"context_line":""},{"line_number":337,"context_line":"Step 2: Smartly track and wait for the in-progress operations:"},{"line_number":338,"context_line":"--------------------------------------------------------------"},{"line_number":339,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"a8063fb7_6ce23f53","line":336,"updated":"2025-11-19 17:23:10.000000000","message":"It would be good to mention the nova-novncproxy service. That also has long running proxy operations and we want to decide what to do with them. \n* a) do the graceful shutdown game of not allowing new proxy connections but keeping the existing connections alive\n* b) as killing a VNC connection should not affect the instance state we just don\u0027t do graceful shutdown for the proxy, but simply kill the connections during stopping the service.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":333,"context_line":""},{"line_number":334,"context_line":".. note:: As per my PoC and manual testing till now, it does not require any"},{"line_number":335,"context_line":" change on oslo.messaging side."},{"line_number":336,"context_line":""},{"line_number":337,"context_line":"Step 2: Smartly track and wait for the in-progress operations:"},{"line_number":338,"context_line":"--------------------------------------------------------------"},{"line_number":339,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"71025bf8_2477f57d","line":336,"in_reply_to":"2d85c6dc_930fc5c3","updated":"2025-11-29 04:04:00.000000000","message":"I finished the investigation and playing around nova-novncproxy and good things is that those services shutdown gracefully, which is handled by the websockify library. I will provide details in the proposed section","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":false,"context_lines":[{"line_number":333,"context_line":""},{"line_number":334,"context_line":".. note:: As per my PoC and manual testing till now, it does not require any"},{"line_number":335,"context_line":" change on oslo.messaging side."},{"line_number":336,"context_line":""},{"line_number":337,"context_line":"Step 2: Smartly track and wait for the in-progress operations:"},{"line_number":338,"context_line":"--------------------------------------------------------------"},{"line_number":339,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"2d85c6dc_930fc5c3","line":336,"in_reply_to":"a8063fb7_6ce23f53","updated":"2025-11-20 05:19:03.000000000","message":"Acknowledged","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"72b066526d4d6099063cd1e58470e42e251ea220","unresolved":true,"context_lines":[{"line_number":353,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":354,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":355,"context_line":" done under lock."},{"line_number":356,"context_line":"* Once shutdown is initiated:"},{"line_number":357,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":358,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":359,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d7668b04_54eff446","line":356,"updated":"2025-11-19 17:23:10.000000000","message":"Do I understand that the below waiting loop will be running in the TERM signal handler? I read the python doc about signal handlers, it does not say we cannot have a long running signal handler, but I\u0027m a bit afraid of it still. It worth to POC what happens if we block that signal handler by starting to wait.\n\nThe doc says\n\u003e Python signal handlers are always executed in the main Python thread of the main interpreter,\n\nSo I assume that what happens is that we have our main thread that basically started when the service is started and the signal handler is hijacking that thread to run the signal handler code. So whathever that main thread was doing before the signal arrived will be stopped while the signal handler runs. \n\nAnyhow this needs a bit of attention during implementation but probably we can ignore it in the spec time.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"80eea3ddc2a40d9dc12eecbf0f5ae8a79ae6b93e","unresolved":false,"context_lines":[{"line_number":353,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":354,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":355,"context_line":" done under lock."},{"line_number":356,"context_line":"* Once shutdown is initiated:"},{"line_number":357,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":358,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":359,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d936ca00_36fe83df","line":356,"in_reply_to":"52e385a1_26918a49","updated":"2025-11-29 04:04:00.000000000","message":"Basically, it will be up to who implement the SIGTERM handler and what timeout they consider. From oslo.sevice with threading backend, it will be handled by the cotyledon and with the graceful_shutdown_timeout timeout[1].\n\noslo.service provide it more flexibility for services by making graceful_shutdown_timeout configurable per service so we have a way to configure it differently based on service. For example, in our case, we might need it a little higher for the compute service than the scheduler or conductor.\n\n[1] https://github.com/sileht/cotyledon/blob/0f80f64aae4b76a79c388c9ae565bc860524b30b/cotyledon/_service_manager.py#L385","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7b49623213d26f44257286d84466897a8cb5b0fe","unresolved":true,"context_lines":[{"line_number":353,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":354,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":355,"context_line":" done under lock."},{"line_number":356,"context_line":"* Once shutdown is initiated:"},{"line_number":357,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":358,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":359,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"}],"source_content_type":"text/x-rst","patch_set":4,"id":"52e385a1_26918a49","line":356,"in_reply_to":"8e58476b_719f70f9","updated":"2025-11-20 16:38:50.000000000","message":"I\u0027m OK to handle this with a note here and investigate it during the implementation.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"dd51d38040b2baf1226923c5de08735d61b25e0c","unresolved":true,"context_lines":[{"line_number":353,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":354,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":355,"context_line":" done under lock."},{"line_number":356,"context_line":"* Once shutdown is initiated:"},{"line_number":357,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":358,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":359,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8e58476b_719f70f9","line":356,"in_reply_to":"d7668b04_54eff446","updated":"2025-11-20 05:19:03.000000000","message":"I tested with the nova wait time of 180 sec and oslo.service graceful_shutdown_timeout of 300 sec, and it handled the shutdown gracefully but yes I need to check oslo.service code more if there is any limitation on signal handler.\n\nLet me make a note here about it so that we do not forget to investigate it during implementation.","commit_id":"69e85dd0aa05e06d5d176a53b0924f801e76ad6d"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":31,"context_line":"RPC server (topic: compute.\u003chost\u003e). The same RPC server is used for the new"},{"line_number":32,"context_line":"requests as well as for in-progress operations where other compute or conductor"},{"line_number":33,"context_line":"services communicate. When shutdown is initiated, the RPC server is stopped"},{"line_number":34,"context_line":"means it will stop requesting the new request, which is ok, but at the same"},{"line_number":35,"context_line":"time it will stop the communication needed for the in-progress operations. For"},{"line_number":36,"context_line":"example, if live migration is in progress, the source and destination compute"},{"line_number":37,"context_line":"communicate (sync and async way) multiple times with each other. Once the RPC"}],"source_content_type":"text/x-rst","patch_set":6,"id":"2345c8f7_d18fcd3f","line":34,"range":{"start_line":34,"start_character":19,"end_line":34,"end_character":29},"updated":"2025-12-01 16:20:56.000000000","message":"nit: handling","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":31,"context_line":"RPC server (topic: compute.\u003chost\u003e). The same RPC server is used for the new"},{"line_number":32,"context_line":"requests as well as for in-progress operations where other compute or conductor"},{"line_number":33,"context_line":"services communicate. When shutdown is initiated, the RPC server is stopped"},{"line_number":34,"context_line":"means it will stop requesting the new request, which is ok, but at the same"},{"line_number":35,"context_line":"time it will stop the communication needed for the in-progress operations. For"},{"line_number":36,"context_line":"example, if live migration is in progress, the source and destination compute"},{"line_number":37,"context_line":"communicate (sync and async way) multiple times with each other. Once the RPC"}],"source_content_type":"text/x-rst","patch_set":6,"id":"30414ff3_71a3bedb","line":34,"range":{"start_line":34,"start_character":19,"end_line":34,"end_character":29},"in_reply_to":"2345c8f7_d18fcd3f","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":38,"context_line":"server on the compute service is stopped, it cannot communicate with the other"},{"line_number":39,"context_line":"compute and fail the live migration. It will lead the system as well as the"},{"line_number":40,"context_line":"instance to be in an unwanted or unrecoverable state"},{"line_number":41,"context_line":""},{"line_number":42,"context_line":"Use Cases"},{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"b8a3b70d_3760f5c4","line":41,"updated":"2025-12-01 21:07:28.000000000","message":"so there are time today where it woudl be ok for the sorce and dest not to be able to comunicate.\n\ni.e. once the live migraiton is sarted and libvirt is amnaging the migraiton\n\nit woudl be fine for the dest to restart without it impacting the migration\nits not ok for the srouce to be restarted today but you stoping the dest wont impact the migration and resuming it shoudl nto abort it.\n\nprovide the request in post live mgiration from the souce to dest does not time out we shoudl eb able to tollerate the destionation restarting without aborting.\non the ohter hand if the souce is restarted it shoudl abort all live migraiton as part of the shutdown.\n\nso the behavior we want in teh future is abort outgoing migrration but allow resuming/continuing incoming live migrations.\n\nif we could resume both eventually that woudl be even better but that is a littel harder as we woudl need to be able to pickup where we left off in the migration on the source host.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ff984cc84dd9c88c6380443929f4fc760830bfdd","unresolved":false,"context_lines":[{"line_number":38,"context_line":"server on the compute service is stopped, it cannot communicate with the other"},{"line_number":39,"context_line":"compute and fail the live migration. It will lead the system as well as the"},{"line_number":40,"context_line":"instance to be in an unwanted or unrecoverable state"},{"line_number":41,"context_line":""},{"line_number":42,"context_line":"Use Cases"},{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"31cb4c16_4273ef54","line":41,"in_reply_to":"668815cc_4abe64b2","updated":"2025-12-02 04:57:24.000000000","message":"I think there is no change needed here so resolving the comment but if anything mroe to be added please clarify.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":true,"context_lines":[{"line_number":38,"context_line":"server on the compute service is stopped, it cannot communicate with the other"},{"line_number":39,"context_line":"compute and fail the live migration. It will lead the system as well as the"},{"line_number":40,"context_line":"instance to be in an unwanted or unrecoverable state"},{"line_number":41,"context_line":""},{"line_number":42,"context_line":"Use Cases"},{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"668815cc_4abe64b2","line":41,"in_reply_to":"b8a3b70d_3760f5c4","updated":"2025-12-02 04:55:32.000000000","message":"yes, that is all about making live migration operations more stateful and with recoverable checkpoints.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations."},{"line_number":47,"context_line":""},{"line_number":48,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":49,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":6,"id":"7bcbaa0b_0794558a","line":46,"updated":"2025-12-01 21:07:28.000000000","message":"so just to be clear to meet this usecase you need the \"complete or resumable\" sematic in my view.\n\nif we dont supprot resuming the operation then that forcues us to not shutwonw untill all in-progress operations are complete which means we shoudl not automircly timeout and kill the service after 60 second which is the default value of graceful_shutdown_timeout today.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations."},{"line_number":47,"context_line":""},{"line_number":48,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":49,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":6,"id":"22255756_7b147444","line":46,"in_reply_to":"18bc259e_d92dd968","updated":"2025-12-02 23:27:11.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":true,"context_lines":[{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations."},{"line_number":47,"context_line":""},{"line_number":48,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":49,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":6,"id":"c382f352_4e181ff2","line":46,"in_reply_to":"7bcbaa0b_0794558a","updated":"2025-12-02 04:55:32.000000000","message":"IMO \u0027resumable\u0027 is not a sematic of graceful shutdown instead that is a \u0027state preservation\u0027 effort.\n\nGoal here is to complete the in-progress opertions, if that is interrupted by timeout or other signal then it will not be graceful shutdown. Operator needs to consider the timeout carefully. And if it is not graceful shutdown then operations will be impacted which is what non-graceful shutdown it.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations."},{"line_number":47,"context_line":""},{"line_number":48,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":49,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":6,"id":"f9d0739e_0a28f026","line":46,"in_reply_to":"c382f352_4e181ff2","updated":"2025-12-02 17:19:26.000000000","message":"Yeah, we can be a lot better than we are today without building infrastructure for \"resumable\" operations. I\u0027d rather make progress towards some sort of quiesced exit than worry about resume-ability. For example, it would be fine, IMHO to immediately kill any in-progress operations on SIGTERM so that we can finish and exit cleanly/quickly, even if that means acting like it failed and requiring the same sort of cleanup that would happen after an actual failure.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":43,"context_line":"---------"},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations."},{"line_number":47,"context_line":""},{"line_number":48,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":49,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":6,"id":"18bc259e_d92dd968","line":46,"in_reply_to":"f9d0739e_0a28f026","updated":"2025-12-02 22:05:34.000000000","message":"so to your point later i woudl consider aborting a in progress migration to be complete ing the operations. if we are doing a source compute node restart that a perfiectly valide thing to do on shutdown. \n\n\ni don\u0027t want us to do nothing unless its perfect but i don\u0027t want use to make things worse either or suggest that operators should rely on the graceful shutdowns to not create operational problems.\n\ni.e. if this makes restarting some service easier great but it make the pain of not properly being able to resume tasks on others more acutely viable.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":77,"context_line":" * Proper logging of unfinished operations."},{"line_number":78,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":79,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":80,"context_line":" timeout, adding the timoeut details in later section) then there should be"},{"line_number":81,"context_line":" a proper logging of all the unfinished operations. This will help to"},{"line_number":82,"context_line":" recover the system or instances."},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"8d668ccd_3c30546d","line":80,"range":{"start_line":80,"start_character":24,"end_line":80,"end_character":31},"updated":"2025-12-01 16:20:56.000000000","message":"timeout","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":77,"context_line":" * Proper logging of unfinished operations."},{"line_number":78,"context_line":" Ideally, all the in-progress operations should be completed before service"},{"line_number":79,"context_line":" is terminated, but if graceful shutdown times out (due to a configured"},{"line_number":80,"context_line":" timeout, adding the timoeut details in later section) then there should be"},{"line_number":81,"context_line":" a proper logging of all the unfinished operations. This will help to"},{"line_number":82,"context_line":" recover the system or instances."},{"line_number":83,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"f96dfb94_dcf838b9","line":80,"range":{"start_line":80,"start_character":24,"end_line":80,"end_character":31},"in_reply_to":"8d668ccd_3c30546d","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":91,"context_line":" * Requests might have been picked by the other worker of that service."},{"line_number":92,"context_line":" For example, you can run more than one Nova scheduler (or conductor)"},{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."}],"source_content_type":"text/x-rst","patch_set":6,"id":"88509fd4_ca681c87","line":94,"range":{"start_line":94,"start_character":6,"end_line":94,"end_character":12},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n service the request. This is not the case for Nova compute which is\n```\nyou could also say \"serve the request\" but that more applicable to a web sever so i think `service` is more correct or you coudl use a diffent verb entrily `fullfil` or `process` to make it more clear","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":91,"context_line":" * Requests might have been picked by the other worker of that service."},{"line_number":92,"context_line":" For example, you can run more than one Nova scheduler (or conductor)"},{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."}],"source_content_type":"text/x-rst","patch_set":6,"id":"f3bd8b4d_f4971d5c","line":94,"range":{"start_line":94,"start_character":6,"end_line":94,"end_character":12},"in_reply_to":"88509fd4_ca681c87","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":92,"context_line":" For example, you can run more than one Nova scheduler (or conductor)"},{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"}],"source_content_type":"text/x-rst","patch_set":6,"id":"d31e48a2_84a189c6","line":95,"range":{"start_line":95,"start_character":33,"end_line":95,"end_character":40},"updated":"2025-12-01 16:20:56.000000000","message":"nit:compute","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":92,"context_line":" For example, you can run more than one Nova scheduler (or conductor)"},{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"}],"source_content_type":"text/x-rst","patch_set":6,"id":"616c8be4_9ea77e27","line":95,"range":{"start_line":95,"start_character":33,"end_line":95,"end_character":40},"in_reply_to":"d31e48a2_84a189c6","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"}],"source_content_type":"text/x-rst","patch_set":6,"id":"b4531770_2a73930c","line":97,"range":{"start_line":96,"start_character":5,"end_line":97,"end_character":31},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n * If a service has single worker running, then request can be picked up\n once service is up again.\n```\n\nthis again tangentally touches on resumablity. in this case the task need to be in a pending state although here its pending because it still waiting in the queue.\nobviously this coudl time out but we cant really do anyting about that unless we condier a wider change to remvoe the rpc timesouts.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":93,"context_line":" worker. If one of the worker is shutting down, then other worker will"},{"line_number":94,"context_line":" server the request. This is not the case for Nova compute which is"},{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"}],"source_content_type":"text/x-rst","patch_set":6,"id":"3336c5d7_1fa8c6b0","line":97,"range":{"start_line":96,"start_character":5,"end_line":97,"end_character":31},"in_reply_to":"b4531770_2a73930c","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"f1c5742f_1745f82f","line":98,"range":{"start_line":98,"start_character":49,"end_line":98,"end_character":56},"updated":"2025-12-01 16:20:56.000000000","message":"nit:compute","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"4c4b9ed0_59c65cea","line":98,"range":{"start_line":98,"start_character":49,"end_line":98,"end_character":56},"in_reply_to":"f1c5742f_1745f82f","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"},{"line_number":102,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"fc6396e9_8f40c1c8","line":99,"range":{"start_line":98,"start_character":6,"end_line":99,"end_character":56},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n * There is an opertunity for the compute service to cleanup\n or recover the interrupted operation on instances during init_host().\n the action taken will depend on the tasks and its status.\n```","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":95,"context_line":" always a single worker per comoute service on specific host."},{"line_number":96,"context_line":" * If service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"},{"line_number":102,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"df9910ab_2c419153","line":99,"range":{"start_line":98,"start_character":6,"end_line":99,"end_character":56},"in_reply_to":"fc6396e9_8f40c1c8","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"},{"line_number":102,"context_line":""},{"line_number":103,"context_line":" * The RPC will timeout the call."},{"line_number":104,"context_line":" * The message broker queue starts dropping messages due to timeout."},{"line_number":105,"context_line":" * The order of requests and messages can be stale."},{"line_number":106,"context_line":""},{"line_number":107,"context_line":"As a graceful shutdown goal, we need to do two things:"},{"line_number":108,"context_line":""},{"line_number":109,"context_line":"#. A way to stop new requests, but do not interrupt in-progress operations."}],"source_content_type":"text/x-rst","patch_set":6,"id":"67d5b9e6_09b8dea8","line":106,"range":{"start_line":100,"start_character":4,"end_line":106,"end_character":1},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n * If the service is in the stopped state for a long time, based on the\n RPC and message queue timeout, there is chance that:\n\n * The RPC client or server will timeout the call.\n * The message broker may drop messages due to timeout.\n * The order of requests and messages can be stale.\n\n```\non the last point are you suggesting that request can be interleaved incorrectly?\n\nalso if we want to prevent that i wonder if it woudl make sense for there to be a global requst timeout that is based on the tiem from which an api call was recived until its complete. i.e. time out the request on its toall processing time not based on rpc timeouts. that would allow us to discard old request on startup.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There are chances that during init_host(), comoute service might cleanup"},{"line_number":99,"context_line":" or recover the interrupted operation on instances."},{"line_number":100,"context_line":" * If the service is in a stop state for a long time and based on the"},{"line_number":101,"context_line":" RPC and message queue timeout, there is chance that:"},{"line_number":102,"context_line":""},{"line_number":103,"context_line":" * The RPC will timeout the call."},{"line_number":104,"context_line":" * The message broker queue starts dropping messages due to timeout."},{"line_number":105,"context_line":" * The order of requests and messages can be stale."},{"line_number":106,"context_line":""},{"line_number":107,"context_line":"As a graceful shutdown goal, we need to do two things:"},{"line_number":108,"context_line":""},{"line_number":109,"context_line":"#. A way to stop new requests, but do not interrupt in-progress operations."}],"source_content_type":"text/x-rst","patch_set":6,"id":"071bdc85_6a59651d","line":106,"range":{"start_line":100,"start_character":4,"end_line":106,"end_character":1},"in_reply_to":"67d5b9e6_09b8dea8","updated":"2025-12-02 04:55:32.000000000","message":"done.\n\nThis is the case where two request were requested in order but the first one is lost due to timeout. For example, user request start server and stop server. Both are in queue and compute has not pick either of them and shutdown initiated. Queue drop the start request but before it drop stop then compute is up and pick stop request.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":120,"context_line":"----------------------------------------------------"},{"line_number":121,"context_line":""},{"line_number":122,"context_line":"For the below services, their graceful shutdown is handled by their"},{"line_number":123,"context_line":"deployment servers or used library."},{"line_number":124,"context_line":""},{"line_number":125,"context_line":"* Nova API \u0026 Nova metadata API:"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"00eb6a0b_39d10026","line":123,"updated":"2025-12-01 21:07:28.000000000","message":"echnically nova-comptue nova-conductor and nova-schduler also have sigterm handleer that are supproted by oslo/eventlet so i dont really agree with not including them in this section\n\ngiven we have a kill/stop interface in our base service class.\n\nhttps://github.com/openstack/nova/blob/master/nova/service.py#L267-L295","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":120,"context_line":"----------------------------------------------------"},{"line_number":121,"context_line":""},{"line_number":122,"context_line":"For the below services, their graceful shutdown is handled by their"},{"line_number":123,"context_line":"deployment servers or used library."},{"line_number":124,"context_line":""},{"line_number":125,"context_line":"* Nova API \u0026 Nova metadata API:"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"d33c03dc_8fce06f7","line":123,"in_reply_to":"00eb6a0b_39d10026","updated":"2025-12-02 04:55:32.000000000","message":"sigterm handleer does not implement the graceful shutdown. We need 2nd RPC server for compute service and task tracking for all these three service to say graceful shutdown is done. current stop/kill things just stop the service abruptly.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":134,"context_line":" by SIGTERM, the uWSGI server SIGTERM handler check if there are any"},{"line_number":135,"context_line":" in-progress request on any worker. It wait for all the workers to finish"},{"line_number":136,"context_line":" the request and then terminates each worker. Once all worker are terminated"},{"line_number":137,"context_line":" then it will terminate the Nova API service."},{"line_number":138,"context_line":""},{"line_number":139,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":140,"context_line":" with \"503 Service Unavailable\" error."}],"source_content_type":"text/x-rst","patch_set":6,"id":"fa8fa6f5_b6c60205","line":137,"updated":"2025-12-01 21:07:28.000000000","message":"what is the behvaior for apache mod_wsgi\n\nmy understanding is in that config it intersect the signals and it willl not propagate the SIGTERM to the python applciation.\n\nthat mean nova will never notficed that it shoudl gracefully shutdown in thsi case.\n and does not propatagte all of them to the applcaiton\n\n\nmy understnading is most instlaler that use apache do not use mod_proxy_uwsgi\nto run it.\n\nhttps://github.com/openstack/kolla-ansible/blob/master/ansible/roles/nova/templates/nova-api-wsgi.conf.j2#L34-L49\n\nhttps://github.com/openstack-k8s-operators/nova-operator/blob/main/templates/novaapi/config/httpd.conf#L70-L78\n\nso we need to consider the semantics of https://modwsgi.readthedocs.io/en/master/\n\nnot just https://httpd.apache.org/docs/2.4/mod/mod_proxy_uwsgi.html\n\n\nlookign wuickly it belive when we run in deamon mode apache will send the sig term to eafch chile worker and the damon mode processes.\n\n\nThose children stop accepting new requests and are given a short grace period to finish.\n\n```\nshutdown-timeout\u003dsss\n\n Defines the maximum number of seconds allowed to pass when waiting for a daemon\n process to shutdown. When this timeout has been reached the daemon process will\n be forced to exited even if there are still active requests or it is still\n running Python exit functions. The shutdown timeout is applied after any\n graceful restart timeout or eviction timeout if they have been specified.\n No new requests are accepted during the shutdown timeout is being applied.\n\n If this option is not defined, then the shutdown timeout will be set to 5\n seconds. Note that this option does not change the shutdown timeout applied\n to daemon processes when Apache itself is being stopped or restarted. That\n timeout value is defined internally to Apache as 3 seconds and cannot be\n overridden.\n```\n\nso for mod_wsgi we can reallly only expect 3-5 second before any in progress api request will be termenated by Apache.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":false,"context_lines":[{"line_number":134,"context_line":" by SIGTERM, the uWSGI server SIGTERM handler check if there are any"},{"line_number":135,"context_line":" in-progress request on any worker. It wait for all the workers to finish"},{"line_number":136,"context_line":" the request and then terminates each worker. Once all worker are terminated"},{"line_number":137,"context_line":" then it will terminate the Nova API service."},{"line_number":138,"context_line":""},{"line_number":139,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":140,"context_line":" with \"503 Service Unavailable\" error."}],"source_content_type":"text/x-rst","patch_set":6,"id":"d75c29c4_3edbed2e","line":137,"in_reply_to":"f80c7984_fee478cb","updated":"2025-12-02 22:05:34.000000000","message":"when nova is run under appche the apache process bing stoped is the primary way that nova api is stopped.\n\nso my pont was for any installation that uses mod_wisg in and deamon mode for example the value will be hardcoded to 3 seconds and cant be extened.\n\nso for our downstream product or kola ansible the api will only have 3 seconds maximum to complete the request when the container that is running the api recives a sig term form podman/docker or Kubernetes.\n\nso we cant realisticly expect most produciton deployment to have a longer gracetime.\n\ngranted if we can use this functionality to motivate installer to adopt gunicorn or similar where we there is more contol over this then cool but mod_wsigi is unfortunately the default and uwsgi is in maintenance mode for the past 3 years so liek eventlet its something we will eventually have to move away form.\n\n\nhttps://github.com/unbit/uwsgi/commit/5838086dd4490b8a55ff58fc0bf0f108caa4e079\n\nits not all doom because they did do a mantiance release in ocober but\nthere are seveal python 3.13 bug reports and no offical suprot for 3.13\n\nso as much as i like uwsgi it shoudl not be the primary wsgi server we design for.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":134,"context_line":" by SIGTERM, the uWSGI server SIGTERM handler check if there are any"},{"line_number":135,"context_line":" in-progress request on any worker. It wait for all the workers to finish"},{"line_number":136,"context_line":" the request and then terminates each worker. Once all worker are terminated"},{"line_number":137,"context_line":" then it will terminate the Nova API service."},{"line_number":138,"context_line":""},{"line_number":139,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":140,"context_line":" with \"503 Service Unavailable\" error."}],"source_content_type":"text/x-rst","patch_set":6,"id":"f80c7984_fee478cb","line":137,"in_reply_to":"fa8fa6f5_b6c60205","updated":"2025-12-02 04:55:32.000000000","message":"or unless it is configured with higher value. We are not considering the case of \u0027Apache itself is being stopped or restarted\u0027 but yes if Apache is stopped that is out of scope for nova.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":139,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":140,"context_line":" with \"503 Service Unavailable\" error."},{"line_number":141,"context_line":""},{"line_number":142,"context_line":" Testing:"},{"line_number":143,"context_line":""},{"line_number":144,"context_line":" I tested two types of requests:"},{"line_number":145,"context_line":""},{"line_number":146,"context_line":" #. Sync request: \u0027openstack server list\u0027:"},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":149,"context_line":" server list API code."},{"line_number":150,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server list``"},{"line_number":151,"context_line":" * Wait till the server list request reaches the Nova API (you can see"},{"line_number":152,"context_line":" the log from the controller)"},{"line_number":153,"context_line":" * Because of sleep(10), the server list takes time to finish."},{"line_number":154,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":155,"context_line":" * Start a new API request \u0027request2\u0027: ``openstack server list``. This new"},{"line_number":156,"context_line":" requests came after shutdown is initiated so it should be denied."},{"line_number":157,"context_line":" * Nova API service will wait because \u0027request1\u0027 is not finished."},{"line_number":158,"context_line":" * \u0027request1\u0027 will get the response of the server list before the service"},{"line_number":159,"context_line":" is terminated."},{"line_number":160,"context_line":" * \u0027request2\u0027 is denied and will receive the error"},{"line_number":161,"context_line":" \"503 Service Unavailable\""},{"line_number":162,"context_line":""},{"line_number":163,"context_line":" #. Async request: ``openstack server pause \u003cserver\u003e``:"},{"line_number":164,"context_line":""},{"line_number":165,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":166,"context_line":" server pause API code."},{"line_number":167,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server pause server1``"},{"line_number":168,"context_line":" * Wait till the pause server request reaches the Nova API (you can see"},{"line_number":169,"context_line":" the log from the controller)"},{"line_number":170,"context_line":" * Because of sleep(10), the pause server takes time to finish."},{"line_number":171,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":172,"context_line":" * Service will wait because \u0027request1\u0027 is not finished."},{"line_number":173,"context_line":" * Nova API will make an RPC cast to the Nova compute service and return."},{"line_number":174,"context_line":" * \u0027request1\u0027 is completed, and the response is returned to the user."},{"line_number":175,"context_line":" * Nova API service is terminated now."},{"line_number":176,"context_line":" * Nova compute service is operating the pause server request."},{"line_number":177,"context_line":" * Check if server is paused ``openstack server list``"},{"line_number":178,"context_line":" * You can see the server is paused."},{"line_number":179,"context_line":""},{"line_number":180,"context_line":"* Nova console proxy services: nova-novncproxy, nova-serialproxy, and"},{"line_number":181,"context_line":" nova-spicehtml5proxy:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"befd815d_af654aa7","line":178,"range":{"start_line":142,"start_character":2,"end_line":178,"end_character":40},"updated":"2025-12-01 21:07:28.000000000","message":"Im conflicted if this should be in the spec.\n\nnormally we capture the outcomes of our investigation rathar then the process.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":139,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":140,"context_line":" with \"503 Service Unavailable\" error."},{"line_number":141,"context_line":""},{"line_number":142,"context_line":" Testing:"},{"line_number":143,"context_line":""},{"line_number":144,"context_line":" I tested two types of requests:"},{"line_number":145,"context_line":""},{"line_number":146,"context_line":" #. Sync request: \u0027openstack server list\u0027:"},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":149,"context_line":" server list API code."},{"line_number":150,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server list``"},{"line_number":151,"context_line":" * Wait till the server list request reaches the Nova API (you can see"},{"line_number":152,"context_line":" the log from the controller)"},{"line_number":153,"context_line":" * Because of sleep(10), the server list takes time to finish."},{"line_number":154,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":155,"context_line":" * Start a new API request \u0027request2\u0027: ``openstack server list``. This new"},{"line_number":156,"context_line":" requests came after shutdown is initiated so it should be denied."},{"line_number":157,"context_line":" * Nova API service will wait because \u0027request1\u0027 is not finished."},{"line_number":158,"context_line":" * \u0027request1\u0027 will get the response of the server list before the service"},{"line_number":159,"context_line":" is terminated."},{"line_number":160,"context_line":" * \u0027request2\u0027 is denied and will receive the error"},{"line_number":161,"context_line":" \"503 Service Unavailable\""},{"line_number":162,"context_line":""},{"line_number":163,"context_line":" #. Async request: ``openstack server pause \u003cserver\u003e``:"},{"line_number":164,"context_line":""},{"line_number":165,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":166,"context_line":" server pause API code."},{"line_number":167,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server pause server1``"},{"line_number":168,"context_line":" * Wait till the pause server request reaches the Nova API (you can see"},{"line_number":169,"context_line":" the log from the controller)"},{"line_number":170,"context_line":" * Because of sleep(10), the pause server takes time to finish."},{"line_number":171,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":172,"context_line":" * Service will wait because \u0027request1\u0027 is not finished."},{"line_number":173,"context_line":" * Nova API will make an RPC cast to the Nova compute service and return."},{"line_number":174,"context_line":" * \u0027request1\u0027 is completed, and the response is returned to the user."},{"line_number":175,"context_line":" * Nova API service is terminated now."},{"line_number":176,"context_line":" * Nova compute service is operating the pause server request."},{"line_number":177,"context_line":" * Check if server is paused ``openstack server list``"},{"line_number":178,"context_line":" * You can see the server is paused."},{"line_number":179,"context_line":""},{"line_number":180,"context_line":"* Nova console proxy services: nova-novncproxy, nova-serialproxy, and"},{"line_number":181,"context_line":" nova-spicehtml5proxy:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"19ee97db_b43761be","line":178,"range":{"start_line":142,"start_character":2,"end_line":178,"end_character":40},"in_reply_to":"befd815d_af654aa7","updated":"2025-12-02 04:55:32.000000000","message":"Well, this is additional information on how I tested the API service, adding more info about implementation or testing is always beneficial for review.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":198,"context_line":""},{"line_number":199,"context_line":" This way, user console sessions will be terminated gracefully, and they will"},{"line_number":200,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":201,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":202,"context_line":""},{"line_number":203,"context_line":"Step 1: Split the new and in-progress requests via RPC:"},{"line_number":204,"context_line":"-------------------------------------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"4d12d4a8_a5310179","line":201,"updated":"2025-12-01 16:20:56.000000000","message":"OK. That is acceptable for me.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":198,"context_line":""},{"line_number":199,"context_line":" This way, user console sessions will be terminated gracefully, and they will"},{"line_number":200,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":201,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":202,"context_line":""},{"line_number":203,"context_line":"Step 1: Split the new and in-progress requests via RPC:"},{"line_number":204,"context_line":"-------------------------------------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"c470c1e8_69b7dc5d","line":201,"in_reply_to":"4d12d4a8_a5310179","updated":"2025-12-01 21:07:28.000000000","message":"Ya i think this is OK as well.\n\nfrom a client perspective it wont really be diffent then how we now supprot terminating the connection when the token expires.\n\nhttps://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/enforce-remote-console-session-timeout.html","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":198,"context_line":""},{"line_number":199,"context_line":" This way, user console sessions will be terminated gracefully, and they will"},{"line_number":200,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":201,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":202,"context_line":""},{"line_number":203,"context_line":"Step 1: Split the new and in-progress requests via RPC:"},{"line_number":204,"context_line":"-------------------------------------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"1c3fbc77_ee7a7e6c","line":201,"in_reply_to":"c470c1e8_69b7dc5d","updated":"2025-12-02 04:55:32.000000000","message":"Acknowledged","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":223,"context_line":""},{"line_number":224,"context_line":" * stop():"},{"line_number":225,"context_line":""},{"line_number":226,"context_line":" * It will stop the listener to the pick any new message from the queue, but"},{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."}],"source_content_type":"text/x-rst","patch_set":6,"id":"81b1744b_0b985514","line":226,"range":{"start_line":226,"start_character":35,"end_line":226,"end_character":38},"updated":"2025-12-01 16:20:56.000000000","message":"nit: remove \"the\"","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":223,"context_line":""},{"line_number":224,"context_line":" * stop():"},{"line_number":225,"context_line":""},{"line_number":226,"context_line":" * It will stop the listener to the pick any new message from the queue, but"},{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."}],"source_content_type":"text/x-rst","patch_set":6,"id":"97cd82af_9a82998a","line":226,"range":{"start_line":226,"start_character":35,"end_line":226,"end_character":38},"in_reply_to":"81b1744b_0b985514","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":223,"context_line":""},{"line_number":224,"context_line":" * stop():"},{"line_number":225,"context_line":""},{"line_number":226,"context_line":" * It will stop the listener to the pick any new message from the queue, but"},{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."},{"line_number":230,"context_line":" * It will not stop sending RPC clients to send messages to the queue. They"}],"source_content_type":"text/x-rst","patch_set":6,"id":"dfe223f9_0a4a1d92","line":227,"range":{"start_line":226,"start_character":6,"end_line":227,"end_character":65},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n * It will disable the listeners ability to pick up any new message from the queue, but\n will dispatch the already retrieved message to the dispatcher.\n```","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":223,"context_line":""},{"line_number":224,"context_line":" * stop():"},{"line_number":225,"context_line":""},{"line_number":226,"context_line":" * It will stop the listener to the pick any new message from the queue, but"},{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."},{"line_number":230,"context_line":" * It will not stop sending RPC clients to send messages to the queue. They"}],"source_content_type":"text/x-rst","patch_set":6,"id":"b023ca53_2c9a2a98","line":227,"range":{"start_line":226,"start_character":6,"end_line":227,"end_character":65},"in_reply_to":"dfe223f9_0a4a1d92","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."},{"line_number":230,"context_line":" * It will not stop sending RPC clients to send messages to the queue. They"},{"line_number":231,"context_line":" will not be picked because the consumer and listener are stopped."},{"line_number":232,"context_line":" * wait():"},{"line_number":233,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"093bb24f_87f1ea45","line":230,"range":{"start_line":230,"start_character":23,"end_line":230,"end_character":30},"updated":"2025-12-01 16:20:56.000000000","message":"nit: remove the word \"sending\"","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."},{"line_number":230,"context_line":" * It will not stop sending RPC clients to send messages to the queue. They"},{"line_number":231,"context_line":" will not be picked because the consumer and listener are stopped."},{"line_number":232,"context_line":" * wait():"},{"line_number":233,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"14ae448d_68bde56e","line":230,"range":{"start_line":230,"start_character":23,"end_line":230,"end_character":30},"in_reply_to":"093bb24f_87f1ea45","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":227,"context_line":" will dispatch the already picked message to the dispatcher."},{"line_number":228,"context_line":" * It will delete the consumer."},{"line_number":229,"context_line":" * It will not delete the queues and exchange on the message broker side."},{"line_number":230,"context_line":" * It will not stop sending RPC clients to send messages to the queue. They"},{"line_number":231,"context_line":" will not be picked because the consumer and listener are stopped."},{"line_number":232,"context_line":" * wait():"},{"line_number":233,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"3744c8bd_d15da442","line":230,"range":{"start_line":230,"start_character":23,"end_line":230,"end_character":30},"in_reply_to":"093bb24f_87f1ea45","updated":"2025-12-01 21:07:28.000000000","message":"it already does \n\nhttps://github.com/openstack/nova/blob/master/nova/service.py#L284\n\nthe change here is that today it shut down the eitner rpc server where as this change is plannign to only shutdown part of it.\n\n\n```suggestion\n * It will not stop RPC clients sending new messages to the queue, however, they\n```","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":237,"context_line":""},{"line_number":238,"context_line":"Analysis per services and the required proposed RPC design change:"},{"line_number":239,"context_line":""},{"line_number":240,"context_line":"* The below services do not use the RPC so no change needed:"},{"line_number":241,"context_line":""},{"line_number":242,"context_line":" * Nova API"},{"line_number":243,"context_line":" * Nova metadata API"}],"source_content_type":"text/x-rst","patch_set":6,"id":"ef2d63e4_e02ae392","line":240,"range":{"start_line":240,"start_character":21,"end_line":240,"end_character":39},"updated":"2025-12-01 16:20:56.000000000","message":"I think what you meant here is \"do not implement and RPC server\"\n\nThe services still have RPC clients to call other services like the conductor or the compute.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":237,"context_line":""},{"line_number":238,"context_line":"Analysis per services and the required proposed RPC design change:"},{"line_number":239,"context_line":""},{"line_number":240,"context_line":"* The below services do not use the RPC so no change needed:"},{"line_number":241,"context_line":""},{"line_number":242,"context_line":" * Nova API"},{"line_number":243,"context_line":" * Nova metadata API"}],"source_content_type":"text/x-rst","patch_set":6,"id":"b16bdbb3_17909cb2","line":240,"range":{"start_line":240,"start_character":21,"end_line":240,"end_character":39},"in_reply_to":"ef2d63e4_e02ae392","updated":"2025-12-01 21:07:28.000000000","message":"+1","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":237,"context_line":""},{"line_number":238,"context_line":"Analysis per services and the required proposed RPC design change:"},{"line_number":239,"context_line":""},{"line_number":240,"context_line":"* The below services do not use the RPC so no change needed:"},{"line_number":241,"context_line":""},{"line_number":242,"context_line":" * Nova API"},{"line_number":243,"context_line":" * Nova metadata API"}],"source_content_type":"text/x-rst","patch_set":6,"id":"24333465_2f20eb35","line":240,"range":{"start_line":240,"start_character":21,"end_line":240,"end_character":39},"in_reply_to":"ef2d63e4_e02ae392","updated":"2025-12-02 04:55:32.000000000","message":"right, I mean RPC server here. done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":260,"context_line":" request."},{"line_number":261,"context_line":""},{"line_number":262,"context_line":" * Response handling:"},{"line_number":263,"context_line":" Whenever there is a RPC call, oslo.messaging creates another reply queue"},{"line_number":264,"context_line":" connected with the unique message id. This reply queue will be used to"},{"line_number":265,"context_line":" send the RPC call response to the caller. Even RPC server is stopped on"},{"line_number":266,"context_line":" this worker, it will not impact the reply queue."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":" We still need to keep the worker up until all the responses are sent via"},{"line_number":269,"context_line":" the reply queue, and for that, we need to implement the in-progress task"}],"source_content_type":"text/x-rst","patch_set":6,"id":"66e66ef1_f8bbab0e","line":266,"range":{"start_line":263,"start_character":4,"end_line":266,"end_character":52},"updated":"2025-12-01 21:07:28.000000000","message":"```suggestion\n Whenever there is a RPC call, oslo.messaging creates another reply queue\n connected with the unique message id. This reply queue will be used to\n send the RPC call response to the caller. Even if the RPC server is stopped on\n this worker, it will not impact the reply queue.\n```","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":260,"context_line":" request."},{"line_number":261,"context_line":""},{"line_number":262,"context_line":" * Response handling:"},{"line_number":263,"context_line":" Whenever there is a RPC call, oslo.messaging creates another reply queue"},{"line_number":264,"context_line":" connected with the unique message id. This reply queue will be used to"},{"line_number":265,"context_line":" send the RPC call response to the caller. Even RPC server is stopped on"},{"line_number":266,"context_line":" this worker, it will not impact the reply queue."},{"line_number":267,"context_line":""},{"line_number":268,"context_line":" We still need to keep the worker up until all the responses are sent via"},{"line_number":269,"context_line":" the reply queue, and for that, we need to implement the in-progress task"}],"source_content_type":"text/x-rst","patch_set":6,"id":"9744f58f_b6213622","line":266,"range":{"start_line":263,"start_character":4,"end_line":266,"end_character":52},"in_reply_to":"66e66ef1_f8bbab0e","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":false,"context_lines":[{"line_number":267,"context_line":""},{"line_number":268,"context_line":" We still need to keep the worker up until all the responses are sent via"},{"line_number":269,"context_line":" the reply queue, and for that, we need to implement the in-progress task"},{"line_number":270,"context_line":" tracking in scheduler services, but that will be handled in step 2."},{"line_number":271,"context_line":""},{"line_number":272,"context_line":" This way, stopping a Nova scheduler worker will not impact the RPC"},{"line_number":273,"context_line":" communication on the scheduler service."}],"source_content_type":"text/x-rst","patch_set":6,"id":"d907cd22_2272b500","line":270,"updated":"2025-12-01 21:07:28.000000000","message":"this is a good thing to expsoe via a healthcheck/metrics endpoing at some future time to improve the observablity of nova.\n\nknowing how many rpcs are in flight in general are a good indicator for the overall load/perfoance of the deployement.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"448be53f_428ecd5a","line":284,"updated":"2025-12-01 16:20:56.000000000","message":"What happens in the following scenario:\n1. conductor receives an RPC cast from nova-api in T1 RPC handler thread.\n2. conductor sends an RPC call to a compute and starts waiting for the response in T1.\n3. conductor graceful shutdown is initiated \n4. conductor RPC server is stopped so no new messages can be received by the conductor.\n5. conductor service starts waiting for all the RPC handler threads to finish. So it waits for T1 to finish.\n6. compute sends the RPC response to conductor.\n\nWill T1 receive the RPC call response at step 7 even though the RPC server in the conductor is sopped at step 4?\n\nThere are hints below in the compute section that the RPC reply is not affected by the RPC server shutdown. So probably it is not a problem. I just want to make sure that how I understand the compute section is applicable here as well.\n\n(The difference between conductor and scheduler is that scheduler is just and RPC server, but conductor both an RPC server and an RPC client as well)","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"6335710b_407f4184","line":284,"in_reply_to":"1183c4ff_58382326","updated":"2025-12-02 04:55:32.000000000","message":"Yes, it will be same as compute. Whenever there is a RPC call, oslo.messaging create a separate reply queue with a unique msg id and that queue is used to reply back to the RPC call. Even RPC server is stopped and it will not pick any new request from its queue but reply queue will still be used to reply the response.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"1183c4ff_58382326","line":284,"in_reply_to":"448be53f_428ecd5a","updated":"2025-12-01 21:07:28.000000000","message":"so up to 6 there is no problem.\n\nfor 6 to proceed you need a second conductor process to hanel new rpc request form the comptue created during the processing fo the cast or call.\n\n\nif we only have one conductor however it will deadlock because the request form teh compute in 6 will not be processed to the call in 2 will time out.\n\nthat probably ok in phase 1\n\nideally we would only shutdown the ability to start new tasks.\nwe would ideally continue to supprot all object operaiton like instnace.save()\nany other db operation on an object that the compute does.\n\nbasiclly i dont think we can shutdown the handeling of \n@base.remotable methods on objects\n\nhttps://github.com/openstack/nova/blob/master/nova/objects/instance.py#L785-L890\n\n\nfor the conductor the only tpc endpoitn that shoudl be shutdown are the ComputeTaskAPI\n\nhttps://github.com/openstack/nova/blob/master/nova/conductor/api.py#L82\n\nfor the schduler i think ti would only be the select_destinations\n\nhttps://github.com/openstack/nova/blob/23b462d77df1a1d09c43d0918bca853ef3af1e3f/nova/scheduler/manager.py#L160\n\nfor the competue service its a subset of the public method in\n\nhttps://github.com/openstack/nova/blob/23b462d77df1a1d09c43d0918bca853ef3af1e3f/nova/compute/rpcapi.py\n\nthose are the rpc endpoign that start new work.\n\ni say subset as there are some api like check_can_live_migrate_destination\n\nthat are part fo an ongoing move operation but do not start one.\n\nseperating the comptue api will be the harder part as we need to go fucntion per function.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d81f74af7e8e3cf248563dd5177e72ff64ba15ea","unresolved":false,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"5137d741_630e7ea0","line":284,"in_reply_to":"6335710b_407f4184","updated":"2025-12-02 20:19:19.000000000","message":"Thanks for confirming it","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"c3326426_a54ccbd0","line":284,"in_reply_to":"6335710b_407f4184","updated":"2025-12-02 22:05:34.000000000","message":"i don\u0027t see any test related to ensuring that remotbale object method for db operation will be handled by the new rpc server that will not be stopped.\n\ni guess your ok with this dead lockign in the case of only 1 conductor?\n\nif so we shoudl call out that limitation in the spec.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":281,"context_line":" available workers will proceed with the request."},{"line_number":282,"context_line":""},{"line_number":283,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":284,"context_line":" the scheduler."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":"* Nova compute: RPC design change needed"},{"line_number":287,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"d934695b_84ab864f","line":284,"in_reply_to":"c3326426_a54ccbd0","updated":"2025-12-02 23:27:11.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":306,"context_line":" * Each compute will have a new 2nd RPC server which will listen to a new"},{"line_number":307,"context_line":" topic ``compoute-ops.\u003chost\u003e``. ``compoute-ops`` name is used because it"},{"line_number":308,"context_line":" is mainly for compute operations, but a better name can be used if"},{"line_number":309,"context_line":" needed."},{"line_number":310,"context_line":" * It will use the same transport layer/bus and exchange that the 1st RPC"},{"line_number":311,"context_line":" server uses."},{"line_number":312,"context_line":" * It will create its own dispatcher, listener, and queue."}],"source_content_type":"text/x-rst","patch_set":6,"id":"b356512f_7ff092aa","line":309,"updated":"2025-12-01 21:07:28.000000000","message":"so this second rpc service will handel calls like detach_volume but not\ncheck_can_live_migrate_source which will be hanlded by the first rpc server right\n\n\nonly the secodn rpc server will be shutdon on service sto correct.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":306,"context_line":" * Each compute will have a new 2nd RPC server which will listen to a new"},{"line_number":307,"context_line":" topic ``compoute-ops.\u003chost\u003e``. ``compoute-ops`` name is used because it"},{"line_number":308,"context_line":" is mainly for compute operations, but a better name can be used if"},{"line_number":309,"context_line":" needed."},{"line_number":310,"context_line":" * It will use the same transport layer/bus and exchange that the 1st RPC"},{"line_number":311,"context_line":" server uses."},{"line_number":312,"context_line":" * It will create its own dispatcher, listener, and queue."}],"source_content_type":"text/x-rst","patch_set":6,"id":"74c7e2d4_4c570010","line":309,"in_reply_to":"b356512f_7ff092aa","updated":"2025-12-02 04:55:32.000000000","message":"first RPC server will be shutdown immediately (so no more new request) whenever shutdown is initiated and 2nd RPC server will stay up to finish the in-progress tasks which might need compute to compute call or a few more.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":315,"context_line":" the same compute manager."},{"line_number":316,"context_line":" * This server will be mainly used for the compute-to-compute operations and"},{"line_number":317,"context_line":" server external events. The idea is to keep this RPC server up during"},{"line_number":318,"context_line":" shutdown so that the in-progress operations can be finished."},{"line_number":319,"context_line":" * In shutdown, nova.service will wait for the compute to tell if they"},{"line_number":320,"context_line":" finished all their tasks, so that it can stop the 2nd RPC server and"},{"line_number":321,"context_line":" finish the shutdown."}],"source_content_type":"text/x-rst","patch_set":6,"id":"9f4e80dd_a9cf9a14","line":318,"updated":"2025-12-01 21:07:28.000000000","message":"if that the intent then compoute-ops is not a good name\n\ni woudl expect this to model the RPC function that driectly map to a api action\n\nnot comptue to compute calls.\n\n\ni think we shoudl specirfy exactily which rpc call will be handeld by each server so its clear what they will manage.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":315,"context_line":" the same compute manager."},{"line_number":316,"context_line":" * This server will be mainly used for the compute-to-compute operations and"},{"line_number":317,"context_line":" server external events. The idea is to keep this RPC server up during"},{"line_number":318,"context_line":" shutdown so that the in-progress operations can be finished."},{"line_number":319,"context_line":" * In shutdown, nova.service will wait for the compute to tell if they"},{"line_number":320,"context_line":" finished all their tasks, so that it can stop the 2nd RPC server and"},{"line_number":321,"context_line":" finish the shutdown."}],"source_content_type":"text/x-rst","patch_set":6,"id":"9e67ba55_3955307c","line":318,"in_reply_to":"9f4e80dd_a9cf9a14","updated":"2025-12-02 04:55:32.000000000","message":"It is more than compute to compute calls. Details is there in further section. I have added the draft list but we will make it a complete list during implementation.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":332,"context_line":" * SIGTERM signal is handled by oslo.service, it will call stop on"},{"line_number":333,"context_line":" nova.service"},{"line_number":334,"context_line":" * nova.service will stop the 1st RPC server so that no new requests are"},{"line_number":335,"context_line":" picked by the compute. The 2nd RPC server is running and up."},{"line_number":336,"context_line":" * nova.service will wait for the manager to signal once all in-progress"},{"line_number":337,"context_line":" operations are finished."},{"line_number":338,"context_line":" * Once compute signal to nova.service, then it will stop the 2nd RPC server"}],"source_content_type":"text/x-rst","patch_set":6,"id":"7c182496_022882f5","line":335,"updated":"2025-12-01 21:07:28.000000000","message":"i generally agree with this each server to handel diffent things\n\ni.e. detach_volume is an op in my mind. post_live_migration is an interal call. that is part fo an op so i was not expecting it to be part of the comptue-ops topic.\n\nit would be more approated for compute-internal.\u003chost\u003e\n\ni assume your proposing it this way so that the majority of the rpc endpoint remain on the topic we use today presumable for upgrade reasons?","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":332,"context_line":" * SIGTERM signal is handled by oslo.service, it will call stop on"},{"line_number":333,"context_line":" nova.service"},{"line_number":334,"context_line":" * nova.service will stop the 1st RPC server so that no new requests are"},{"line_number":335,"context_line":" picked by the compute. The 2nd RPC server is running and up."},{"line_number":336,"context_line":" * nova.service will wait for the manager to signal once all in-progress"},{"line_number":337,"context_line":" operations are finished."},{"line_number":338,"context_line":" * Once compute signal to nova.service, then it will stop the 2nd RPC server"}],"source_content_type":"text/x-rst","patch_set":6,"id":"f6ed732b_d1e38568","line":335,"in_reply_to":"7c182496_022882f5","updated":"2025-12-02 04:55:32.000000000","message":"I thought about compute-internal but it is not used for compute internal tasks only but more than that. for example. server external event, periodic tasks, compute to compute talk.\n\nI choose *ops* in \"compute-ops\" which donates \u0027operations\u0027. I am ok for better name than this but it is not just for internal things.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":413,"context_line":""},{"line_number":414,"context_line":" * compute service: 1500 sec, considering long-running operations on compute."},{"line_number":415,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":416,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":417,"context_line":""},{"line_number":418,"context_line":"* PoC:"},{"line_number":419,"context_line":" This PoC shows the working of the Step 1 proposal."}],"source_content_type":"text/x-rst","patch_set":6,"id":"3788b991_a647e986","line":416,"updated":"2025-12-01 21:07:28.000000000","message":"i woudl set 0 for all and not have an internal timeout by default.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":413,"context_line":""},{"line_number":414,"context_line":" * compute service: 1500 sec, considering long-running operations on compute."},{"line_number":415,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":416,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":417,"context_line":""},{"line_number":418,"context_line":"* PoC:"},{"line_number":419,"context_line":" This PoC shows the working of the Step 1 proposal."}],"source_content_type":"text/x-rst","patch_set":6,"id":"8a31d7b4_b11beb45","line":416,"in_reply_to":"3788b991_a647e986","updated":"2025-12-02 04:55:32.000000000","message":"this is not the same as graceful timeout, this section is about time-based wait system- how much wait service will do to let service to finish the tasks. This is replaced by tracker system in Step2.\n\n0 for them means \u0027no wait at all and immediately stop service\u0027","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":false,"context_lines":[{"line_number":413,"context_line":""},{"line_number":414,"context_line":" * compute service: 1500 sec, considering long-running operations on compute."},{"line_number":415,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":416,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":417,"context_line":""},{"line_number":418,"context_line":"* PoC:"},{"line_number":419,"context_line":" This PoC shows the working of the Step 1 proposal."}],"source_content_type":"text/x-rst","patch_set":6,"id":"8d817192_fd6e25cf","line":416,"in_reply_to":"8a31d7b4_b11beb45","updated":"2025-12-02 22:05:34.000000000","message":"i guess we dont have a -1 for wait for ever or external termination via sigkill?\n\nthat is the semantics i think we shoudl default to.\n\ni dont think oslo sevice shoudl be orchstrating this in general so i hope there is a way to disable that today.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":false,"context_lines":[{"line_number":463,"context_line":"* The below services need to implement the tracking system:"},{"line_number":464,"context_line":""},{"line_number":465,"context_line":" * Nova compute"},{"line_number":466,"context_line":" * Nova conductor"},{"line_number":467,"context_line":" * Nova scheduler"},{"line_number":468,"context_line":""},{"line_number":469,"context_line":"This proposal is to make the service wait time based on tracking the"},{"line_number":470,"context_line":"in-progress tasks. Once the service finishes the tasks, then they can signal"},{"line_number":471,"context_line":"to nova.service to proceed with shutting down the service. Basically, this"}],"source_content_type":"text/x-rst","patch_set":6,"id":"294ea3d1_3c482a1c","line":468,"range":{"start_line":466,"start_character":0,"end_line":468,"end_character":0},"updated":"2025-12-01 16:20:56.000000000","message":"I\u0027m wondering if just waiting for all the RPC handlers to finish while stopping the RPC server to take up new request would be enough for these services. In my limited understanding a task in these services always maps 1:1 to an executing RPC handler thread.\n\n-- later --\n\nProbably we still want to track the tasks by name to be able to log the task names as in-progress tasks.\n\nSo nothing is needs to change here really. Just an observation from me.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":false,"context_lines":[{"line_number":463,"context_line":"* The below services need to implement the tracking system:"},{"line_number":464,"context_line":""},{"line_number":465,"context_line":" * Nova compute"},{"line_number":466,"context_line":" * Nova conductor"},{"line_number":467,"context_line":" * Nova scheduler"},{"line_number":468,"context_line":""},{"line_number":469,"context_line":"This proposal is to make the service wait time based on tracking the"},{"line_number":470,"context_line":"in-progress tasks. Once the service finishes the tasks, then they can signal"},{"line_number":471,"context_line":"to nova.service to proceed with shutting down the service. Basically, this"}],"source_content_type":"text/x-rst","patch_set":6,"id":"b2609a49_c41b78b9","line":468,"range":{"start_line":466,"start_character":0,"end_line":468,"end_character":0},"in_reply_to":"294ea3d1_3c482a1c","updated":"2025-12-02 22:05:34.000000000","message":"or by request_id, perhapps both.\n\nideally we woudl be able to corralate the ongoing task with the api request it was created form.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":473,"context_line":""},{"line_number":474,"context_line":"* There will be a tracker introduced to track the in-progress tasks."},{"line_number":475,"context_line":"* It will be a singleton object."},{"line_number":476,"context_line":"* It maintains a list of \u0027method names\u0027 and UUID (instance UUID if the task is"},{"line_number":477,"context_line":" related to the instance). UUID will help to track multiple calls to the same"},{"line_number":478,"context_line":" method."},{"line_number":479,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."}],"source_content_type":"text/x-rst","patch_set":6,"id":"0e39925f_59b3842c","line":478,"range":{"start_line":476,"start_character":0,"end_line":478,"end_character":9},"updated":"2025-12-01 16:20:56.000000000","message":"I think request-id would be better from the context of the RPC request as it helps correlating the original RPC logs with the in-progress task logging. Also it always exists and unique.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":473,"context_line":""},{"line_number":474,"context_line":"* There will be a tracker introduced to track the in-progress tasks."},{"line_number":475,"context_line":"* It will be a singleton object."},{"line_number":476,"context_line":"* It maintains a list of \u0027method names\u0027 and UUID (instance UUID if the task is"},{"line_number":477,"context_line":" related to the instance). UUID will help to track multiple calls to the same"},{"line_number":478,"context_line":" method."},{"line_number":479,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."}],"source_content_type":"text/x-rst","patch_set":6,"id":"42a6b267_6be8e6be","line":478,"range":{"start_line":476,"start_character":0,"end_line":478,"end_character":9},"in_reply_to":"0e39925f_59b3842c","updated":"2025-12-02 04:55:32.000000000","message":"request-id is a good idea. We can always log tasks with request-id for uniqueness and if there are instance-related tasks, then adding the instance UUID along with the request ID will help for quick filtering of particular instance in-progress tasks.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":false,"context_lines":[{"line_number":473,"context_line":""},{"line_number":474,"context_line":"* There will be a tracker introduced to track the in-progress tasks."},{"line_number":475,"context_line":"* It will be a singleton object."},{"line_number":476,"context_line":"* It maintains a list of \u0027method names\u0027 and UUID (instance UUID if the task is"},{"line_number":477,"context_line":" related to the instance). UUID will help to track multiple calls to the same"},{"line_number":478,"context_line":" method."},{"line_number":479,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."}],"source_content_type":"text/x-rst","patch_set":6,"id":"04f25d2e_2c8490e4","line":478,"range":{"start_line":476,"start_character":0,"end_line":478,"end_character":9},"in_reply_to":"42a6b267_6be8e6be","updated":"2025-12-02 22:05:34.000000000","message":"+1\n\nagain in the future i would like to be able to iterate over these and expsoe it as a list of ongoing task in a helthcheck endpoint eventually.\n\nrequest id is the most imorant info for me although knowing what oepration taht request is performang at a glance via the top levle method name is also nice to know.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"4f4f3a3f_2e05c8e0","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"updated":"2025-12-01 16:20:56.000000000","message":"Hm. So far I assumed that tasks in this context only means RPC requests. But here you mention periodic tasks. Those are not triggered by RPC requests. I\u0027m OK not to run any periodics during graceful shutdown. But if there is one ongoing when the graceful shutdown starts then that periodic should probably finish. I\u0027m not sure how oslo.service does the periodics handling during its own graceful shutdown logic.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"a032eb66_c042226c","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"in_reply_to":"4f4f3a3f_2e05c8e0","updated":"2025-12-01 21:07:28.000000000","message":"ya i could see it being either way\nif we shut them down early it mean that the compute agent is more responsive to completing the rpc triggered ones.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d81f74af7e8e3cf248563dd5177e72ff64ba15ea","unresolved":true,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"64b3913b_690c8d14","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"in_reply_to":"5cf15eda_4eb6dd01","updated":"2025-12-02 20:19:19.000000000","message":"OK. I vote for preventing **any new** periodic task starting during graceful shutdown but any that is ongoing should be waited for to be finished. I suggest this to avoid the situation of periodic tasks making the graceful shutdown never converging. E.g. while waiting for Task A to finish Task B hits it period and starts, then while waiting for Task B to finish Task A hits its periods and starts etc.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"66a3a80493bbd602b324a398edf1ceb7a6a65f59","unresolved":true,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"ffbac5c2_b578676e","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"in_reply_to":"64b3913b_690c8d14","updated":"2025-12-02 20:31:14.000000000","message":"Makes sense to me.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"5cf15eda_4eb6dd01","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"in_reply_to":"a032eb66_c042226c","updated":"2025-12-02 04:55:32.000000000","message":"we will go through the each one of them and decide. The main idea is to \u0027let compute finish the things before shutdown\u0027 and those are mostly RPC calls but not limited, periodic tasks are good example and some of them can be good to finish before it shutdown. Maybe _sync_power_states() and _cleanup_incomplete_migrations() can be good to finish? and all polling periodic tasks can be denied.\n\noslo.service shutdown logic does not consider if task is periodic, it is up to services to handle those.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"823956f767be6007406f91ccec3ac1674033382c","unresolved":false,"context_lines":[{"line_number":480,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":481,"context_line":" done under lock."},{"line_number":482,"context_line":"* Once shutdown is initiated:"},{"line_number":483,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":484,"context_line":" them. The decision will be made by case, for example, reject the tasks if"},{"line_number":485,"context_line":" they are not critical to handle during shutdown( periodic tasks of purge"},{"line_number":486,"context_line":" deleted instances). An exact list of tasks which will be rejected"},{"line_number":487,"context_line":" and accepted will be decided during implementation."},{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"}],"source_content_type":"text/x-rst","patch_set":6,"id":"04d34c45_0c66b2a7","line":486,"range":{"start_line":483,"start_character":65,"end_line":486,"end_character":21},"in_reply_to":"ffbac5c2_b578676e","updated":"2025-12-02 22:18:52.000000000","message":"+1, I think that make sense just to finish the in-progress periodic tasks and reject if any new is starting during shutdown.\n\nI will mention that in spec.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"58c5de89_3f928471","line":491,"range":{"start_line":491,"start_character":29,"end_line":491,"end_character":35},"updated":"2025-12-01 16:20:56.000000000","message":"nit:tracker","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":488,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"}],"source_content_type":"text/x-rst","patch_set":6,"id":"c394612f_ae2f569b","line":491,"range":{"start_line":491,"start_character":29,"end_line":491,"end_character":35},"in_reply_to":"58c5de89_3f928471","updated":"2025-12-02 04:55:32.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"78a81645_4a177eea","line":492,"updated":"2025-12-01 16:20:56.000000000","message":"I\u0027m wondering how this fits into the picture of calling RPC server.wait() As far as I understand that blocks and waits for all the executing RPC handlers to finish. Will we not call wait() but instead poll the tracker\u0027s state?","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"d2519f43_dab0a40b","line":492,"in_reply_to":"78a81645_4a177eea","updated":"2025-12-02 04:55:32.000000000","message":"we have to do both.\n\nRPCserver.wait() will wait for all the tasks to be completed by the manager but that we can call only after RPCserver.stop() otherwise new request on that RPC server will keep coming. So we will stop 1st RPC server and wait on that which will finish tasks requested on 1st RPC server but we will keep 2nd RPC server up until nova service task tracker finish it for example \u0027live migration\u0027 operations will be adding new request on 2nd RPC server. Once nova service tracker finish all in-progress tasks, then we will stop() 2nd RPC server and wait() on that just in case any pending request.\n\nThe flow will be like this:\n\n- RPCserver1.stop() - stop requesting new requests\n- RPCserver1.wait() - finish in-progress tasks on RPCserver1\n- wait for manager.finish_tasks() - here this tracker will wait/log for all in-progress tasks to be completed.\n\nNOTE: at this stage we will keep having new requests (for example live migration back and forth calls from src \u0026 dest compute) on 2nd RPC server so it needs to be up until tracker finish the in-progress tasks.\n\n- RPCserver2.stop() - stop requesting new requests on 2nd RPCserver\n- RPCserver2.wait() - This should just exit as service tracker waited for all tasks to be completed but in case there are any and we missed in tracker then it will finish those also.\n\nHere is the code change in my PoC to show it the flow will work- \nhttps://review.opendev.org/c/openstack/nova/+/967261/7/nova/service.py#323","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"da4611c2_a9afc61d","line":492,"in_reply_to":"be24e419_aec5c36c","updated":"2025-12-02 22:05:34.000000000","message":"i assume we will spawn a thread to log the inflight tasks in the task tracker periodically or per happes prtin them on every n addtions or removals?\n\nor will the main thread that spawned the rpc server be the one that is pooling it?\nmy understanding is the main thread will block when we call wait so we need somethign esle to do that.\n\nor is that why your saying \n\n* manager.finish_tasks() This is where tracker will wait and log the\n in-progress tasks.\n \nbut in that case we could be killed before any of the inprgress task form \nRPCserver1.wait() finish and never log anything.\n\ni was expecthign this to kind of work like the executor statistics loging that gibi added to help with the eventlet remvoal work.\n\nsomething that would always be running int he back ground periodically logging the in progress tasks even outside of the shutdown case.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d81f74af7e8e3cf248563dd5177e72ff64ba15ea","unresolved":false,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"2e54e39f_d4e35a21","line":492,"in_reply_to":"d2519f43_dab0a40b","updated":"2025-12-02 20:19:19.000000000","message":"OK that sequence make sense.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"8db4ca4fe0550f0fbf06ee9b77cf424e88000ff2","unresolved":true,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"be24e419_aec5c36c","line":492,"in_reply_to":"d2519f43_dab0a40b","updated":"2025-12-02 18:58:55.000000000","message":"ditto, I did not mean to resolve this. Please check if this answers your question or if there is anything to be updated here.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"151540c600a2dff4ee4925a41d509c53178c3ee5","unresolved":true,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"f5460e2e_2e3fb1ff","line":492,"in_reply_to":"da4611c2_a9afc61d","updated":"2025-12-02 22:55:05.000000000","message":"ack.\n\nI mentioned it in L489 where we will be tracking the tasks always, irrespective of shutdown is initiated or not. Logging always or during shutdown only, that is something we can do via a flag. I do not want to duplicate or overload the logs, so at this time I thought of logging only when shutdown is started. \n\nThe main idea here is that during shutdown tracker will be logging the snapshot of \"all in-progress\", \"tasks completed\", \"tasks remaining\" etc\n\n\u003e but in that case we could be killed before any of the inprgress task form\nRPCserver1.wait() finish and never log anything.\n\nFor that, it is ok to start logging it at the start of shutdown. As soon as shutdown start, it will flag the compute as shutting down and then tracker will start logging. manager.finish_tasks() is more of wait until tasks are finished.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":489,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":490,"context_line":" during shutdown."},{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"c63852eb_cce381be","line":492,"in_reply_to":"f5460e2e_2e3fb1ff","updated":"2025-12-02 23:27:11.000000000","message":"Done","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"},{"line_number":496,"context_line":""},{"line_number":497,"context_line":"* Nova service timeout:"},{"line_number":498,"context_line":""},{"line_number":499,"context_line":" * oslo.service already has the timeout (graceful_shutdown_timeout_)"},{"line_number":500,"context_line":" which is configurable per service and used to timeout the SIGTERM signal"},{"line_number":501,"context_line":" handler."},{"line_number":502,"context_line":" * oslo.service will terminate the Nova service based on"},{"line_number":503,"context_line":" graceful_shutdown_timeout_, even Nova service graceful shutdown is not"},{"line_number":504,"context_line":" finished."},{"line_number":505,"context_line":" * No new configurable timeout will be added for the Nova, instead it will use"},{"line_number":506,"context_line":" the existing graceful_shutdown_timeout_."},{"line_number":507,"context_line":" * Its default value is 60 sec, which is less for Nova services. The proposal"},{"line_number":508,"context_line":" is to override its default value per Nova services:"},{"line_number":509,"context_line":""},{"line_number":510,"context_line":" * compute service: 1800 sec (Considering the long running tasks)."},{"line_number":511,"context_line":" * conductor service: 80 sec"},{"line_number":512,"context_line":" * scheduler service: 80 sec"},{"line_number":513,"context_line":""},{"line_number":514,"context_line":"* External system timeout:"},{"line_number":515,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"a975f7a9_dc4d8311","line":512,"range":{"start_line":494,"start_character":1,"end_line":512,"end_character":33},"updated":"2025-12-01 21:07:28.000000000","message":"this is kind of duplicated with the text around line 414\n\nyou also said 1800 (or 80) here and 1500s (or 60) there but i don\u0027t think either are reasonsbale.\ni dont think its reasonable to expect compute to be able to wait for 30 mins when asked to shutdown. i was orginally thinking 30second was pushing it.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":491,"context_line":"* nova.service will wait for tacker to finish the in-progress tasks until"},{"line_number":492,"context_line":" timeout."},{"line_number":493,"context_line":""},{"line_number":494,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":495,"context_line":"---------------------------"},{"line_number":496,"context_line":""},{"line_number":497,"context_line":"* Nova service timeout:"},{"line_number":498,"context_line":""},{"line_number":499,"context_line":" * oslo.service already has the timeout (graceful_shutdown_timeout_)"},{"line_number":500,"context_line":" which is configurable per service and used to timeout the SIGTERM signal"},{"line_number":501,"context_line":" handler."},{"line_number":502,"context_line":" * oslo.service will terminate the Nova service based on"},{"line_number":503,"context_line":" graceful_shutdown_timeout_, even Nova service graceful shutdown is not"},{"line_number":504,"context_line":" finished."},{"line_number":505,"context_line":" * No new configurable timeout will be added for the Nova, instead it will use"},{"line_number":506,"context_line":" the existing graceful_shutdown_timeout_."},{"line_number":507,"context_line":" * Its default value is 60 sec, which is less for Nova services. The proposal"},{"line_number":508,"context_line":" is to override its default value per Nova services:"},{"line_number":509,"context_line":""},{"line_number":510,"context_line":" * compute service: 1800 sec (Considering the long running tasks)."},{"line_number":511,"context_line":" * conductor service: 80 sec"},{"line_number":512,"context_line":" * scheduler service: 80 sec"},{"line_number":513,"context_line":""},{"line_number":514,"context_line":"* External system timeout:"},{"line_number":515,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"4572fbc6_923bdd0f","line":512,"range":{"start_line":494,"start_character":1,"end_line":512,"end_character":33},"in_reply_to":"a975f7a9_dc4d8311","updated":"2025-12-02 04:55:32.000000000","message":"both are different, L414 is time-based wait which is for Step 1 and will be replaced by the tracker system in step 2.\n\nThis is what graceful timeout is.\n\nI think 1800 sec is too much, I was considering long running live migrations but I will say 3 min is at least needed for compute.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"a343cf1d7eb569b00e5a6f5ca755d59349a0c64a","unresolved":true,"context_lines":[{"line_number":520,"context_line":" graceful_shutdown_timeout_ should be set accordingly. The external"},{"line_number":521,"context_line":" system timeout should be higher than graceful_shutdown_timeout_,"},{"line_number":522,"context_line":" otherwise external system will timeout and will interrupt the Nova graceful"},{"line_number":523,"context_line":" shutdown."},{"line_number":524,"context_line":""},{"line_number":525,"context_line":"Alternatives"},{"line_number":526,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"2b2860a3_63d9ca18","line":523,"updated":"2025-12-01 21:07:28.000000000","message":"i think we have to assume that there will be by default and design for that constraint rather the assuming that nova can handle the shutdown more gracefully.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":520,"context_line":" graceful_shutdown_timeout_ should be set accordingly. The external"},{"line_number":521,"context_line":" system timeout should be higher than graceful_shutdown_timeout_,"},{"line_number":522,"context_line":" otherwise external system will timeout and will interrupt the Nova graceful"},{"line_number":523,"context_line":" shutdown."},{"line_number":524,"context_line":""},{"line_number":525,"context_line":"Alternatives"},{"line_number":526,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":6,"id":"56ffe044_30cde98f","line":523,"in_reply_to":"2b2860a3_63d9ca18","updated":"2025-12-02 04:55:32.000000000","message":"Acknowledged","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"7c8c36ec_9569ec09","line":593,"updated":"2025-12-01 16:20:56.000000000","message":"Do I understand correctly that it implies that If we have RPC version_cap in place to old RPC version then even new computes cannot stop their 1st RPC server during graceful shutdown as due to the cap the compute-compute communication will not be sent to the 2nd RPC server.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":true,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"294591a8_6542b2ce","line":593,"in_reply_to":"01ed6220_51674f22","updated":"2025-12-02 18:14:46.000000000","message":"I did not mean to resolve it until you are ok with the answer or want me to add/explain it in upgrade section. Reopening this comment.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d81f74af7e8e3cf248563dd5177e72ff64ba15ea","unresolved":true,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"c6ba7bf2_e906d3a8","line":593,"in_reply_to":"01ed6220_51674f22","updated":"2025-12-02 20:19:19.000000000","message":"Sorry I wasn\u0027t clear enough.\n\n\u003e In summary, if versio_cap is old compute then graceful shutdown will not be working even for new computes.\n\nThat is one of my main point. Thanks for confirming that. I think it make sense to state is in the spec too.\n\n\u003e When shutdown is initiated, then 1st RPC server will be stopped and then we will check if 2nd RPC server exist then it will be stopped also. We will create and stop the RPC servers based on new compute even that is not in use due to version_cap is old compute.\n\nYeah I think this is where my mind went into a rabbit hole trying to optimize. We probably don\u0027t need it. But I write it down just to see if it make sense.\n\n-- later -- \n\nI tried to write it down but that helped figuring out why my optimization of the logic in the new compute under old version cap is not feasible. :) Sorry for the noise. :)","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"4c0c9a5fbe8696edc0bf0c61eca9a105eb19de31","unresolved":true,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"b943ebf6_87467e4b","line":593,"in_reply_to":"294591a8_6542b2ce","updated":"2025-12-02 18:50:38.000000000","message":"Yes, also I don\u0027t think this is an upgrade issue because today, everyone just shuts down their one and only RPC server immediately. Obviously the continue-until-complete behavior won\u0027t work between two computes unless they\u0027re both updated and following the same rules, but I don\u0027t think this creates an RPC version conflict as proposed.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"01ed6220_51674f22","line":593,"in_reply_to":"7c8c36ec_9569ec09","updated":"2025-12-02 04:55:32.000000000","message":"I am not sure I understood the question completely \"new computes cannot stop their 1st RPC server \" you mean 2nd RPC server?\n\nIf version_cap is old compute (configured or calculated for upgrade_level \u0027auto\u0027) then requests will be sent on topic \u0027compute.\u003chost\u003e\u0027(1st RPC server) even on new compute. Basically 2nd RPC server on new compute will be ideal.\n\nWhen shutdown is initiated, then 1st RPC server will be stopped and then we will check if 2nd RPC server exist then it will be stopped also. We will create and stop the RPC servers based on new compute even that is not in use due to version_cap is old compute.\n\nIn summary, if versio_cap is old compute then graceful shutdown will not be working even for new computes.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":590,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":591,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":592,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":593,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":594,"context_line":""},{"line_number":595,"context_line":"Implementation"},{"line_number":596,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":6,"id":"fc5179cc_55c12600","line":593,"in_reply_to":"c6ba7bf2_e906d3a8","updated":"2025-12-02 23:27:11.000000000","message":"I think nothing to add here?","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6781b3f2215ce1bed4d06b9b15184cc7a775892f","unresolved":true,"context_lines":[{"line_number":632,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":633,"context_line":""},{"line_number":634,"context_line":"* We cannot write tempest tests for this because tempest will not be able to"},{"line_number":635,"context_line":" stop the services."},{"line_number":636,"context_line":"* Unit and functional tests will be added."},{"line_number":637,"context_line":""},{"line_number":638,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"f76f8fd9_fda18aa0","line":635,"updated":"2025-12-01 16:20:56.000000000","message":"We have some devstack level destructive tests running after tempest. I suggest to add some coverage there if possible. Unfortunately the functional test env might not be suitable to test the full graceful shutdown behavior as every service in the functional test running in the same process and therefore sharing global stuff.\n\nhttps://github.com/openstack/nova/blob/master/gate/post_test_hook.sh\nhttps://github.com/openstack/nova/blob/master/roles/run-evacuate-hook/files/test_evacuate.sh","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"ce7f87d9aa2d6b007be03f19739a2c39f6f9ffd0","unresolved":false,"context_lines":[{"line_number":632,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":633,"context_line":""},{"line_number":634,"context_line":"* We cannot write tempest tests for this because tempest will not be able to"},{"line_number":635,"context_line":" stop the services."},{"line_number":636,"context_line":"* Unit and functional tests will be added."},{"line_number":637,"context_line":""},{"line_number":638,"context_line":""}],"source_content_type":"text/x-rst","patch_set":6,"id":"c6c67635_024d188e","line":635,"in_reply_to":"f76f8fd9_fda18aa0","updated":"2025-12-02 04:55:32.000000000","message":"+1, yeah that is good place go try the graceful shutdown.","commit_id":"0ffcff8a001085102adae17d625a3582f4ff8169"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":65,"context_line":"* When service shutdown is initiated by SIGTERM:"},{"line_number":66,"context_line":""},{"line_number":67,"context_line":" * Do not process any new requests"},{"line_number":68,"context_line":" * New requests should not be lost. Once service is started, it should process"},{"line_number":69,"context_line":" the requests."},{"line_number":70,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":71,"context_line":" before shutdown."}],"source_content_type":"text/x-rst","patch_set":7,"id":"cf1222e0_7780bf78","line":68,"range":{"start_line":68,"start_character":53,"end_line":68,"end_character":60},"updated":"2025-12-02 17:19:26.000000000","message":"say \"restarted\" here for clarity.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":65,"context_line":"* When service shutdown is initiated by SIGTERM:"},{"line_number":66,"context_line":""},{"line_number":67,"context_line":" * Do not process any new requests"},{"line_number":68,"context_line":" * New requests should not be lost. Once service is started, it should process"},{"line_number":69,"context_line":" the requests."},{"line_number":70,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":71,"context_line":" before shutdown."}],"source_content_type":"text/x-rst","patch_set":7,"id":"c1eba55b_e2be9421","line":68,"range":{"start_line":68,"start_character":53,"end_line":68,"end_character":60},"in_reply_to":"cf1222e0_7780bf78","updated":"2025-12-02 18:14:46.000000000","message":"Done","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":68,"context_line":" * New requests should not be lost. Once service is started, it should process"},{"line_number":69,"context_line":" the requests."},{"line_number":70,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":71,"context_line":" before shutdown."},{"line_number":72,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":73,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":74,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"10ee15c9_b4610091","line":71,"updated":"2025-12-02 17:19:26.000000000","message":"I think we could argue (as above) that aborting a live migration that we can do cleanly is better than waiting an hour for it to finish. So you could (your choice) make this a bit softer by saying something like \"allow in-progress operations to reach their quickest safe termination point, either completion or abort.\"","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":68,"context_line":" * New requests should not be lost. Once service is started, it should process"},{"line_number":69,"context_line":" the requests."},{"line_number":70,"context_line":" * No interruption to the in-progress operations; they should be completed"},{"line_number":71,"context_line":" before shutdown."},{"line_number":72,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":73,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":74,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"34887869_9383b49b","line":71,"in_reply_to":"10ee15c9_b4610091","updated":"2025-12-02 18:14:46.000000000","message":"Done","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":95,"context_line":" always a single worker per compute service on specific host."},{"line_number":96,"context_line":" * If a service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There is an opportuniy for the compute service to cleanup or recover"},{"line_number":99,"context_line":" the interrupted operation on instances during init_host(). The action"},{"line_number":100,"context_line":" taken will depends on the tasks and its status."},{"line_number":101,"context_line":" * If the service is in the stopped state for a long time, based on the"}],"source_content_type":"text/x-rst","patch_set":7,"id":"e8ee472e_e41fdfa4","line":98,"range":{"start_line":98,"start_character":18,"end_line":98,"end_character":28},"updated":"2025-12-02 17:19:26.000000000","message":"\"opportunity\"","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":95,"context_line":" always a single worker per compute service on specific host."},{"line_number":96,"context_line":" * If a service has single worker running, then request can be picked up"},{"line_number":97,"context_line":" once service is up again."},{"line_number":98,"context_line":" * There is an opportuniy for the compute service to cleanup or recover"},{"line_number":99,"context_line":" the interrupted operation on instances during init_host(). The action"},{"line_number":100,"context_line":" taken will depends on the tasks and its status."},{"line_number":101,"context_line":" * If the service is in the stopped state for a long time, based on the"}],"source_content_type":"text/x-rst","patch_set":7,"id":"425ffa23_7ef3f329","line":98,"range":{"start_line":98,"start_character":18,"end_line":98,"end_character":28},"in_reply_to":"e8ee472e_e41fdfa4","updated":"2025-12-02 18:14:46.000000000","message":"Done","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":318,"context_line":" the same compute manager."},{"line_number":319,"context_line":" * This server will be mainly used for the compute-to-compute operations and"},{"line_number":320,"context_line":" server external events. The idea is to keep this RPC server up during"},{"line_number":321,"context_line":" shutdown so that the in-progress operations can be finished."},{"line_number":322,"context_line":" * In shutdown, nova.service will wait for the compute to tell if they"},{"line_number":323,"context_line":" finished all their tasks, so that it can stop the 2nd RPC server and"},{"line_number":324,"context_line":" finish the shutdown."}],"source_content_type":"text/x-rst","patch_set":7,"id":"80800cc8_09208adb","line":321,"updated":"2025-12-02 17:19:26.000000000","message":"And to be clear, we will wire this other RPC server up to the same `ComputeManager` object so that nothing major needs to change about how we implement things today - we will just have two \"doors\" into the compute manager, one of which we can close, knowing that new requests will stop at the closed door. Correct?","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":318,"context_line":" the same compute manager."},{"line_number":319,"context_line":" * This server will be mainly used for the compute-to-compute operations and"},{"line_number":320,"context_line":" server external events. The idea is to keep this RPC server up during"},{"line_number":321,"context_line":" shutdown so that the in-progress operations can be finished."},{"line_number":322,"context_line":" * In shutdown, nova.service will wait for the compute to tell if they"},{"line_number":323,"context_line":" finished all their tasks, so that it can stop the 2nd RPC server and"},{"line_number":324,"context_line":" finish the shutdown."}],"source_content_type":"text/x-rst","patch_set":7,"id":"ab23ac4d_6b251ba5","line":321,"in_reply_to":"80800cc8_09208adb","updated":"2025-12-02 18:14:46.000000000","message":"yes, that is correct. it will be connected to the same compute manager. We will be using same endpoint, transport in both RPC servers.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":426,"context_line":""},{"line_number":427,"context_line":" * Migrations:"},{"line_number":428,"context_line":""},{"line_number":429,"context_line":" * Migration operations will use the 2nd RPC server."},{"line_number":430,"context_line":""},{"line_number":431,"context_line":" * If migration is in-progress then the service shutdown will not"},{"line_number":432,"context_line":" terminate the migration; instead will be able to wait for the migration"}],"source_content_type":"text/x-rst","patch_set":7,"id":"755d12fa_216df5c6","line":429,"range":{"start_line":429,"start_character":40,"end_line":429,"end_character":54},"updated":"2025-12-02 17:19:26.000000000","message":"I think we should define this above as \"the ops RPC channel\" or something to avoid confusion about which is \"first\" and which is being shutdown. Let\u0027s pick a good term (ops is fine with me, FWIW) and use that consistently to talk about the always-on RPC channel going forward.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":426,"context_line":""},{"line_number":427,"context_line":" * Migrations:"},{"line_number":428,"context_line":""},{"line_number":429,"context_line":" * Migration operations will use the 2nd RPC server."},{"line_number":430,"context_line":""},{"line_number":431,"context_line":" * If migration is in-progress then the service shutdown will not"},{"line_number":432,"context_line":" terminate the migration; instead will be able to wait for the migration"}],"source_content_type":"text/x-rst","patch_set":7,"id":"01869f61_08c4cb50","line":429,"range":{"start_line":429,"start_character":40,"end_line":429,"end_character":54},"in_reply_to":"755d12fa_216df5c6","updated":"2025-12-02 18:14:46.000000000","message":"Done","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":448,"context_line":" As per my PoC and manual testing till now, it does not require any"},{"line_number":449,"context_line":" change on oslo.messaging side."},{"line_number":450,"context_line":""},{"line_number":451,"context_line":"Step 2: Smartly track and wait for the in-progress operations:"},{"line_number":452,"context_line":"--------------------------------------------------------------"},{"line_number":453,"context_line":""},{"line_number":454,"context_line":"* The below services graceful shutdown is handled by their deployed server or"}],"source_content_type":"text/x-rst","patch_set":7,"id":"e0790191_0790d1a1","line":451,"updated":"2025-12-02 17:19:26.000000000","message":"This will be a different spec in the future, right? Maybe just make that clear here as it sort of looks like this means \"Step 2 for this spec\".","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":448,"context_line":" As per my PoC and manual testing till now, it does not require any"},{"line_number":449,"context_line":" change on oslo.messaging side."},{"line_number":450,"context_line":""},{"line_number":451,"context_line":"Step 2: Smartly track and wait for the in-progress operations:"},{"line_number":452,"context_line":"--------------------------------------------------------------"},{"line_number":453,"context_line":""},{"line_number":454,"context_line":"* The below services graceful shutdown is handled by their deployed server or"}],"source_content_type":"text/x-rst","patch_set":7,"id":"c72216f1_5d62692e","line":451,"in_reply_to":"e0790191_0790d1a1","updated":"2025-12-02 18:14:46.000000000","message":"Yes, different spec. I divided it into steps, considering each step as a separate spec but let me rename Step -\u003e Spec","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"79352d3483a4ded90076d9f916eef70e26a9cfcb","unresolved":true,"context_lines":[{"line_number":599,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":600,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":601,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":602,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":603,"context_line":""},{"line_number":604,"context_line":"Implementation"},{"line_number":605,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":7,"id":"6877c7d8_19dde249","line":602,"updated":"2025-12-02 17:19:26.000000000","message":"Yep, and the sooner we cleave the RPC queues in two, the better we are positioned to have this upgrade impact be less .. impactful. So I think we should plan to have that server-side patch done ASAP and merged to put a stake in the ground for that RPC version so we can move forward with the rest of the implementation knowing the timer has been started.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"31cf5d1678db8ab27bb24d241c387838eb540cdb","unresolved":false,"context_lines":[{"line_number":599,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":600,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":601,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":602,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":603,"context_line":""},{"line_number":604,"context_line":"Implementation"},{"line_number":605,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":7,"id":"9edc490b_22cb5816","line":602,"in_reply_to":"6877c7d8_19dde249","updated":"2025-12-02 18:14:46.000000000","message":"ok, I thought we needed to do client-side implementation (to start using the 2nd RPC server) and server version bump at the same time. But, doing server-side RPC version bump up + adding 2nd RPC server as the first is a good idea. Then we usage of 2nd RPC server in client side can be second.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"c813e1d2c919d6f77ef5f950b2f2b6e08cd4a25c","unresolved":false,"context_lines":[{"line_number":599,"context_line":"the new 2nd RPC server listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":600,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":601,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":602,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":603,"context_line":""},{"line_number":604,"context_line":"Implementation"},{"line_number":605,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":7,"id":"f22d2f52_94d98d9d","line":602,"in_reply_to":"9edc490b_22cb5816","updated":"2025-12-02 18:21:03.000000000","message":"Server side and bump need to go together for sure. But yes, the client side needs to be done earlyish as well in order to get the full RPC behavior changed, which can happen before the actual shutdown stuff does. Those two things (server+bump, then client) are the first two things to get done (in that order) and merged as soon as we can since those require the timer, and then work on the actual shutdown bits.","commit_id":"e4ccd04aeda87dac01d8a1cbae79ce618948638b"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":68,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":69,"context_line":" process the requests."},{"line_number":70,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":71,"context_line":" point, either completion or abort."},{"line_number":72,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":73,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":74,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"534e806e_806df6ef","line":71,"updated":"2025-12-02 22:05:34.000000000","message":"this is more acceptable to me as its adressign some of my concern about actions being stuck in intermediary states.\n\nwe dont have to succesffully complete the operation but we shoudl not leave it in a state that need reset-state or db surgery to recover form.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"151540c600a2dff4ee4925a41d509c53178c3ee5","unresolved":false,"context_lines":[{"line_number":68,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":69,"context_line":" process the requests."},{"line_number":70,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":71,"context_line":" point, either completion or abort."},{"line_number":72,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":73,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":74,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"3955aa40_cf880d65","line":71,"in_reply_to":"534e806e_806df6ef","updated":"2025-12-02 22:55:05.000000000","message":"Acknowledged","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"4c0c9a5fbe8696edc0bf0c61eca9a105eb19de31","unresolved":false,"context_lines":[{"line_number":115,"context_line":" tracking mechanism."},{"line_number":116,"context_line":""},{"line_number":117,"context_line":"This backlog spec proposes achieving the above goals in two steps. Each step"},{"line_number":118,"context_line":"will be proposed as a separate spec for a specific release."},{"line_number":119,"context_line":""},{"line_number":120,"context_line":"The Nova services which already gracefully shutdown:"},{"line_number":121,"context_line":"----------------------------------------------------"}],"source_content_type":"text/x-rst","patch_set":8,"id":"503feb65_b0c9a4de","line":118,"updated":"2025-12-02 18:50:38.000000000","message":"++ This makes it clearer","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":284,"context_line":" available workers will proceed with the request."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":287,"context_line":" the scheduler."},{"line_number":288,"context_line":""},{"line_number":289,"context_line":"* Nova compute: RPC design change needed"},{"line_number":290,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"d7e14d66_3ba5d8fe","line":287,"updated":"2025-12-02 22:05:34.000000000","message":"this only works if you have multipel conductors per cell and or top level conductors.\n\nim not sure i agree with punting graceful shutdown of single conductor deployment out of scope fo this spec.\n\nim ok with it being out of scope fo the 2026.1 spec but i think we need to keep conducrtor alive to handel rembotable oject calls for comptue db access.\n\nim ok with saying we will shutdown the ComputeTaskAPI\n\nhttps://github.com/openstack/nova/blob/master/nova/conductor/api.py#L82C7-L82C21\n\nbut the object_action handelers shoudl continue to work\n\nhttps://github.com/openstack/nova/blob/master/nova/conductor/manager.py#L133-L201\n\nis that your intent? this is differnt form the schduler as it is not providign db access vai the object interface so the scdhuler and conductors are not quite the saem.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"2e53bdb0b7a24bb77a9aee534cf1041e94c3d7e5","unresolved":true,"context_lines":[{"line_number":284,"context_line":" available workers will proceed with the request."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":287,"context_line":" the scheduler."},{"line_number":288,"context_line":""},{"line_number":289,"context_line":"* Nova compute: RPC design change needed"},{"line_number":290,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"e2b121a5_708d896b","line":287,"in_reply_to":"d7e14d66_3ba5d8fe","updated":"2025-12-02 22:13:00.000000000","message":"To me, the conductors are the least important to do any sort of graceful shutdown work against. If we get compute fully squared away, then we can start thinking about working on conductor in a similar way, but the headless-ness of conductors make this a much less useful and less interesting problem to solve there. IMHO, in the early days we\u0027re talking about the graceful shutdown of individual nodes (mostly compute). I think solving the problem for shutting down all (or all but one) conductor worker in the cluster is not something we need to worry about any time soon. Since we can\u0027t address a specific conductor, it doesn\u0027t suffer nearly the same problems as compute, with the exception of reply queues for ongoing long-running tasks.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"151540c600a2dff4ee4925a41d509c53178c3ee5","unresolved":true,"context_lines":[{"line_number":284,"context_line":" available workers will proceed with the request."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":287,"context_line":" the scheduler."},{"line_number":288,"context_line":""},{"line_number":289,"context_line":"* Nova compute: RPC design change needed"},{"line_number":290,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"f4174c2d_7b6809d2","line":287,"in_reply_to":"e2b121a5_708d896b","updated":"2025-12-02 22:55:05.000000000","message":"yes, if it is single conductor case, we might need same approach as compute and we can separate the trafiic of new requests vs in-progress request. I am not denying that not to do, but yes let\u0027s focus on compute and task tracking which solve most of the graceful shutdown cases.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":284,"context_line":" available workers will proceed with the request."},{"line_number":285,"context_line":""},{"line_number":286,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":287,"context_line":" the scheduler."},{"line_number":288,"context_line":""},{"line_number":289,"context_line":"* Nova compute: RPC design change needed"},{"line_number":290,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"f123dbdc_0ec6dbee","line":287,"in_reply_to":"f4174c2d_7b6809d2","updated":"2025-12-02 23:27:11.000000000","message":"Done","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"4c0c9a5fbe8696edc0bf0c61eca9a105eb19de31","unresolved":false,"context_lines":[{"line_number":304,"context_line":" will be used to finish the in-progress requests and will stay up during"},{"line_number":305,"context_line":" shutdown."},{"line_number":306,"context_line":" * \u0027new request RPC server\u0027: This will be used for the current RPC server,"},{"line_number":307,"context_line":" which is used for the new requests and will be stopped during shutdown."},{"line_number":308,"context_line":""},{"line_number":309,"context_line":" * \u0027new request RPC server\u0027 per compute:"},{"line_number":310,"context_line":" No change in this RPC server, but it will be used for all the new requests,"}],"source_content_type":"text/x-rst","patch_set":8,"id":"74757c36_092655ad","line":307,"updated":"2025-12-02 18:50:38.000000000","message":"++ I think this terminology/definition is good, and we can argue about the actual names later and amend if we want.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":false,"context_lines":[{"line_number":304,"context_line":" will be used to finish the in-progress requests and will stay up during"},{"line_number":305,"context_line":" shutdown."},{"line_number":306,"context_line":" * \u0027new request RPC server\u0027: This will be used for the current RPC server,"},{"line_number":307,"context_line":" which is used for the new requests and will be stopped during shutdown."},{"line_number":308,"context_line":""},{"line_number":309,"context_line":" * \u0027new request RPC server\u0027 per compute:"},{"line_number":310,"context_line":" No change in this RPC server, but it will be used for all the new requests,"}],"source_content_type":"text/x-rst","patch_set":8,"id":"9243f537_2eabc838","line":307,"in_reply_to":"74757c36_092655ad","updated":"2025-12-02 22:05:34.000000000","message":"sure the two classes of rpc server i think makes sense although i think the ops and new request naming is confusing\n\ni woudl borrow some terms form the ovn world i owuld be tempeted to call the one we shutdown the Northbound RPC server\n\nif it was the conductor it woudl be the RPC server that handles request form the API\n\nthe one we keep running i would call the southbound rpc server\n\nagain in the conductor it would be the RPC server that handles request form the computes.\n```\n * \u0027south-bound rpc server\u0027: This will be used for the new RPC server, which\n will be used to finish the in-progress requests and will stay up during\n shutdown.\n * \u0027north-bound rpc server\u0027: This will be used for the current RPC server,\n which is used for the new requests and will be stopped during shutdown.\n```\n\nthat is building on the common software idiom that a north-bound api is external, public or fronted facing and a southbound API is internal,private and backend facing.\n\nthe reason i bring up the conductor is i think we shoudl be doing the same split there eventually so that even if you only have 1 conductor at a cell or super conductor level stating it shutdown does not break all db access for the computes immediately.and if we do that we shoudl try to use consistent naming.\n\nto refer to these two types fo rpc serer.\n\nwith all that said this is clear then it was the last time i reviewed this.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"4c0c9a5fbe8696edc0bf0c61eca9a105eb19de31","unresolved":false,"context_lines":[{"line_number":321,"context_line":" \u0027new request RPC server\u0027 uses."},{"line_number":322,"context_line":" * It will create its own dispatcher, listener, and queue."},{"line_number":323,"context_line":" * Both RPC server will be bound to the same endpoints (same compute"},{"line_number":324,"context_line":" manager), so that requests coming from either server are handled by"},{"line_number":325,"context_line":" the same compute manager."},{"line_number":326,"context_line":" * This server will be mainly used for the compute-to-compute operations and"},{"line_number":327,"context_line":" server external events. The idea is to keep this RPC server up during"}],"source_content_type":"text/x-rst","patch_set":8,"id":"904a1c5c_cacfb337","line":324,"updated":"2025-12-02 18:50:38.000000000","message":"Ah sorry I totally missed the \"same compute manager\" words before, but here they are (and they were before :P)","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":488,"context_line":" ``request-id`` will help to track multiple calls to the same method."},{"line_number":489,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":490,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":491,"context_line":" done under lock."},{"line_number":492,"context_line":"* Once shutdown is initiated:"},{"line_number":493,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":494,"context_line":" them. The decision will be made by case, for example, reject the tasks if"}],"source_content_type":"text/x-rst","patch_set":8,"id":"9ca4c3ce_6b6472e5","line":491,"updated":"2025-12-02 22:05:34.000000000","message":"how exactly do you intend to implemetn this?\n\nare we going to decorate the top level rpc apis that we condier \"tasks\" with a decorator that uses a try, finally to defer the removal of the task form the singleton regardless of if it competes successfully or with an exception?\n\nsince we are using a singelton request tracker we also need the request tracker to internally syncoise access to this map as we will be sharing it across threads\n(green or real) since oslo messaging will spanw a new tread in an execturo for each rpc request.\n\nso idealy we should hide that behind add_request and remove_request methods which handel the lockign for us.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":false,"context_lines":[{"line_number":488,"context_line":" ``request-id`` will help to track multiple calls to the same method."},{"line_number":489,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":490,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":491,"context_line":" done under lock."},{"line_number":492,"context_line":"* Once shutdown is initiated:"},{"line_number":493,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":494,"context_line":" them. The decision will be made by case, for example, reject the tasks if"}],"source_content_type":"text/x-rst","patch_set":8,"id":"76ad2d46_00fdbc8f","line":491,"in_reply_to":"79cd8559_5fb9ba75","updated":"2025-12-03 09:47:39.000000000","message":"ack as i said i think my prefence woudl be to decrator all of the request/task handelers and have the task tracker maintian the list of ongoing task alwasy not just during shutdown so we can have this observablity data.\n\nwe coudl for exampel usei to to track thigns like average tiem to service a specific request or other simple metrics.\n\nthis is part of the spec 2 scope so im fine with the actual details of how this will work being defered until that is written.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"151540c600a2dff4ee4925a41d509c53178c3ee5","unresolved":false,"context_lines":[{"line_number":488,"context_line":" ``request-id`` will help to track multiple calls to the same method."},{"line_number":489,"context_line":"* Whenever a new request comes to compute, it will add that to the task list"},{"line_number":490,"context_line":" and remove it once the task is completed. Modification to the tracker will be"},{"line_number":491,"context_line":" done under lock."},{"line_number":492,"context_line":"* Once shutdown is initiated:"},{"line_number":493,"context_line":"* This tracker will either add the new tasks to the tracker list or reject"},{"line_number":494,"context_line":" them. The decision will be made by case, for example, reject the tasks if"}],"source_content_type":"text/x-rst","patch_set":8,"id":"79cd8559_5fb9ba75","line":491,"in_reply_to":"9ca4c3ce_6b6472e5","updated":"2025-12-02 22:55:05.000000000","message":"ack, I have not done the PoC for this yet but idea is using with context manager or decorator. Yes, add_request/add_tasks and remove_request/remove_tasks methods we can implement","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9f2e7de1af4b3e421b9766c33093eaea1228ecf7","unresolved":true,"context_lines":[{"line_number":498,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":499,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":500,"context_line":" during shutdown."},{"line_number":501,"context_line":"* nova.service will wait for tracker to finish the in-progress tasks until"},{"line_number":502,"context_line":" timeout."},{"line_number":503,"context_line":"* The flow of RPC servers stop, wait, and tacker wait will be something like:"},{"line_number":504,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"067195dd_e6b8b9cc","line":501,"range":{"start_line":501,"start_character":3,"end_line":501,"end_character":74},"updated":"2025-12-02 22:05:34.000000000","message":"nit: `the request tracker` or `the task tracker` or `the rpc tracker`.\n```suggestion\n* nova.service will wait for the task tracker to finish the in-progress tasks until\n```\n\nwe have resources trackers and PCI tackers already do i would prefer a more descriptive name for this.","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"9acb4e86bdf9522af42a29e04051258411cd2c79","unresolved":false,"context_lines":[{"line_number":498,"context_line":"* Tracker will start logging the tasks which are in progress, and log when"},{"line_number":499,"context_line":" they are completed. Basically, log the detail view of in-progress things"},{"line_number":500,"context_line":" during shutdown."},{"line_number":501,"context_line":"* nova.service will wait for tracker to finish the in-progress tasks until"},{"line_number":502,"context_line":" timeout."},{"line_number":503,"context_line":"* The flow of RPC servers stop, wait, and tacker wait will be something like:"},{"line_number":504,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"e20d6d54_52c02103","line":501,"range":{"start_line":501,"start_character":3,"end_line":501,"end_character":74},"in_reply_to":"067195dd_e6b8b9cc","updated":"2025-12-02 23:27:11.000000000","message":"Done","commit_id":"9e3f4a183c3e3ba9a2922d21737dc290eaa882a1"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":true,"context_lines":[{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations or"},{"line_number":47,"context_line":"keep resources in usable state."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":10,"id":"bf7d4da3_612316e3","line":47,"updated":"2025-12-03 09:47:39.000000000","message":"+1","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"b4cf9a3cbe8cdedfa65994cbc6525ae284756878","unresolved":false,"context_lines":[{"line_number":44,"context_line":""},{"line_number":45,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":46,"context_line":"services so that it will not impact the users\u0027 in-progress operations or"},{"line_number":47,"context_line":"keep resources in usable state."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":50,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":10,"id":"9d35ff58_caa23333","line":47,"in_reply_to":"bf7d4da3_612316e3","updated":"2025-12-03 15:26:36.000000000","message":"Acknowledged","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":true,"context_lines":[{"line_number":278,"context_line":""},{"line_number":279,"context_line":"* Nova conductor: No RPC change needed."},{"line_number":280,"context_line":""},{"line_number":281,"context_line":" Nova conductor runs as multiple workers, each having its own RPC server,"},{"line_number":282,"context_line":" but all the Nova conductor workers will listen to the same RPC topic"},{"line_number":283,"context_line":" and queue ``conductor``. Stopping a Nova conductor worker will not impact"},{"line_number":284,"context_line":" the RPC communication on the conductor service, and other available workers"},{"line_number":285,"context_line":" will proceed with the request."},{"line_number":286,"context_line":""},{"line_number":287,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":288,"context_line":" the scheduler."}],"source_content_type":"text/x-rst","patch_set":10,"id":"df039182_b84e18f8","line":285,"range":{"start_line":281,"start_character":2,"end_line":285,"end_character":32},"updated":"2025-12-03 09:47:39.000000000","message":"nit: workers has two meaning here\n\nfor each conductor process we start on the cil it will swap n worker threads today.\nsame for the schduler\n\nthe other meaning which i think is waht your actully implyihng is that there can be multiple conductor process rungin on many host or contaienrs any of which can service the request as form a client perspecitve we dont care about an indiviugal conductor but just that there are active conductors in the relevelnt pool(cell-0 cell-1 ectra)\n\nhttps://docs.openstack.org/nova/latest/configuration/config.html#conductor.workers\n\n\nif you end up reworkign this i would prorbaly rewright this as follwos\n\n \n \n```suggestion\n The Nova conductor binary is a stateless service that can spawn multiple worker\n threads. Each instance of the nova conductor has its own RPC server,\n but all the Nova conductor instances will listen to the same RPC topic\n and queue ``conductor``. This allows the conductor instance to ack as a\n distributed worker pool such that stopping an individual conductor instance\n will not impact the RPC communication for the pool of conductor instances,\n allowing other available workers to process the request. Each cell has its own\n pool of conductors meaning as long as 1 conductor is up for any given cell\n the RPC communication will continue to function even when one or more conductors\n are stopped.\n```","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"b4cf9a3cbe8cdedfa65994cbc6525ae284756878","unresolved":false,"context_lines":[{"line_number":278,"context_line":""},{"line_number":279,"context_line":"* Nova conductor: No RPC change needed."},{"line_number":280,"context_line":""},{"line_number":281,"context_line":" Nova conductor runs as multiple workers, each having its own RPC server,"},{"line_number":282,"context_line":" but all the Nova conductor workers will listen to the same RPC topic"},{"line_number":283,"context_line":" and queue ``conductor``. Stopping a Nova conductor worker will not impact"},{"line_number":284,"context_line":" the RPC communication on the conductor service, and other available workers"},{"line_number":285,"context_line":" will proceed with the request."},{"line_number":286,"context_line":""},{"line_number":287,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":288,"context_line":" the scheduler."}],"source_content_type":"text/x-rst","patch_set":10,"id":"75ff9a6c_6a0eae95","line":285,"range":{"start_line":281,"start_character":2,"end_line":285,"end_character":32},"in_reply_to":"df039182_b84e18f8","updated":"2025-12-03 15:26:36.000000000","message":"Done","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":true,"context_lines":[{"line_number":291,"context_line":""},{"line_number":292,"context_line":" This spec does not cover the conductor single worker case. That might"},{"line_number":293,"context_line":" requires the RPC designing for conductor as well but it need more"},{"line_number":294,"context_line":" investigation."},{"line_number":295,"context_line":""},{"line_number":296,"context_line":"* Nova compute: RPC design change needed"},{"line_number":297,"context_line":""}],"source_content_type":"text/x-rst","patch_set":10,"id":"2d1af182_f56ca6fa","line":294,"updated":"2025-12-03 09:47:39.000000000","message":"+1","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"b4cf9a3cbe8cdedfa65994cbc6525ae284756878","unresolved":false,"context_lines":[{"line_number":291,"context_line":""},{"line_number":292,"context_line":" This spec does not cover the conductor single worker case. That might"},{"line_number":293,"context_line":" requires the RPC designing for conductor as well but it need more"},{"line_number":294,"context_line":" investigation."},{"line_number":295,"context_line":""},{"line_number":296,"context_line":"* Nova compute: RPC design change needed"},{"line_number":297,"context_line":""}],"source_content_type":"text/x-rst","patch_set":10,"id":"8c02cd31_34a1def4","line":294,"in_reply_to":"2d1af182_f56ca6fa","updated":"2025-12-03 15:26:36.000000000","message":"Acknowledged","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"1bd3cd4fe32f4ea3d2596521b0d0dc25b0506795","unresolved":true,"context_lines":[{"line_number":505,"context_line":" periodic tasks will be finished."},{"line_number":506,"context_line":" * An exact list of tasks which will be rejected and accepted will be decided"},{"line_number":507,"context_line":" during implementation."},{"line_number":508,"context_line":" * Task tracker will start logging the tasks which are in progress, and log"},{"line_number":509,"context_line":" when they are completed. Basically, log the detail view of in-progress"},{"line_number":510,"context_line":" things during shutdown."},{"line_number":511,"context_line":"* nova.service will wait for the task tracker to finish the in-progress tasks"}],"source_content_type":"text/x-rst","patch_set":10,"id":"48ca42f8_6051ac09","line":508,"range":{"start_line":508,"start_character":4,"end_line":508,"end_character":76},"updated":"2025-12-03 09:47:39.000000000","message":"nit:\n\n```suggestion\n * The task tracker will start logging the tasks which are in progress, and log\n```","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"b4cf9a3cbe8cdedfa65994cbc6525ae284756878","unresolved":false,"context_lines":[{"line_number":505,"context_line":" periodic tasks will be finished."},{"line_number":506,"context_line":" * An exact list of tasks which will be rejected and accepted will be decided"},{"line_number":507,"context_line":" during implementation."},{"line_number":508,"context_line":" * Task tracker will start logging the tasks which are in progress, and log"},{"line_number":509,"context_line":" when they are completed. Basically, log the detail view of in-progress"},{"line_number":510,"context_line":" things during shutdown."},{"line_number":511,"context_line":"* nova.service will wait for the task tracker to finish the in-progress tasks"}],"source_content_type":"text/x-rst","patch_set":10,"id":"be6c3f09_d7a233a5","line":508,"range":{"start_line":508,"start_character":4,"end_line":508,"end_character":76},"in_reply_to":"48ca42f8_6051ac09","updated":"2025-12-03 15:26:36.000000000","message":"Done","commit_id":"d57407453ac4b55b0e6b30b3dd1fa235f63e6299"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"c00a931b77a8c6babfc405b47f21ba944a9a6a6a","unresolved":true,"context_lines":[{"line_number":281,"context_line":" The Nova conductor binary is a stateless service that can spawn multiple"},{"line_number":282,"context_line":" worker threads. Each instance of the Nova conductor has its own RPC server,"},{"line_number":283,"context_line":" but all the Nova conductor instances will listen to the same RPC topic"},{"line_number":284,"context_line":" and queue ``conductor``. This allows the conductor instance to ack as a"},{"line_number":285,"context_line":" distributed worker pool such that stopping an individual conductor instance"},{"line_number":286,"context_line":" will not impact the RPC communication for the pool of conductor instances,"},{"line_number":287,"context_line":" allowing other available workers to process the request. Each cell has its"}],"source_content_type":"text/x-rst","patch_set":11,"id":"f86c1253_db795665","line":284,"range":{"start_line":284,"start_character":65,"end_line":284,"end_character":68},"updated":"2025-12-03 18:52:30.000000000","message":"\"act\"","commit_id":"8d374dd715ddb6e83ddd36fb83e471ccff2a0e9e"},{"author":{"_account_id":4393,"name":"Dan Smith","email":"dms@danplanet.com","username":"danms"},"change_message_id":"c00a931b77a8c6babfc405b47f21ba944a9a6a6a","unresolved":true,"context_lines":[{"line_number":295,"context_line":" .. note::"},{"line_number":296,"context_line":""},{"line_number":297,"context_line":" This spec does not cover the conductor single worker case. That might"},{"line_number":298,"context_line":" requires the RPC designing for conductor as well but it need more"},{"line_number":299,"context_line":" investigation."},{"line_number":300,"context_line":""},{"line_number":301,"context_line":"* Nova compute: RPC design change needed"}],"source_content_type":"text/x-rst","patch_set":11,"id":"2f98dbc6_20546bbf","line":298,"range":{"start_line":298,"start_character":5,"end_line":298,"end_character":13},"updated":"2025-12-03 18:52:30.000000000","message":"\"require\"","commit_id":"8d374dd715ddb6e83ddd36fb83e471ccff2a0e9e"}]}