)]}' {"/PATCHSET_LEVEL":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"c62ba377_cf4630c7","updated":"2025-12-03 14:21:20.000000000","message":"* Lets clarify what is in and out of scope about aborting in flight operations. \n* Lets note that the proposed behavior should work both in eventlet and native threaded mode.\n* Lets clarify that graceful shutdown will only work after the cluster is fully upgraded.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"94086402_b6284211","updated":"2025-12-04 10:47:08.000000000","message":"I still have some concerns but nothing holding me to accept that proposal as I\u0027d really like we could start to implement graceful shutdown during that SLURP release so I\u0027ll keep my comments in mind and we\u0027ll revisit those at implementation phase.\n\nAs agreed at the PTG, I can sign off for reviewing the implementation patches, for sure so let me +2 that spec.\n\nNice work, gmaan.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"f4ab39f9_70ed08bf","updated":"2025-12-04 09:43:20.000000000","message":"Looks good to me. I\u0027m holding my +2 not because technical reasons but because I need to figure out what implementation I will have time to review during the cycle.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"7bbfc894100e0728f090cb4af4642179403b923f","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"0ee75278_e379183d","updated":"2025-12-04 14:39:12.000000000","message":"given gibi and I committed on reviewing the series, let\u0027s accept this patch.\n@gmaan@ghanshyammann.com please provide a follow-up patch for my comments if you want now, or just await it when we will discuss on the implementation series.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"9ee2f9c5_533d0c59","updated":"2025-12-04 16:48:25.000000000","message":"proposed the follow up for comments fixing https://review.opendev.org/c/openstack/nova-specs/+/969813","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"f824ffbe3be8c50251b86eee2e27f78be4ea9485","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"8bdd47bb_5c83ed49","in_reply_to":"f4ab39f9_70ed08bf","updated":"2025-12-04 11:07:48.000000000","message":"my priorities for this cycle is clarified so Im fully committed now.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"}],"specs/2026.1/approved/nova-services-graceful-shutdown-part1.rst":[{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":24,"context_line":""},{"line_number":25,"context_line":"Nova services do not have a way to shutdown gracefully means they do not wait"},{"line_number":26,"context_line":"for the in-progress operations to be completed. When shutdown is initiated,"},{"line_number":27,"context_line":"services wait for the RPC server to stop and wait so that they can consume all"},{"line_number":28,"context_line":"the existing request messages (RPC call/cast) from the queue, but the service"},{"line_number":29,"context_line":"does not complete the operation."},{"line_number":30,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"f477018a_04a6d791","line":27,"range":{"start_line":27,"start_character":40,"end_line":27,"end_character":50},"updated":"2025-12-04 10:47:08.000000000","message":"nit: this is a duplicate ?","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":24,"context_line":""},{"line_number":25,"context_line":"Nova services do not have a way to shutdown gracefully means they do not wait"},{"line_number":26,"context_line":"for the in-progress operations to be completed. When shutdown is initiated,"},{"line_number":27,"context_line":"services wait for the RPC server to stop and wait so that they can consume all"},{"line_number":28,"context_line":"the existing request messages (RPC call/cast) from the queue, but the service"},{"line_number":29,"context_line":"does not complete the operation."},{"line_number":30,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"cb99d163_b5a3bb74","line":27,"range":{"start_line":27,"start_character":40,"end_line":27,"end_character":50},"in_reply_to":"f477018a_04a6d791","updated":"2025-12-04 16:48:25.000000000","message":"this is for RPC server where \u0027service\u0027 (means Nova service) will wait for RPC server to stop/wait","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":37,"context_line":"example, if live migration is in progress, the source and destination compute"},{"line_number":38,"context_line":"communicate (sync and async way) multiple times with each other. Once the RPC"},{"line_number":39,"context_line":"server on the compute service is stopped, it cannot communicate with the other"},{"line_number":40,"context_line":"compute and fail the live migration. It will lead the system as well as the"},{"line_number":41,"context_line":"instance to be in an unwanted or unrecoverable state"},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"Use Cases"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8bb9a892_c8480e03","line":40,"range":{"start_line":40,"start_character":12,"end_line":40,"end_character":16},"updated":"2025-12-04 10:47:08.000000000","message":"fails","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":37,"context_line":"example, if live migration is in progress, the source and destination compute"},{"line_number":38,"context_line":"communicate (sync and async way) multiple times with each other. Once the RPC"},{"line_number":39,"context_line":"server on the compute service is stopped, it cannot communicate with the other"},{"line_number":40,"context_line":"compute and fail the live migration. It will lead the system as well as the"},{"line_number":41,"context_line":"instance to be in an unwanted or unrecoverable state"},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"Use Cases"}],"source_content_type":"text/x-rst","patch_set":4,"id":"7e8828f1_309ee2bf","line":40,"range":{"start_line":40,"start_character":12,"end_line":40,"end_character":16},"in_reply_to":"8bb9a892_c8480e03","updated":"2025-12-04 16:48:25.000000000","message":"Done","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":45,"context_line":""},{"line_number":46,"context_line":"As an operator, I want to be able to gracefully shut down (SIGTERM) the Nova"},{"line_number":47,"context_line":"services so that it will not impact the users\u0027 in-progress operations or"},{"line_number":48,"context_line":"keep resources in usable state."},{"line_number":49,"context_line":""},{"line_number":50,"context_line":"As an operator, I want to be able to keep instances and other resources in a"},{"line_number":51,"context_line":"usable state even if service is gracefully terminated (SIGTERM)."}],"source_content_type":"text/x-rst","patch_set":4,"id":"b3304297_a9d6c405","line":48,"updated":"2025-12-04 10:47:08.000000000","message":"++","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":59,"context_line":"Proposed change"},{"line_number":60,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"For detailed context, refer to the graceful shutdown `backlog spec`_."},{"line_number":63,"context_line":""},{"line_number":64,"context_line":"Split the new and in-progress requests via RPC:"},{"line_number":65,"context_line":"-----------------------------------------------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"c93c5903_a20ccc83","line":62,"updated":"2025-12-04 10:47:08.000000000","message":"thanks for the move of the details into the backlog spec, I just read them.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":113,"context_line":" * Requests handling:"},{"line_number":114,"context_line":" Nova scheduler service runs as multiple workers, each having its own RPC"},{"line_number":115,"context_line":" server, but all the Nova scheduler workers will listen to the same RPC"},{"line_number":116,"context_line":" topic and queue ``scheduler`` with fanout way."},{"line_number":117,"context_line":""},{"line_number":118,"context_line":" Currently, nova.service.py-\u003estop() calls stop() and wait() on RPC server."},{"line_number":119,"context_line":" Once RPC server is stopped, it will stop listening to any new messages."}],"source_content_type":"text/x-rst","patch_set":4,"id":"9dc27bd4_56a47491","line":116,"updated":"2025-12-04 10:47:08.000000000","message":"correct","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":117,"context_line":""},{"line_number":118,"context_line":" Currently, nova.service.py-\u003estop() calls stop() and wait() on RPC server."},{"line_number":119,"context_line":" Once RPC server is stopped, it will stop listening to any new messages."},{"line_number":120,"context_line":" But it will not impact anything on the other scheduler worker, and they"},{"line_number":121,"context_line":" continue listening to the same queue and process the request. If any of"},{"line_number":122,"context_line":" the scheduler worker is stopped, then the other workers will process the"},{"line_number":123,"context_line":" request."}],"source_content_type":"text/x-rst","patch_set":4,"id":"a4f4aafb_8dd01741","line":120,"range":{"start_line":120,"start_character":59,"end_line":120,"end_character":65},"updated":"2025-12-04 10:47:08.000000000","message":"workers","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":117,"context_line":""},{"line_number":118,"context_line":" Currently, nova.service.py-\u003estop() calls stop() and wait() on RPC server."},{"line_number":119,"context_line":" Once RPC server is stopped, it will stop listening to any new messages."},{"line_number":120,"context_line":" But it will not impact anything on the other scheduler worker, and they"},{"line_number":121,"context_line":" continue listening to the same queue and process the request. If any of"},{"line_number":122,"context_line":" the scheduler worker is stopped, then the other workers will process the"},{"line_number":123,"context_line":" request."}],"source_content_type":"text/x-rst","patch_set":4,"id":"cd8b63d7_a0508719","line":120,"range":{"start_line":120,"start_character":59,"end_line":120,"end_character":65},"in_reply_to":"a4f4aafb_8dd01741","updated":"2025-12-04 16:48:25.000000000","message":"Done","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":130,"context_line":""},{"line_number":131,"context_line":" We still need to keep the worker up until all the responses are sent via"},{"line_number":132,"context_line":" the reply queue, and for that, we need to implement the in-progress task"},{"line_number":133,"context_line":" tracking in scheduler services, but that will be handled in step 2."},{"line_number":134,"context_line":""},{"line_number":135,"context_line":" This way, stopping a Nova scheduler worker will not impact the RPC"},{"line_number":136,"context_line":" communication on the scheduler service."}],"source_content_type":"text/x-rst","patch_set":4,"id":"7e623fca_7cd0fb73","line":133,"updated":"2025-12-04 10:47:08.000000000","message":"yup, agreed on the phased approach","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":137,"context_line":""},{"line_number":138,"context_line":"* Nova conductor: No RPC change needed."},{"line_number":139,"context_line":""},{"line_number":140,"context_line":" The Nova conductor binary is a stateless service that can spawn multiple"},{"line_number":141,"context_line":" worker threads. Each instance of the Nova conductor has its own RPC server,"},{"line_number":142,"context_line":" but all the Nova conductor instances will listen to the same RPC topic"},{"line_number":143,"context_line":" and queue ``conductor``. This allows the conductor instance to ack as a"}],"source_content_type":"text/x-rst","patch_set":4,"id":"f4aba488_6f262913","line":140,"range":{"start_line":140,"start_character":33,"end_line":140,"end_character":50},"updated":"2025-12-04 10:47:08.000000000","message":"a stateless service that manages stateful objects but I hear you 😊","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":137,"context_line":""},{"line_number":138,"context_line":"* Nova conductor: No RPC change needed."},{"line_number":139,"context_line":""},{"line_number":140,"context_line":" The Nova conductor binary is a stateless service that can spawn multiple"},{"line_number":141,"context_line":" worker threads. Each instance of the Nova conductor has its own RPC server,"},{"line_number":142,"context_line":" but all the Nova conductor instances will listen to the same RPC topic"},{"line_number":143,"context_line":" and queue ``conductor``. This allows the conductor instance to ack as a"}],"source_content_type":"text/x-rst","patch_set":4,"id":"12591c42_70bfb487","line":140,"range":{"start_line":140,"start_character":33,"end_line":140,"end_character":50},"in_reply_to":"f4aba488_6f262913","updated":"2025-12-04 16:48:25.000000000","message":"Acknowledged","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":140,"context_line":" The Nova conductor binary is a stateless service that can spawn multiple"},{"line_number":141,"context_line":" worker threads. Each instance of the Nova conductor has its own RPC server,"},{"line_number":142,"context_line":" but all the Nova conductor instances will listen to the same RPC topic"},{"line_number":143,"context_line":" and queue ``conductor``. This allows the conductor instance to ack as a"},{"line_number":144,"context_line":" distributed worker pool such that stopping an individual conductor instance"},{"line_number":145,"context_line":" will not impact the RPC communication for the pool of conductor instances,"},{"line_number":146,"context_line":" allowing other available workers to process the request. Each cell has its"}],"source_content_type":"text/x-rst","patch_set":4,"id":"a87b93f4_627f7f7d","line":143,"range":{"start_line":143,"start_character":65,"end_line":143,"end_character":68},"updated":"2025-12-04 10:47:08.000000000","message":"act","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":140,"context_line":" The Nova conductor binary is a stateless service that can spawn multiple"},{"line_number":141,"context_line":" worker threads. Each instance of the Nova conductor has its own RPC server,"},{"line_number":142,"context_line":" but all the Nova conductor instances will listen to the same RPC topic"},{"line_number":143,"context_line":" and queue ``conductor``. This allows the conductor instance to ack as a"},{"line_number":144,"context_line":" distributed worker pool such that stopping an individual conductor instance"},{"line_number":145,"context_line":" will not impact the RPC communication for the pool of conductor instances,"},{"line_number":146,"context_line":" allowing other available workers to process the request. Each cell has its"}],"source_content_type":"text/x-rst","patch_set":4,"id":"c349038b_39c5ff86","line":143,"range":{"start_line":143,"start_character":65,"end_line":143,"end_character":68},"in_reply_to":"a87b93f4_627f7f7d","updated":"2025-12-04 16:48:25.000000000","message":"Done","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":146,"context_line":" allowing other available workers to process the request. Each cell has its"},{"line_number":147,"context_line":" own pool of conductors meaning as long as one conductor is up for any given"},{"line_number":148,"context_line":" cell the RPC communication will continue to function even when one or more"},{"line_number":149,"context_line":" conductors are stopped."},{"line_number":150,"context_line":""},{"line_number":151,"context_line":" The request and response handling is done in the same way as mentioned for"},{"line_number":152,"context_line":" the scheduler."}],"source_content_type":"text/x-rst","patch_set":4,"id":"a007abad_dfc16977","line":149,"updated":"2025-12-04 10:47:08.000000000","message":"that\u0027s correct","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":155,"context_line":""},{"line_number":156,"context_line":" This spec does not cover the conductor single worker case. That might"},{"line_number":157,"context_line":" requires the RPC designing for conductor as well but it need more"},{"line_number":158,"context_line":" investigation."},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"* Nova compute: RPC design change needed"},{"line_number":161,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"7a702809_837332d5","line":158,"updated":"2025-12-04 10:47:08.000000000","message":"I don\u0027t think we should recommend running a production environment with a single conductor worker, so I don\u0027t really see a priority here.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":155,"context_line":""},{"line_number":156,"context_line":" This spec does not cover the conductor single worker case. That might"},{"line_number":157,"context_line":" requires the RPC designing for conductor as well but it need more"},{"line_number":158,"context_line":" investigation."},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"* Nova compute: RPC design change needed"},{"line_number":161,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"3985134a_5f33058a","line":158,"in_reply_to":"7a702809_837332d5","updated":"2025-12-04 16:48:25.000000000","message":"Acknowledged","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":166,"context_line":" operations on the same RPC server. To achieve the graceful shutdown, we"},{"line_number":167,"context_line":" need to separate communication for the new requests and in-progress"},{"line_number":168,"context_line":" operations. This will be done by adding a new RPC server in the compute"},{"line_number":169,"context_line":" service."},{"line_number":170,"context_line":""},{"line_number":171,"context_line":" For easy readability, we will be using a different term for each RPC"},{"line_number":172,"context_line":" server:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"5c3dc356_ba5ff4a9","line":169,"updated":"2025-12-04 10:47:08.000000000","message":"+100","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":186,"context_line":""},{"line_number":187,"context_line":" * Each compute will have a new \u0027ops RPC server\u0027 which will listen to a new"},{"line_number":188,"context_line":" topic ``compute-ops.\u003chost\u003e``. ``compute-ops`` name is used because it"},{"line_number":189,"context_line":" is mainly for compute operations, but a better name can be used if"},{"line_number":190,"context_line":" needed."},{"line_number":191,"context_line":" * It will use the same transport layer/bus and exchange that the"},{"line_number":192,"context_line":" \u0027new request RPC server\u0027 uses."},{"line_number":193,"context_line":" * It will create its own dispatcher, listener, and queue."}],"source_content_type":"text/x-rst","patch_set":4,"id":"f7c38830_8d806d49","line":190,"range":{"start_line":189,"start_character":43,"end_line":190,"end_character":13},"updated":"2025-12-04 10:47:08.000000000","message":"yeah, we could and should bikeshed on the naming, particularly given operators will need to clearly understand the reasoning behind that new topic and RPC server.\n\nAs an analogy, some operators still sometimes struggle with understand what \u0027cell0\u0027 is and think that\u0027s the first and default cell we have.\n\nThat being said, I don\u0027t want that spec to be hold on a naming bikeshed, so let\u0027s do the naming discussion during the implementation phase (and we could amend that spec if the naming changes)","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":186,"context_line":""},{"line_number":187,"context_line":" * Each compute will have a new \u0027ops RPC server\u0027 which will listen to a new"},{"line_number":188,"context_line":" topic ``compute-ops.\u003chost\u003e``. ``compute-ops`` name is used because it"},{"line_number":189,"context_line":" is mainly for compute operations, but a better name can be used if"},{"line_number":190,"context_line":" needed."},{"line_number":191,"context_line":" * It will use the same transport layer/bus and exchange that the"},{"line_number":192,"context_line":" \u0027new request RPC server\u0027 uses."},{"line_number":193,"context_line":" * It will create its own dispatcher, listener, and queue."}],"source_content_type":"text/x-rst","patch_set":4,"id":"9993d23c_e635f3a1","line":190,"range":{"start_line":189,"start_character":43,"end_line":190,"end_character":13},"in_reply_to":"f7c38830_8d806d49","updated":"2025-12-04 16:48:25.000000000","message":"agree.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":216,"context_line":" requests are picked by the compute. The \u0027ops RPC server\u0027 is running and"},{"line_number":217,"context_line":" up."},{"line_number":218,"context_line":" * nova.service will wait for the manager to signal once all in-progress"},{"line_number":219,"context_line":" operations are finished."},{"line_number":220,"context_line":" * Once compute signal to nova.service, then it will stop the"},{"line_number":221,"context_line":" \u0027ops RPC server\u0027 and proceed with service shutdown."},{"line_number":222,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"1903d0f7_e2e40144","line":219,"updated":"2025-12-04 10:47:08.000000000","message":"long-running tasks can be very long (heh, pun not intended) so operators have to understand that the service could possibly be not stopped for a while except by sending SIGKILL to the process if they really want to stop nova-compute.\n\nlater: oh, this is commented later in the spec.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":216,"context_line":" requests are picked by the compute. The \u0027ops RPC server\u0027 is running and"},{"line_number":217,"context_line":" up."},{"line_number":218,"context_line":" * nova.service will wait for the manager to signal once all in-progress"},{"line_number":219,"context_line":" operations are finished."},{"line_number":220,"context_line":" * Once compute signal to nova.service, then it will stop the"},{"line_number":221,"context_line":" \u0027ops RPC server\u0027 and proceed with service shutdown."},{"line_number":222,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"a21cbe6d_de7a4989","line":219,"in_reply_to":"1903d0f7_e2e40144","updated":"2025-12-04 16:48:25.000000000","message":"Acknowledged","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":227,"context_line":" sent via \u0027new request RPC server\u0027."},{"line_number":228,"context_line":" * If any RPC cast/call wants to send a message via the \u0027ops RPC server\u0027,"},{"line_number":229,"context_line":" they need to override the ``topic`` to ``compute-ops.\u003chost\u003e`` during"},{"line_number":230,"context_line":" client.prepare() call."},{"line_number":231,"context_line":" * If the RPC client detects an old compute (based on version_cap), then it"},{"line_number":232,"context_line":" will fall back to send the message to the \u0027new request RPC server\u0027 topic"},{"line_number":233,"context_line":" ``compute.\u003chost\u003e``."}],"source_content_type":"text/x-rst","patch_set":4,"id":"96e695e9_a0d8471e","line":230,"updated":"2025-12-04 10:47:08.000000000","message":"I like that Adapter pattern, seems to me the cleanest.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":230,"context_line":" client.prepare() call."},{"line_number":231,"context_line":" * If the RPC client detects an old compute (based on version_cap), then it"},{"line_number":232,"context_line":" will fall back to send the message to the \u0027new request RPC server\u0027 topic"},{"line_number":233,"context_line":" ``compute.\u003chost\u003e``."},{"line_number":234,"context_line":" * Which RPC cast/call will be using the \u0027ops RPC server\u0027 will be decided"},{"line_number":235,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":236,"context_line":" methods are used for the operations we want to finish during shutdown."}],"source_content_type":"text/x-rst","patch_set":4,"id":"b62da4a6_f0ba3984","line":233,"updated":"2025-12-04 10:47:08.000000000","message":"yup, seems easily managed, and we do that already with old RPC major version bumps.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":238,"context_line":""},{"line_number":239,"context_line":" .. note::"},{"line_number":240,"context_line":""},{"line_number":241,"context_line":" This is draft list and can be changed during implementation."},{"line_number":242,"context_line":""},{"line_number":243,"context_line":" * Migrations:"},{"line_number":244,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"b92a3106_003fd0bc","line":241,"updated":"2025-12-04 10:47:08.000000000","message":"ack, we should reflect the implementation changes in the spec as a follow-up spec patch once we agree on the exhaustive list.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":true,"context_lines":[{"line_number":238,"context_line":""},{"line_number":239,"context_line":" .. note::"},{"line_number":240,"context_line":""},{"line_number":241,"context_line":" This is draft list and can be changed during implementation."},{"line_number":242,"context_line":""},{"line_number":243,"context_line":" * Migrations:"},{"line_number":244,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"2af2f1c3_efeec037","line":241,"in_reply_to":"b92a3106_003fd0bc","updated":"2025-12-04 16:48:25.000000000","message":"yeah, we can amend the list here once we finalize it in implementation. I will keep this comment as unresolved as a reminder.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":251,"context_line":" check_can_live_migrate_source methods, as this is the very initial"},{"line_number":252,"context_line":" phase where the compute service has not started the live"},{"line_number":253,"context_line":" migration. If shutdown is initiated before live migration request,"},{"line_number":254,"context_line":" came then migration should be rejected."},{"line_number":255,"context_line":""},{"line_number":256,"context_line":" - pre_live_migration()"},{"line_number":257,"context_line":" - live_migration()"}],"source_content_type":"text/x-rst","patch_set":4,"id":"d0700202_6942f810","line":254,"updated":"2025-12-04 10:47:08.000000000","message":"++","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":282,"context_line":" configurable time-based waiting for services to complete their operations."},{"line_number":283,"context_line":" * The wait time should be less than global graceful shutdown timeout. So that"},{"line_number":284,"context_line":" external system or oslo.service does not shut down the service before the"},{"line_number":285,"context_line":" service wait time is over."},{"line_number":286,"context_line":""},{"line_number":287,"context_line":"* Some specific examples of the shutdown issues which will be solved by this"},{"line_number":288,"context_line":" proposal:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"41b07949_a11d5c62","line":285,"updated":"2025-12-04 10:47:08.000000000","message":"as I said earlier, that timeout has to be correctly defined based on the fact live migrations can be very long to run.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":282,"context_line":" configurable time-based waiting for services to complete their operations."},{"line_number":283,"context_line":" * The wait time should be less than global graceful shutdown timeout. So that"},{"line_number":284,"context_line":" external system or oslo.service does not shut down the service before the"},{"line_number":285,"context_line":" service wait time is over."},{"line_number":286,"context_line":""},{"line_number":287,"context_line":"* Some specific examples of the shutdown issues which will be solved by this"},{"line_number":288,"context_line":" proposal:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"c03db7bb_9153a2b1","line":285,"in_reply_to":"41b07949_a11d5c62","updated":"2025-12-04 16:48:25.000000000","message":"Acknowledged","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":295,"context_line":" terminate the migration; instead will be able to wait for the migration"},{"line_number":296,"context_line":" to complete."},{"line_number":297,"context_line":" * Later, we will make long running migration to abort but that is out of"},{"line_number":298,"context_line":" scope from this spec."},{"line_number":299,"context_line":""},{"line_number":300,"context_line":" * Instance boot:"},{"line_number":301,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"2b101453_4b26be8b","line":298,"updated":"2025-12-04 10:47:08.000000000","message":"ack","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":295,"context_line":" terminate the migration; instead will be able to wait for the migration"},{"line_number":296,"context_line":" to complete."},{"line_number":297,"context_line":" * Later, we will make long running migration to abort but that is out of"},{"line_number":298,"context_line":" scope from this spec."},{"line_number":299,"context_line":""},{"line_number":300,"context_line":" * Instance boot:"},{"line_number":301,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"dc5f25de_1c0a7135","line":298,"in_reply_to":"2b101453_4b26be8b","updated":"2025-12-04 16:48:25.000000000","message":"Acknowledged","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":306,"context_line":" shutdown will wait for compute to boot them successfully."},{"line_number":307,"context_line":" * The instance external event will be received during graceful shutdown;"},{"line_number":308,"context_line":" therefore, an instance boot request will not be blocked for the"},{"line_number":309,"context_line":" external event."},{"line_number":310,"context_line":" * If a new instance boot request arrives after the shutdown is initiated,"},{"line_number":311,"context_line":" then it will stay in the queue, and the compute will handle it once it"},{"line_number":312,"context_line":" is started again."}],"source_content_type":"text/x-rst","patch_set":4,"id":"09d5caa0_d30b6a8e","line":309,"updated":"2025-12-04 10:47:08.000000000","message":"++","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":317,"context_line":""},{"line_number":318,"context_line":" As per testing till now (eventlet mode), it does not require any change in"},{"line_number":319,"context_line":" oslo.messaging but we need to test it by running compute in native thread"},{"line_number":320,"context_line":" mode (with oslo.service threading backend )."},{"line_number":321,"context_line":""},{"line_number":322,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":323,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"0794ce94_ca6f5da8","line":320,"updated":"2025-12-04 10:47:08.000000000","message":"++","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":336,"context_line":" is not finished."},{"line_number":337,"context_line":" * Its default value is 60 seconds, which is less for Nova services. The"},{"line_number":338,"context_line":" proposal is to override its default value to 180 sec for all the"},{"line_number":339,"context_line":" Nova services."},{"line_number":340,"context_line":" * The operator can override this value per Nova services."},{"line_number":341,"context_line":""},{"line_number":342,"context_line":" #. Timeout for Nova service to finish the in-progress tasks:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"f881af02_b7407048","line":339,"updated":"2025-12-04 10:47:08.000000000","message":"yup, 180 seconds seems realistic to me... except for live-migrations 😊","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":336,"context_line":" is not finished."},{"line_number":337,"context_line":" * Its default value is 60 seconds, which is less for Nova services. The"},{"line_number":338,"context_line":" proposal is to override its default value to 180 sec for all the"},{"line_number":339,"context_line":" Nova services."},{"line_number":340,"context_line":" * The operator can override this value per Nova services."},{"line_number":341,"context_line":""},{"line_number":342,"context_line":" #. Timeout for Nova service to finish the in-progress tasks:"}],"source_content_type":"text/x-rst","patch_set":4,"id":"cf6fb8cf_c28a72fa","line":339,"in_reply_to":"f881af02_b7407048","updated":"2025-12-04 16:48:25.000000000","message":"yeah, live migration are long one and it is configurable so operator have a way to put live migration into the graceful shutodwn window if they want.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":true,"context_lines":[{"line_number":350,"context_line":" takes more time to finish in-progress tasks, then oslo.service"},{"line_number":351,"context_line":" graceful_shutdown_timeout_ will not let cleanup_host() to be executed."},{"line_number":352,"context_line":" * We need to add this configurable timeout option per the Nova services"},{"line_number":353,"context_line":" and their default value should be lower than graceful_shutdown_timeout_,"},{"line_number":354,"context_line":""},{"line_number":355,"context_line":"* External system timeout:"},{"line_number":356,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"c0a5ce54_79a0a850","line":353,"updated":"2025-12-04 09:43:20.000000000","message":"I\u0027m afraid of these timeout handlings as it can be significantly different between eventlet and threading. But you noted the need of testing with threading. So I\u0027m OK to land this spec and figure out the implementation as we go.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":350,"context_line":" takes more time to finish in-progress tasks, then oslo.service"},{"line_number":351,"context_line":" graceful_shutdown_timeout_ will not let cleanup_host() to be executed."},{"line_number":352,"context_line":" * We need to add this configurable timeout option per the Nova services"},{"line_number":353,"context_line":" and their default value should be lower than graceful_shutdown_timeout_,"},{"line_number":354,"context_line":""},{"line_number":355,"context_line":"* External system timeout:"},{"line_number":356,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"08444651_cf3d4082","line":353,"in_reply_to":"4955df7a_96ed5a22","updated":"2025-12-04 16:48:25.000000000","message":"yeah, we will be going qwith the threading mode implementation on this and if there are any change in proposed timeout then I will amend the spec to reflect the implementation.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":350,"context_line":" takes more time to finish in-progress tasks, then oslo.service"},{"line_number":351,"context_line":" graceful_shutdown_timeout_ will not let cleanup_host() to be executed."},{"line_number":352,"context_line":" * We need to add this configurable timeout option per the Nova services"},{"line_number":353,"context_line":" and their default value should be lower than graceful_shutdown_timeout_,"},{"line_number":354,"context_line":""},{"line_number":355,"context_line":"* External system timeout:"},{"line_number":356,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"4955df7a_96ed5a22","line":353,"in_reply_to":"c0a5ce54_79a0a850","updated":"2025-12-04 10:47:08.000000000","message":"that option value being lower than the global shutdown value makes me a bit afraid, but like gibi, I want to leave that for implementation discussions.","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":false,"context_lines":[{"line_number":433,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":434,"context_line":"server (listening to ``compute.\u003chost\u003e``); and therefore graceful shutdown will"},{"line_number":435,"context_line":"not work on new compute nodes until all the computes are upgraded and the RPC"},{"line_number":436,"context_line":"version_cap is removed."},{"line_number":437,"context_line":""},{"line_number":438,"context_line":"Implementation"},{"line_number":439,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"659278be_5611b2e1","line":436,"updated":"2025-12-04 10:47:08.000000000","message":"++","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"0bbfa6a56b7db1e4cc0c81b0acae3a1510fd2f47","unresolved":true,"context_lines":[{"line_number":452,"context_line":"Feature Liaison"},{"line_number":453,"context_line":"---------------"},{"line_number":454,"context_line":""},{"line_number":455,"context_line":"gmaan"},{"line_number":456,"context_line":""},{"line_number":457,"context_line":"Work Items"},{"line_number":458,"context_line":"----------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"7c26bb64_b2543ed7","line":455,"updated":"2025-12-04 10:47:08.000000000","message":"nit: not necessary","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"47510d95153c7160d4b0c4f58f092e0b7e40ff3a","unresolved":false,"context_lines":[{"line_number":452,"context_line":"Feature Liaison"},{"line_number":453,"context_line":"---------------"},{"line_number":454,"context_line":""},{"line_number":455,"context_line":"gmaan"},{"line_number":456,"context_line":""},{"line_number":457,"context_line":"Work Items"},{"line_number":458,"context_line":"----------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"36565067_6ed016c9","line":455,"in_reply_to":"7c26bb64_b2543ed7","updated":"2025-12-04 16:48:25.000000000","message":"Done","commit_id":"4e95c63b60b4f7d12903d2cfbd6febdd84991c49"}],"specs/2026.1/approved/nova-services-graceful-shutdown-spec1.rst":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"217150af168485cf08bac2d03fa2e98e0bc51101","unresolved":true,"context_lines":[{"line_number":11,"context_line":"https://blueprints.launchpad.net/nova/+spec/nova-services-graceful-shutdown-spec1"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This proposes the spec 1 of the graceful shutdown backlog spec for the"},{"line_number":14,"context_line":"2026.1 cycle."},{"line_number":15,"context_line":""},{"line_number":16,"context_line":"Nova services do not shut down gracefully. When services are stopped, it also"},{"line_number":17,"context_line":"stops all the in-progress operations, which not only interrupt the in-progress"}],"source_content_type":"text/x-rst","patch_set":2,"id":"f0ba7491_77e2e720","line":14,"updated":"2025-12-02 22:09:37.000000000","message":"i have not had time to review this yet but skiming it i see a lot of duplication of context that is covered by the background spec.\n\ndo we need to do that or can we just make this fmore focused on the proposed changes and less on the background context?","commit_id":"4f9e871716bf427d3e97e7f3da40e4e33b0e7fb5"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":11,"context_line":"https://blueprints.launchpad.net/nova/+spec/nova-services-graceful-shutdown-spec1"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This proposes the spec 1 of the graceful shutdown backlog spec for the"},{"line_number":14,"context_line":"2026.1 cycle."},{"line_number":15,"context_line":""},{"line_number":16,"context_line":"Nova services do not shut down gracefully. When services are stopped, it also"},{"line_number":17,"context_line":"stops all the in-progress operations, which not only interrupt the in-progress"}],"source_content_type":"text/x-rst","patch_set":2,"id":"b25d70c8_697c16b2","line":14,"in_reply_to":"41967feb_fd841784","updated":"2025-12-03 22:55:31.000000000","message":"Acknowledged","commit_id":"4f9e871716bf427d3e97e7f3da40e4e33b0e7fb5"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"f638664e46f0e385fd0c6a9ee881ae283d8acdc5","unresolved":true,"context_lines":[{"line_number":11,"context_line":"https://blueprints.launchpad.net/nova/+spec/nova-services-graceful-shutdown-spec1"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This proposes the spec 1 of the graceful shutdown backlog spec for the"},{"line_number":14,"context_line":"2026.1 cycle."},{"line_number":15,"context_line":""},{"line_number":16,"context_line":"Nova services do not shut down gracefully. When services are stopped, it also"},{"line_number":17,"context_line":"stops all the in-progress operations, which not only interrupt the in-progress"}],"source_content_type":"text/x-rst","patch_set":2,"id":"f83c7b5c_fdf2294c","line":14,"in_reply_to":"f0ba7491_77e2e720","updated":"2025-12-02 22:58:28.000000000","message":"This is exactly the same content copied from the backlog spec. I think this is the way to propose things/part from backlog specs, but this is the first time I am doing as a backlog spec. If there is any other better way to do it, please let me know.","commit_id":"4f9e871716bf427d3e97e7f3da40e4e33b0e7fb5"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":11,"context_line":"https://blueprints.launchpad.net/nova/+spec/nova-services-graceful-shutdown-spec1"},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This proposes the spec 1 of the graceful shutdown backlog spec for the"},{"line_number":14,"context_line":"2026.1 cycle."},{"line_number":15,"context_line":""},{"line_number":16,"context_line":"Nova services do not shut down gracefully. When services are stopped, it also"},{"line_number":17,"context_line":"stops all the in-progress operations, which not only interrupt the in-progress"}],"source_content_type":"text/x-rst","patch_set":2,"id":"41967feb_fd841784","line":14,"in_reply_to":"f83c7b5c_fdf2294c","updated":"2025-12-03 15:02:37.000000000","message":"well that actully what i was objectign too\ncopy pastign the content.\nwhen you have a backlog spec and release sepc you are ment to read them togehter by reading the backlog spec first then going into more detail in teh release sepcific one\n\ni dont think it should be duplciate betwen the backlog spec and actual spec\nor rather the duplicaiton shoudl be minimised where possibel.\n\nthe backlog spec is ment to be hte hegher level design reqrueiemnt and direction i.e. the 10,000 feet view\n\nthe release spescif spec shoudl focus on the detailed design of the subset that will be adressed at the 1000 foot view.","commit_id":"4f9e871716bf427d3e97e7f3da40e4e33b0e7fb5"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":69,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":70,"context_line":" process the requests."},{"line_number":71,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":72,"context_line":" point, either completion or abort."},{"line_number":73,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":74,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"69fd7304_3b535767","line":72,"updated":"2025-12-03 14:21:20.000000000","message":"I guess in practice this means we want to abort any non post-copy live migration that are in flight as we don\u0027t know how long it would take to finish them but we know that abort will happen fairly quickly. (We cannot do that with migration already in post-copy state as abort there means we loose the running VM).\n\nI\u0027m wondering what other operations we want to abort by this principle? \n\nA cold migration, resize, snapshot, or shelve_offload that is still copying data is probably abortable without loosing anything other than the actual request (so the user needs to retry later). \n\nDo we have other long running operations where we want to abort?\n\nAre the above aborts part of the scope of this spec? or will be handed by a later spec? (I personally would not put the abort into the scope of step1 just to limit the scope, and put it into a later step)","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":69,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":70,"context_line":" process the requests."},{"line_number":71,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":72,"context_line":" point, either completion or abort."},{"line_number":73,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":74,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"b5efea16_10021cd9","line":72,"in_reply_to":"69fd7304_3b535767","updated":"2025-12-03 15:02:37.000000000","message":"we can abort post copy live migraton form the souce side if it has not entered post_live_migration i.e. strated runing o the dest.\n\nbut if it has then yes force_complte would be more correct.\n\nthe imporant thing is once we have hti post_live_migratoinif we are then triggered to terminate the instnace.host woudl idealy be updated to the dest as soon as possibel ebcause we want to avoid a hard reboot creating a second vm on the source node and possible corrupt guest data.\n\ndo you know if we heal that today form the destination node side? i dont think we do.\n\n\"cold migration, resize, snapshot, or shelve_offload\" i think should all be abortable yes.\n\n\none thing i was thinking we could do eventually is set a global boolean when we receive sigterm and then add checkpoint in some of the longer runnign function paths that woudl raise a ShutdownAbort exception to trigger early abort of the operation rather then trying to continue and complete the operation.\n\n\ni also think this makes sense for spawn or shorter running operation like itnerface/volume attach/detach although we would have to verify if they are at an abortable state. for spawn we already have Build Failed or build abort exceptions\n,i dont recall the name, that could be reused.\n\ni wonder if defining abort semantics for each of our instance operations shoudl be factored into its own spec that coudl be progresed in paralle with this RPC work.\n\ni.e. making sure we have good abort checkpoitn so we can shutdown faster.\n\nswap volume for example is one of the opertion im most concerned with to make sure we do not leak host volume attachments. so havign a way to recover form that or abort it would be a good future improvement.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":69,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":70,"context_line":" process the requests."},{"line_number":71,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":72,"context_line":" point, either completion or abort."},{"line_number":73,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":74,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"ce57f104_7bc64a17","line":72,"in_reply_to":"b5efea16_10021cd9","updated":"2025-12-03 22:55:31.000000000","message":"yeah \"cold migration, resize, snapshot, or shelve_offload\" are good candidate to abort but live migration is not easy to just abort those based on what stage it is.\n\nI did not plan this part for Spec 1 but for later. Yes, this part is little critical and need operation wise investigation and logic to abort. For that we might need to inject more checkpoints based on operation. And I agree it need a separate spec to investigate and add the details per operations.\n\nTo clarify, let me add this as a separate spec/step in backlog spec.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"e21edd37e6eaf7e2ef1944a67b91ed0ea10b8e51","unresolved":false,"context_lines":[{"line_number":69,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":70,"context_line":" process the requests."},{"line_number":71,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":72,"context_line":" point, either completion or abort."},{"line_number":73,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":74,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"d909124a_4a39023f","line":72,"in_reply_to":"ce57f104_7bc64a17","updated":"2025-12-03 23:55:29.000000000","message":"added it in backlog spec as spec3 https://review.opendev.org/c/openstack/nova-specs/+/969543","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":false,"context_lines":[{"line_number":69,"context_line":" * New requests should not be lost. Once service is restarted, it should"},{"line_number":70,"context_line":" process the requests."},{"line_number":71,"context_line":" * Allow in-progress operations to reach their quickest safe termination"},{"line_number":72,"context_line":" point, either completion or abort."},{"line_number":73,"context_line":" * Proper logging of the state of in-progress operations"},{"line_number":74,"context_line":" * Keep instances or other resources in a usable state"},{"line_number":75,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"916f4b81_32270544","line":72,"in_reply_to":"d909124a_4a39023f","updated":"2025-12-04 09:43:20.000000000","message":"Sounds good to me.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":138,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":139,"context_line":" with \"503 Service Unavailable\" error."},{"line_number":140,"context_line":""},{"line_number":141,"context_line":" Testing:"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":" I tested two types of requests:"},{"line_number":144,"context_line":""},{"line_number":145,"context_line":" #. Sync request: \u0027openstack server list\u0027:"},{"line_number":146,"context_line":""},{"line_number":147,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":148,"context_line":" server list API code."},{"line_number":149,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server list``"},{"line_number":150,"context_line":" * Wait till the server list request reaches the Nova API (you can see"},{"line_number":151,"context_line":" the log from the controller)"},{"line_number":152,"context_line":" * Because of sleep(10), the server list takes time to finish."},{"line_number":153,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":154,"context_line":" * Start a new API request \u0027request2\u0027: ``openstack server list``. This new"},{"line_number":155,"context_line":" requests came after shutdown is initiated so it should be denied."},{"line_number":156,"context_line":" * Nova API service will wait because \u0027request1\u0027 is not finished."},{"line_number":157,"context_line":" * \u0027request1\u0027 will get the response of the server list before the service"},{"line_number":158,"context_line":" is terminated."},{"line_number":159,"context_line":" * \u0027request2\u0027 is denied and will receive the error"},{"line_number":160,"context_line":" \"503 Service Unavailable\""},{"line_number":161,"context_line":""},{"line_number":162,"context_line":" #. Async request: ``openstack server pause \u003cserver\u003e``:"},{"line_number":163,"context_line":""},{"line_number":164,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":165,"context_line":" server pause API code."},{"line_number":166,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server pause server1``"},{"line_number":167,"context_line":" * Wait till the pause server request reaches the Nova API (you can see"},{"line_number":168,"context_line":" the log from the controller)"},{"line_number":169,"context_line":" * Because of sleep(10), the pause server takes time to finish."},{"line_number":170,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":171,"context_line":" * Service will wait because \u0027request1\u0027 is not finished."},{"line_number":172,"context_line":" * Nova API will make an RPC cast to the Nova compute service and return."},{"line_number":173,"context_line":" * \u0027request1\u0027 is completed, and the response is returned to the user."},{"line_number":174,"context_line":" * Nova API service is terminated now."},{"line_number":175,"context_line":" * Nova compute service is operating the pause server request."},{"line_number":176,"context_line":" * Check if server is paused ``openstack server list``"},{"line_number":177,"context_line":" * You can see the server is paused."},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"* Nova console proxy services: nova-novncproxy, nova-serialproxy, and"},{"line_number":180,"context_line":" nova-spicehtml5proxy:"},{"line_number":181,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"9d3b3435_5e21bfbb","line":178,"range":{"start_line":141,"start_character":0,"end_line":178,"end_character":1},"updated":"2025-12-03 15:02:37.000000000","message":"i questiend the value of including this in the backlog spec and i think\nits even less useful here.\n\nyou have already defiend the behvior that will be observed in lines 138-139\n\nso this is really just adding more lenght to the spec without adding value","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":138,"context_line":" If any new request comes after the shutdown is initiated, it will be rejected"},{"line_number":139,"context_line":" with \"503 Service Unavailable\" error."},{"line_number":140,"context_line":""},{"line_number":141,"context_line":" Testing:"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":" I tested two types of requests:"},{"line_number":144,"context_line":""},{"line_number":145,"context_line":" #. Sync request: \u0027openstack server list\u0027:"},{"line_number":146,"context_line":""},{"line_number":147,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":148,"context_line":" server list API code."},{"line_number":149,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server list``"},{"line_number":150,"context_line":" * Wait till the server list request reaches the Nova API (you can see"},{"line_number":151,"context_line":" the log from the controller)"},{"line_number":152,"context_line":" * Because of sleep(10), the server list takes time to finish."},{"line_number":153,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":154,"context_line":" * Start a new API request \u0027request2\u0027: ``openstack server list``. This new"},{"line_number":155,"context_line":" requests came after shutdown is initiated so it should be denied."},{"line_number":156,"context_line":" * Nova API service will wait because \u0027request1\u0027 is not finished."},{"line_number":157,"context_line":" * \u0027request1\u0027 will get the response of the server list before the service"},{"line_number":158,"context_line":" is terminated."},{"line_number":159,"context_line":" * \u0027request2\u0027 is denied and will receive the error"},{"line_number":160,"context_line":" \"503 Service Unavailable\""},{"line_number":161,"context_line":""},{"line_number":162,"context_line":" #. Async request: ``openstack server pause \u003cserver\u003e``:"},{"line_number":163,"context_line":""},{"line_number":164,"context_line":" * To observe the graceful shutdown, I added 10 seconds of sleep in the"},{"line_number":165,"context_line":" server pause API code."},{"line_number":166,"context_line":" * Start a API request \u0027request1\u0027: ``openstack server pause server1``"},{"line_number":167,"context_line":" * Wait till the pause server request reaches the Nova API (you can see"},{"line_number":168,"context_line":" the log from the controller)"},{"line_number":169,"context_line":" * Because of sleep(10), the pause server takes time to finish."},{"line_number":170,"context_line":" * Initiate the Nova API service shutdown."},{"line_number":171,"context_line":" * Service will wait because \u0027request1\u0027 is not finished."},{"line_number":172,"context_line":" * Nova API will make an RPC cast to the Nova compute service and return."},{"line_number":173,"context_line":" * \u0027request1\u0027 is completed, and the response is returned to the user."},{"line_number":174,"context_line":" * Nova API service is terminated now."},{"line_number":175,"context_line":" * Nova compute service is operating the pause server request."},{"line_number":176,"context_line":" * Check if server is paused ``openstack server list``"},{"line_number":177,"context_line":" * You can see the server is paused."},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"* Nova console proxy services: nova-novncproxy, nova-serialproxy, and"},{"line_number":180,"context_line":" nova-spicehtml5proxy:"},{"line_number":181,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"72ceae07_7eed0450","line":178,"range":{"start_line":141,"start_character":0,"end_line":178,"end_character":1},"in_reply_to":"9d3b3435_5e21bfbb","updated":"2025-12-03 22:55:31.000000000","message":"Done","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":199,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":200,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":201,"context_line":""},{"line_number":202,"context_line":"Spec 1: Split the new and in-progress requests via RPC:"},{"line_number":203,"context_line":"-------------------------------------------------------"},{"line_number":204,"context_line":""},{"line_number":205,"context_line":"RPC communication is an important part of services to finish a particular"}],"source_content_type":"text/x-rst","patch_set":3,"id":"e1abe9eb_e367d6a3","line":202,"updated":"2025-12-03 14:21:20.000000000","message":"I guess we don\u0027t have Spec 2 described in this doc any more so we can just remove \"Spec 1\" from the heading","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":199,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":200,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":201,"context_line":""},{"line_number":202,"context_line":"Spec 1: Split the new and in-progress requests via RPC:"},{"line_number":203,"context_line":"-------------------------------------------------------"},{"line_number":204,"context_line":""},{"line_number":205,"context_line":"RPC communication is an important part of services to finish a particular"}],"source_content_type":"text/x-rst","patch_set":3,"id":"900950e8_fddf6aa1","line":202,"in_reply_to":"b0e0f286_95ded64d","updated":"2025-12-03 22:55:31.000000000","message":"Done","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":199,"context_line":" get \"Disconnected\" message. Once service is up, the user can refresh the"},{"line_number":200,"context_line":" browser, and the console will be up again (if the token has not expired)."},{"line_number":201,"context_line":""},{"line_number":202,"context_line":"Spec 1: Split the new and in-progress requests via RPC:"},{"line_number":203,"context_line":"-------------------------------------------------------"},{"line_number":204,"context_line":""},{"line_number":205,"context_line":"RPC communication is an important part of services to finish a particular"}],"source_content_type":"text/x-rst","patch_set":3,"id":"b0e0f286_95ded64d","line":202,"in_reply_to":"e1abe9eb_e367d6a3","updated":"2025-12-03 15:02:37.000000000","message":"+1","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":352,"context_line":" * There is an existing graceful_shutdown_timeout_ config option present"},{"line_number":353,"context_line":" on oslo.service which can be set per service."},{"line_number":354,"context_line":" * That is honoured to timeout the service stop, and it will stop service"},{"line_number":355,"context_line":" irrespective of the compute finishing the things."},{"line_number":356,"context_line":""},{"line_number":357,"context_line":" * RPC client:"},{"line_number":358,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"ff43a46f_b1c3eed2","line":355,"updated":"2025-12-03 14:21:20.000000000","message":"Does oslo.service just kill the SIGTERM handler when the graceful_shutdown_timeout is reached? Or the nova SIGTERM handler implementation needs to take care of detecting if graceful_shutdown_timeout seconds are passes since it started executing and stop?\n\nThis is especially interesting in native threading mode where there is no easy way to inject a timeout in to an already executing thread. (In eventlet at each yield point the hub can inject a timeout exception into the greenlet)","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"857f5ba0b5e45de816dd377e5ca273ab91b7f107","unresolved":false,"context_lines":[{"line_number":352,"context_line":" * There is an existing graceful_shutdown_timeout_ config option present"},{"line_number":353,"context_line":" on oslo.service which can be set per service."},{"line_number":354,"context_line":" * That is honoured to timeout the service stop, and it will stop service"},{"line_number":355,"context_line":" irrespective of the compute finishing the things."},{"line_number":356,"context_line":""},{"line_number":357,"context_line":" * RPC client:"},{"line_number":358,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"44b002a1_4d1ce29d","line":355,"in_reply_to":"666624ee_c4fbad11","updated":"2025-12-03 23:22:24.000000000","message":"I have defined all the timeout things in a single section \u0027Graceful Shutdown Timeouts:\u0027","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":352,"context_line":" * There is an existing graceful_shutdown_timeout_ config option present"},{"line_number":353,"context_line":" on oslo.service which can be set per service."},{"line_number":354,"context_line":" * That is honoured to timeout the service stop, and it will stop service"},{"line_number":355,"context_line":" irrespective of the compute finishing the things."},{"line_number":356,"context_line":""},{"line_number":357,"context_line":" * RPC client:"},{"line_number":358,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"666624ee_c4fbad11","line":355,"in_reply_to":"ff43a46f_b1c3eed2","updated":"2025-12-03 22:55:31.000000000","message":"oslo.service take care of it. once graceful_shutdown_timeout is reached then SIGTERM handler will perform os._exit(1) in both case eventlet or threading mode.\n\nNow I am re-thinking that we need this Timeout (L57) to be a another permanent timeout in Nova. because if we only consider graceful_shutdown_timeout then oslo.service will abruptly exit and it will not allow nova to perform other operation (manager.cleanup_host()) after in-progress operations are waited. We need to timeout in-progress tasks wait/tracking before graceful_shutdown_timeout so that cleanup_host() can get some time to finish before it reach to graceful_shutdown_timeout and oslo.service stop the service.\n\nI will update it in backlog spec also","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":361,"context_line":" sent via \u0027new request RPC server\u0027."},{"line_number":362,"context_line":" * If any RPC cast/call wants to send a message via the \u0027ops RPC server\u0027,"},{"line_number":363,"context_line":" they need to override the ``topic`` to ``compute-ops.\u003chost\u003e`` during"},{"line_number":364,"context_line":" client.prepare() call."},{"line_number":365,"context_line":" * Which RPC cast/call will be using the \u0027ops RPC server\u0027 will be decided"},{"line_number":366,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":367,"context_line":" methods are used for the operations we want to finish during shutdown."}],"source_content_type":"text/x-rst","patch_set":3,"id":"ba285c05_a0743ced","line":364,"updated":"2025-12-03 14:21:20.000000000","message":"We have a single compute RPC API definition used by multiple services, nova-conductor, nova-api, and even nova-compute itself. I assume that we change the logic in this RPC API client code to start sending the RPC messages to the proper topic based on the\n* method: like build_and_run_instance will remain on RPC1, while external_event condiditionaly moves to RPC2\n* the RPC version_cap: If the version_cap is old then we keep sending all RPC methods to RPC1 as today.\n\nAm I correct?","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":false,"context_lines":[{"line_number":361,"context_line":" sent via \u0027new request RPC server\u0027."},{"line_number":362,"context_line":" * If any RPC cast/call wants to send a message via the \u0027ops RPC server\u0027,"},{"line_number":363,"context_line":" they need to override the ``topic`` to ``compute-ops.\u003chost\u003e`` during"},{"line_number":364,"context_line":" client.prepare() call."},{"line_number":365,"context_line":" * Which RPC cast/call will be using the \u0027ops RPC server\u0027 will be decided"},{"line_number":366,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":367,"context_line":" methods are used for the operations we want to finish during shutdown."}],"source_content_type":"text/x-rst","patch_set":3,"id":"3d6eccdb_0866fab1","line":364,"in_reply_to":"6c2eb5bb_7e2a966b","updated":"2025-12-04 09:43:20.000000000","message":"cool","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":361,"context_line":" sent via \u0027new request RPC server\u0027."},{"line_number":362,"context_line":" * If any RPC cast/call wants to send a message via the \u0027ops RPC server\u0027,"},{"line_number":363,"context_line":" they need to override the ``topic`` to ``compute-ops.\u003chost\u003e`` during"},{"line_number":364,"context_line":" client.prepare() call."},{"line_number":365,"context_line":" * Which RPC cast/call will be using the \u0027ops RPC server\u0027 will be decided"},{"line_number":366,"context_line":" during implementation, so that we can have a better judgment on what all"},{"line_number":367,"context_line":" methods are used for the operations we want to finish during shutdown."}],"source_content_type":"text/x-rst","patch_set":3,"id":"6c2eb5bb_7e2a966b","line":364,"in_reply_to":"ba285c05_a0743ced","updated":"2025-12-03 22:55:31.000000000","message":"yes, we will use the RPC client to send the msg to new topic. Based on version cap, we will fallback to the old topic (1st RPC server)\n\n- https://review.opendev.org/c/openstack/nova/+/967261/7/nova/compute/rpcapi.py#591","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":" .. note::"},{"line_number":408,"context_line":""},{"line_number":409,"context_line":" The time based waiting is a temporary solution in spec 1. In spec 2,"},{"line_number":410,"context_line":" it will be replaced by the proper tracking of in-progress tasks."},{"line_number":411,"context_line":""},{"line_number":412,"context_line":" * To make the graceful shutdown less complicated, spec 1 proposes to"}],"source_content_type":"text/x-rst","patch_set":3,"id":"66c43e36_c1e289a7","line":409,"range":{"start_line":409,"start_character":63,"end_line":409,"end_character":73},"updated":"2025-12-03 14:21:20.000000000","message":"\"In later specs\" Just to give use a bit of freedom :)","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":" .. note::"},{"line_number":408,"context_line":""},{"line_number":409,"context_line":" The time based waiting is a temporary solution in spec 1. In spec 2,"},{"line_number":410,"context_line":" it will be replaced by the proper tracking of in-progress tasks."},{"line_number":411,"context_line":""},{"line_number":412,"context_line":" * To make the graceful shutdown less complicated, spec 1 proposes to"}],"source_content_type":"text/x-rst","patch_set":3,"id":"13dd9e6e_c9b54e49","line":409,"range":{"start_line":409,"start_character":63,"end_line":409,"end_character":73},"in_reply_to":"66c43e36_c1e289a7","updated":"2025-12-03 15:02:37.000000000","message":"+1\nor we can just refecne the backlog sepc","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"857f5ba0b5e45de816dd377e5ca273ab91b7f107","unresolved":false,"context_lines":[{"line_number":414,"context_line":" * The wait time should be less than global graceful shutdown timeout. So that"},{"line_number":415,"context_line":" external system or oslo.service does not shut down the service before the"},{"line_number":416,"context_line":" service wait time is over."},{"line_number":417,"context_line":" * It will be configurable per service."},{"line_number":418,"context_line":" * Proposal for the default value:"},{"line_number":419,"context_line":""},{"line_number":420,"context_line":" * compute service: 150 sec, considering long-running operations on compute."},{"line_number":421,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":422,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":423,"context_line":""},{"line_number":424,"context_line":"* PoC:"},{"line_number":425,"context_line":" This PoC shows the working of the spec 1 proposal."}],"source_content_type":"text/x-rst","patch_set":3,"id":"99e132ad_6fb17a44","line":422,"range":{"start_line":417,"start_character":0,"end_line":422,"end_character":49},"updated":"2025-12-03 23:22:24.000000000","message":"I have moved it in \u0027Graceful Shutdown Timeouts:\u0027 section","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":421,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":422,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":423,"context_line":""},{"line_number":424,"context_line":"* PoC:"},{"line_number":425,"context_line":" This PoC shows the working of the spec 1 proposal."},{"line_number":426,"context_line":""},{"line_number":427,"context_line":" * Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":428,"context_line":" * PoC results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":429,"context_line":""},{"line_number":430,"context_line":"* Some specific examples of the shutdown issues which will be solved by this"},{"line_number":431,"context_line":" proposal:"},{"line_number":432,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"b55471be_c03c22b9","line":429,"range":{"start_line":424,"start_character":0,"end_line":429,"end_character":1},"updated":"2025-12-03 15:02:37.000000000","message":"i would remove this its already in the reference section.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":421,"context_line":" * conductor service: 60 sec should be enough."},{"line_number":422,"context_line":" * scheduler service: 60 sec should be enough."},{"line_number":423,"context_line":""},{"line_number":424,"context_line":"* PoC:"},{"line_number":425,"context_line":" This PoC shows the working of the spec 1 proposal."},{"line_number":426,"context_line":""},{"line_number":427,"context_line":" * Code change: https://review.opendev.org/c/openstack/nova/+/967261"},{"line_number":428,"context_line":" * PoC results: https://docs.google.com/document/d/1wd_VSw4fBYCXgyh5qwnjvjticNa8AnghzRmRH3H8pu4/"},{"line_number":429,"context_line":""},{"line_number":430,"context_line":"* Some specific examples of the shutdown issues which will be solved by this"},{"line_number":431,"context_line":" proposal:"},{"line_number":432,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"29d17011_09c7b021","line":429,"range":{"start_line":424,"start_character":0,"end_line":429,"end_character":1},"in_reply_to":"b55471be_c03c22b9","updated":"2025-12-03 22:55:31.000000000","message":"Done","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":436,"context_line":""},{"line_number":437,"context_line":" * If migration is in-progress then the service shutdown will not"},{"line_number":438,"context_line":" terminate the migration; instead will be able to wait for the migration"},{"line_number":439,"context_line":" to complete."},{"line_number":440,"context_line":" * Instance boot:"},{"line_number":441,"context_line":""},{"line_number":442,"context_line":" * Instance boot operations will continue to use the"}],"source_content_type":"text/x-rst","patch_set":3,"id":"d6ac3ec1_e780e9c1","line":439,"updated":"2025-12-03 14:21:20.000000000","message":"Somewhere here we need to make a statement about will we start aborting migrations or it is out of scope for this spec? (See my longer comment on the use case section above)","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":false,"context_lines":[{"line_number":436,"context_line":""},{"line_number":437,"context_line":" * If migration is in-progress then the service shutdown will not"},{"line_number":438,"context_line":" terminate the migration; instead will be able to wait for the migration"},{"line_number":439,"context_line":" to complete."},{"line_number":440,"context_line":" * Instance boot:"},{"line_number":441,"context_line":""},{"line_number":442,"context_line":" * Instance boot operations will continue to use the"}],"source_content_type":"text/x-rst","patch_set":3,"id":"bfe32c12_1c09f61f","line":439,"in_reply_to":"cba72b5f_99472efd","updated":"2025-12-04 09:43:20.000000000","message":"cool","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":436,"context_line":""},{"line_number":437,"context_line":" * If migration is in-progress then the service shutdown will not"},{"line_number":438,"context_line":" terminate the migration; instead will be able to wait for the migration"},{"line_number":439,"context_line":" to complete."},{"line_number":440,"context_line":" * Instance boot:"},{"line_number":441,"context_line":""},{"line_number":442,"context_line":" * Instance boot operations will continue to use the"}],"source_content_type":"text/x-rst","patch_set":3,"id":"cba72b5f_99472efd","line":439,"in_reply_to":"d6ac3ec1_e780e9c1","updated":"2025-12-03 22:55:31.000000000","message":"It is out of this spec, I will make it clear in backlog spec and here also","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":443,"context_line":" \u0027new request RPC server\u0027. Otherwise, we will not be able to stop the"},{"line_number":444,"context_line":" new requests."},{"line_number":445,"context_line":" * If instance boot requests are in progress by compute services, then"},{"line_number":446,"context_line":" shutdown will wait for compute to boot them successfully."},{"line_number":447,"context_line":" * If a new instance boot request arrives after the shutdown is initiated,"},{"line_number":448,"context_line":" then it will stay in the queue, and the compute will handle it once it"},{"line_number":449,"context_line":" is started again."}],"source_content_type":"text/x-rst","patch_set":3,"id":"09f48044_e34f127e","line":446,"updated":"2025-12-03 14:21:20.000000000","message":"Here I would add in that: And external_event are will be still received during graceful shutdown, therefor a boot request will be able to make progress and eventually succeed even if it needs to wait for such extrenal events arriving via RPC.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":443,"context_line":" \u0027new request RPC server\u0027. Otherwise, we will not be able to stop the"},{"line_number":444,"context_line":" new requests."},{"line_number":445,"context_line":" * If instance boot requests are in progress by compute services, then"},{"line_number":446,"context_line":" shutdown will wait for compute to boot them successfully."},{"line_number":447,"context_line":" * If a new instance boot request arrives after the shutdown is initiated,"},{"line_number":448,"context_line":" then it will stay in the queue, and the compute will handle it once it"},{"line_number":449,"context_line":" is started again."}],"source_content_type":"text/x-rst","patch_set":3,"id":"44d5b353_b66f5dbe","line":446,"in_reply_to":"09f48044_e34f127e","updated":"2025-12-03 15:02:37.000000000","message":"this is a place where we coudl abort the build although that problay shoudl be\nleft to a later spec","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":443,"context_line":" \u0027new request RPC server\u0027. Otherwise, we will not be able to stop the"},{"line_number":444,"context_line":" new requests."},{"line_number":445,"context_line":" * If instance boot requests are in progress by compute services, then"},{"line_number":446,"context_line":" shutdown will wait for compute to boot them successfully."},{"line_number":447,"context_line":" * If a new instance boot request arrives after the shutdown is initiated,"},{"line_number":448,"context_line":" then it will stay in the queue, and the compute will handle it once it"},{"line_number":449,"context_line":" is started again."}],"source_content_type":"text/x-rst","patch_set":3,"id":"bf02df4c_675bf05f","line":446,"in_reply_to":"44d5b353_b66f5dbe","updated":"2025-12-03 22:55:31.000000000","message":"Done","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":453,"context_line":".. note::"},{"line_number":454,"context_line":""},{"line_number":455,"context_line":" As per my PoC and manual testing till now, it does not require any"},{"line_number":456,"context_line":" change on oslo.messaging side."},{"line_number":457,"context_line":""},{"line_number":458,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":459,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":3,"id":"14b0176a_c284e4ca","line":456,"updated":"2025-12-03 14:21:20.000000000","message":"Hopefully this is true when we run this with the native threading oslo.service backend too. If you want you can test you poc together with native threaded compute by pulling the https://review.opendev.org/c/openstack/nova/+/965467 review (and its ancestors) into your env and using the OS_NOVA_DISABLE_EVENTLET_PATCHING\u003dtrue env var to switch nova-compute to native threaded move (including the oslo.service backend)","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":453,"context_line":".. note::"},{"line_number":454,"context_line":""},{"line_number":455,"context_line":" As per my PoC and manual testing till now, it does not require any"},{"line_number":456,"context_line":" change on oslo.messaging side."},{"line_number":457,"context_line":""},{"line_number":458,"context_line":"Graceful Shutdown Timeouts:"},{"line_number":459,"context_line":"---------------------------"}],"source_content_type":"text/x-rst","patch_set":3,"id":"c6c0797b_88dd10d0","line":456,"in_reply_to":"14b0176a_c284e4ca","updated":"2025-12-03 22:55:31.000000000","message":"sure, I can test that, I test compute with eventlet mode only. There is already change needed in oslo.service threading mode for graceful_shutdown_timeout so there are deps on oslo lib, I will note that here.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":468,"context_line":" finished."},{"line_number":469,"context_line":" * No new configurable timeout will be added for the Nova, instead it will use"},{"line_number":470,"context_line":" the existing graceful_shutdown_timeout_."},{"line_number":471,"context_line":" * Its default value is 60 sec, which is less for Nova services. The proposal"},{"line_number":472,"context_line":" is to override its default value per Nova services:"},{"line_number":473,"context_line":""},{"line_number":474,"context_line":" * compute service: 180 sec (Considering the long running tasks)."},{"line_number":475,"context_line":" * conductor service: 80 sec"},{"line_number":476,"context_line":" * scheduler service: 80 sec"},{"line_number":477,"context_line":""},{"line_number":478,"context_line":"* External system timeout:"},{"line_number":479,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"3eb74111_f7844d82","line":476,"range":{"start_line":471,"start_character":2,"end_line":476,"end_character":33},"updated":"2025-12-03 15:02:37.000000000","message":"you shoudl define how yoru goign to do this\n\ni assume you will do this form teh cmd moduels for each binary\n\ncalling set_default\n\nhttps://docs.openstack.org/oslo.config/latest/reference/locations.html\nhttps://docs.openstack.org/oslo.config/latest/reference/api/oslo_config.html#oslo_config.cfg.set_defaults\nhttps://docs.openstack.org/oslo.config/latest/reference/api/oslo_config.html#oslo_config.fixture.Config.set_default\n\nthat will allow per service overeid but still give precendce to a user value\n\nthe main concern i have with tha tis the effect on the help text a s we woudl need to overight that as well to doument the per service defaults.\n\nso we need to be carful to ensure this is doucmented properly.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":468,"context_line":" finished."},{"line_number":469,"context_line":" * No new configurable timeout will be added for the Nova, instead it will use"},{"line_number":470,"context_line":" the existing graceful_shutdown_timeout_."},{"line_number":471,"context_line":" * Its default value is 60 sec, which is less for Nova services. The proposal"},{"line_number":472,"context_line":" is to override its default value per Nova services:"},{"line_number":473,"context_line":""},{"line_number":474,"context_line":" * compute service: 180 sec (Considering the long running tasks)."},{"line_number":475,"context_line":" * conductor service: 80 sec"},{"line_number":476,"context_line":" * scheduler service: 80 sec"},{"line_number":477,"context_line":""},{"line_number":478,"context_line":"* External system timeout:"},{"line_number":479,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"ca83b0cf_7cef74a9","line":476,"range":{"start_line":471,"start_character":2,"end_line":476,"end_character":33},"in_reply_to":"3eb74111_f7844d82","updated":"2025-12-03 22:55:31.000000000","message":"We need to do litlte more that oslo_config.set_defaults(). Nova cannot directly change its default as that config option is owned by the oslo.service. We have to implement a set_defaults() in oslo.service which will take opts graceful_shutdown_timeout is present and graceful_shutdown_timeout_\u003d\u003cnew default value\u003e. Then oslo.service.set_defaults() will first register the config and then set new default in oslo.config.\n\nSomething like below:\n\noslo.service:\n\n def set_service_opts_defaults(conf, **kwargs):\n\n conf.register_opts(service_opts )\n\n if kwargs:\n cfg.set_defaults(service_opts , **kwargs)\n\n\nNova will call it:\n\n oslo.service-\u003eset_defaults(conf, graceful_shutdown_timeout\u003d180)\n\nAs you mentioned, it will not improve the help message. As it is in default section, Nova config shows it in default and there we will be having difficulties to separate this per nova service inhttps://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.graceful_shutdown_timeout\n\nI thought of registering the graceful_shutdown_timeout in nova per service, for example nova.compute.graceful_shutdown_timeout nova.conductor.graceful_shutdown_timeout but that would not work as oslo.service has the entry point for SIGTERM handler and their graceful_shutdown_timeout will anyways come into the pic.\n\nI think we can have a single default value overridden say 180 for all nova services which will be less confusing for users. They can anyways override it per nova service.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":554,"context_line":"the new \u0027ops RPC server\u0027 listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":555,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":556,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":557,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":558,"context_line":""},{"line_number":559,"context_line":"Implementation"},{"line_number":560,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"39f08b06_db94e6ea","line":557,"updated":"2025-12-03 14:21:20.000000000","message":"and therefore graceful shutdown will not work on new compute nodes until all the computes are upgraded and the RPC version_cap is removed.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":554,"context_line":"the new \u0027ops RPC server\u0027 listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":555,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":556,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":557,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":558,"context_line":""},{"line_number":559,"context_line":"Implementation"},{"line_number":560,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"b9bf9e00_492c6203","line":557,"in_reply_to":"22579fa0_e37a710c","updated":"2025-12-03 22:55:31.000000000","message":"Instead of error it will fallback to the 1st RPC server (old topic \u0027compute.\u003chost\u003e\u0027) and that way it will be\n- no graceful shutdown until all computes are new or version_cap is at new computes\n- in mixed env, things will continue working as per old compute","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":554,"context_line":"the new \u0027ops RPC server\u0027 listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":555,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":556,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":557,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":558,"context_line":""},{"line_number":559,"context_line":"Implementation"},{"line_number":560,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"22579fa0_e37a710c","line":557,"in_reply_to":"39f08b06_db94e6ea","updated":"2025-12-03 15:02:37.000000000","message":"ack ya i was wonderign how we were going to handel the upgrade case.\n\nso we are not going to supprot mixed upgraded and non upgraded ndoes wehre some use one approch and the ohter do not. we are going to docuemtn that you have to pin the rpc version\n\n\ni think only supprotign this when all comptue are upgraded makes sesne but if we are doing that shoudl we not also have a compute service version bump and rasie an error if you try to use the new rpc version if we have comptues below the min verion?","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c0fd8c8f6a1826a9ae169f8d0612cda62904fe1","unresolved":false,"context_lines":[{"line_number":554,"context_line":"the new \u0027ops RPC server\u0027 listening on topic RPC_TOPIC_OPS, so we need to handle"},{"line_number":555,"context_line":"it with RPC versioning. If the RPC client detects an old compute (based on"},{"line_number":556,"context_line":"version_cap), then it will fall back to send the message to the original RPC"},{"line_number":557,"context_line":"server (listening to RPC_TOPIC)."},{"line_number":558,"context_line":""},{"line_number":559,"context_line":"Implementation"},{"line_number":560,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"606827d1_83e03369","line":557,"in_reply_to":"b9bf9e00_492c6203","updated":"2025-12-04 09:43:20.000000000","message":"thanks for the added not. Looks good","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d93b7f75a74e12ad1cbf0a68e31fa8bdeb5a1bdc","unresolved":true,"context_lines":[{"line_number":587,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":588,"context_line":""},{"line_number":589,"context_line":"* No dependency as of now, but we will see during implementation if any change"},{"line_number":590,"context_line":" is needed in oslo.messaging."},{"line_number":591,"context_line":""},{"line_number":592,"context_line":""},{"line_number":593,"context_line":"Testing"}],"source_content_type":"text/x-rst","patch_set":3,"id":"eeb433fa_07e8edc6","line":590,"updated":"2025-12-03 14:21:20.000000000","message":"We expect that this feature works both in eventlet and in native threading mode. But nova-compute native threading support is not landed yet. We need to be careful to at least cross test between the two parallel work to detect if adjustment is needed.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":587,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":588,"context_line":""},{"line_number":589,"context_line":"* No dependency as of now, but we will see during implementation if any change"},{"line_number":590,"context_line":" is needed in oslo.messaging."},{"line_number":591,"context_line":""},{"line_number":592,"context_line":""},{"line_number":593,"context_line":"Testing"}],"source_content_type":"text/x-rst","patch_set":3,"id":"8565260c_261a30a6","line":590,"in_reply_to":"797c8510_cf81b9dd","updated":"2025-12-03 22:55:31.000000000","message":"yeah, agree to land the eventlet first. I will be testing it in threading more first.\n\nAlso, it has oslo.service deps where graceful_shutdown_timeout is not taken care for threading mdoe.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":587,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":588,"context_line":""},{"line_number":589,"context_line":"* No dependency as of now, but we will see during implementation if any change"},{"line_number":590,"context_line":" is needed in oslo.messaging."},{"line_number":591,"context_line":""},{"line_number":592,"context_line":""},{"line_number":593,"context_line":"Testing"}],"source_content_type":"text/x-rst","patch_set":3,"id":"797c8510_cf81b9dd","line":590,"in_reply_to":"eeb433fa_07e8edc6","updated":"2025-12-03 15:02:37.000000000","message":"i think we need to require that.\n\nif this woudl only work in eventlet mode we shoudl not progress with it until we have a way for it ot work in threaded mode.\ni woudl laso be ok with progressing with this if it only worked in threaded mode.\n\nwe just cant allow this to delay the eventlet removal or cause a regression.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d220a01e9f8c5db85feb4be384896d70a849b745","unresolved":true,"context_lines":[{"line_number":604,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":605,"context_line":""},{"line_number":606,"context_line":"Graceful shutdown working will be documented along with other considerations,"},{"line_number":607,"context_line":"for example, timeout or wait time considered for the graceful shutdown."},{"line_number":608,"context_line":""},{"line_number":609,"context_line":"References"},{"line_number":610,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"9cc6a939_8f0ee25b","line":607,"updated":"2025-12-03 15:02:37.000000000","message":"this shoudl inlcude ensuring the graceful_shutdown_timeout help text is overriden\nform oslos default to include the per sevice defaults and prefereably better wording in general.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"},{"author":{"_account_id":8556,"name":"Ghanshyam Maan","display_name":"Ghanshyam Maan","email":"gmaan.os14@gmail.com","username":"ghanshyam"},"change_message_id":"c2a12e43bb08d54a2488bf0e7fdf194bd3a450ed","unresolved":false,"context_lines":[{"line_number":604,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":605,"context_line":""},{"line_number":606,"context_line":"Graceful shutdown working will be documented along with other considerations,"},{"line_number":607,"context_line":"for example, timeout or wait time considered for the graceful shutdown."},{"line_number":608,"context_line":""},{"line_number":609,"context_line":"References"},{"line_number":610,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":3,"id":"2edf4452_535313f3","line":607,"in_reply_to":"9cc6a939_8f0ee25b","updated":"2025-12-03 22:55:31.000000000","message":"AFAIK, we cannot override the help msg. I replied it in another comment to have a single default for nova. that is easy and safe.","commit_id":"d8fe9b73cf8a6aa8270edf1d6ea973eb8e77bd23"}]}