)]}' {"/PATCHSET_LEVEL":[{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"436ad07d2cd0858c3c4d05ddbad5540b3c3376e2","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"9db6e3f7_f4af842d","updated":"2025-01-08 15:23:29.000000000","message":"we said in the upstream nova meeting we would prefer to first accept https://blueprints.launchpad.net/nova/+spec/distributed-host-discovery","commit_id":"f9d0d54f2dea5ca19b2e5a1f4e433ff123a04636"}],"specs/2025.1/approved/host-discovery-distributed-lock.rst":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":37,"context_line":"* Database conflicts from simultaneous host mapping attempts"},{"line_number":38,"context_line":"* Requires operator intervention to manually fix host mappings or"},{"line_number":39,"context_line":" force re-discovery when hosts are missed"},{"line_number":40,"context_line":""},{"line_number":41,"context_line":"Use Cases"},{"line_number":42,"context_line":"---------"},{"line_number":43,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"fef935cf_23d13920","line":40,"updated":"2024-11-27 19:45:38.000000000","message":"this problem statement is a well known (with in the nova comuntity) limitation\nand somethign currently unsupproted.\n\n\nwe support but do not recommend using the periodic.\nwhen the periodic is used it is recommended that its only used on one of the scheduler instances.\n\n\nwe can make this better but currently our recommenadion is to only run discover host as a ad hoc prodecure when new compute nodes are added.\n\nwe can do better but in the past adding a distributed lock manager was considered too heavy weait a runtim requirement given the load the perodic create and the scalablity implciates that creates.\n\nin general we do not recomemdn using the current perodic in production at large scale unless it run infrequetly.\n\nthe periodic is most usually in a highly dynmaic environment where hosts are added frequently. in such an environment running the perodic with a large interval defeats the benefit.\n\n\nso while i agree we can do better to adress this pain point, any solution should have minimal overhead and ideally minimal dependencies.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"a9d195d7633557872faa99861742249ffb97bbd3","unresolved":true,"context_lines":[{"line_number":37,"context_line":"* Database conflicts from simultaneous host mapping attempts"},{"line_number":38,"context_line":"* Requires operator intervention to manually fix host mappings or"},{"line_number":39,"context_line":" force re-discovery when hosts are missed"},{"line_number":40,"context_line":""},{"line_number":41,"context_line":"Use Cases"},{"line_number":42,"context_line":"---------"},{"line_number":43,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"d7fae871_4b97ca4e","line":40,"in_reply_to":"fef935cf_23d13920","updated":"2024-12-05 13:55:04.000000000","message":"we should defiantly have a solution for the problem, this can be any acceptable one, we need a decision here how to handle this issue and need it to be formal","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":71,"context_line":""},{"line_number":72,"context_line":" * Initialize coordination when enabled"},{"line_number":73,"context_line":" * Acquire a distributed lock before executing host discovery"},{"line_number":74,"context_line":" * Release the lock after completion"},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"The implementation will use the `tooz` library which Nova already depends"},{"line_number":77,"context_line":"on for other coordination needs."}],"source_content_type":"text/x-rst","patch_set":2,"id":"42d00683_3ab6964d","line":74,"updated":"2024-11-27 19:45:38.000000000","message":"so this requires carful handling in error cases to ensure we do not deadlock forever.\n\ni.e. if you interupt the cli or the process executing the perodic is terminated abruptly.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"a9d195d7633557872faa99861742249ffb97bbd3","unresolved":true,"context_lines":[{"line_number":71,"context_line":""},{"line_number":72,"context_line":" * Initialize coordination when enabled"},{"line_number":73,"context_line":" * Acquire a distributed lock before executing host discovery"},{"line_number":74,"context_line":" * Release the lock after completion"},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"The implementation will use the `tooz` library which Nova already depends"},{"line_number":77,"context_line":"on for other coordination needs."}],"source_content_type":"text/x-rst","patch_set":2,"id":"539999c3_9bee4d8b","line":74,"in_reply_to":"42d00683_3ab6964d","updated":"2024-12-05 13:55:04.000000000","message":"can also have a TTL on lock, so DB (one that is used for locking) itself will expire it","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":102,"context_line":""},{"line_number":103,"context_line":"2. Database-level locking:"},{"line_number":104,"context_line":""},{"line_number":105,"context_line":" * Add database table for discovery locking"},{"line_number":106,"context_line":" * Use database transactions for coordination"},{"line_number":107,"context_line":" * Handle lock timeouts at the database level"},{"line_number":108,"context_line":" * Integrate with existing database operations"}],"source_content_type":"text/x-rst","patch_set":2,"id":"ffe6f006_105d5023","line":105,"updated":"2024-11-27 19:45:38.000000000","message":"i think any approch that need a new db table is likely to heavey weight of a solution to this problem.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":103,"context_line":"2. Database-level locking:"},{"line_number":104,"context_line":""},{"line_number":105,"context_line":" * Add database table for discovery locking"},{"line_number":106,"context_line":" * Use database transactions for coordination"},{"line_number":107,"context_line":" * Handle lock timeouts at the database level"},{"line_number":108,"context_line":" * Integrate with existing database operations"},{"line_number":109,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"d532aaaa_35eb6f7e","line":106,"updated":"2024-11-27 19:45:38.000000000","message":"we already use write transactions for coordination but that combined with our unique constraints is what leads to the current conflicts.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":116,"context_line":" * Disadvantages:"},{"line_number":117,"context_line":""},{"line_number":118,"context_line":" * Adds database migration"},{"line_number":119,"context_line":" * Increased database load under contention"},{"line_number":120,"context_line":" * May impact other database operations"},{"line_number":121,"context_line":""},{"line_number":122,"context_line":"3. Graceful conflict handling:"}],"source_content_type":"text/x-rst","patch_set":2,"id":"380871d8_c6eb791d","line":119,"updated":"2024-11-27 19:45:38.000000000","message":"the perodic already increase database load more then this would so thats a valid concern but if you are enabling the perodic its small in comparison to the load created by the perodic itself","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":124,"context_line":" * Handle database conflicts gracefully"},{"line_number":125,"context_line":" * Continue with remaining mappings when conflicts occur"},{"line_number":126,"context_line":" * Downgrade duplicate mapping errors to debug level"},{"line_number":127,"context_line":" * Trust that missed mappings will be picked up in the next run"},{"line_number":128,"context_line":""},{"line_number":129,"context_line":" * Advantages:"},{"line_number":130,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"742e737c_77106999","line":127,"updated":"2024-11-27 19:45:38.000000000","message":"this is not nessiarly an alternitive as it could complement any of the other aprpcohes. i.e. better error handling and robustness is orthogonal to the other issues.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"a9d195d7633557872faa99861742249ffb97bbd3","unresolved":true,"context_lines":[{"line_number":124,"context_line":" * Handle database conflicts gracefully"},{"line_number":125,"context_line":" * Continue with remaining mappings when conflicts occur"},{"line_number":126,"context_line":" * Downgrade duplicate mapping errors to debug level"},{"line_number":127,"context_line":" * Trust that missed mappings will be picked up in the next run"},{"line_number":128,"context_line":""},{"line_number":129,"context_line":" * Advantages:"},{"line_number":130,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"924ec70b_70234d9e","line":127,"in_reply_to":"742e737c_77106999","updated":"2024-12-05 13:55:04.000000000","message":"Agree, can be a point on all alternatives","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":143,"context_line":""},{"line_number":144,"context_line":" * Deprecate periodic host discovery tasks"},{"line_number":145,"context_line":" * Recommend usage of CLI-based host discovery"},{"line_number":146,"context_line":" * Recommend usage of external scheduling for concurrent host discovery"},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" * Advantages:"},{"line_number":149,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"ef26be39_bef58bbb","line":146,"updated":"2024-11-27 19:45:38.000000000","message":"this would be documenting oru current best practices.\n\ni think even if we supported parallel perodic we would still recommend that you do not use them with large clouds to minimise db load.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"98ac04a1b05ac7a590658bc7918e7fe275bc0e86","unresolved":true,"context_lines":[{"line_number":143,"context_line":""},{"line_number":144,"context_line":" * Deprecate periodic host discovery tasks"},{"line_number":145,"context_line":" * Recommend usage of CLI-based host discovery"},{"line_number":146,"context_line":" * Recommend usage of external scheduling for concurrent host discovery"},{"line_number":147,"context_line":""},{"line_number":148,"context_line":" * Advantages:"},{"line_number":149,"context_line":""}],"source_content_type":"text/x-rst","patch_set":2,"id":"214fc9be_0bf41516","line":146,"in_reply_to":"ef26be39_bef58bbb","updated":"2024-11-27 19:56:41.000000000","message":"explicitly documenting is also an ok solution, what is bad is the uncertainty how concurrency problem should be solved in what place and by whom","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":156,"context_line":" * Doesn\u0027t solve the underlying technical issue"},{"line_number":157,"context_line":" * Requires manual/external coordination"},{"line_number":158,"context_line":""},{"line_number":159,"context_line":"5. Rendezvous hashing-based distribution:"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":" * Use rendezvous hashing to distribute mapping tasks between schedulers"},{"line_number":162,"context_line":" * Handle database conflicts gracefully and proceed with remaining mappings"}],"source_content_type":"text/x-rst","patch_set":2,"id":"fe3aa251_70cc7d31","line":159,"updated":"2024-11-27 19:45:38.000000000","message":"this is one of the approchs i was thinking of\n\nthe other way of useing Rendezvous hashing-based distribution\nwas actully do do leader election\n\nbasically the scduler are all part of a service group currently and we report the \"up\" status as part fo group memebrship\n\nwe coudl provide a way to get a list fo the \"up\" group member and the use a hash\nto elect a leader form the up set.\n\neach schduler would run teh perod an exit at the top of the loop if it was not the current leader.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"54ee79f5b49f02544f0db72e64a2d9d23759c109","unresolved":true,"context_lines":[{"line_number":156,"context_line":" * Doesn\u0027t solve the underlying technical issue"},{"line_number":157,"context_line":" * Requires manual/external coordination"},{"line_number":158,"context_line":""},{"line_number":159,"context_line":"5. Rendezvous hashing-based distribution:"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":" * Use rendezvous hashing to distribute mapping tasks between schedulers"},{"line_number":162,"context_line":" * Handle database conflicts gracefully and proceed with remaining mappings"}],"source_content_type":"text/x-rst","patch_set":2,"id":"33033b22_72f84e11","line":159,"in_reply_to":"5e2ad4bd_54aa226b","updated":"2025-01-06 21:29:06.000000000","message":"i have only tested this in unit tests buti belive \n\nhttps://review.opendev.org/c/openstack/nova/+/938523\n\nwould work\nlets see what the ci says.\n\n\nthat is a very minimal increase in complexity to enable the usecase.\ncollisions coudl still happing if ran form nova-manage and the perodic at the same time but i think thats accpetable.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"3b7a557767e032d2c88c2994f5d2f585373c2d83","unresolved":true,"context_lines":[{"line_number":156,"context_line":" * Doesn\u0027t solve the underlying technical issue"},{"line_number":157,"context_line":" * Requires manual/external coordination"},{"line_number":158,"context_line":""},{"line_number":159,"context_line":"5. Rendezvous hashing-based distribution:"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":" * Use rendezvous hashing to distribute mapping tasks between schedulers"},{"line_number":162,"context_line":" * Handle database conflicts gracefully and proceed with remaining mappings"}],"source_content_type":"text/x-rst","patch_set":2,"id":"5e2ad4bd_54aa226b","line":159,"in_reply_to":"fd652615_1eb60e37","updated":"2024-12-05 14:00:47.000000000","message":"```\nservices \u003d objects.ServiceList.get_by_binary(ctxt, \u0027nova-scheduler\u0027)\nleader \u003d sorted([service.uuid for service in services if service.up])[0]\n\nif self.service_ref.uuid !\u003d leader\n return \n\n... do current perodic.\n```","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"a9d195d7633557872faa99861742249ffb97bbd3","unresolved":true,"context_lines":[{"line_number":156,"context_line":" * Doesn\u0027t solve the underlying technical issue"},{"line_number":157,"context_line":" * Requires manual/external coordination"},{"line_number":158,"context_line":""},{"line_number":159,"context_line":"5. Rendezvous hashing-based distribution:"},{"line_number":160,"context_line":""},{"line_number":161,"context_line":" * Use rendezvous hashing to distribute mapping tasks between schedulers"},{"line_number":162,"context_line":" * Handle database conflicts gracefully and proceed with remaining mappings"}],"source_content_type":"text/x-rst","patch_set":2,"id":"fd652615_1eb60e37","line":159,"in_reply_to":"fe3aa251_70cc7d31","updated":"2024-12-05 13:55:04.000000000","message":"I think this is what you\u0027ve suggested in IRC\nhttps://paste.opendev.org/show/bFFLh0pVeBZAOYXoNjKs/\nin https://github.com/openstack/nova/blob/master/nova/scheduler/manager.py#L111\n\n(putting here for context)","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"277c4cc530de5af1d92d85822b41a2e9b1b44c75","unresolved":true,"context_lines":[{"line_number":214,"context_line":"* Minor latency increase from lock acquisition/release"},{"line_number":215,"context_line":"* Coordination backend must be appropriately sized for deployment scale"},{"line_number":216,"context_line":"* When enabled, coordination backend availability is critical for"},{"line_number":217,"context_line":" host discovery operations"},{"line_number":218,"context_line":""},{"line_number":219,"context_line":"Other deployer impact"},{"line_number":220,"context_line":"---------------------"}],"source_content_type":"text/x-rst","patch_set":2,"id":"2fd65b8f_e6a574fe","line":217,"updated":"2024-11-27 19:45:38.000000000","message":"so its not just that.\n\nthe scheduler runs the periodic in a green thread.\n\nOnly one green tread can execute at a time, so while the periodic is running, that scheduler instance is unavailable to service scheduling requests.\n\nif we yield execution while holding the lock via an eventlet context switch then other schedulers are prevented from acquiring the lock while we are servicing the the scheduling request.\n\nso the periodic impacts the performance of scheduling just by being enabled\nand by using a lock, we ensure that only one scheduler can progress at a time on the execution of the current periodic run.\n\nto minimize that effect we would have to keep the critical section as small as possible which means either acquiring the lock for each host, or computing a batch \nof updates which we apply in the lock after eliminating duplicates.","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"},{"author":{"_account_id":37442,"name":"Serhii Ivanov","email":"evasive.gyron@gmail.com","username":"s3rj1k"},"change_message_id":"a9d195d7633557872faa99861742249ffb97bbd3","unresolved":true,"context_lines":[{"line_number":214,"context_line":"* Minor latency increase from lock acquisition/release"},{"line_number":215,"context_line":"* Coordination backend must be appropriately sized for deployment scale"},{"line_number":216,"context_line":"* When enabled, coordination backend availability is critical for"},{"line_number":217,"context_line":" host discovery operations"},{"line_number":218,"context_line":""},{"line_number":219,"context_line":"Other deployer impact"},{"line_number":220,"context_line":"---------------------"}],"source_content_type":"text/x-rst","patch_set":2,"id":"44c00991_1989f984","line":217,"in_reply_to":"2fd65b8f_e6a574fe","updated":"2024-12-05 13:55:04.000000000","message":"lock is not only about periodic, CLI is also locked in main proposal\n(CLI can be a cron job for example)","commit_id":"e9ed10aaa3c493a0a5b39d51eeeba44a323e6ab2"}]}