)]}' {"/COMMIT_MSG":[{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"81774120fe683396a94f0a19a996b79bb6898494","unresolved":true,"context_lines":[{"line_number":22,"context_line":"instance\u0027s host record across the RPC bus."},{"line_number":23,"context_line":""},{"line_number":24,"context_line":"What we explicitly need to do is when we rebalance, we need to"},{"line_number":25,"context_line":"reconcile the host record entries to parallel who is actively"},{"line_number":26,"context_line":"managing the node based upon the hash ring."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"This requires us to know when the hash ring has changed,"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"f1d39368_cf656adb","line":25,"range":{"start_line":25,"start_character":37,"end_line":25,"end_character":45},"updated":"2021-11-12 22:15:53.000000000","message":"suggestion: \"reflect\"","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":22,"context_line":"instance\u0027s host record across the RPC bus."},{"line_number":23,"context_line":""},{"line_number":24,"context_line":"What we explicitly need to do is when we rebalance, we need to"},{"line_number":25,"context_line":"reconcile the host record entries to parallel who is actively"},{"line_number":26,"context_line":"managing the node based upon the hash ring."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"This requires us to know when the hash ring has changed,"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"7a8abfd7_78232e67","line":25,"range":{"start_line":25,"start_character":37,"end_line":25,"end_character":45},"in_reply_to":"f1d39368_cf656adb","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"81774120fe683396a94f0a19a996b79bb6898494","unresolved":true,"context_lines":[{"line_number":29,"context_line":"and then efficently handle:"},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"* If we should manage the node because the hash ring says so."},{"line_number":32,"context_line":"* If that node is the same compute node we\u0027re on, then nothing to"},{"line_number":33,"context_line":" see here, carry on."},{"line_number":34,"context_line":"* If the hash ring is indicating the node is managed by *us* and"},{"line_number":35,"context_line":" the prior host is online, then do *nothing*. The requests can still"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"44e38d71_434c995c","line":32,"range":{"start_line":32,"start_character":27,"end_line":32,"end_character":48},"updated":"2021-11-12 22:15:53.000000000","message":"That\u0027s be true in Ironic\u0027s case, can it? Shouldn\u0027t we rephrase this as \"If that node is a node that we already manage\"?","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":29,"context_line":"and then efficently handle:"},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"* If we should manage the node because the hash ring says so."},{"line_number":32,"context_line":"* If that node is the same compute node we\u0027re on, then nothing to"},{"line_number":33,"context_line":" see here, carry on."},{"line_number":34,"context_line":"* If the hash ring is indicating the node is managed by *us* and"},{"line_number":35,"context_line":" the prior host is online, then do *nothing*. The requests can still"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"b864f2fc_09714f3f","line":32,"range":{"start_line":32,"start_character":27,"end_line":32,"end_character":48},"in_reply_to":"1a5548b6_4e5dfe30","updated":"2022-02-16 21:24:09.000000000","message":"I\u0027m revising the text because after a couple months, it does seem to need more clarity.","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"dc4f731488e488f9114043a46519bbbe4031d364","unresolved":true,"context_lines":[{"line_number":29,"context_line":"and then efficently handle:"},{"line_number":30,"context_line":""},{"line_number":31,"context_line":"* If we should manage the node because the hash ring says so."},{"line_number":32,"context_line":"* If that node is the same compute node we\u0027re on, then nothing to"},{"line_number":33,"context_line":" see here, carry on."},{"line_number":34,"context_line":"* If the hash ring is indicating the node is managed by *us* and"},{"line_number":35,"context_line":" the prior host is online, then do *nothing*. The requests can still"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"1a5548b6_4e5dfe30","line":32,"range":{"start_line":32,"start_character":27,"end_line":32,"end_character":48},"in_reply_to":"44e38d71_434c995c","updated":"2021-11-18 01:13:44.000000000","message":"Already managing may be wrong because a full hash ring rebalacne through an additional node may result in us not managing the instance anymore.","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"ddebfbe53a34a5c6d1129cd46d98a5378cbccfc7","unresolved":true,"context_lines":[{"line_number":43,"context_line":" the instance."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"All reconcilliation is also performed using ironic\u0027s knowledge of the"},{"line_number":46,"context_line":"instance as well. Obviously, if there is no instance, no action is"},{"line_number":47,"context_line":"required and the node cache being reported can be updated with that"},{"line_number":48,"context_line":"node. If an instance is unknown to nova-compute, then we tolerate"},{"line_number":49,"context_line":"that as there may be more than one nova in the same ironic sandbox."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"9892351f_fd8decdf","line":46,"range":{"start_line":46,"start_character":16,"end_line":46,"end_character":17},"updated":"2021-11-25 16:06:29.000000000","message":"Might be worth noting why this is important to do:\n\nSpecifically, if people change the conductor group, then the node disappears from view completely. If it re-appears elsewhere, we should update the records for it based upon the view of the physical bare metal assets, assigned instances, as a common pattern is to delineate groups of nova-computes into logical blocks so timely resource tracker updates occur.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":43,"context_line":" the instance."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"All reconcilliation is also performed using ironic\u0027s knowledge of the"},{"line_number":46,"context_line":"instance as well. Obviously, if there is no instance, no action is"},{"line_number":47,"context_line":"required and the node cache being reported can be updated with that"},{"line_number":48,"context_line":"node. If an instance is unknown to nova-compute, then we tolerate"},{"line_number":49,"context_line":"that as there may be more than one nova in the same ironic sandbox."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"fc441d28_983e7e1d","line":46,"range":{"start_line":46,"start_character":16,"end_line":46,"end_character":17},"in_reply_to":"9892351f_fd8decdf","updated":"2022-02-16 21:24:09.000000000","message":"Ack","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"ddebfbe53a34a5c6d1129cd46d98a5378cbccfc7","unresolved":true,"context_lines":[{"line_number":63,"context_line":"unexpected conditions because resource tracking calls thorugh"},{"line_number":64,"context_line":"get_available_resource and node_is_available calls ultimately"},{"line_number":65,"context_line":"leverage the cache of nodes to be managed by the compute service"},{"line_number":66,"context_line":"instance running, and a lack of presence in the cache can result"},{"line_number":67,"context_line":"in the node being considered disabled and being entirely skipped"},{"line_number":68,"context_line":"from resource tracker runs *and* ultimately causes the compute"},{"line_number":69,"context_line":"node entry to be destroyed and removed from the resource tracker."},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"928c071c_400303df","line":69,"range":{"start_line":66,"start_character":0,"end_line":69,"end_character":65},"updated":"2021-11-25 16:06:29.000000000","message":"This needs to be updated, because the later patch has been merged in which helps prevent this by also updating the ComputeNode\u0027s host entry as well.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":63,"context_line":"unexpected conditions because resource tracking calls thorugh"},{"line_number":64,"context_line":"get_available_resource and node_is_available calls ultimately"},{"line_number":65,"context_line":"leverage the cache of nodes to be managed by the compute service"},{"line_number":66,"context_line":"instance running, and a lack of presence in the cache can result"},{"line_number":67,"context_line":"in the node being considered disabled and being entirely skipped"},{"line_number":68,"context_line":"from resource tracker runs *and* ultimately causes the compute"},{"line_number":69,"context_line":"node entry to be destroyed and removed from the resource tracker."},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"38e4999d_452d88dd","line":69,"range":{"start_line":66,"start_character":0,"end_line":69,"end_character":65},"in_reply_to":"928c071c_400303df","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"ddebfbe53a34a5c6d1129cd46d98a5378cbccfc7","unresolved":true,"context_lines":[{"line_number":69,"context_line":"node entry to be destroyed and removed from the resource tracker."},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"This patch does *not* update the compute node host field record,"},{"line_number":75,"context_line":"which is queried and intertwined with cases where a compute node"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"dbe503b4_eacd2319","line":72,"updated":"2021-11-25 16:06:29.000000000","message":"and not set ourselves up for orphaned instances or deleted compute nodes.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":69,"context_line":"node entry to be destroyed and removed from the resource tracker."},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"This patch does *not* update the compute node host field record,"},{"line_number":75,"context_line":"which is queried and intertwined with cases where a compute node"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"cc88645d_a2cda8ae","line":72,"in_reply_to":"dbe503b4_eacd2319","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"ddebfbe53a34a5c6d1129cd46d98a5378cbccfc7","unresolved":true,"context_lines":[{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"This patch does *not* update the compute node host field record,"},{"line_number":75,"context_line":"which is queried and intertwined with cases where a compute node"},{"line_number":76,"context_line":"be deleted. For reviewer sanity, that will be done in another patch,"},{"line_number":77,"context_line":"which is related to launchpad bug #1825876."},{"line_number":78,"context_line":""},{"line_number":79,"context_line":"Closes-Bug: #1730834"},{"line_number":80,"context_line":"Closes-Bug: #1825876"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"692cc86b_796a9591","line":77,"range":{"start_line":74,"start_character":1,"end_line":77,"end_character":43},"updated":"2021-11-25 16:06:29.000000000","message":"no longer true.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":71,"context_line":"In other words, since the hash ring dictates we manage the node,"},{"line_number":72,"context_line":"Then we must be capable of doing so accordingly."},{"line_number":73,"context_line":""},{"line_number":74,"context_line":"This patch does *not* update the compute node host field record,"},{"line_number":75,"context_line":"which is queried and intertwined with cases where a compute node"},{"line_number":76,"context_line":"be deleted. For reviewer sanity, that will be done in another patch,"},{"line_number":77,"context_line":"which is related to launchpad bug #1825876."},{"line_number":78,"context_line":""},{"line_number":79,"context_line":"Closes-Bug: #1730834"},{"line_number":80,"context_line":"Closes-Bug: #1825876"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"9b6e2768_a5bb7cbe","line":77,"range":{"start_line":74,"start_character":1,"end_line":77,"end_character":43},"in_reply_to":"692cc86b_796a9591","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"ddebfbe53a34a5c6d1129cd46d98a5378cbccfc7","unresolved":true,"context_lines":[{"line_number":79,"context_line":"Closes-Bug: #1730834"},{"line_number":80,"context_line":"Closes-Bug: #1825876"},{"line_number":81,"context_line":"Closes: rhbz#2017980"},{"line_number":82,"context_line":"Related: #1825876"},{"line_number":83,"context_line":"Change-Id: I4b6359c1154a8cecc72c16dddf66c5c4184a69f0"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"88d346e0_239dbbcf","line":82,"range":{"start_line":82,"start_character":0,"end_line":82,"end_character":17},"updated":"2021-11-25 16:06:29.000000000","message":"no longer related, also closes.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":79,"context_line":"Closes-Bug: #1730834"},{"line_number":80,"context_line":"Closes-Bug: #1825876"},{"line_number":81,"context_line":"Closes: rhbz#2017980"},{"line_number":82,"context_line":"Related: #1825876"},{"line_number":83,"context_line":"Change-Id: I4b6359c1154a8cecc72c16dddf66c5c4184a69f0"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"6fedadba_2f93dca1","line":82,"range":{"start_line":82,"start_character":0,"end_line":82,"end_character":17},"in_reply_to":"88d346e0_239dbbcf","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"e544e0efffa51d2fdd51655cf770cbd5fe157d02","unresolved":true,"context_lines":[{"line_number":80,"context_line":"Closes-Bug: #1825876"},{"line_number":81,"context_line":"Closes: rhbz#2017980"},{"line_number":82,"context_line":"Related: #1825876"},{"line_number":83,"context_line":"Change-Id: I4b6359c1154a8cecc72c16dddf66c5c4184a69f0"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"bef51c18_62396bf5","line":83,"updated":"2022-02-14 22:42:01.000000000","message":"also related: https://bugs.launchpad.net/nova/+bug/1853009","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":80,"context_line":"Closes-Bug: #1825876"},{"line_number":81,"context_line":"Closes: rhbz#2017980"},{"line_number":82,"context_line":"Related: #1825876"},{"line_number":83,"context_line":"Change-Id: I4b6359c1154a8cecc72c16dddf66c5c4184a69f0"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"1e544176_b8987594","line":83,"in_reply_to":"bef51c18_62396bf5","updated":"2022-02-16 21:24:09.000000000","message":"Done","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"3dc0e2eded5fd80ccff0975f7933986fdc0c4d71","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"dbedac30_d36b1652","updated":"2021-10-13 23:05:24.000000000","message":"I think I understand the overall logic, and I think it makes sense. Comment inline about style.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"d4d9b38bfda08fe55f61fbe8268d47090e0211cc","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"c5820a7f_0c593d2a","updated":"2021-11-03 15:11:50.000000000","message":"Adding Arne as mentally this is related in part to an issue he encountered with changing the conductor_group of a running node, and it seems like that might need to be an additional case reconciled when the node loads up in a new compute service. Maybe?!","commit_id":"b5b912df17548867cdf2cf552204ff0e615dbc13"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"81774120fe683396a94f0a19a996b79bb6898494","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"b5e768c9_2e29341a","updated":"2021-11-12 22:15:53.000000000","message":"-1 for... discussion? Here it goes:\n\nYou summarized the algorithm really well in the commit message, and then I got to the code and was lost all over again :( I wonder if we can simplify it down the the following (all in a single method hopefully) - tell me if I got something wrong:\n\n-1. The hash ring is a thing that takes a list of nodes and a list of nova-compute services, and distributes the former into buckets of the latter. If called multiple times with the same inputs, it will return the exact same outputs. IOW, in a situation where multiple nova-compute services are doing their refresh and calling on the hash ring, they will get the same buckets of nodes.\n\n0a. Re-frame _refresh_cache() as a method that accepts:\n - the hash_ring\n - list of instances that we manage (IOW, instance.host \u003d\u003d CONF.host), and returns a list of nodes that we manage (or more precisely, a dict of nodes that we manage, keyed by node UUID).\n - I don\u0027t think we need the list of UP compute services... see later.\n\n0b. If the above is true, start with an empty node cache, and just add to it, using the following logic:\n\n1. If the node has an instance on it that we manage, add it to our node cache and set its compute_node.host.\n\nOtherwise:\n\n2. If the hash ring says we\u0027re the node\u0027s host:\n\nAdd it to our node cache, and set its compute_node.host to ourselves. If it has an instance on it, set instance.host to ourselves.\n\nThis is where you currently have the \"is the node\u0027s previous host up?\" check, the idea being, if the previous host went away, came back, and started its _refresh_cache() after us with a different list of UP compute services that includes itself (whereas our list didn\u0027t include it), it will re-appropriate itself the node. But that\u0027s... fine? If there\u0027s an instance on the node, once we set instance.host, step 1 will prevent any re-appropriation. And if there\u0027s no instance, other hosts are free to take this node from us in the future.\n\n3. If the hash ring says we\u0027re not the node\u0027s host\":\n\nDo nothing.\n\n\nAll of this also has the advantage of squashing your patch on top into this one, into a single, hopefully more easily grokkable, fix?","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"dc4f731488e488f9114043a46519bbbe4031d364","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"5f6f3530_bbfeaed8","updated":"2021-11-18 01:13:44.000000000","message":"I think this all makes sense","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"fa663199d6e7c07d8b6cd6e0794a1f8e2d2755c7","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"af707990_96e47d19","updated":"2021-11-19 16:44:37.000000000","message":"I\u0027ve tried to clarify what I meant, we also chatted about this on IRC a bit. I\u0027ll remove my -1, I think we need operator input to have a better understanding of whether some things like pulling the list of nodes every time are feasible or not.","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"817e7d5764d7cad8eb21914e6855bfb8f700dc50","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"5b14f97b_f255794e","in_reply_to":"2e93476a_05080a6b","updated":"2022-02-17 15:49:06.000000000","message":"Not quoting the previous discussion because syncing back to it was getting difficult to follow. So, service startup adds the entry as in essence the very first step, so the compute node would show back up in the list, so the current revisions with the detection of mismatch would at least skip actions for both computes and instances. Which means startup time really doesn\u0027t factor into this whole thing.\n\nSo, a single process or even multi-process rapid restart wouldn\u0027t trigger a rebalance, but it would trigger the cache being regenerated and reconciliation to occur regardless for nodes assigned to that compute node. If by chance something is wrong, that pass would correct it too, which is a bonus I guess.\n\nThe case where a rebalance would occur is if someone stops one or more compute services for \u003e90 seconds.\n\nI guess this leads to a question I\u0027ve been pondering... Do we, instead of reconciling, if we identify a possibility of the hash ring changing, raise an internal exception to abort the cache/hash ring regeneration... I feel like that would delay overall full reconcile operation if someone is doing some invasive operations like... upgrading services or stopping everything and restarting one service at a time. The other downside which comes to mind to completely short circuiting the cache rebuild/reconciliation process would create a situation where we would be far more likely to match the compute node host not found in service list condition state which forces operators like Chris presently get to deal with.\n\nGoing back to invasive restarts, in that case, there would be record churn, but rightfully so because we can\u0027t make assumptions to the operators intent or the infrastructure state.\n\n(Maybe, a doc update is worthwhile here, suggesting to do quick restarts if found necessary.)\n\nAnd, the window to identify a necessity to rebalance would be only at every resource tracker run.\n\nIt occurs to me that we likely need to direct operators to be careful with CONF.update_resources_interval since by default it\u0027s value is 0, which could create tons of pain... I actually wonder if we should log a warning if this is the case....","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"fa663199d6e7c07d8b6cd6e0794a1f8e2d2755c7","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"fd63df04_c2bacc87","in_reply_to":"45e03c01_4ca705e6","updated":"2021-11-19 16:44:37.000000000","message":"\u003e \u003e -1 for... discussion? Here it goes:\n\u003e \n\u003e Okay, wow, this is a bit much. But okay!\n\nSorry, I realize this was an unfriendly way to start a review. In the past just leaving comments on Nova patches would often result in no reaction, so I\u0027ve developed a habit (and been advised by others) to be a bit \"in your face\" about it, and not feel bad about saying \"-1 for question visibility\". Which is what I attempted to do here, just clumsily. Sorry again.\n\n\u003e \u003e You summarized the algorithm really well in the commit message, and then I got to the code and was lost all over again :( I wonder if we can simplify it down the the following (all in a single method hopefully) - tell me if I got something wrong:\n\u003e \u003e \n\u003e \u003e -1. The hash ring is a thing that takes a list of nodes and a list of nova-compute services, and distributes the former into buckets of the latter. If called multiple times with the same inputs, it will return the exact same outputs. IOW, in a situation where multiple nova-compute services are doing their refresh and calling on the hash ring, they will get the same buckets of nodes.\n\u003e \u003e \n\u003e \n\u003e Semi-correct, at least based on how I interpret your explaination. The way the hash ring works is based upon just the list of services which are responsible entities in the context. Physical machines, nodes, because everything is a node, unless it is an inode (yes, this is an awful joke).\n\u003e \n\u003e Anyway, the way it works is the hash ring *can* contain one or more hosts which *are* responsible for the input node value based upon the input. When self.hash_ring.get_nodes(physical_node_key) is called, it determines based upon the state of the hash ring, which \"nodes as in hash ring members\", are responsible for the node. So as long as the list responsible services does not change, then if the same data is fed in for each node, the assignment will always be the same. At least, that is now I understand it. If a \"responsible entity\" disappears or is no longer able to manage a node or take work, then it should disappear from the ring, and upon the next cache refresh, the nodes which were managed by that instance would now be managed by the others. Similarlly if an entirely new \"responsible entity\" enters the picture. Nodes would be completely re-balanced across all of the responsible entities. This is in part because both the node list and the running compute list are dynamic... ?unicorns?.\n\nI think we have the same concepts in our heads, just worded differently. It doesn\u0027t help that the hash ring uses \"nodes\" for responsible entities, while Ironic uses \"nodes\" to mean the objects being assigned to the responsible entities. Using the hashring vocabulary, it\u0027s telling us which which nodes are responsible for which objects, so get_nodes(\u003cobject\u003e) will return one of more nodes. Using the Ironic vocabulary, Ironic is passing Ironic nodes in place of \u003cobject\u003e, and gets back a list of nova-services. Let\u0027s further hammer it in by making an ASCII table:\n\n Concept Ironic word Hashring word\n \n responsible entities nova-compute services nodes\n objects being mapped nodes objects/data\n what get_nodes() returns nova-compute services nodes\n\n\u003e \u003e 0a. Re-frame _refresh_cache() as a method that accepts:\n\u003e \u003e - the hash_ring\n\u003e \u003e - list of instances that we manage (IOW, instance.host \u003d\u003d CONF.host), and returns a list of nodes that we manage (or more precisely, a dict of nodes that we manage, keyed by node UUID).\n\u003e \n\u003e What value do you perceive by *not* performing the query inside the cache refresh method?\n\nI meant it in a more conceptual sense of \"this is the input data that the method\u0027s logic operates on. The data can be pulled inside the method\u0027s implementation in whatever order is best.\n\n\u003e We\u0027ve previously learned that doing it *before* getting the list of nodes results in out of sync node caches versus compute records, and compute nodes getting dropped. Hence saga\u0027s fix which starts at line 851 which was to move the query until after the list of nodes is retrieved. Previously the instance list was performed first before a list of nodes was collected.\n\u003e \n\u003e Why is this important, timing unfortunately. The DB query doesn\u0027t take very long at all. But downloading even just a thousand node records takes a non-insignificant amount of time. We\u0027ve made this better in Ironic, but all of the conversions, and pagination, can make things really painful. I\u0027ve got some anecdotal numbers on this if your curious and want to cringe at the scale a single nova-compute is attempting to support.\n\u003e \n\u003e (Side note, maybe we should count and actually start issuing some rather... urgent errors when people start running things at unreasonable scales...)\n\u003e \n\u003e Okay, so to do this, we would need to make get_available_resource and get_available_nodes, get_info, and node_is_available all aware of the hash ring and the list of instances. I guess we could, but then the question is, will we be able to backport this at that point?\n\nI totally buy your explanation of why _get_node_list() takes forever and needs to be done first.\n\n\n\u003e \u003e - I don\u0027t think we need the list of UP compute services... see later.\n\u003e \u003e \n\u003e \n\u003e \n\u003e We need a list of UP compute nodes in order to know if a node is part of the ring in terms of being an active participant in the cluster\u0027s management. In other words, if we\u0027re based on the presence of down, then we\u0027re not actively a redundant service, we\u0027re relying on external and likely human driven intervention to resolve conditions. Anyway, I\u0027ll separately look at the comments below as I digest everything here.\n\nI keep flip-flapping on this, but I\u0027m still not convinced that using the hashring as the absolute source of truth is bad. IIUC, the reason we need to have a list of nova-compute services that are present and up is the following situation:\n\n0. nova-compute service A is managing node K, with instance X on it.\n1. nova-compute service A goes away\n2. Rebalance is triggered\n3. Hash ring assigns K to nova-compute service B.\n4. B saves K in its node cache, updates X\u0027s instance.host record, and updates K\u0027s compute_node.host record.\n5. Surprise, A comes back roughly at the same time as steps 3 and 4 are taking place.\n\nThe initial idea was to make steps 3 and 4 conditional on A not having appeared back to avoid B \"stealing\" its node K and instance X. But what if we don\u0027t do that, and just let the process continue:\n\n6. Rebalance is triggered.\n7. Steps 3 and 4 repeat, potentially with different values for A, K and X.\n\nI realize this would mean instances and nodes moving across hosts more often, but wouldn\u0027t this make this code less fragile and prone to races?\n\n\u003e \u003e 0b. If the above is true, start with an empty node cache, and just add to it, using the following logic:\n\u003e \n\u003e \n\u003e So, nodes do need to disappear from the cache if they are no longer in the deployment. Baremetal machines are deleted or removed from a state where they can no longer be provisioned. They may return, but we don\u0027t know that an an attempt to deploy to that node would result in a failure. Plus get_available_nodes uses the cache instead of spending the 15 minutes in a medium sized deployment to pull the list of nodes.\n\nSo if I understand you correctly, rebuilding the node_cache from scratch every time is not an option, because it takes too long?\n\n\u003e \u003e 1. If the node has an instance on it that we manage, add it to our node cache and set its compute_node.host.\n\u003e \n\u003e \n\u003e So basically line line 867-870 w/r/t the node cache.\n\u003e \n\u003e My follow-up patch does the latter, disjointed in part because of the complexity and necessity of fixing the hash ring behavior overall. What is present today is... well. yeah. ;)\n\nI was hoping that it we can simplify the algorithm and make it easier to reason about it, we can do everything in the same patch.\n\n\u003e \u003e Otherwise:\n\u003e \u003e \n\u003e \u003e 2. If the hash ring says we\u0027re the node\u0027s host:\n\u003e \u003e \n\u003e \u003e Add it to our node cache, and set its compute_node.host to ourselves. If it has an instance on it, set instance.host to ourselves.\n\u003e \n\u003e Okay, like 877-889, although the only difference there is if, and only if the hash ring has changed.\n\u003e \n\u003e I guess, the difference is I\u0027m trying to avoid the impact to the db which would be potentially having to check the compute node and instance record data for every node in the list of nodes every time the cache is refreshed. I guess I\u0027m trying to avoid a few thousand individual selects in the entire process.\n\nSo instance.save() is nice in that it won\u0027t actually write anything to the database if nothing has changed on the object. But yeah, we may still need to do the \"hash ring has changed?\" check for performance reasons. ComputeNode.save() does not have this magic, btw...\n\n\u003e \u003e This is where you currently have the \"is the node\u0027s previous host up?\" check, the idea being, if the previous host went away, came back, and started its _refresh_cache() after us with a different list of UP compute services that includes itself (whereas our list didn\u0027t include it), it will re-appropriate itself the node. But that\u0027s... fine? \n\u003e \n\u003e Totally fine and also expected behavior. We know from reports of race conditions and missing compute nodes, that we loose compute node records if we get out of sync (in part, because of the host record, again, next patch). The thought at play here was more of one: We know operators will restart one or more compute services in rapid succession, lets make sure we don\u0027t needlessly squash on each other, and with a longer set of nodes to iterate through, depending on timing we may have a more observable gap in the window between the original hash ring state and the next hash ring.\n\nBut... compute services shouldn\u0027t go away if they\u0027re restarted. Nova has the CONF.service_down_time and CONF.report_interval configurables that can be tweaked so that, for example, if a nova-compute services takes 15 minutes to restart, it doesn\u0027t show up as flapping down then back up while doing so (though it would mean *actualy* service downs take longer to be noticed).\n\n\u003e \u003eIf there\u0027s an instance on the node, once we set instance.host, step 1 will prevent any re-appropriation. And if there\u0027s no instance, other hosts are free to take this node from us in the future.\n\u003e \n\u003e This is where we need to make sure the nodes disappear when no longer present in the deployment, which is why the cache rebuild goes through the entire list it can see and bases the decision on the hash ring state.\n\u003e \n\u003e \u003e \n\u003e \u003e 3. If the hash ring says we\u0027re not the node\u0027s host\":\n\u003e \u003e\n\u003e \u003e Do nothing.\n\u003e \u003e \n\u003e \u003e \n\u003e \u003e All of this also has the advantage of squashing your patch on top into this one, into a single, hopefully more easily grokkable, fix?\n\u003e \n\u003e I... guess, but some of what your suggesting would re-introduce a shift in cache presence awareness on cache updates because we wouldn\u0027t be evaluating the absolute latest instance list, but a potentially out of date instance list while we retrieved nodes. Granted, if we\u0027ve seen it before not a big deal, but get_available_nodes and such would also begin to get outdated info from the cache as we\u0027re not refreshing the entire cache, only adding at that point.\n\nI wasn\u0027t proposing getting the services first, then the nodes. Get the nodes if that takes longer, then get the services, then use the hash_ring (without adding ourselves to it to ensure consistency across all calls to the hash_ring across all nova-compute services) as the absolute source of truh","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"144551861b5e6e6df92872f626705927083263a7","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"8e2dcb94_6bdbe43f","in_reply_to":"5f6f3530_bbfeaed8","updated":"2021-11-18 01:14:51.000000000","message":"And by sense, I meant... my comments make sense. Hopefully!","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"156fd2aa2a20740609ff42749b07034c0c60181c","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"93d59c19_623ed2bb","in_reply_to":"af707990_96e47d19","updated":"2021-11-23 19:14:06.000000000","message":"Cool, Just skimming the replies, Yeah, terminology. I feel like a small reference or something might be needed 😊\n\nI have mixed feelings on each time but it is semi-required too and +1000 to getting operator input. Unfortunately this is not the time of the year for soliciting it quickly. :\\","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"dc4f731488e488f9114043a46519bbbe4031d364","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"45e03c01_4ca705e6","in_reply_to":"b5e768c9_2e29341a","updated":"2021-11-18 01:13:44.000000000","message":"\u003e -1 for... discussion? Here it goes:\n\nOkay, wow, this is a bit much. But okay!\n\n\u003e \n\u003e You summarized the algorithm really well in the commit message, and then I got to the code and was lost all over again :( I wonder if we can simplify it down the the following (all in a single method hopefully) - tell me if I got something wrong:\n\u003e \n\u003e -1. The hash ring is a thing that takes a list of nodes and a list of nova-compute services, and distributes the former into buckets of the latter. If called multiple times with the same inputs, it will return the exact same outputs. IOW, in a situation where multiple nova-compute services are doing their refresh and calling on the hash ring, they will get the same buckets of nodes.\n\u003e \n\nSemi-correct, at least based on how I interpret your explaination. The way the hash ring works is based upon just the list of services which are responsible entities in the context. Physical machines, nodes, because everything is a node, unless it is an inode (yes, this is an awful joke).\n\nAnyway, the way it works is the hash ring *can* contain one or more hosts which *are* responsible for the input node value based upon the input. When self.hash_ring.get_nodes(physical_node_key) is called, it determines based upon the state of the hash ring, which \"nodes as in hash ring members\", are responsible for the node. So as long as the list responsible services does not change, then if the same data is fed in for each node, the assignment will always be the same. At least, that is now I understand it. If a \"responsible entity\" disappears or is no longer able to manage a node or take work, then it should disappear from the ring, and upon the next cache refresh, the nodes which were managed by that instance would now be managed by the others. Similarlly if an entirely new \"responsible entity\" enters the picture. Nodes would be completely re-balanced across all of the responsible entities. This is in part because both the node list and the running compute list are dynamic... ?unicorns?.\n\n\n\u003e 0a. Re-frame _refresh_cache() as a method that accepts:\n\u003e - the hash_ring\n\u003e - list of instances that we manage (IOW, instance.host \u003d\u003d CONF.host), and returns a list of nodes that we manage (or more precisely, a dict of nodes that we manage, keyed by node UUID).\n\nWhat value do you perceive by *not* performing the query inside the cache refresh method?\n\nWe\u0027ve previously learned that doing it *before* getting the list of nodes results in out of sync node caches versus compute records, and compute nodes getting dropped. Hence saga\u0027s fix which starts at line 851 which was to move the query until after the list of nodes is retrieved. Previously the instance list was performed first before a list of nodes was collected.\n\nWhy is this important, timing unfortunately. The DB query doesn\u0027t take very long at all. But downloading even just a thousand node records takes a non-insignificant amount of time. We\u0027ve made this better in Ironic, but all of the conversions, and pagination, can make things really painful. I\u0027ve got some anecdotal numbers on this if your curious and want to cringe at the scale a single nova-compute is attempting to support.\n\n(Side note, maybe we should count and actually start issuing some rather... urgent errors when people start running things at unreasonable scales...)\n\nOkay, so to do this, we would need to make get_available_resource and get_available_nodes, get_info, and node_is_available all aware of the hash ring and the list of instances. I guess we could, but then the question is, will we be able to backport this at that point?\n\n\n\u003e - I don\u0027t think we need the list of UP compute services... see later.\n\u003e \n\n\nWe need a list of UP compute nodes in order to know if a node is part of the ring in terms of being an active participant in the cluster\u0027s management. In other words, if we\u0027re based on the presence of down, then we\u0027re not actively a redundant service, we\u0027re relying on external and likely human driven intervention to resolve conditions. Anyway, I\u0027ll separately look at the comments below as I digest everything here.\n\n\n\u003e 0b. If the above is true, start with an empty node cache, and just add to it, using the following logic:\n\n\nSo, nodes do need to disappear from the cache if they are no longer in the deployment. Baremetal machines are deleted or removed from a state where they can no longer be provisioned. They may return, but we don\u0027t know that an an attempt to deploy to that node would result in a failure. Plus get_available_nodes uses the cache instead of spending the 15 minutes in a medium sized deployment to pull the list of nodes.\n\n\n\u003e \n\u003e 1. If the node has an instance on it that we manage, add it to our node cache and set its compute_node.host.\n\n\nSo basically line line 867-870 w/r/t the node cache.\n\nMy follow-up patch does the latter, disjointed in part because of the complexity and necessity of fixing the hash ring behavior overall. What is present today is... well. yeah. ;)\n\n\u003e \n\u003e Otherwise:\n\u003e \n\u003e 2. If the hash ring says we\u0027re the node\u0027s host:\n\u003e \n\u003e Add it to our node cache, and set its compute_node.host to ourselves. If it has an instance on it, set instance.host to ourselves.\n\nOkay, like 877-889, although the only difference there is if, and only if the hash ring has changed.\n\nI guess, the difference is I\u0027m trying to avoid the impact to the db which would be potentially having to check the compute node and instance record data for every node in the list of nodes every time the cache is refreshed. I guess I\u0027m trying to avoid a few thousand individual selects in the entire process.\n\n\n\u003e \n\u003e This is where you currently have the \"is the node\u0027s previous host up?\" check, the idea being, if the previous host went away, came back, and started its _refresh_cache() after us with a different list of UP compute services that includes itself (whereas our list didn\u0027t include it), it will re-appropriate itself the node. But that\u0027s... fine? \n\nTotally fine and also expected behavior. We know from reports of race conditions and missing compute nodes, that we loose compute node records if we get out of sync (in part, because of the host record, again, next patch). The thought at play here was more of one: We know operators will restart one or more compute services in rapid succession, lets make sure we don\u0027t needlessly squash on each other, and with a longer set of nodes to iterate through, depending on timing we may have a more observable gap in the window between the original hash ring state and the next hash ring. \n\n\u003eIf there\u0027s an instance on the node, once we set instance.host, step 1 will prevent any re-appropriation. And if there\u0027s no instance, other hosts are free to take this node from us in the future.\n\nThis is where we need to make sure the nodes disappear when no longer present in the deployment, which is why the cache rebuild goes through the entire list it can see and bases the decision on the hash ring state.\n\n\u003e \n\u003e 3. If the hash ring says we\u0027re not the node\u0027s host\":\n\u003e\n\u003e Do nothing.\n\u003e \n\u003e \n\u003e All of this also has the advantage of squashing your patch on top into this one, into a single, hopefully more easily grokkable, fix?\n\nI... guess, but some of what your suggesting would re-introduce a shift in cache presence awareness on cache updates because we wouldn\u0027t be evaluating the absolute latest instance list, but a potentially out of date instance list while we retrieved nodes. Granted, if we\u0027ve seen it before not a big deal, but get_available_nodes and such would also begin to get outdated info from the cache as we\u0027re not refreshing the entire cache, only adding at that point.","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"156fd2aa2a20740609ff42749b07034c0c60181c","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"2e93476a_05080a6b","in_reply_to":"fd63df04_c2bacc87","updated":"2021-11-23 19:14:06.000000000","message":"\u003e Sorry, I realize this was an unfriendly way to start a review. In the past just leaving comments on Nova patches would often result in no reaction, so I\u0027ve developed a habit (and been advised by others) to be a bit \"in your face\" about it, and not feel bad about saying \"-1 for question visibility\". Which is what I attempted to do here, just clumsily. Sorry again.\n\u003e\n\nNo worries!\n\n\u003e \n\u003e I think we have the same concepts in our heads, just worded differently. It doesn\u0027t help that the hash ring uses \"nodes\" for responsible entities, while Ironic uses \"nodes\" to mean the objects being assigned to the responsible entities. Using the hashring vocabulary, it\u0027s telling us which which nodes are responsible for which objects, so get_nodes(\u003cobject\u003e) will return one of more nodes. Using the Ironic vocabulary, Ironic is passing Ironic nodes in place of \u003cobject\u003e, and gets back a list of nova-services. Let\u0027s further hammer it in by making an ASCII table:\n\u003e \n\u003e Concept Ironic word Hashring word\n\u003e \n\u003e responsible entities nova-compute services nodes\n\u003e objects being mapped nodes objects/data\n\u003e what get_nodes() returns nova-compute services nodes\n\u003e \n\nThat puts it exactly as I was thinking, and I believe this definitely needs to just go ahead and be in the code.\n\nI\u0027ll add in next revision.\n\n\n\u003e \n\u003e I meant it in a more conceptual sense of \"this is the input data that the method\u0027s logic operates on. The data can be pulled inside the method\u0027s implementation in whatever order is best.\n\u003e \n\nAhh, okay. It is modeled slightly differently than that now, but words make it hard as well as the consistency problems.\n\n\n\u003e \n\u003e I totally buy your explanation of why _get_node_list() takes forever and needs to be done first.\n\u003e \n\nFWIW, I *have* asked to see if zer0c00l can free up some bandwidth to take a look at what we\u0027re trying to do here to improve this entire situation, since I know he has also spent a significant amount of time looking at this area of the code.\n\n\u003e \n\u003e \u003e \u003e - I don\u0027t think we need the list of UP compute services... see later.\n\u003e \u003e \u003e \n\u003e \u003e \n\u003e \u003e \n\u003e \u003e We need a list of UP compute nodes in order to know if a node is part of the ring in terms of being an active participant in the cluster\u0027s management. In other words, if we\u0027re based on the presence of down, then we\u0027re not actively a redundant service, we\u0027re relying on external and likely human driven intervention to resolve conditions. Anyway, I\u0027ll separately look at the comments below as I digest everything here.\n\u003e \n\u003e I keep flip-flapping on this, but I\u0027m still not convinced that using the hashring as the absolute source of truth is bad. IIUC, the reason we need to have a list of nova-compute services that are present and up is the following situation:\n\nSo, I agree it should likely be the absolute source of truth. That being said, I think that means we drop the list of instances in the calculation of assignments, or at least could potentially drop it.\n\n\u003e \n\u003e 0. nova-compute service A is managing node K, with instance X on it.\n\u003e 1. nova-compute service A goes away\n\u003e 2. Rebalance is triggered\n\u003e 3. Hash ring assigns K to nova-compute service B.\n\u003e 4. B saves K in its node cache, updates X\u0027s instance.host record, and updates K\u0027s compute_node.host record.\n\u003e 5. Surprise, A comes back roughly at the same time as steps 3 and 4 are taking place.\n\u003e \n\u003e The initial idea was to make steps 3 and 4 conditional on A not having appeared back to avoid B \"stealing\" its node K and instance X. But what if we don\u0027t do that, and just let the process continue:\n\u003e \n\nWhich is why I wanted to re-consult and skip nodes if we recognize the hash ring is going to change again, given we could be 300 seconds behind still sorting through thousands of records.\n\n\u003e 6. Rebalance is triggered.\n\u003e 7. Steps 3 and 4 repeat, potentially with different values for A, K and X.\n\u003e \n\u003e I realize this would mean instances and nodes moving across hosts more often, but wouldn\u0027t this make this code less fragile and prone to races?\n\nAgreed, and it is a trade-off I guess. Operator.Input.required().\n\n\u003e \n\u003e \u003e \u003e 0b. If the above is true, start with an empty node cache, and just add to it, using the following logic:\n\u003e \u003e \n\u003e \u003e \n\u003e \u003e So, nodes do need to disappear from the cache if they are no longer in the deployment. Baremetal machines are deleted or removed from a state where they can no longer be provisioned. They may return, but we don\u0027t know that an an attempt to deploy to that node would result in a failure. Plus get_available_nodes uses the cache instead of spending the 15 minutes in a medium sized deployment to pull the list of nodes.\n\u003e \n\u003e So if I understand you correctly, rebuilding the node_cache from scratch every time is not an option, because it takes too long?\n\nNo, I believe it is the only option. It is just we don\u0027t want to remove the old cache while we\u0027re building the new cache. This is *slightly* different from the prior behavior where the nova-compute virt driver would always try to cache if an instance was assigned to that nova-compute. Fixing the host records as we rebuild the helps prevent that from being a bottleneck and the only risk which I can see while a rebalance/reconcile/update is occuring is a request could get routed to a nova-compute process in flux, and the entry gets added to the cache temporarily for an existing instance, but since it is already deployed I don\u0027t think it would have an impact because the node is in use.\n\n\u003e \n\u003e \u003e \u003e 1. If the node has an instance on it that we manage, add it to our node cache and set its compute_node.host.\n\u003e \u003e \n\u003e \u003e \n\u003e \u003e So basically line line 867-870 w/r/t the node cache.\n\u003e \u003e \n\u003e \u003e My follow-up patch does the latter, disjointed in part because of the complexity and necessity of fixing the hash ring behavior overall. What is present today is... well. yeah. ;)\n\u003e \n\u003e I was hoping that it we can simplify the algorithm and make it easier to reason about it, we can do everything in the same patch.\n\nAcknowledged.\n\n\n\u003e \u003e \n\u003e \u003e I guess, the difference is I\u0027m trying to avoid the impact to the db which would be potentially having to check the compute node and instance record data for every node in the list of nodes every time the cache is refreshed. I guess I\u0027m trying to avoid a few thousand individual selects in the entire process.\n\u003e \n\u003e So instance.save() is nice in that it won\u0027t actually write anything to the database if nothing has changed on the object. But yeah, we may still need to do the \"hash ring has changed?\" check for performance reasons. ComputeNode.save() does not have this magic, btw...\n\u003e \n\nThat is good to know. Hmm... \n\n\u003e \u003e \u003e This is where you currently have the \"is the node\u0027s previous host up?\" check, the idea being, if the previous host went away, came back, and started its _refresh_cache() after us with a different list of UP compute services that includes itself (whereas our list didn\u0027t include it), it will re-appropriate itself the node. But that\u0027s... fine? \n\u003e \u003e \n\u003e \u003e Totally fine and also expected behavior. We know from reports of race conditions and missing compute nodes, that we loose compute node records if we get out of sync (in part, because of the host record, again, next patch). The thought at play here was more of one: We know operators will restart one or more compute services in rapid succession, lets make sure we don\u0027t needlessly squash on each other, and with a longer set of nodes to iterate through, depending on timing we may have a more observable gap in the window between the original hash ring state and the next hash ring.\n\u003e \n\u003e But... compute services shouldn\u0027t go away if they\u0027re restarted. Nova has the CONF.service_down_time and CONF.report_interval configurables that can be tweaked so that, for example, if a nova-compute services takes 15 minutes to restart, it doesn\u0027t show up as flapping down then back up while doing so (though it would mean *actualy* service downs take longer to be noticed).\n\u003e \n\nGood to know. \n\nI feel like I need to dig into the rest of the compute manager service code, but that also means as long as the restart is quick, then the hash ring doesn\u0027t completely need to be rebuilt across the board, the awareness just needs to be reloaded locally, which works.\n\n\n\u003e \n\u003e I wasn\u0027t proposing getting the services first, then the nodes. Get the nodes if that takes longer, then get the services, then use the hash_ring (without adding ourselves to it to ensure consistency across all calls to the hash_ring across all nova-compute services) as the absolute source of truh\n\nAck, I think that is likely the best way to go. It *would* mean we may have an intermediate window...\n\nI *suspect* we may need to at least pre-populate on the host record presence to avoid any potential resource tracking/placement updates on restart. But removing ourselves from the list may also make our initial start a lot faster... maybe.","commit_id":"2b7a2acefe8bcc49ebaef86322c238361c24f9bc"},{"author":{"_account_id":5805,"name":"Chris Krelle","email":"nobodycam@gmail.com","username":"nobodycam"},"change_message_id":"cb6b7f40c15fecb0bb1a39ebf2d551d3e8615796","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"62254014_568c9ab6","updated":"2021-12-08 01:09:32.000000000","message":"I believe this to be a valuable effort for those of us that encounter this issue. ","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":2033,"name":"Belmiro Moreira","email":"moreira.belmiro.email.lists@gmail.com","username":"moreira-belmiro-email-lists"},"change_message_id":"c9229229aab27cd037782d96dde0d04d554df594","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"e27a8edf_2b0b22e9","updated":"2021-12-02 14:22:09.000000000","message":"Thanks Julia for looking into this.\n\nThe \"hash ring\" is mostly to keep the \"nova logic\" that an instance is always associated with a compute node.\nI find this concept very complex!\nAt CERN we only run a nova compute per conductor group.\n\nNow, if we lived in the perfect world:\nWith the ironic driver, \"nova-compute\" is basically a proxy to Ironic, and if we were doing it from scratch, there\u0027s no need have the instances mapped to a nova compute node. In my view any compute node with the ironic driver should be able to handle it.\nSure, most likely this means a profound refactor in nova-compute (also the cell conductor) and possible some assumptions (maybe not mix kvm and ironic compute nodes in the same cell).","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11292,"name":"Arne Wiebalck","email":"Arne.Wiebalck@cern.ch","username":"wiebalck"},"change_message_id":"68803b3d0fc7d12a7ede9dc1cb0f60f1b44a5e73","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"64331be3_e09692b4","updated":"2021-11-30 21:25:23.000000000","message":"This is definitely a pain point for operators and is hard to clean up when things go wrong, esp. when there are a few thousand nodes involved. After several issues we eventually disabled the rebalancing from our deployment (trading availability/flexibility for predictability/stability). Getting this fixed would be great! ","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":12640,"name":"Arun S A G","email":"sagarun@gmail.com","username":"sagarun"},"change_message_id":"71f8dff54f7e279e72eeeebd5e804ab07030789f","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"a077fd06_594aec9c","updated":"2021-11-30 03:41:20.000000000","message":"We have the same problem. While inventory is rebalanced between running computes, the instances never gets taken over by a running compute. This causes any actions on the instance to fail. This is a good thing to fix. \n\nThanks for doing the hardwork julia. i am happy to review this.","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":5805,"name":"Chris Krelle","email":"nobodycam@gmail.com","username":"nobodycam"},"change_message_id":"48db6b0c6197c134a281a3379e79f9a2eca8a764","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":5,"id":"0cfe0061_263d70d2","updated":"2021-11-29 20:00:16.000000000","message":"We manage 1000\u0027s of ironic BM nodes per region and the issue being addresses here is a constant issue for us. Artom raised some good points, but I am more than excited to see this patch. Addressing these node behaviors will save us many hours a week troubleshooting / recovering nodes. ","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"435e3e80f743d0eaa0672ffbd7a58103fe3f6437","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"63afa7c8_2553e8a8","in_reply_to":"0cfe0061_263d70d2","updated":"2022-02-17 15:49:25.000000000","message":"Ack","commit_id":"ca70fc471889b9ee87c51e131d93941dc9ade0ff"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"9fc64224e173d05eb6b7bdd67856d44f84600ad3","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"f845e5dc_7bb11420","updated":"2022-02-17 18:35:40.000000000","message":"recheck","commit_id":"57c988ed31d8da687011d080fb8dfa2ebd7fd305"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"081cc47a42bc4a8f60beee4ac3ceda27248896b8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"d8453680_af35f3cc","updated":"2022-05-13 20:54:52.000000000","message":"I\u0027m starting a spec document based on a discussion with some nova contributors. There is a pre-existing one, but I think I\u0027m going to try and take a clean room approach based on my context and understanding of the options, and we can marry things back together from there.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"30e45ab66559f929acc663a3fccaa53126bb506b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"da478ebb_785190e8","updated":"2022-04-25 20:39:47.000000000","message":"I\u0027ve only done an initial pass reviewing, need to spend more time to fully understand it (though the code documentation is excellent). No need to respin at the moment. I\u0027m also adding Mark for review, he might be interested in this.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"c8b027ba919a810b9a94ca2eb76711334c0e2c7a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"a6c9fe9f_f2b27cb2","updated":"2022-04-25 20:15:31.000000000","message":"Sometimes a gif is what is required: https://media4.giphy.com/media/xT1R9M8zspTuhkokOQ/giphy.gif?cid\u003decf05e47bkubi4e2chiu4lr5riw9388l4xi9fhrdggb4s6ay\u0026rid\u003dgiphy.gif\u0026ct\u003dg","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"f55e04197eba1512683373280d52eb8dad98a709","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"98f3b4fa_9a4d1f76","updated":"2022-05-11 12:12:08.000000000","message":"by the way the -2 is procedural im happy to drop it once we etiehr have an appoved spec or agree to do this as a specless blueprint instead but its certainly not a bug.\n\ni need to review the implemantion that you have proposed to also ensure it corralates to what we have previously agreed too with regrades wo when its allowable for the reblance to happen.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"83117b5519a4cc151f3e91bbef8c4d86e095890d","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"2b82b093_8e6a7cd5","updated":"2022-05-11 11:48:21.000000000","message":"the spec for this is still not repoposed for zed so -2 until its approved.\n\nwe agreed at the ptg in yoga or zed when you started this that this was not a bug it was a feature that needed a spec","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"5b717a33bacc4063db68d584b92ad858250add0d","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":9,"id":"65479a20_83dec1f7","in_reply_to":"7ec6f447_5940dbca","updated":"2022-05-11 16:58:25.000000000","message":"by the way i have not looked at the spec in a while but i dont reacall anything about it that involved a breaking chanage or anything htat was not backportable other then the procedural facat that we dont backport features upstream.\n\nill try and read over that again too after i review your change.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"2ca48e94d3ea6f852dc53caf24dc3bf7388b7e9b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"9b19f354_e872fa6d","in_reply_to":"98f3b4fa_9a4d1f76","updated":"2022-05-11 14:25:00.000000000","message":"I don\u0027t remember when this was discussed, but I would have never agreed that this is a feature.\n\nGranted, a nova contributor has stated in the change set dialog that they would prefer a breaking interface redesign over the needful which has been proposed by other contributors in the past. This case, just happens to be very well notated inline.\n\nA breaking, non-backportable change, which would surely result from a specification process, which would mean operators will not see a fix for potentially years down the road until they manage to get to the newest version. Meanwhile, they would be unable to manage deployed baremetal nodes depending on what they do to their compute cluster or what way it breaks, meaning they would need to perform database surgery to remedy their records and their deployment. Which is really far from ideal and I think something we can all appreciate as a defect which should be prevented by being fixed.\n\nIf we really are deciding to not fix the multiple bugs that this ties in to, and treat it as a feature, then we are truly siding with perfection as opposed to good, which I\u0027m sure operators will find really quite disappointing and disheartening.\n\nGranted, I\u0027m not opposed to merging/backporting a bug fix and then having a dialog on what nova would ultimately like to do if that is the consensus and support from the nova community on doing so, but a -2 seems like an explicit statement that it is not an option.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"7e5534ca3e25b38894d3f787245ce632e789e93f","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":9,"id":"7ec6f447_5940dbca","in_reply_to":"9b19f354_e872fa6d","updated":"2022-05-11 16:55:40.000000000","message":"the reason we wanted a spec was becauses fo the upgrade impact and ensureing that the rebalaicne only happend in very limit cases.\n\nspecificaly the reblance should only happen if the compute service is delete\nor the compute service is disabled. \n\nhttps://review.opendev.org/c/openstack/nova-specs/+/815789/1/specs/yoga/approved/ironic-instance-change-host.rst#37\u003d\n\nyour change seam to be confined to the ironic virt driver.\nso i guess there are no rpc, object or db changes that would prevent a backport.\n\n\ni know we discussed this downstream because you atteded our internal team meating \nOct 27 2021 the week after the yoga ptg based which is why we filed the redhat bz that is linkied in the commit message https://bugzilla.redhat.com/show_bug.cgi?id\u003d2017980#\n\ni tought we had coverd this at the yoga ptg but i dont see it in the eterpad.\ni also check the nova team meeting before and after and dont see it so i guess\nwe only talked about this on irc breifly upstream but i was pretty sure sylvain express concerns about the upgrade impact of the inital apptoch and i spent tiem reviewing the spec explcitly because of the downstream customer issues and in my mind this was going to be a feature not a bugfix.\n\nhttps://meetings.opendev.org/irclogs/%23openstack-nova/%23openstack-nova.2021-10-25.log.html\nhttps://meetings.opendev.org/irclogs/%23openstack-nova/%23openstack-nova.2021-11-04.log.html#t2021-11-04T15:55:03\n\n\nill try and review this fully tomrrow with a clear head but ill remove the -1 for now. i set it becasue i did not wnat this to acidently be merged until we actully reconsiled your work and the spec and came to an agreemnt thatthis could be a bugfix.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"9088b62fd7fcea7f847850349ae0aa14bc77eb00","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"fefed009_89edce31","in_reply_to":"da478ebb_785190e8","updated":"2022-04-27 14:56:28.000000000","message":"Feel free to reach out if you have questions. I did hunt down jroll\u0027s attempt to basically fix this which I believe he indicated they had tested/run as a downstream patch at his then employer, but that was a *long* time ago. In essence, his approach was very similar, although mine takes it a step further and does ComputeNode record fixes in addition to Instances. At least, that is if my memory is correct.\n\nP.S. Thank you for the compliment. I\u0027ve been known to sometimes be a bit verbose, but really it felt like there was a lot that had to be understood in the interactions/reasoning so the code could be maintained moving forward.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"}],"nova/virt/ironic/driver.py":[{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"3dc0e2eded5fd80ccff0975f7933986fdc0c4d71","unresolved":true,"context_lines":[{"line_number":727,"context_line":" # table will be here so far, and we might be brand new."},{"line_number":728,"context_line":" services.add(CONF.host.lower())"},{"line_number":729,"context_line":""},{"line_number":730,"context_line":" # NOTE(TheJulia): We need to determine if the hash ring has changed"},{"line_number":731,"context_line":" # as we need to take corrective action to update the nova records"},{"line_number":732,"context_line":" # to align, otherwise we risk race conditions with resource tracking"},{"line_number":733,"context_line":" # and placement record updates."}],"source_content_type":"text/x-python","patch_set":1,"id":"b75765fa_1db78595","line":730,"updated":"2021-10-13 23:05:24.000000000","message":"Could we simplify all of the below into:\n\n if self.hash_ring:\n if services \u003d\u003d self.hash_ring.nodes: # we might need some sort of list equals here\n # services haven\u0027t changed, assume hash ring has stayed the same as well\n return false\n else:\n self.hash_ring \u003d \u003cnew hash ring\u003e\n return true\n else:\n self.hash_ring \u003d \u003cnew hash ring\u003e\n return true\n\n?\n\nAnd maybe factor it out into its own method?","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":727,"context_line":" # table will be here so far, and we might be brand new."},{"line_number":728,"context_line":" services.add(CONF.host.lower())"},{"line_number":729,"context_line":""},{"line_number":730,"context_line":" # NOTE(TheJulia): We need to determine if the hash ring has changed"},{"line_number":731,"context_line":" # as we need to take corrective action to update the nova records"},{"line_number":732,"context_line":" # to align, otherwise we risk race conditions with resource tracking"},{"line_number":733,"context_line":" # and placement record updates."}],"source_content_type":"text/x-python","patch_set":1,"id":"6f67d34e_d3c21522","line":730,"in_reply_to":"8df69677_f85ae052","updated":"2022-02-16 21:24:09.000000000","message":"Okay, I figured out a way to let python do it for me which I knew was already a thing, just didn\u0027t think of it. Anyway, simplified it and got rid of the calculation of the ring.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"1a3ad6d09df95621e0d9af2047a3b10a11d39d2d","unresolved":true,"context_lines":[{"line_number":727,"context_line":" # table will be here so far, and we might be brand new."},{"line_number":728,"context_line":" services.add(CONF.host.lower())"},{"line_number":729,"context_line":""},{"line_number":730,"context_line":" # NOTE(TheJulia): We need to determine if the hash ring has changed"},{"line_number":731,"context_line":" # as we need to take corrective action to update the nova records"},{"line_number":732,"context_line":" # to align, otherwise we risk race conditions with resource tracking"},{"line_number":733,"context_line":" # and placement record updates."}],"source_content_type":"text/x-python","patch_set":1,"id":"e0f89c27_1ed80855","line":730,"in_reply_to":"b75765fa_1db78595","updated":"2021-11-03 13:02:34.000000000","message":"services would have be de-duplicated along with both entries sorted, which would make it a little more complex, and then compared. That feels... a little more heavy I guess.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"2524ffa0f3dfde3a74233e2d2416232b6580a198","unresolved":true,"context_lines":[{"line_number":727,"context_line":" # table will be here so far, and we might be brand new."},{"line_number":728,"context_line":" services.add(CONF.host.lower())"},{"line_number":729,"context_line":""},{"line_number":730,"context_line":" # NOTE(TheJulia): We need to determine if the hash ring has changed"},{"line_number":731,"context_line":" # as we need to take corrective action to update the nova records"},{"line_number":732,"context_line":" # to align, otherwise we risk race conditions with resource tracking"},{"line_number":733,"context_line":" # and placement record updates."}],"source_content_type":"text/x-python","patch_set":1,"id":"8df69677_f85ae052","line":730,"in_reply_to":"e0f89c27_1ed80855","updated":"2021-11-03 15:00:46.000000000","message":"Ahh, but where it makes sense to check is if it was rebalanced from one active compute node to another active compute node. Realistically, we\u0027ll never reconcile that until it goes down, but that *should* be okay I guess because the original compute node can still proxy/service commands.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"b3fe0a28ee4af2b0747567d69dcf5135e8e6b550","unresolved":true,"context_lines":[{"line_number":758,"context_line":" context \u003d nova_context.get_admin_context()"},{"line_number":759,"context_line":" try:"},{"line_number":760,"context_line":" instance \u003d objects.Instance.get_by_uuid(context, instance_uuid)"},{"line_number":761,"context_line":" instance.host \u003d CONF.host.lower()"},{"line_number":762,"context_line":" # Explicitly save the instance object to the DB to allow"},{"line_number":763,"context_line":" # new queries to pickup the host field."},{"line_number":764,"context_line":" instance.save()"}],"source_content_type":"text/x-python","patch_set":1,"id":"780e05e9_6ea30482","line":761,"updated":"2021-10-22 15:36:45.000000000","message":"I think we need to worry about the instance uuid lock being violated here.\n\nMaybe: check the compute host is disabled AND down... OR deleted.\n\n... side note: awesome to see us starting to fix this :)","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"1a3ad6d09df95621e0d9af2047a3b10a11d39d2d","unresolved":true,"context_lines":[{"line_number":758,"context_line":" context \u003d nova_context.get_admin_context()"},{"line_number":759,"context_line":" try:"},{"line_number":760,"context_line":" instance \u003d objects.Instance.get_by_uuid(context, instance_uuid)"},{"line_number":761,"context_line":" instance.host \u003d CONF.host.lower()"},{"line_number":762,"context_line":" # Explicitly save the instance object to the DB to allow"},{"line_number":763,"context_line":" # new queries to pickup the host field."},{"line_number":764,"context_line":" instance.save()"}],"source_content_type":"text/x-python","patch_set":1,"id":"f3320608_77ab6f16","line":761,"in_reply_to":"780e05e9_6ea30482","updated":"2021-11-03 13:02:34.000000000","message":"I chatted with some of RH\u0027s nova folks recently and that was the consensus as well. It might delay rebalance a little, but in the grand scheme of things fixing this overall issue is more important.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"11df8138d14a1a7ae85815ca8d7a41ea5163059a","unresolved":false,"context_lines":[{"line_number":758,"context_line":" context \u003d nova_context.get_admin_context()"},{"line_number":759,"context_line":" try:"},{"line_number":760,"context_line":" instance \u003d objects.Instance.get_by_uuid(context, instance_uuid)"},{"line_number":761,"context_line":" instance.host \u003d CONF.host.lower()"},{"line_number":762,"context_line":" # Explicitly save the instance object to the DB to allow"},{"line_number":763,"context_line":" # new queries to pickup the host field."},{"line_number":764,"context_line":" instance.save()"}],"source_content_type":"text/x-python","patch_set":1,"id":"16101d6d_dc394a15","line":761,"in_reply_to":"9a62edba_08431e76","updated":"2022-02-16 21:24:09.000000000","message":"So current state of the code, *does* pull a compute node list and validates against that as well. If the node is online, then it skips doing anything on an instance level. Deleted compute nodes in the resource tracker are re-created, and actually we\u0027ll now in my latest version actually undo deleted state which can occur... say from changing a conductor group setting on a known deployed node.\n\nWhich leaves disabled, and upon looking at the use of disabled, it basically, from what I can tell mainly only guards the logic around scheduling. This led me to consider \"what if we impacted the hash ring with disabled\" and I concluded that was likely a mistake because virt drivers seem to largely not halt internal operations or even ability to do things on a compute node as a result.\n\nAnyway, I\u0027m going to mark this resolved as I think this has been guarded reasonably while trying not to set ourselves up for excesive queries on clusters with thousands, tens of thousands, or even hundreds of thousands of compute nodes.\n\nThe plus side with all of this is that any hash ring change will actually remedy the issues we tend to run into, which I guess is a good thing in the long run.","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"af8d58485985f6d4bf99cb85301423442adabab2","unresolved":true,"context_lines":[{"line_number":758,"context_line":" context \u003d nova_context.get_admin_context()"},{"line_number":759,"context_line":" try:"},{"line_number":760,"context_line":" instance \u003d objects.Instance.get_by_uuid(context, instance_uuid)"},{"line_number":761,"context_line":" instance.host \u003d CONF.host.lower()"},{"line_number":762,"context_line":" # Explicitly save the instance object to the DB to allow"},{"line_number":763,"context_line":" # new queries to pickup the host field."},{"line_number":764,"context_line":" instance.save()"}],"source_content_type":"text/x-python","patch_set":1,"id":"9a62edba_08431e76","line":761,"in_reply_to":"f3320608_77ab6f16","updated":"2021-11-03 13:12:38.000000000","message":"So, the service has to be up in order to make it into the hash ring calculation. Would the mismatch in that case imply the compute host is down or deleted?","commit_id":"b36298de9e40b9af1c4ab5f122e812b9c5b3ee83"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"054a0df78d51e2509dad477afabf61c9625e48b3","unresolved":true,"context_lines":[{"line_number":1050,"context_line":" # to be able to support operations against the instance"},{"line_number":1051,"context_line":" # as this driver is a proxy to Ironic."},{"line_number":1052,"context_line":" # Regardless of how we got here, things should drop out of"},{"line_number":1053,"context_line":" # disappear from reaching here unless there is some"},{"line_number":1054,"context_line":" # extenuating circumstance like database surgery occured."},{"line_number":1055,"context_line":" node_cache[node.uuid] \u003d node"},{"line_number":1056,"context_line":""}],"source_content_type":"text/x-python","patch_set":7,"id":"52dc7e14_be8ecae7","line":1053,"range":{"start_line":1053,"start_character":18,"end_line":1053,"end_character":27},"updated":"2022-02-17 18:35:13.000000000","message":"err, drop out of the cache and disappear","commit_id":"57c988ed31d8da687011d080fb8dfa2ebd7fd305"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"9e0567456e7c0b0ade2f9122d2508b907b761898","unresolved":false,"context_lines":[{"line_number":1050,"context_line":" # to be able to support operations against the instance"},{"line_number":1051,"context_line":" # as this driver is a proxy to Ironic."},{"line_number":1052,"context_line":" # Regardless of how we got here, things should drop out of"},{"line_number":1053,"context_line":" # disappear from reaching here unless there is some"},{"line_number":1054,"context_line":" # extenuating circumstance like database surgery occured."},{"line_number":1055,"context_line":" node_cache[node.uuid] \u003d node"},{"line_number":1056,"context_line":""}],"source_content_type":"text/x-python","patch_set":7,"id":"5e54a31e_0c7833b6","line":1053,"range":{"start_line":1053,"start_character":18,"end_line":1053,"end_character":27},"in_reply_to":"52dc7e14_be8ecae7","updated":"2022-02-21 22:37:05.000000000","message":"Done","commit_id":"57c988ed31d8da687011d080fb8dfa2ebd7fd305"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"cf758577de7d0d379b862061dad4b205851bdbaf","unresolved":true,"context_lines":[{"line_number":724,"context_line":" # conductor group, so we check all compute services for liveness."},{"line_number":725,"context_line":" # if we have a peer_list, don\u0027t check liveness for compute"},{"line_number":726,"context_line":" # services that aren\u0027t in the list."},{"line_number":727,"context_line":" # NOTE(TheJulia): We explicitly do not check *if* the compute is"},{"line_number":728,"context_line":" # disabled or not. A compute whose service is disabled cannot"},{"line_number":729,"context_line":" # be schedueld upon, but those services can still take commands,"},{"line_number":730,"context_line":" # and are still the \"responsible entity\"."},{"line_number":731,"context_line":" if peer_list is None or svc.host in peer_list:"},{"line_number":732,"context_line":" is_up \u003d self.servicegroup_api.service_is_up(svc)"},{"line_number":733,"context_line":" if is_up:"}],"source_content_type":"text/x-python","patch_set":9,"id":"924d5256_b9d3d671","line":730,"range":{"start_line":727,"start_character":12,"end_line":730,"end_character":53},"updated":"2022-05-11 17:13:56.000000000","message":"so this is one of the point of contention in the spec.\n\nwe wanted to ensure that the rebalace only happened if the comptue service was deleted or if the comptue service was disabeld. by the admin as a way to opt into to allowing the reblance to happen.\n\nwe did not want is_up to be the only thing that was used to determin if the rebalcne could happen.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"b2bfa57c88dd830fb50cd5c32ce0b9b7c6fc7c58","unresolved":true,"context_lines":[{"line_number":724,"context_line":" # conductor group, so we check all compute services for liveness."},{"line_number":725,"context_line":" # if we have a peer_list, don\u0027t check liveness for compute"},{"line_number":726,"context_line":" # services that aren\u0027t in the list."},{"line_number":727,"context_line":" # NOTE(TheJulia): We explicitly do not check *if* the compute is"},{"line_number":728,"context_line":" # disabled or not. A compute whose service is disabled cannot"},{"line_number":729,"context_line":" # be schedueld upon, but those services can still take commands,"},{"line_number":730,"context_line":" # and are still the \"responsible entity\"."},{"line_number":731,"context_line":" if peer_list is None or svc.host in peer_list:"},{"line_number":732,"context_line":" is_up \u003d self.servicegroup_api.service_is_up(svc)"},{"line_number":733,"context_line":" if is_up:"}],"source_content_type":"text/x-python","patch_set":9,"id":"fe210a44_d8df8372","line":730,"range":{"start_line":727,"start_character":12,"end_line":730,"end_character":53},"in_reply_to":"924d5256_b9d3d671","updated":"2022-05-11 20:30:37.000000000","message":"by the way the last time we talks about this was \nhttps://meetings.opendev.org/irclogs/%23openstack-nova/%23openstack-nova.2022-01-13.log.html#t2022-01-13T15:13:14\n\nwe never actully got to a conlution fo that discussion other then the fact you seamed to want to stive for an evneutally consistent point of view.\n\nand i made a suggestion about refactoring the driver in the futre so that we did not need to rebalcnce instacn.host at all but that was not what i was suggesting a sthe inital solution but rater a longer term fix.\n\ni.e. since ironic does not really care which service proxies to it\nall the ionic compute service could use instance.host\u003dironic for all we care.\n\nthat would result in all of them sharin a singel queue and loadbalanicng the rpc traffic without the need for hashing or anything else in nova. again that is the large change not the simple interim solution and not what the current sepc was covering.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"30e45ab66559f929acc663a3fccaa53126bb506b","unresolved":true,"context_lines":[{"line_number":755,"context_line":" objects mapped physical nodes objects/data"},{"line_number":756,"context_line":" get_nodes() returns nova-compute services nodes"},{"line_number":757,"context_line":""},{"line_number":758,"context_line":" While, a nova-compute hash ring may not match ironic\u0027s distribution"},{"line_number":759,"context_line":" of responsibility to manage physical nodes, the concept is in essence"},{"line_number":760,"context_line":" the same. The physical nodes are mapped to a nova compute service"},{"line_number":761,"context_line":" based upon the known operating members of the hash ring. The core"}],"source_content_type":"text/x-python","patch_set":9,"id":"f0a6e862_2fba75a0","line":758,"range":{"start_line":758,"start_character":13,"end_line":758,"end_character":14},"updated":"2022-04-25 20:39:47.000000000","message":"Nix","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"30e45ab66559f929acc663a3fccaa53126bb506b","unresolved":true,"context_lines":[{"line_number":760,"context_line":" the same. The physical nodes are mapped to a nova compute service"},{"line_number":761,"context_line":" based upon the known operating members of the hash ring. The core"},{"line_number":762,"context_line":" responsibility of this method is to update that ring composition"},{"line_number":763,"context_line":" of responsible entities is updated."},{"line_number":764,"context_line":""},{"line_number":765,"context_line":" :params ctxt: Running context of the service."},{"line_number":766,"context_line":" :returns: Boolean value representing if the hash ring *has* changed"}],"source_content_type":"text/x-python","patch_set":9,"id":"d20d5c7b_267b4a7a","line":763,"range":{"start_line":763,"start_character":31,"end_line":763,"end_character":42},"updated":"2022-04-25 20:39:47.000000000","message":"Should this be removed? The sentence doesn\u0027t quite make sense with it included.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"},{"author":{"_account_id":11655,"name":"Julia Kreger","email":"juliaashleykreger@gmail.com","username":"jkreger","status":"Flying to the moon with a Jetpack!"},"change_message_id":"9088b62fd7fcea7f847850349ae0aa14bc77eb00","unresolved":true,"context_lines":[{"line_number":760,"context_line":" the same. The physical nodes are mapped to a nova compute service"},{"line_number":761,"context_line":" based upon the known operating members of the hash ring. The core"},{"line_number":762,"context_line":" responsibility of this method is to update that ring composition"},{"line_number":763,"context_line":" of responsible entities is updated."},{"line_number":764,"context_line":""},{"line_number":765,"context_line":" :params ctxt: Running context of the service."},{"line_number":766,"context_line":" :returns: Boolean value representing if the hash ring *has* changed"}],"source_content_type":"text/x-python","patch_set":9,"id":"43fa3bc4_867c3c79","line":763,"range":{"start_line":763,"start_character":31,"end_line":763,"end_character":42},"in_reply_to":"d20d5c7b_267b4a7a","updated":"2022-04-27 14:56:28.000000000","message":"It kind of made sense to me, but I can see it also maybe not making sense since the update is implied already.","commit_id":"30cd3d8df6fdb7b560e0420c2d14cd3d57cb0124"}]}