)]}' {"/COMMIT_MSG":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"37815b361bbbb7183cf6b45bab778ddfcab9c6a6","unresolved":true,"context_lines":[{"line_number":10,"context_line":"instances during init in order to online their dedicated CPUs. This"},{"line_number":11,"context_line":"was a problem because only instances with a pending reboot were"},{"line_number":12,"context_line":"actually rebooted, meaning that all other instances\u0027s CPUs were left"},{"line_number":13,"context_line":"powered down."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Fix this by having the resource tracker\u0027s _update() method call back"},{"line_number":16,"context_line":"to the virt driver to let it know that the compute node has been"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"ebe5b062_6d5cdb95","line":13,"updated":"2024-10-30 01:07:35.000000000","message":"this is not true. we have never relied on that.","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"1548f99679ac672c4145db97fcf3c3d246b62dfa","unresolved":true,"context_lines":[{"line_number":10,"context_line":"instances during init in order to online their dedicated CPUs. This"},{"line_number":11,"context_line":"was a problem because only instances with a pending reboot were"},{"line_number":12,"context_line":"actually rebooted, meaning that all other instances\u0027s CPUs were left"},{"line_number":13,"context_line":"powered down."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Fix this by having the resource tracker\u0027s _update() method call back"},{"line_number":16,"context_line":"to the virt driver to let it know that the compute node has been"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"bd625766_c00e4cce","line":13,"in_reply_to":"5a65bbdb_fa598c40","updated":"2024-11-17 18:23:09.000000000","message":"I\u0027m not debating original intent, I\u0027m stating facts as they currently are in the code:\n\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/virt/libvirt/driver.py#L854-L859","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"bb461f67351caada4baee5b8c705e84aecff19b1","unresolved":true,"context_lines":[{"line_number":10,"context_line":"instances during init in order to online their dedicated CPUs. This"},{"line_number":11,"context_line":"was a problem because only instances with a pending reboot were"},{"line_number":12,"context_line":"actually rebooted, meaning that all other instances\u0027s CPUs were left"},{"line_number":13,"context_line":"powered down."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"Fix this by having the resource tracker\u0027s _update() method call back"},{"line_number":16,"context_line":"to the virt driver to let it know that the compute node has been"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"5a65bbdb_fa598c40","line":13,"in_reply_to":"ebe5b062_6d5cdb95","updated":"2024-10-30 01:14:01.000000000","message":"the orginal implmentation very intentionally only looked at the cores that were not in uses to make sure we could not acidentlaly offline a core assigned to a vm\n\nhttps://github.com/SeanMooney/arbiterd/blob/12e4b977828297b2ba0eb5bf5958f7419fc4efa9/src/arbiterd/arbiters/cpu_state.py#L21-L26\n\ni was carefult to ensure that could not happen in the design.","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"10ef8c93e4cf14a96a1b138f88cf4e711edcb8ba","unresolved":true,"context_lines":[{"line_number":4,"context_line":"Commit: melanie witt \u003cmelwittt@gmail.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2025-01-29 02:13:26 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"pwr mgmt: power down free PCPUS when updating compute node"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Previously, the libvirt relied on the compute manager rebooting"},{"line_number":10,"context_line":"instances during init in order to online their dedicated CPUs. This"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":15,"id":"a85e9bb3_8d127b6e","line":7,"updated":"2025-01-29 05:39:31.000000000","message":"Just a friendly reminder to rewrite pretty much all of this (I guess except maybe the first paragraph that\u0027s stating the problem?) before merging.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"eb0e841d3908bb2a6906e34a9efdfa89b3f20e67","unresolved":true,"context_lines":[{"line_number":4,"context_line":"Commit: melanie witt \u003cmelwittt@gmail.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2025-01-29 02:13:26 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"pwr mgmt: power down free PCPUS when updating compute node"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Previously, the libvirt relied on the compute manager rebooting"},{"line_number":10,"context_line":"instances during init in order to online their dedicated CPUs. This"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":15,"id":"dfe576a2_675d4a71","line":7,"in_reply_to":"a85e9bb3_8d127b6e","updated":"2025-01-29 06:23:49.000000000","message":"Thanks for reminding! I didn\u0027t feel confident about what reviewers would think of this version and am ready to revert or make other changes based on feedback.\n\nIf we find something we agree on, I\u0027ll adjust the commit message to reflect whatever that ends up being 🙂","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"d544e925_66f62b1d","updated":"2024-10-22 08:43:08.000000000","message":"I discovered a bunch of sharp edges around this logic with potential bugs.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"5d11ccf7786d66b2374b6eb896e7c52a9a5593fd","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"36118957_e92cc578","updated":"2024-10-22 14:00:58.000000000","message":"I wonder if we should not do this in init_host() at all, and instead use update_available_resources() in the resource tracker, since at that point presumably it\u0027ll be easier to be certain that we\u0027re operating with the correct CPU config and the correct instance(s)... Lemme explore what\u0027s possible.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"dc06623dc00a029fea4b3d9f76fd9d5d1577c15e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"ddd5e776_7fbf43d0","updated":"2024-10-22 17:33:06.000000000","message":"This doesn\u0027t do anything for instances that are powered off - their PCPUs will still be powered up...","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6266485ea0b048e0c2f4ecb4dcbbdde734a10412","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"3b2c0eac_56e7e9e7","updated":"2025-01-20 15:20:06.000000000","message":"4","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"6a2ebe39_9e50c272","updated":"2025-01-16 12:59:57.000000000","message":"Thanks @melwittt@gmail.com for picking this up. I have some questions inline.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"fb239602681e6700d830aa06a760ad7ebf769e28","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"a0d2f0b1_69058176","updated":"2025-01-24 16:58:37.000000000","message":"again i have not really reviewed this properly but im still seeing virt driver api changes so i don\u0027t think this is right approach","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"7d9dde112d661f8e82a42d0ec6933623b57d4dfb","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"c849ec47_5eb94ac4","updated":"2025-01-21 16:14:44.000000000","message":"check experimental","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"f18de3988ec28408fd2ac76fc561c5480f59dbb7","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"eb91892d_679fcdb1","updated":"2025-01-16 04:44:27.000000000","message":"recheck [ 6.198242] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode\u003d0x00001000 ]---","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"866f663ed0d990ac225bce82b0c93dfdda93b93e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"03420800_d612f846","updated":"2025-01-21 12:28:53.000000000","message":"recheck experimental\n\nwe have whitebox run in experimental now :)","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"c1af86cd7ecf98ec3a08b78a194efbe87f0a1bc8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"a2966d0f_228c157e","updated":"2025-01-25 02:14:36.000000000","message":"I kind of went out on a limb here to try something and see if it works.","commit_id":"f4264cb67647105a863f8fb48809894dbea5f7d0"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"d3a217b5cfd244058a82bea3af3da25980093145","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":12,"id":"1897bd02_7c04ce79","updated":"2025-01-25 05:49:38.000000000","message":"check experimental","commit_id":"b6b17cf47717c4839d1b12c566032dfae4255500"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"44b67670_5531fa74","updated":"2025-01-28 07:26:25.000000000","message":"Working on adding some more func test coverage.","commit_id":"ccc0ff6181894878a1976b0c2fc259f7d4e6572b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":15,"id":"755dbb81_275b3248","updated":"2025-02-10 13:13:58.000000000","message":"I think the general direction looks good. The offlining now happens at the end of the first update_available_resources which happens after in pre_start_hook that runs after init_host. So we expect that manager rt and virt driver all initialized when we go and look at any unused pcpus to offline.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"}],"nova/compute/manager.py":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":1637,"context_line":" context, self.host,"},{"line_number":1638,"context_line":" expected_attrs\u003d[\u0027info_cache\u0027, \u0027metadata\u0027, \u0027numa_topology\u0027])"},{"line_number":1639,"context_line":""},{"line_number":1640,"context_line":" self.driver.init_host(host\u003dself.host, instances\u003dinstances)"},{"line_number":1641,"context_line":""},{"line_number":1642,"context_line":" # NOTE(gibi): At this point the compute_nodes of the resource tracker"},{"line_number":1643,"context_line":" # has not been populated yet so we cannot rely on the resource tracker"}],"source_content_type":"text/x-python","patch_set":4,"id":"6aa5a9f2_967a4b02","line":1640,"updated":"2024-10-22 08:43:08.000000000","message":"I\u0027m wondering what happens with instances that has allocation on two nodes\ni.e. being migrated while the compute is restarted. I suggest to test at least the following non racy scenario:\n* create an instance on compute A\n* cold migrate it to compute B, instance is in VERIFY_RESIZE state\n* restart nova-compute on compute B, see that the instance is not unpinned. (I feel this will work)\n* restart nova-compute on compute A, then revert the migration, see if the started up VM is not trying to use an offlined pcpu.\n\nI guess any other racy scenario where restart overlaps with an ongoing cold / live migration is a) less important, b) if broken then probably not just because of offlining pcpus. Still if you have time, it worth to do some exploratory testing.\n\n\nAlso it worth checking how this offlining interferes with resume_guest_on_host_reboot scenarios. That logic happens in _init_instance() below and it involves calling reboot on the instance, so probably it will online the necessary pcpus.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":true,"context_lines":[{"line_number":1637,"context_line":" context, self.host,"},{"line_number":1638,"context_line":" expected_attrs\u003d[\u0027info_cache\u0027, \u0027metadata\u0027, \u0027numa_topology\u0027])"},{"line_number":1639,"context_line":""},{"line_number":1640,"context_line":" self.driver.init_host(host\u003dself.host, instances\u003dinstances)"},{"line_number":1641,"context_line":""},{"line_number":1642,"context_line":" # NOTE(gibi): At this point the compute_nodes of the resource tracker"},{"line_number":1643,"context_line":" # has not been populated yet so we cannot rely on the resource tracker"}],"source_content_type":"text/x-python","patch_set":4,"id":"ccb1681c_f29a910f","line":1640,"in_reply_to":"6aa5a9f2_967a4b02","updated":"2025-01-28 07:26:25.000000000","message":"I have added a couple of tests for these scenarios. The migration revert does not seem to work as expected but resume_guests_on_host_reboot appears to be OK.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":false,"context_lines":[{"line_number":1637,"context_line":" context, self.host,"},{"line_number":1638,"context_line":" expected_attrs\u003d[\u0027info_cache\u0027, \u0027metadata\u0027, \u0027numa_topology\u0027])"},{"line_number":1639,"context_line":""},{"line_number":1640,"context_line":" self.driver.init_host(host\u003dself.host, instances\u003dinstances)"},{"line_number":1641,"context_line":""},{"line_number":1642,"context_line":" # NOTE(gibi): At this point the compute_nodes of the resource tracker"},{"line_number":1643,"context_line":" # has not been populated yet so we cannot rely on the resource tracker"}],"source_content_type":"text/x-python","patch_set":4,"id":"e4ce98e0_6f8cf0da","line":1640,"in_reply_to":"ccb1681c_f29a910f","updated":"2025-02-10 13:13:58.000000000","message":"Thank you!","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":1651,"context_line":""},{"line_number":1652,"context_line":" self.init_virt_events()"},{"line_number":1653,"context_line":""},{"line_number":1654,"context_line":" self._validate_pinning_configuration(instances)"},{"line_number":1655,"context_line":" self._validate_vtpm_configuration(instances)"},{"line_number":1656,"context_line":""},{"line_number":1657,"context_line":" # NOTE(gibi): If ironic and vcenter virt driver slow start time"}],"source_content_type":"text/x-python","patch_set":4,"id":"12e545cb_971f2de5","line":1654,"updated":"2024-10-22 08:43:08.000000000","message":"I just noticed this, and it feels in the wrong order now. So this call validates that the current shared/dedidcated_set config is valid based on the instances we have on the host[1]. By having the online/offline pcpu logic before this check means that nova start using the new value of the shared/dedicated_set config option *before* validating those values.\n\nThis issue is independent from the current bug fix so lets take it separately but I think we need to fix it.\n\n//later\nbased on my comments in the libvirt driver about stopped instances I feel like we should do the offlining of the cpus as part of the init_instance logic below as there nova has more information about the power state of the instance. This move will help solving the issue with the ordering of the config validation as well.\n\n[1]https://github.com/openstack/nova/blob/ed2bf3699ddc360fb646b895560fa9dbcfd6beba/nova/compute/manager.py#L1000-L1018","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":1764,"context_line":" instance, clean_task_state\u003dTrue)"},{"line_number":1765,"context_line":""},{"line_number":1766,"context_line":" def cleanup_host(self):"},{"line_number":1767,"context_line":" self._shutting_down \u003d True"},{"line_number":1768,"context_line":" self.instance_events.cancel_all_events()"},{"line_number":1769,"context_line":" self.driver.cleanup_host(host\u003dself.host)"},{"line_number":1770,"context_line":" self._cleanup_live_migrations_in_pool()"}],"source_content_type":"text/x-python","patch_set":15,"id":"42139e60_fa86479f","line":1767,"updated":"2025-02-10 13:13:58.000000000","message":"OK. So this is done as the event listener interface changed to handle multiple registrations instead of overriding existing ones.\n\nAs you already changing that interface a bit I suggest to add a remove_event_listener(callback) to it so that the client (compute manager) does not have to handle a new state (_shutting_down)","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"}],"nova/compute/resource_tracker.py":[{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"dc06623dc00a029fea4b3d9f76fd9d5d1577c15e","unresolved":true,"context_lines":[{"line_number":674,"context_line":" self._update_usage(usage, nodename, sign\u003d-1)"},{"line_number":675,"context_line":""},{"line_number":676,"context_line":" ctxt \u003d context.elevated()"},{"line_number":677,"context_line":" self._update(ctxt, self.compute_nodes[nodename])"},{"line_number":678,"context_line":""},{"line_number":679,"context_line":" # Remove usage for an instance that is tracked in migrations, such as"},{"line_number":680,"context_line":" # on the dest node during revert resize."}],"source_content_type":"text/x-python","patch_set":5,"id":"869458fa_b668d2fa","line":677,"updated":"2024-10-22 17:33:06.000000000","message":"We call _update() in a bunch of different places (like here), not only from the periodic, but maybe that\u0027s not a bad thing?","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":674,"context_line":" self._update_usage(usage, nodename, sign\u003d-1)"},{"line_number":675,"context_line":""},{"line_number":676,"context_line":" ctxt \u003d context.elevated()"},{"line_number":677,"context_line":" self._update(ctxt, self.compute_nodes[nodename])"},{"line_number":678,"context_line":""},{"line_number":679,"context_line":" # Remove usage for an instance that is tracked in migrations, such as"},{"line_number":680,"context_line":" # on the dest node during revert resize."}],"source_content_type":"text/x-python","patch_set":5,"id":"8cc91580_c3b87342","line":677,"in_reply_to":"869458fa_b668d2fa","updated":"2025-01-16 12:59:57.000000000","message":"As no instances are passed in those cases nothing new will happen. It is not helping the readability of _update and also does not create a nice contract between the rt and the driver about when exactly the compute_node_updated is called. But at the moment I don\u0027t have a different solution proposal. I\u0027m interested to see the outcome of the live migration vs periodic offlining discussion (see my other comments) as maybe that will lead back to this and similar places.","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"de6dafb93f6775d50f11a697f5f4038439a09e6c","unresolved":true,"context_lines":[{"line_number":1064,"context_line":""},{"line_number":1065,"context_line":" # update the compute_node"},{"line_number":1066,"context_line":" self._update(context, cn, startup\u003dstartup)"},{"line_number":1067,"context_line":" self.driver.compute_node_updated(cn, instances)"},{"line_number":1068,"context_line":" LOG.debug(\u0027Compute_service record updated for %(host)s:%(node)s\u0027,"},{"line_number":1069,"context_line":" {\u0027host\u0027: self.host, \u0027node\u0027: nodename})"},{"line_number":1070,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"fbc67e44_bc1d72f1","line":1067,"updated":"2024-11-18 15:32:09.000000000","message":"If the intention is to call compute_node_updated whenever the ComputeNode is updated then I would move this call into the _update() method.","commit_id":"f45fb9b1bce0fcc6e5ab6d7cc6187852384f435e"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"d44bbfa63e7ff76c119028ff9e44a4c7d27c1172","unresolved":false,"context_lines":[{"line_number":1064,"context_line":""},{"line_number":1065,"context_line":" # update the compute_node"},{"line_number":1066,"context_line":" self._update(context, cn, startup\u003dstartup)"},{"line_number":1067,"context_line":" self.driver.compute_node_updated(cn, instances)"},{"line_number":1068,"context_line":" LOG.debug(\u0027Compute_service record updated for %(host)s:%(node)s\u0027,"},{"line_number":1069,"context_line":" {\u0027host\u0027: self.host, \u0027node\u0027: nodename})"},{"line_number":1070,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"d56f05f5_4d3ffc5d","line":1067,"in_reply_to":"fbc67e44_bc1d72f1","updated":"2025-01-15 23:40:29.000000000","message":"Done","commit_id":"f45fb9b1bce0fcc6e5ab6d7cc6187852384f435e"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":1063,"context_line":" self._populate_assigned_resources(context, instance_by_uuid)"},{"line_number":1064,"context_line":""},{"line_number":1065,"context_line":" # update the compute_node"},{"line_number":1066,"context_line":" self._update(context, cn, startup\u003dstartup, instances\u003dinstances)"},{"line_number":1067,"context_line":" LOG.debug(\u0027Compute_service record updated for %(host)s:%(node)s\u0027,"},{"line_number":1068,"context_line":" {\u0027host\u0027: self.host, \u0027node\u0027: nodename})"},{"line_number":1069,"context_line":""}],"source_content_type":"text/x-python","patch_set":10,"id":"b67adaae_424cf3ae","line":1066,"updated":"2025-01-16 12:59:57.000000000","message":"These instances are the ones where instance.host and instance.node points to this host. I\u0027m wondering what happens during the migration of the VM regarding the CPUs of the VM.\n\nFor cold migration (and resize) we only need to keep the VM running on one host at a time, and hopefully we first set the instance.host and node to the dest during migration (probably after the claim) then we start the VM on the dest. Also hopefully revert works similarly so we set the instance.host back to the source then restart the VM.\n\nBut for live migration we probably need to keep CPUs on both nodes online for the VM while libvirt copying the memory and eventually activating the dest VM. \n\nDoes this periodic keeps both side of a live migration online?","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"0b9bcda6e32529c8ede75b3173f0bcfd56a00713","unresolved":true,"context_lines":[{"line_number":1063,"context_line":" self._populate_assigned_resources(context, instance_by_uuid)"},{"line_number":1064,"context_line":""},{"line_number":1065,"context_line":" # update the compute_node"},{"line_number":1066,"context_line":" self._update(context, cn, startup\u003dstartup, instances\u003dinstances)"},{"line_number":1067,"context_line":" LOG.debug(\u0027Compute_service record updated for %(host)s:%(node)s\u0027,"},{"line_number":1068,"context_line":" {\u0027host\u0027: self.host, \u0027node\u0027: nodename})"},{"line_number":1069,"context_line":""}],"source_content_type":"text/x-python","patch_set":10,"id":"04d827d3_a76ad61e","line":1066,"in_reply_to":"b67adaae_424cf3ae","updated":"2025-01-16 23:25:49.000000000","message":"This is a good point. Honestly I don\u0027t know off the top of my head. I\u0027ll dig into it and try to make some functional tests to capture the behavior.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":1072,"context_line":" # in provider tree"},{"line_number":1073,"context_line":" if startup:"},{"line_number":1074,"context_line":" self._check_resources(context)"},{"line_number":1075,"context_line":" self.driver.emit_event(virtevent.PowerMgmtEvent(cn, instances))"},{"line_number":1076,"context_line":""},{"line_number":1077,"context_line":" def _get_compute_node(self, context, node_uuid):"},{"line_number":1078,"context_line":" \"\"\"Returns compute node for the host and nodename.\"\"\""}],"source_content_type":"text/x-python","patch_set":15,"id":"f7073086_6aba1e2e","line":1075,"updated":"2025-02-10 13:13:58.000000000","message":"I\u0027m wondering that doing this with an event instead of a direct call to the driver to offline unused pcpus is just to avoid a virt driver interface change, or I\u0027m missing some other aspects that requires this to be an event.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"}],"nova/tests/functional/libvirt/test_power_manage.py":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"002b3952_39e05ae7","line":272,"updated":"2024-10-22 08:43:08.000000000","message":"To cover the offlined and then onlined case we need to wrap the the function that offline a pcpu to see if it is not called with a used pcpu. \n\n[1]https://github.com/openstack/nova/blob/ed2bf3699ddc360fb646b895560fa9dbcfd6beba/nova/virt/libvirt/cpu/core.py#L66","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"1548f99679ac672c4145db97fcf3c3d246b62dfa","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"f9f9a560_7b75cae0","line":272,"in_reply_to":"002b3952_39e05ae7","updated":"2024-11-17 18:23:09.000000000","message":"Do we really want to do that in a functional test? We talked about having whitebox as a periodic for Nova, and that would be totally within whitebox\u0027s scope, and much easier to understand...","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6266485ea0b048e0c2f4ecb4dcbbdde734a10412","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"5a108e74_05412438","line":272,"in_reply_to":"0a470b88_948851d8","updated":"2025-01-20 15:20:06.000000000","message":"I\u0027m OK if this is covered in whitebox. Does whitebox checks the VM\u0027s taskset to assert that the VM is not unpinned during a nova-compute restart? If not the we need to add such asserts. \n\n//later\n\nI don\u0027t see any taskset check in https://opendev.org/openstack/whitebox-tempest-plugin/src/branch/master/whitebox_tempest_plugin/api/compute/test_cpu_state_mgmt.py We need those as the current asserts only observe that the PCPU is online, it does not assert that is wasn\u0027t offlined and then onlined.\n\nAlso I think we need to add a nova-compute restart test case for test_cpu_state_mgmt.py to cover this.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d7008f0f970b12b202f61e0fbd8be041d2d22502","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"c7254005_9f8c5cb2","line":272,"in_reply_to":"0f0ebdd7_fbdc1e96","updated":"2025-01-29 10:49:00.000000000","message":"so that is still wrong because the cpu is only ment to be offlined when the vm is deleted.\n\nin the orgianl propoal we intentionall did not offline the cpus when you power off a vm.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"e84ceccbf5f6b66375bcf7b58e902d5b61b25399","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"a4160acf_bf542c2b","line":272,"in_reply_to":"13c75c07_950a3c1f","updated":"2025-01-30 01:26:12.000000000","message":"OK, that is very different than what is currently implemented 😛\n\nI guess I\u0027m wondering, how many issue fixes are we wanting to include in this one patch? The reboot_instance problem is not related to the nova-compute restart problem and seems like a separate bug or undesired behavior.\n\nSo far it seems we have:\n\n* Bug where online used CPUs are offlined after nova-compute restart\n* Bug where a cold migration revert does not return CPUs online/offline state to what it was before the migration\n* Bug where CPUs are offlined and then onlined in rapid succession in the reboot_instance code path\n* and maybe more??","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"b191b328_f76f12ee","line":272,"in_reply_to":"2583093f_f467277f","updated":"2025-01-28 07:26:25.000000000","message":"I wondered about that also, what impact on workload/performance of offlining then onlining an in-use CPU.\n\nI have added an assert to the functional test for verifying that used CPUs are not offlined during a compute restart.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"d44bbfa63e7ff76c119028ff9e44a4c7d27c1172","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"af1bb5cf_85544607","line":272,"in_reply_to":"264b25aa_9a743eb2","updated":"2025-01-15 23:40:29.000000000","message":"Sorry if I\u0027m just being dumb but ... I don\u0027t see how this test is not covering the offlined and then onlined case described in the bug report?\n\nAt the beginning of the test, all CPUs are offline (asserted in setUp). Then `server` is created and running with online PCPU before the compute host restart. Then after the restart, its PCPU is not included in the list of offline CPUs which verifies that it is online and was not offlined, no?","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"fb239602681e6700d830aa06a760ad7ebf769e28","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"e6c14a9d_25c19d1a","line":272,"in_reply_to":"276483a5_bb44c052","updated":"2025-01-24 16:58:37.000000000","message":"not a really review, just responindign to this tread.\n\ni agree we should nto try to do that in whitebox.\n\nmy understanding is that part of the offlining process of a cpu at the kernel is to \nmove all processes off them, so im not suprised that the cgpus taskset is lost.\nit also makes sense that if the cgoup is still defined that the kernel or libvirt may revaluate and apply it when core are online.\n\nim a litte confused why you would want to test this however.\n\nthe approch that we are aiming for is to never offline a core asocated with a nova instance.\n\nwe only managed core in in the cpu_dedicated_set.\n\nwe are free to online all cores in cpu_dedicated_set at any point (that is safe but less power effiect)\n\nwhat we need to do on restarting the compute agent is online all cores in cpu_dedicated_set, recored the relevent numa/socket info, then offline the cores not used by any vm.\n\nat no point in the workflow shoudl the vms ever be unpinned by the offlining of a core.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"921067402553c6df001e361ac720f1b4382d11c3","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"0f0ebdd7_fbdc1e96","line":272,"in_reply_to":"32564048_a661ab23","updated":"2025-01-29 02:14:39.000000000","message":"Yes, I mentioned comment above that I added an assert to the functional test that verifies that no in-use CPU is offlined during a nova-compute restart.\n\nNote that \"rebooting\" instances will be rebooted (by calling `reboot_instance()`), so for those instances the CPU assigned to it will be offlined and then onlined as part of the reboot.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"97640bf889225cd7c00ae35d722015cd93280335","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"9879d306_3edf2247","line":272,"in_reply_to":"5a108e74_05412438","updated":"2025-01-24 02:33:58.000000000","message":"I might be doing this wrong but so far I am unable to reproduce the behavior of a running VM staying unpinned in the offline/online case. If I offline the CPU while the VM is running, it becomes unpinned but when I online the CPU, it becomes pinned to the original CPU again. To me it seems like we would not be able to assert or detect possible offline/online behavior even in a whitebox test because the VM always gets pinned again as soon as the CPU is powered on?\n\nFor example:\n```\n$ virsh list\n Id Name State\n-----------------------------------\n 1 instance-000000ca running\n 5 instance-000000d0 running\n\n$ virsh vcpuinfo 5 --pretty\nVCPU: 0\nCPU: 6\nState: running\nCPU time: 60.5s\nCPU Affinity: 6 (out of 12)\n\n$ echo 0 | sudo tee /sys/devices/system/cpu/cpu6/online \n0\n\n$ virsh vcpuinfo 5 --pretty\nVCPU: 0\nCPU: 11\nState: running\nCPU time: 60.6s\nCPU Affinity: 0-3,8-11 (out of 12)\n\n$ echo 1 | sudo tee /sys/devices/system/cpu/cpu6/online \n1\n\n$ virsh vcpuinfo 5 --pretty\nVCPU: 0\nCPU: 6\nState: running\nCPU time: 60.7s\nCPU Affinity: 6 (out of 12)\n```","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1321a27a4d4f8392776c43878f6b6f54dcb914d0","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"276483a5_bb44c052","line":272,"in_reply_to":"9879d306_3edf2247","updated":"2025-01-24 11:41:21.000000000","message":"That strange. But I see the same behavior in my env too what you reported above. So libvirt pins the VM back to its dedicated core after the core is onlined later. So probably I did something wrong in my original trial. \n\nAnyhow I agree that this means we have no real way to catch this in whitebox.\n\nThis means I\u0027m more strongly advocating now for a functional test that asserts this by probably extending the sysfs fixture with such capability.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"14e52af2_746261d7","line":272,"in_reply_to":"a3189f07_67d0a622","updated":"2025-02-10 13:13:58.000000000","message":"\u003e I guess I\u0027m wondering, how many issue fixes are we wanting to include in this one patch? The reboot_instance problem is not related to the nova-compute restart problem and seems like a separate bug or undesired behavior.\n\u003e\n\u003e So far it seems we have:\n\u003e\n\u003e * Bug where online used CPUs are offlined after nova-compute restart\n\u003e * Bug where a cold migration revert does not return CPUs online/offline state to what it was before the migration\n\u003e * Bug where CPUs are offlined and then onlined in rapid succession in the reboot_instance code path\n\u003e * and maybe more??\n\nThanks @melwittt@gmail.com for summarizing the issues at hand. I suggest to only try for fix the first issue here in this patch and file separate bugs for the rest.\n\nFor me the 3rd issue is not a bug or at least not an important one until somebody can show an issue caused by the rapid offline / onlineing of CPUs.\n\nHowever the 2nd issue (cold migrate revert) sounds like an important bug to fix **after** we fixed the nova-compute restart case. Sorry for mixing cold migration into the scope of this fix.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"66371cc513832004052439ed04c1ccb5c9745eef","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"a3189f07_67d0a622","line":272,"in_reply_to":"a4160acf_bf542c2b","updated":"2025-01-30 01:33:24.000000000","message":"\u003e so that is still wrong because the cpu is only ment to be offlined when the vm is deleted.\n\n\u003e in the orgianl propoal we intentionall did not offline the cpus when you power off a vm.\n\nAlso FWIW in the spec it says \"When an instance is stopped (or powered off or suspended or shelved offload or in confirm-resize state on the source host), then we would either offline the core or use the powersaving governor.\"\n\nhttps://specs.openstack.org/openstack/nova-specs/specs/2023.1/implemented/libvirt-cpu-state-mgmt.html#instance-lifecycle","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6f1a926bbc162e4ad2608bd2cd773dfabd1296f2","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"cd84f61f_8030a8ad","line":272,"in_reply_to":"af1bb5cf_85544607","updated":"2025-01-16 09:33:20.000000000","message":"I lost most of the context so below is what I guess I meant. 😊\n\nThe current test case ensures that after the compute is restarted only the unused (not allocated, or allocated to a VM but the VM is stoppped) are offline. It does not however ensures that the nova-compute did not actually offlined all the pcpcus during startup and then onlined those that are used. (I remember this was a case at some point in the implementation). However if such transient happens during nova-compute startup then the running VMs are still unpinned by the kernel and never moved back to their original allocated pcpus.\n\n// later\n\nI think this version of the change triggered this thread. https://review.opendev.org/c/openstack/nova/+/932926/2..10/nova/virt/libvirt/driver.py#b878\n\nWhile the current change does not do the offline all, online used logic any more, for the future I would rather have the test cover the case as it is a potential regression.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"fb60ac8d32d593c0d7e4333cf962f2e5e02b6ce2","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"32564048_a661ab23","line":272,"in_reply_to":"b191b328_f76f12ee","updated":"2025-01-28 11:32:55.000000000","message":"to me offlining a core assigned ot an instance is a performance sla violation and a pseudo security bug in the sense that it can affect the availability of capsity of the vm and perhaps vms of other tenatnt whne it starts to float.\n\nso i dont think we shoudl accpate any solution that offlines cores assined to vms at all. i was very careful in the poc to not do that. i dont know how this was missed in the nova version but its very regritable that that behavior was intoduced.\n\ncan we not assert that the cores assocated with vms are never offlined on a agent restart by asserting the relevetn cores never recives a wirte of 0 to online?\n\nthat asserts the expected beahiovve and shoudl be testable by mocking the correct function and asserting it does not have the relevent calsl in the call list.\nwe dont actully care about order just that a call to write 0 to /sys/devices/system/cpu/cpu6/online does not happen\n\nwe dont need to try an dassert the cpu pinning of the vm is still enforced because we should not be touching the relevent cores \n\na write of 1 is accetable becuase it should not be changing the state.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"d4f98eea4d7c6018f3cc441107702777534a1120","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"13c75c07_950a3c1f","line":272,"in_reply_to":"c7254005_9f8c5cb2","updated":"2025-01-29 10:50:51.000000000","message":"we had tow reaons for that by the way.\nfor the reboot case specificly we wanted to avoid the offlining and onlining in rapid succstion but the main one was the added complexity in the code base.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"49d4dac0247b9f2a0ffda02c873f7d7302539c1c","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"0a470b88_948851d8","line":272,"in_reply_to":"cd84f61f_8030a8ad","updated":"2025-01-16 18:59:13.000000000","message":"\u003e It does not however ensures that the nova-compute did not actually offlined all the pcpcus during startup and then onlined those that are used. (I remember this was a case at some point in the implementation).\n\nThis is indeed something that we need to check, but I feel like doing it in functional tests is going to be fragile. I can\u0027t think of a way of doing it other than remembering the sequence of offline, online, offline etc for every CPU, and asserting stuff on that.\n\nWhitebox feels like a much better place to test that, because it will _actually_ break if restarting nova-compute unpins running VM\u0027s CPUs. I know we don\u0027t run whitebox against Nova at all for now, but there\u0027s a periodic proposed, and we talked about it at PTG.\n\nhttps://review.opendev.org/c/openstack/nova/+/833453","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"a24eeea69bd1768473a91a637935ea9f0c4ff7c5","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"2583093f_f467277f","line":272,"in_reply_to":"e6c14a9d_25c19d1a","updated":"2025-01-27 11:15:18.000000000","message":"\u003e im a litte confused why you would want to test this however.\n\nI would like to test this as one of the earlier proposed solution was actually buggy because it implemented the fix in a way that in enables used CPU cores. On the outside it seemed to solve the issue as the necessary CPUs were online after the compute service restart, but in reality telco workloads would suffer a strange but temporary loss of performance due to the kernel moving around their pinned CPUs twice during the restart. \n\nSo I want to avoid later introducing similar regressions.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"de6dafb93f6775d50f11a697f5f4038439a09e6c","unresolved":true,"context_lines":[{"line_number":269,"context_line":" pcpus \u003d instance.numa_topology.cpu_pinning"},{"line_number":270,"context_line":" rebooting_pcpus \u003d rebooting_instance.numa_topology.cpu_pinning"},{"line_number":271,"context_line":" expected_offline_cpus \u003d cpu_dedicated_set - pcpus - rebooting_pcpus"},{"line_number":272,"context_line":" self._assert_cpu_set_state(expected_offline_cpus, \u0027offline\u0027)"},{"line_number":273,"context_line":""},{"line_number":274,"context_line":" def test_compute_service_starts_with_power_management_and_zero_pcpu(self):"},{"line_number":275,"context_line":" self.flags(cpu_dedicated_set\u003dNone, cpu_shared_set\u003dNone,"}],"source_content_type":"text/x-python","patch_set":4,"id":"264b25aa_9a743eb2","line":272,"in_reply_to":"f9f9a560_7b75cae0","updated":"2024-11-18 15:32:09.000000000","message":"I ask for some coverage for the offline then online case as it was the actual scenario as far as I remember that lead to the bug. If the test is not in the functional test then I\u0027m OK with a unit test too.","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":121,"context_line":" return self.cores[i]"},{"line_number":122,"context_line":""},{"line_number":123,"context_line":""},{"line_number":124,"context_line":"class PowerManagementLiveMigrationTestsBase(base.LibvirtMigrationMixin,"},{"line_number":125,"context_line":" PowerManagementTestsBase):"},{"line_number":126,"context_line":""},{"line_number":127,"context_line":" def setUp(self):"}],"source_content_type":"text/x-python","patch_set":10,"id":"bd77ed96_a5b641b1","line":124,"updated":"2025-01-16 12:59:57.000000000","message":"The test cases under this base class now probably sensitive to when the periodic update_available_resources runs and observes the instance.host and node while the instance is being live migrated. See my comment in the resource_tracker.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":false,"context_lines":[{"line_number":121,"context_line":" return self.cores[i]"},{"line_number":122,"context_line":""},{"line_number":123,"context_line":""},{"line_number":124,"context_line":"class PowerManagementLiveMigrationTestsBase(base.LibvirtMigrationMixin,"},{"line_number":125,"context_line":" PowerManagementTestsBase):"},{"line_number":126,"context_line":""},{"line_number":127,"context_line":" def setUp(self):"}],"source_content_type":"text/x-python","patch_set":10,"id":"b80a336a_bee7e9d5","line":124,"in_reply_to":"2ec07e07_ca0bf6a1","updated":"2025-02-10 13:13:58.000000000","message":"OK. Then then my comment is not applicable any more.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":true,"context_lines":[{"line_number":121,"context_line":" return self.cores[i]"},{"line_number":122,"context_line":""},{"line_number":123,"context_line":""},{"line_number":124,"context_line":"class PowerManagementLiveMigrationTestsBase(base.LibvirtMigrationMixin,"},{"line_number":125,"context_line":" PowerManagementTestsBase):"},{"line_number":126,"context_line":""},{"line_number":127,"context_line":" def setUp(self):"}],"source_content_type":"text/x-python","patch_set":10,"id":"2ec07e07_ca0bf6a1","line":124,"in_reply_to":"b49821ed_91a8fe84","updated":"2025-01-28 07:26:25.000000000","message":"Update: I have reverted back to only adjusting CPUs during startup based on the more recent review comments.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"0b9bcda6e32529c8ede75b3173f0bcfd56a00713","unresolved":true,"context_lines":[{"line_number":121,"context_line":" return self.cores[i]"},{"line_number":122,"context_line":""},{"line_number":123,"context_line":""},{"line_number":124,"context_line":"class PowerManagementLiveMigrationTestsBase(base.LibvirtMigrationMixin,"},{"line_number":125,"context_line":" PowerManagementTestsBase):"},{"line_number":126,"context_line":""},{"line_number":127,"context_line":" def setUp(self):"}],"source_content_type":"text/x-python","patch_set":10,"id":"b49821ed_91a8fe84","line":124,"in_reply_to":"bd77ed96_a5b641b1","updated":"2025-01-16 23:25:49.000000000","message":"Hm, yeah. It might not be the right thing to adjust CPUs all of the times a compute node is updated ... I\u0027ll see if I can figure it out.","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":147,"context_line":" hostname\u003d\u0027src\u0027)"},{"line_number":148,"context_line":" self.src \u003d self.computes[\u0027src\u0027]"},{"line_number":149,"context_line":" self.src.driver.cpu_api.core \u003d CoresStub()"},{"line_number":150,"context_line":" # NOTE(artom) In init_host() the libvirt driver calls"},{"line_number":151,"context_line":" # power_down_all_dedicated_cpus(). Call it again now after swapping to"},{"line_number":152,"context_line":" # our stub to fake reality."},{"line_number":153,"context_line":" self.src.driver.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":154,"context_line":""},{"line_number":155,"context_line":" self.start_compute("},{"line_number":156,"context_line":" host_info\u003dfakelibvirt.HostInfo(cpu_nodes\u003d1, cpu_sockets\u003d1,"}],"source_content_type":"text/x-python","patch_set":10,"id":"ee51fe8c_3684357d","line":153,"range":{"start_line":150,"start_character":0,"end_line":153,"end_character":63},"updated":"2025-01-16 12:59:57.000000000","message":"this is not true any more","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":false,"context_lines":[{"line_number":147,"context_line":" hostname\u003d\u0027src\u0027)"},{"line_number":148,"context_line":" self.src \u003d self.computes[\u0027src\u0027]"},{"line_number":149,"context_line":" self.src.driver.cpu_api.core \u003d CoresStub()"},{"line_number":150,"context_line":" # NOTE(artom) In init_host() the libvirt driver calls"},{"line_number":151,"context_line":" # power_down_all_dedicated_cpus(). Call it again now after swapping to"},{"line_number":152,"context_line":" # our stub to fake reality."},{"line_number":153,"context_line":" self.src.driver.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":154,"context_line":""},{"line_number":155,"context_line":" self.start_compute("},{"line_number":156,"context_line":" host_info\u003dfakelibvirt.HostInfo(cpu_nodes\u003d1, cpu_sockets\u003d1,"}],"source_content_type":"text/x-python","patch_set":10,"id":"2dd98412_b6f33a2b","line":153,"range":{"start_line":150,"start_character":0,"end_line":153,"end_character":63},"in_reply_to":"ee51fe8c_3684357d","updated":"2025-01-28 07:26:25.000000000","message":"Done","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":157,"context_line":" cpu_cores\u003d5, cpu_threads\u003d2),"},{"line_number":158,"context_line":" hostname\u003d\u0027dest\u0027)"},{"line_number":159,"context_line":" self.dest \u003d self.computes[\u0027dest\u0027]"},{"line_number":160,"context_line":" self.dest.driver.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":161,"context_line":""},{"line_number":162,"context_line":" def assert_cores(self, host, cores, online\u003dTrue):"},{"line_number":163,"context_line":" for i in cores:"}],"source_content_type":"text/x-python","patch_set":10,"id":"669acdfe_3a5cbdb5","line":160,"updated":"2025-01-16 12:59:57.000000000","message":"ditto","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":false,"context_lines":[{"line_number":157,"context_line":" cpu_cores\u003d5, cpu_threads\u003d2),"},{"line_number":158,"context_line":" hostname\u003d\u0027dest\u0027)"},{"line_number":159,"context_line":" self.dest \u003d self.computes[\u0027dest\u0027]"},{"line_number":160,"context_line":" self.dest.driver.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":161,"context_line":""},{"line_number":162,"context_line":" def assert_cores(self, host, cores, online\u003dTrue):"},{"line_number":163,"context_line":" for i in cores:"}],"source_content_type":"text/x-python","patch_set":10,"id":"a5c8ae12_f10ca119","line":160,"in_reply_to":"669acdfe_3a5cbdb5","updated":"2025-01-28 07:26:25.000000000","message":"Done","commit_id":"c0e5e3efaa0e022ed731d90de166e5fb7642053b"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":157,"context_line":" cpu_cores\u003d5, cpu_threads\u003d2),"},{"line_number":158,"context_line":" hostname\u003d\u0027dest\u0027)"},{"line_number":159,"context_line":" self.dest \u003d self.computes[\u0027dest\u0027]"},{"line_number":160,"context_line":" self.dest.driver.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":161,"context_line":""},{"line_number":162,"context_line":" def assert_cores(self, host, cores, online\u003dTrue):"},{"line_number":163,"context_line":" for i in cores:"}],"source_content_type":"text/x-python","patch_set":15,"id":"bcc68832_5bc764ec","side":"PARENT","line":160,"updated":"2025-02-10 13:13:58.000000000","message":"don\u0027t we need to stub the core api on the dest as well?","commit_id":"4b17758daabe283d034762bddfaa2cf7d7d9685c"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":151,"context_line":" # stub for the remainder of the test, so copy the current state of the"},{"line_number":152,"context_line":" # cores into the stub before replacing it."},{"line_number":153,"context_line":" core_stub \u003d CoresStub()"},{"line_number":154,"context_line":" for i in range(1, 10):"},{"line_number":155,"context_line":" core \u003d core_stub(i)"},{"line_number":156,"context_line":" core.online \u003d self.src.driver.cpu_api.core(i).online"},{"line_number":157,"context_line":" self.src.driver.cpu_api.core \u003d core_stub"},{"line_number":158,"context_line":""},{"line_number":159,"context_line":" self.start_compute("}],"source_content_type":"text/x-python","patch_set":15,"id":"d7b62ea5_8e8311d0","line":156,"range":{"start_line":154,"start_character":0,"end_line":156,"end_character":64},"updated":"2025-02-10 13:13:58.000000000","message":"I think this could be moved to CoreStub to have something only like:\n```\nself.src.driver.cpu_api.core \u003d CoreStub(self.src.driver.cpu_api)\n```","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":248,"context_line":" self.assert_cores(self.src, range(1, 10), online\u003dFalse)"},{"line_number":249,"context_line":" self.assert_cores(self.dest, range(1, 10), online\u003dTrue)"},{"line_number":250,"context_line":""},{"line_number":251,"context_line":" # Restart nova-compute on the destination."},{"line_number":252,"context_line":" self.restart_compute_service(hostname\u003d\u0027dest\u0027)"},{"line_number":253,"context_line":""},{"line_number":254,"context_line":" # Cores on the source should still be offline and dest still online."},{"line_number":255,"context_line":" self.assert_cores(self.src, range(1, 10), online\u003dFalse)"},{"line_number":256,"context_line":" self.assert_cores(self.dest, range(1, 10), online\u003dTrue)"},{"line_number":257,"context_line":""}],"source_content_type":"text/x-python","patch_set":15,"id":"7e53cc34_918f32ba","line":254,"range":{"start_line":251,"start_character":0,"end_line":254,"end_character":76},"updated":"2025-02-10 13:13:58.000000000","message":"... and we need to assert that the dest cores are not just online after the restart but also did not went through a offline -\u003e online transition during the restart. I\u0027m OK to add this coverage as a follow up.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":254,"context_line":" # Cores on the source should still be offline and dest still online."},{"line_number":255,"context_line":" self.assert_cores(self.src, range(1, 10), online\u003dFalse)"},{"line_number":256,"context_line":" self.assert_cores(self.dest, range(1, 10), online\u003dTrue)"},{"line_number":257,"context_line":""},{"line_number":258,"context_line":" # Restart nova-compute on the source."},{"line_number":259,"context_line":" self.restart_compute_service(hostname\u003d\u0027src\u0027)"},{"line_number":260,"context_line":""},{"line_number":261,"context_line":" # Revert the migration."},{"line_number":262,"context_line":" self._revert_resize(server)"},{"line_number":263,"context_line":""}],"source_content_type":"text/x-python","patch_set":15,"id":"69a5b31e_b2600520","line":260,"range":{"start_line":257,"start_character":0,"end_line":260,"end_character":0},"updated":"2025-02-10 13:13:58.000000000","message":"nit: I would assert that cores remained offline on the src.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":261,"context_line":" # Revert the migration."},{"line_number":262,"context_line":" self._revert_resize(server)"},{"line_number":263,"context_line":""},{"line_number":264,"context_line":" # Now cores on the source should be online and dest offline."},{"line_number":265,"context_line":" # FIXME(melwitt): This is currently not working as expected."},{"line_number":266,"context_line":" # self.assert_cores(self.src, range(1, 10), online\u003dTrue)"},{"line_number":267,"context_line":" # self.assert_cores(self.dest, range(1, 10), online\u003dFalse)"},{"line_number":268,"context_line":" self.assert_cores(self.src, range(1, 10), online\u003dFalse)"},{"line_number":269,"context_line":" self.assert_cores(self.dest, range(1, 10), online\u003dTrue)"},{"line_number":270,"context_line":""},{"line_number":271,"context_line":""},{"line_number":272,"context_line":"class PowerManagementTests(PowerManagementTestsBase):"},{"line_number":273,"context_line":" \"\"\"Test suite for a single host with 9 dedicated cores and 1 used for OS\"\"\""}],"source_content_type":"text/x-python","patch_set":15,"id":"9b1fca09_c7231911","line":270,"range":{"start_line":264,"start_character":0,"end_line":270,"end_character":0},"updated":"2025-02-10 13:13:58.000000000","message":"Thanks! As commented on elsewhere, lets file a separate bug about cold migration revert and move this testcase there.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":false,"context_lines":[{"line_number":335,"context_line":" # Note that the PCPU for the rebooting server is expected to have"},{"line_number":336,"context_line":" # been offlined then onlined as part of reboot_instance(), so we do"},{"line_number":337,"context_line":" # not include it in used_pcpus."},{"line_number":338,"context_line":" self.assertNotIn(core, used_pcpus)"},{"line_number":339,"context_line":" return real_set_offline(core)"},{"line_number":340,"context_line":""},{"line_number":341,"context_line":" if power_off_before_restart:"}],"source_content_type":"text/x-python","patch_set":15,"id":"38db4ce5_92af020a","line":338,"updated":"2025-02-10 13:13:58.000000000","message":"Thanks!","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"67569829438119744f2bf05c53a95e89667194b3","unresolved":true,"context_lines":[{"line_number":339,"context_line":" return real_set_offline(core)"},{"line_number":340,"context_line":""},{"line_number":341,"context_line":" if power_off_before_restart:"},{"line_number":342,"context_line":" # This will make nova-compute reboot an instance that is expected"},{"line_number":343,"context_line":" # to be running but is powered off if"},{"line_number":344,"context_line":" # resume_guests_state_on_host_boot\u003dTrue."},{"line_number":345,"context_line":" instance \u003d objects.Instance.get_by_uuid(self.ctxt, server[\u0027id\u0027])"},{"line_number":346,"context_line":" self.computes[\u0027compute1\u0027].driver.power_off(instance)"},{"line_number":347,"context_line":""},{"line_number":348,"context_line":" with mock.patch("},{"line_number":349,"context_line":" \u0027nova.virt.libvirt.cpu.core.set_offline\u0027,"}],"source_content_type":"text/x-python","patch_set":15,"id":"cc226c05_1060ea03","line":346,"range":{"start_line":342,"start_character":0,"end_line":346,"end_character":64},"updated":"2025-02-10 13:13:58.000000000","message":"hm that is strange. In real env nova only reboots the instance during nova-compute startup if the DB state of the instance is different from the hypervisor state of the instance. But this test stops the instance via the driver so I assume the driver updates the DB state during the power_off. So I would not expect nova-compute to reboot this instance.","commit_id":"01583e6f677dcc01e7d8c7b1860289add0d29362"}],"nova/virt/driver.py":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":323,"context_line":""},{"line_number":324,"context_line":" def init_host(self, host, instances):"},{"line_number":325,"context_line":" \"\"\"Initialize anything that is necessary for the driver to function,"},{"line_number":326,"context_line":" including catching up with currently running VM\u0027s on the given host."},{"line_number":327,"context_line":" \"\"\""},{"line_number":328,"context_line":" # TODO(Vek): Need to pass context in for access to auth_token"},{"line_number":329,"context_line":" raise NotImplementedError()"}],"source_content_type":"text/x-python","patch_set":4,"id":"360483c2_13fc6500","line":326,"updated":"2024-10-22 08:43:08.000000000","message":"nit: would be nice to have a doc for the new param","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"894d82cebe70e0bc7d90fc0e6215966b122944a7","unresolved":false,"context_lines":[{"line_number":323,"context_line":""},{"line_number":324,"context_line":" def init_host(self, host, instances):"},{"line_number":325,"context_line":" \"\"\"Initialize anything that is necessary for the driver to function,"},{"line_number":326,"context_line":" including catching up with currently running VM\u0027s on the given host."},{"line_number":327,"context_line":" \"\"\""},{"line_number":328,"context_line":" # TODO(Vek): Need to pass context in for access to auth_token"},{"line_number":329,"context_line":" raise NotImplementedError()"}],"source_content_type":"text/x-python","patch_set":4,"id":"3b12f7b4_8f462ee5","line":326,"in_reply_to":"360483c2_13fc6500","updated":"2024-11-04 10:13:33.000000000","message":"not applicable any more","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ed55fa1aa43967e915282ce8528e3e2c876154b7","unresolved":true,"context_lines":[{"line_number":1967,"context_line":" \"\"\""},{"line_number":1968,"context_line":" return True"},{"line_number":1969,"context_line":""},{"line_number":1970,"context_line":" def compute_node_updated(self, compute_node):"},{"line_number":1971,"context_line":" \"\"\"Called by the resource tracker whenthe compute node record has been"},{"line_number":1972,"context_line":" updated. This can be used by virt drivers that manager hypervisor"},{"line_number":1973,"context_line":" physical resources to align those resources to their representation in"}],"source_content_type":"text/x-python","patch_set":6,"id":"5e5d450d_eebe097b","line":1970,"updated":"2024-10-30 01:51:01.000000000","message":"this api change is not requried.\n\ninthe pre_start hook we intialise the resouce tracker and call updated_avaiable_resouces\n\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/manager.py#L1779-L1787\n\nthat calls update_aviable_resouces_for_node\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/manager.py#L10706\n\nwhich calls update_aviable_resouces on the resouce tracker.\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/manager.py#L10581C21-L10581C27\n\nthat calls into the driver to get a generic resouce dict\n\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/resource_tracker.py#L911\n\nwhich gets passed to the rt _update_available_resource method.\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/resource_tracker.py#L974\n\nthat loops over all the instnaces on the host and calls _update_usage_from_instances which eventually calls _get_usage_dict which extract the numatoplgoy form the instance object.\n\n\nso before we every get to the init_host call teh prestart hook should have already loaded all instnce on the host and updated the usage in the resouce tracker.\n\nso we dont need a new callback method in teh virt driver api.\n\nthe libvirt dirver init host can just use the data in the reouscue tracker to determine wich cores to offline.\n\ni may have missed something but i tired to explain this 2 weeks ago before the ptg.","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"1548f99679ac672c4145db97fcf3c3d246b62dfa","unresolved":true,"context_lines":[{"line_number":1967,"context_line":" \"\"\""},{"line_number":1968,"context_line":" return True"},{"line_number":1969,"context_line":""},{"line_number":1970,"context_line":" def compute_node_updated(self, compute_node):"},{"line_number":1971,"context_line":" \"\"\"Called by the resource tracker whenthe compute node record has been"},{"line_number":1972,"context_line":" updated. This can be used by virt drivers that manager hypervisor"},{"line_number":1973,"context_line":" physical resources to align those resources to their representation in"}],"source_content_type":"text/x-python","patch_set":6,"id":"cf256d14_3514519d","line":1970,"in_reply_to":"5e5d450d_eebe097b","updated":"2024-11-17 18:23:09.000000000","message":"I don\u0027t follow in what step in all of that you expect the libvirt driver to get called to power down unused dedicated CPUs.\n\nAlso, in nova/service.py, pre_start_hook() is called _after_ init_host()","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"fb239602681e6700d830aa06a760ad7ebf769e28","unresolved":true,"context_lines":[{"line_number":1967,"context_line":" \"\"\""},{"line_number":1968,"context_line":" return True"},{"line_number":1969,"context_line":""},{"line_number":1970,"context_line":" def compute_node_updated(self, compute_node):"},{"line_number":1971,"context_line":" \"\"\"Called by the resource tracker whenthe compute node record has been"},{"line_number":1972,"context_line":" updated. This can be used by virt drivers that manager hypervisor"},{"line_number":1973,"context_line":" physical resources to align those resources to their representation in"}],"source_content_type":"text/x-python","patch_set":6,"id":"2b5c1454_8ebc2b28","line":1970,"in_reply_to":"cf256d14_3514519d","updated":"2025-01-24 16:58:37.000000000","message":"yep i realise that after the comemtn\n\nim still not practically happy with modifying the driver interface for this\nso i would prefer to reorder those hooks or find another way to do this if we can.","commit_id":"6e973126d9c2507c23da55072d071c76f86aea80"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"de6dafb93f6775d50f11a697f5f4038439a09e6c","unresolved":true,"context_lines":[{"line_number":1968,"context_line":" return True"},{"line_number":1969,"context_line":""},{"line_number":1970,"context_line":" def compute_node_updated(self, compute_node, instances):"},{"line_number":1971,"context_line":" \"\"\"Called by the resource tracker whenthe compute node record has been"},{"line_number":1972,"context_line":" updated. This can be used by virt drivers that manager hypervisor"},{"line_number":1973,"context_line":" physical resources to align those resources to their representation in"},{"line_number":1974,"context_line":" Nova."}],"source_content_type":"text/x-python","patch_set":8,"id":"3c48deb0_d00d882a","line":1971,"updated":"2024-11-18 15:32:09.000000000","message":"nit: when the","commit_id":"f45fb9b1bce0fcc6e5ab6d7cc6187852384f435e"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"d44bbfa63e7ff76c119028ff9e44a4c7d27c1172","unresolved":false,"context_lines":[{"line_number":1968,"context_line":" return True"},{"line_number":1969,"context_line":""},{"line_number":1970,"context_line":" def compute_node_updated(self, compute_node, instances):"},{"line_number":1971,"context_line":" \"\"\"Called by the resource tracker whenthe compute node record has been"},{"line_number":1972,"context_line":" updated. This can be used by virt drivers that manager hypervisor"},{"line_number":1973,"context_line":" physical resources to align those resources to their representation in"},{"line_number":1974,"context_line":" Nova."}],"source_content_type":"text/x-python","patch_set":8,"id":"a9c9dc24_8223a86e","line":1971,"in_reply_to":"3c48deb0_d00d882a","updated":"2025-01-15 23:40:29.000000000","message":"Done","commit_id":"f45fb9b1bce0fcc6e5ab6d7cc6187852384f435e"}],"nova/virt/libvirt/driver.py":[{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d19f75a4bb70994a30046d10ae199ec60acf49ed","unresolved":true,"context_lines":[{"line_number":873,"context_line":" # either the governor strategies are different between the cores or if"},{"line_number":874,"context_line":" # the cores are offline."},{"line_number":875,"context_line":" self.cpu_api.validate_all_dedicated_cpus()"},{"line_number":876,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":877,"context_line":" for instance in instances:"},{"line_number":878,"context_line":" self.cpu_api.power_up_for_instance(instance)"},{"line_number":879,"context_line":""},{"line_number":880,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":881,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"}],"source_content_type":"text/x-python","patch_set":2,"id":"dcec38d7_7f1ca421","line":878,"range":{"start_line":876,"start_character":0,"end_line":878,"end_character":56},"updated":"2024-10-21 16:48:58.000000000","message":"the power_down_all_dedicated_cpus already forcing the kernel to unpin the instance from it\u0027s pcpu as the pcpu is going down. So even if power_up_for_instance is re-enabling the cpu that is already too late, the instance is already unpinned.\n\nSo instead nova should only offline unused pcpus, not all pcpus in the first place.","commit_id":"1d1d67a0d34425f8829c0c569444979a1c2ae453"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"894d82cebe70e0bc7d90fc0e6215966b122944a7","unresolved":false,"context_lines":[{"line_number":873,"context_line":" # either the governor strategies are different between the cores or if"},{"line_number":874,"context_line":" # the cores are offline."},{"line_number":875,"context_line":" self.cpu_api.validate_all_dedicated_cpus()"},{"line_number":876,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":877,"context_line":" for instance in instances:"},{"line_number":878,"context_line":" self.cpu_api.power_up_for_instance(instance)"},{"line_number":879,"context_line":""},{"line_number":880,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":881,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"}],"source_content_type":"text/x-python","patch_set":2,"id":"ef15cbed_56ed1b91","line":878,"range":{"start_line":876,"start_character":0,"end_line":878,"end_character":56},"in_reply_to":"c9372bc2_5e0c3d84","updated":"2024-11-04 10:13:33.000000000","message":"Acknowledged","commit_id":"1d1d67a0d34425f8829c0c569444979a1c2ae453"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"f6e42344b2b5bea8a4e5a6da57c7ff946da100c7","unresolved":true,"context_lines":[{"line_number":873,"context_line":" # either the governor strategies are different between the cores or if"},{"line_number":874,"context_line":" # the cores are offline."},{"line_number":875,"context_line":" self.cpu_api.validate_all_dedicated_cpus()"},{"line_number":876,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":877,"context_line":" for instance in instances:"},{"line_number":878,"context_line":" self.cpu_api.power_up_for_instance(instance)"},{"line_number":879,"context_line":""},{"line_number":880,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":881,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"}],"source_content_type":"text/x-python","patch_set":2,"id":"c9372bc2_5e0c3d84","line":878,"range":{"start_line":876,"start_character":0,"end_line":878,"end_character":56},"in_reply_to":"dcec38d7_7f1ca421","updated":"2024-10-21 17:42:36.000000000","message":"Yeeppp, excellent point. Fixed.","commit_id":"1d1d67a0d34425f8829c0c569444979a1c2ae453"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":879,"context_line":" # crucial to only power down free CPUs - as opposed to powering"},{"line_number":880,"context_line":" # everything down, and then instances\u0027s CPUs back up - as powering down"},{"line_number":881,"context_line":" # CPUs pinned to instances causes them to unpin."},{"line_number":882,"context_line":" for instance in instances:"},{"line_number":883,"context_line":" if instance.numa_topology:"},{"line_number":884,"context_line":" cpus \u003d cpus - instance.numa_topology.cpu_pinning.union("},{"line_number":885,"context_line":" instance.numa_topology.cpuset_reserved)"}],"source_content_type":"text/x-python","patch_set":4,"id":"57f05dd7_7b4edbe1","line":882,"updated":"2024-10-22 08:43:08.000000000","message":"This is all the instances on the host including those that are stopped. I think normally when an instance is stopped we expect that the pcpus of the instance is offlined.\n\nPlease note that at this point nova haven not yet synced the DB power state of the instance with the libvirt power state of the instance as that happens in init_instance after init_host. So if the nova-compute is restarted then there is likely to have discrepancy between the instance\u0027s DB power state and the instance\u0027s hypervisor power state, depending on the fact if this was just a nova-compute service restart or a full hypervisor power cycle. Nova in this case updates the DB value based on the hypervisor value (except for the case of resume_guest_state_on_host_reboot), so relying on the DB state here is not a good idea.\n\nSo there are multiple cases to consider:\n* a) the instance was already stopped before the restart\n* b) the instance was running, there was a hypervisor restart, the instance is not running on the hypervisor but nova is not synced that power state yet\n* c) the instance was running, there was a hypervisor restart, the instance is not running on the hypervisor, but nova is not synced that power state yet, but nova will restart the instance in init_instance as resume_guests_state_on_host_boot is configured to True.\n* d) the instance was running, there was only a nova-compute restart, the instance is still running on the hypervisor","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"d06995bddaa582f051b60b8b262a3d5fb6fd3bb0","unresolved":true,"context_lines":[{"line_number":881,"context_line":" # CPUs pinned to instances causes them to unpin."},{"line_number":882,"context_line":" for instance in instances:"},{"line_number":883,"context_line":" if instance.numa_topology:"},{"line_number":884,"context_line":" cpus \u003d cpus - instance.numa_topology.cpu_pinning.union("},{"line_number":885,"context_line":" instance.numa_topology.cpuset_reserved)"},{"line_number":886,"context_line":" self.cpu_api.power_down(cpus)"},{"line_number":887,"context_line":""},{"line_number":888,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"}],"source_content_type":"text/x-python","patch_set":4,"id":"3bee42cb_3a2506d7","line":885,"range":{"start_line":884,"start_character":30,"end_line":885,"end_character":59},"updated":"2024-10-22 08:43:08.000000000","message":"Don\u0027t we have a helper for it?\n// after a bit of grepping for it\nNo we didn\u0027t have, but we should as this logic is now in shared across multiple places[1][2] with the same meaning: give me all the pcpus allocated to the instance. So it worth having a helper for it on the Instance object.\n\n\n[1]https://github.com/openstack/nova/blob/ed2bf3699ddc360fb646b895560fa9dbcfd6beba/nova/virt/libvirt/cpu/api.py#L112-L113\n[2]https://github.com/openstack/nova/blob/ed2bf3699ddc360fb646b895560fa9dbcfd6beba/nova/virt/libvirt/cpu/api.py#L154-L155","commit_id":"494ac71ac1e48040e6cec2b39312ba6591f7b168"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"b97023a67a7c36b662626e0426de151aab79ba5f","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" if CONF.libvirt.virt_type \u003d\u003d \u0027lxc\u0027:"},{"line_number":1649,"context_line":" self._teardown_container(instance)"},{"line_number":1650,"context_line":" # We\u0027re sure the instance is gone, we can shutdown the core if so"},{"line_number":1651,"context_line":" self.cpu_api.power_down_for_instance(instance)"},{"line_number":1652,"context_line":""},{"line_number":1653,"context_line":" def destroy(self, context, instance, network_info, block_device_info\u003dNone,"},{"line_number":1654,"context_line":" destroy_disks\u003dTrue, destroy_secrets\u003dTrue):"}],"source_content_type":"text/x-python","patch_set":5,"id":"35432096_92ca8d5e","line":1651,"updated":"2024-10-22 20:01:03.000000000","message":"We power down the cores upon instance stop (which calls down to here), this patch should take into account instances that are STOPPED.","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"11ce69e61ce190b25689f1d2754cfbeb8b587a1a","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" if CONF.libvirt.virt_type \u003d\u003d \u0027lxc\u0027:"},{"line_number":1649,"context_line":" self._teardown_container(instance)"},{"line_number":1650,"context_line":" # We\u0027re sure the instance is gone, we can shutdown the core if so"},{"line_number":1651,"context_line":" self.cpu_api.power_down_for_instance(instance)"},{"line_number":1652,"context_line":""},{"line_number":1653,"context_line":" def destroy(self, context, instance, network_info, block_device_info\u003dNone,"},{"line_number":1654,"context_line":" destroy_disks\u003dTrue, destroy_secrets\u003dTrue):"}],"source_content_type":"text/x-python","patch_set":5,"id":"bbf3bb2a_a3acb720","line":1651,"in_reply_to":"35432096_92ca8d5e","updated":"2024-10-24 09:55:33.000000000","message":"Thanks for confirming this open question from the PTG discussion.","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"28fe7d70499e8e8b91985e04b05f50eccdf502d6","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" if CONF.libvirt.virt_type \u003d\u003d \u0027lxc\u0027:"},{"line_number":1649,"context_line":" self._teardown_container(instance)"},{"line_number":1650,"context_line":" # We\u0027re sure the instance is gone, we can shutdown the core if so"},{"line_number":1651,"context_line":" self.cpu_api.power_down_for_instance(instance)"},{"line_number":1652,"context_line":""},{"line_number":1653,"context_line":" def destroy(self, context, instance, network_info, block_device_info\u003dNone,"},{"line_number":1654,"context_line":" destroy_disks\u003dTrue, destroy_secrets\u003dTrue):"}],"source_content_type":"text/x-python","patch_set":5,"id":"96da46ec_c40191bc","line":1651,"in_reply_to":"bbf3bb2a_a3acb720","updated":"2024-10-30 01:26:17.000000000","message":"i guess we decided to make it more agressive\n\nthe orignal propoasla was to only power off the core when we delete the vm or move it to another host or shleve it.","commit_id":"b9dadcffed7319d0a388565139ee95415ee47ae5"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"313f3316fafb8140b2720335cc5d9b73825a5b26","unresolved":true,"context_lines":[{"line_number":879,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":880,"context_line":" # wrongly modified."},{"line_number":881,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":882,"context_line":""},{"line_number":883,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":884,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"},{"line_number":885,"context_line":" # persisted across reboots."}],"source_content_type":"text/x-python","patch_set":6,"id":"47e896d7_3c122ebc","side":"PARENT","line":882,"updated":"2024-10-30 01:10:00.000000000","message":"this was not part of the design of the feature in the spec.\n that would not be logically correct.\nthe feature as desinged is inted to only power off unused cores.","commit_id":"c652c67f90169045b100daa7f98fbcb66815af1f"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"28fe7d70499e8e8b91985e04b05f50eccdf502d6","unresolved":true,"context_lines":[{"line_number":879,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":880,"context_line":" # wrongly modified."},{"line_number":881,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":882,"context_line":""},{"line_number":883,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":884,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"},{"line_number":885,"context_line":" # persisted across reboots."}],"source_content_type":"text/x-python","patch_set":6,"id":"9959dbc0_a1ed1189","side":"PARENT","line":882,"in_reply_to":"47e896d7_3c122ebc","updated":"2024-10-30 01:26:17.000000000","message":"this call shoudl be powering down unused cores only.\nwhen we call this we should have already run update_aviable_resouces once and the resouce tracker should be populated with the used cpu in memory.\nthe bug here is that was do nto use that to determni which cpus to offline.\n\nif we have not intiallsed teh resouce tracker at this point then we need to defer the offlining of the cores until after that is done.","commit_id":"c652c67f90169045b100daa7f98fbcb66815af1f"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"baa789ce4362eeebc90f40542f2dcf58d7eb7dd5","unresolved":true,"context_lines":[{"line_number":879,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":880,"context_line":" # wrongly modified."},{"line_number":881,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":882,"context_line":""},{"line_number":883,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":884,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"},{"line_number":885,"context_line":" # persisted across reboots."}],"source_content_type":"text/x-python","patch_set":6,"id":"bd7a4d56_6b31b9ef","side":"PARENT","line":882,"in_reply_to":"9959dbc0_a1ed1189","updated":"2024-10-30 02:14:59.000000000","message":"in case its not clear the host numa toplogy object has a free_pcpus property\nhttps://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/objects/numa.py#L65\n\nthat are the cores to offline.\n\nthe driver has access to the compute via the virtapi object https://github.com/openstack/nova/blob/01b207e50d307a7e7650b5839535fbd3ad40bc1b/nova/compute/manager.py#L386\n\nso i think you can just call self.virtapi._compute.numa_toplogy.free_pcpus()\nor failing that self.virtapi._compute.rt.compute_nodes[CONF.host].numa_topology.free_cpus()\ndepending on if the compute object in the virt api is the compute manager or the compute node.\n\ni think it might be the manager so you need to look up the node object via the resouce tracker but that how i would fix this issue without any virt driver API changes.","commit_id":"c652c67f90169045b100daa7f98fbcb66815af1f"},{"author":{"_account_id":8864,"name":"Artom Lifshitz","email":"notartom@gmail.com","username":"artom"},"change_message_id":"1548f99679ac672c4145db97fcf3c3d246b62dfa","unresolved":true,"context_lines":[{"line_number":879,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":880,"context_line":" # wrongly modified."},{"line_number":881,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":882,"context_line":""},{"line_number":883,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"},{"line_number":884,"context_line":" # TODO(sbauza): Remove this code once mediated devices are"},{"line_number":885,"context_line":" # persisted across reboots."}],"source_content_type":"text/x-python","patch_set":6,"id":"38db92fc_1cca2d34","side":"PARENT","line":882,"in_reply_to":"bd7a4d56_6b31b9ef","updated":"2024-11-17 18:23:09.000000000","message":"I\u0027m not a fan of the interface change either, but I think it\u0027s mitigated by having a default `pass` implementation, so the impact is minimal, even for backports. And IMO, the new callback method is far more elegant that chaining 7 object attributes, with a dict lookup in the middle ;)","commit_id":"c652c67f90169045b100daa7f98fbcb66815af1f"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"de6dafb93f6775d50f11a697f5f4038439a09e6c","unresolved":false,"context_lines":[{"line_number":13235,"context_line":" instance.numa_topology and"},{"line_number":13236,"context_line":" instance.vm_state \u003d\u003d vm_states.STOPPED"},{"line_number":13237,"context_line":" ):"},{"line_number":13238,"context_line":" cpus \u003d cpus.union(instance.numa_topology.cpu_pinning)"},{"line_number":13239,"context_line":" self.cpu_api.power_down(cpus)"}],"source_content_type":"text/x-python","patch_set":8,"id":"a279329c_2d4bf050","line":13238,"updated":"2024-11-18 15:32:09.000000000","message":"OK. This is correct because we can assume that VMs are excursively using the pcpus. So if there is VM that allocated the pcpu but the VM is stopped then there could not be another VM on the host that is allocated and using the same pcpu.","commit_id":"f45fb9b1bce0fcc6e5ab6d7cc6187852384f435e"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9c3cf0bf71f87410f8acb2976b265f119af7e7b8","unresolved":true,"context_lines":[{"line_number":880,"context_line":" # NOTE(sbauza): We powerdown all dedicated CPUs but if some instances"},{"line_number":881,"context_line":" # exist that are pinned for some CPUs, then we\u0027ll later powerup those"},{"line_number":882,"context_line":" # CPUs when rebooting the instance in _init_instance()"},{"line_number":883,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":884,"context_line":" # wrongly modified."},{"line_number":885,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":886,"context_line":""},{"line_number":887,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"}],"source_content_type":"text/x-python","patch_set":10,"id":"f3920d62_91b38cf4","side":"PARENT","line":884,"range":{"start_line":883,"start_character":0,"end_line":884,"end_character":27},"updated":"2025-01-16 12:59:57.000000000","message":"If this was an intentional way to verify the config options validity the by removing this we might change the behavior. I.e. a config error that caused a nova-compute startup failure now will only pop up during the periodic which probably does not cause a nova-compute startup failure.","commit_id":"4b17758daabe283d034762bddfaa2cf7d7d9685c"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"78c6f047b2c3ce9e983eeb5266d712bc4294f2c3","unresolved":true,"context_lines":[{"line_number":880,"context_line":" # NOTE(sbauza): We powerdown all dedicated CPUs but if some instances"},{"line_number":881,"context_line":" # exist that are pinned for some CPUs, then we\u0027ll later powerup those"},{"line_number":882,"context_line":" # CPUs when rebooting the instance in _init_instance()"},{"line_number":883,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":884,"context_line":" # wrongly modified."},{"line_number":885,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":886,"context_line":""},{"line_number":887,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"}],"source_content_type":"text/x-python","patch_set":10,"id":"f4113cd5_fd95cfc6","side":"PARENT","line":884,"range":{"start_line":883,"start_character":0,"end_line":884,"end_character":27},"in_reply_to":"e3c68f3b_3027e2bb","updated":"2025-01-28 07:26:25.000000000","message":"I have added a test for invalid configuration that asserts nova-compute fails to start in that case.","commit_id":"4b17758daabe283d034762bddfaa2cf7d7d9685c"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"0b9bcda6e32529c8ede75b3173f0bcfd56a00713","unresolved":true,"context_lines":[{"line_number":880,"context_line":" # NOTE(sbauza): We powerdown all dedicated CPUs but if some instances"},{"line_number":881,"context_line":" # exist that are pinned for some CPUs, then we\u0027ll later powerup those"},{"line_number":882,"context_line":" # CPUs when rebooting the instance in _init_instance()"},{"line_number":883,"context_line":" # Note that it can provide an exception if the config options are"},{"line_number":884,"context_line":" # wrongly modified."},{"line_number":885,"context_line":" self.cpu_api.power_down_all_dedicated_cpus()"},{"line_number":886,"context_line":""},{"line_number":887,"context_line":" if not self._host.has_min_version(MIN_LIBVIRT_PERSISTENT_MDEV):"}],"source_content_type":"text/x-python","patch_set":10,"id":"e3c68f3b_3027e2bb","side":"PARENT","line":884,"range":{"start_line":883,"start_character":0,"end_line":884,"end_character":27},"in_reply_to":"f3920d62_91b38cf4","updated":"2025-01-16 23:25:49.000000000","message":"Good point, I will look into this too.","commit_id":"4b17758daabe283d034762bddfaa2cf7d7d9685c"}]}