)]}'
{"/PATCHSET_LEVEL":[{"author":{"_account_id":30609,"name":"Stanislav Dmitriev","email":"sdmitriev1@gmail.com","username":"sdmitriev"},"change_message_id":"27b5c61220b27210d6a5506f688b09cefe15a829","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"91cbb684_4fe2e0b9","updated":"2021-12-12 18:22:09.000000000","message":"recheck","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"2658f1462c6fb273edcfd99bedde79124d1bda25","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"42d73b81_a72f5196","updated":"2023-05-18 15:02:16.000000000","message":"Nice, I think the functional test nicely answers all our questions about what is happening here.","commit_id":"b1cd349b03cacccb75923d4ccbe405baf2b5f6d1"},{"author":{"_account_id":26832,"name":"Stefan Hoffmann","email":"stefan.hoffmann@cloudandheat.com","username":"shoffmann"},"change_message_id":"58448dd563acfe7d39d472aa57704336bc337deb","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"ad21abe8_9f44f238","updated":"2024-06-05 10:37:31.000000000","message":"recheck\n\nnova-grenade-multinode failed","commit_id":"7024764a93992d359173e8e84ff6a9c949d6cd3d"},{"author":{"_account_id":14826,"name":"Mark Goddard","email":"markgoddard86@gmail.com","username":"mgoddard"},"change_message_id":"ad45440c5925f0e336bbbdf95a98d7c44cfe64e1","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"ec42dee6_33e08dd9","updated":"2023-11-20 16:49:27.000000000","message":"recheck\n\ntempest-integrated-compute-rbac-old-defaults failed","commit_id":"7024764a93992d359173e8e84ff6a9c949d6cd3d"},{"author":{"_account_id":7730,"name":"Sahid Orentino Ferdjaoui","email":"sahid.ferdjaoui@industrialdiscipline.com","username":"sahid"},"change_message_id":"3cdae444e823fc30f51d027459dea99145128f74","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":12,"id":"a920ab46_8db7efee","updated":"2024-06-11 08:31:03.000000000","message":"Sounds reasonable","commit_id":"a19ff60c4e0df73d689cfe45215f6c610aa37c69"},{"author":{"_account_id":26832,"name":"Stefan Hoffmann","email":"stefan.hoffmann@cloudandheat.com","username":"shoffmann"},"change_message_id":"fe9e0ba757b5a69aa4be2326a97023061e6e8a52","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":12,"id":"362b2123_ea066aa8","updated":"2024-06-05 16:16:11.000000000","message":"We tested this patch at our environment successfully.","commit_id":"a19ff60c4e0df73d689cfe45215f6c610aa37c69"},{"author":{"_account_id":26832,"name":"Stefan Hoffmann","email":"stefan.hoffmann@cloudandheat.com","username":"shoffmann"},"change_message_id":"736e7120ec296240cc95ee99d461660d80629bfe","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"210bee83_0d86d29a","updated":"2024-08-12 07:42:58.000000000","message":"LGTM","commit_id":"37403b2cad8600ea85c227cba22c25d6b9904ea2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"4e40d17a0828215ab087bac3fde3ea4f5b58b090","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"14b65d72_54bb4f40","updated":"2024-08-13 12:54:49.000000000","message":"other then the simplification of the test this is still effectively the same as the previous version.\n\nit would be nice if this had a release not but that is not not enough to hold this in my opinion\n\nso lets see if gibi agrees with proceeding with the current iterations.\n\nthanks artom for picking this up.","commit_id":"37403b2cad8600ea85c227cba22c25d6b9904ea2"}],"nova/compute/manager.py":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"cedf29881b4c630a2e16d5bfb8486f2b88844936","unresolved":false,"context_lines":[{"line_number":2236,"context_line":""},{"line_number":2237,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2238,"context_line":"            # have been deallocated."},{"line_number":2239,"context_line":"            instance.refresh()"},{"line_number":2240,"context_line":""},{"line_number":2241,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2242,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":1,"id":"9f560f44_554a7726","line":2239,"range":{"start_line":2239,"start_character":11,"end_line":2239,"end_character":30},"updated":"2020-09-07 13:09:21.000000000","message":"i would kind of perfer if this was doen as part of delete_allocation_for_instance\nbut i guess doing it here is ok.","commit_id":"fabd9f76f7f00f12715cebccf1af5f14b93416e2"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"f19c0597413b1d96ddf19ebc882113843e24c10d","unresolved":false,"context_lines":[{"line_number":2236,"context_line":""},{"line_number":2237,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2238,"context_line":"            # have been deallocated."},{"line_number":2239,"context_line":"            instance.refresh()"},{"line_number":2240,"context_line":""},{"line_number":2241,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2242,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":1,"id":"6d1816ef_f56f7535","line":2239,"range":{"start_line":2239,"start_character":11,"end_line":2239,"end_character":30},"in_reply_to":"9f560f44_42a39fff","updated":"2022-11-17 10:47:31.000000000","message":"I would argue this has more to do about build_instances call getting a \"fresh\" copy of the instance, free of tempory changes that are made above, than it is about delete_allocation_for_instance meaning you need to refresh the instance.","commit_id":"fabd9f76f7f00f12715cebccf1af5f14b93416e2"},{"author":{"_account_id":14826,"name":"Mark Goddard","email":"markgoddard86@gmail.com","username":"mgoddard"},"change_message_id":"6d9709b239b71bceb520846804fc8e3b9977a6a2","unresolved":false,"context_lines":[{"line_number":2236,"context_line":""},{"line_number":2237,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2238,"context_line":"            # have been deallocated."},{"line_number":2239,"context_line":"            instance.refresh()"},{"line_number":2240,"context_line":""},{"line_number":2241,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2242,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":1,"id":"9f560f44_42a39fff","line":2239,"range":{"start_line":2239,"start_character":11,"end_line":2239,"end_character":30},"in_reply_to":"9f560f44_554a7726","updated":"2020-10-12 15:58:07.000000000","message":"That module seems to be more about placement, and doesn\u0027t do much manipulation of instances.","commit_id":"fabd9f76f7f00f12715cebccf1af5f14b93416e2"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"70de34634e3fd26c61b4f80bba729a0fbe1466e9","unresolved":true,"context_lines":[{"line_number":2191,"context_line":"                \u0027support certificate validation.\u0027)"},{"line_number":2192,"context_line":""},{"line_number":2193,"context_line":"    @wrap_exception()"},{"line_number":2194,"context_line":"    @reverts_task_state"},{"line_number":2195,"context_line":"    @wrap_instance_event(prefix\u003d\u0027compute\u0027)"},{"line_number":2196,"context_line":"    @wrap_instance_fault"},{"line_number":2197,"context_line":"    def _do_build_and_run_instance(self, context, instance, image,"}],"source_content_type":"text/x-python","patch_set":6,"id":"6003f455_25788de7","line":2194,"updated":"2024-08-14 16:33:12.000000000","message":"i belive its because reverts_task_state https://github.com/openstack/nova/blob/a2d77845ab247f1b09e2ae4f32f9421c3f50b98d/nova/compute/manager.py#L151\n\nif we exit with an exeption update the instnace here\n\nhttps://github.com/openstack/nova/blob/a2d77845ab247f1b09e2ae4f32f9421c3f50b98d/nova/compute/manager.py#L176\n\n\nwhich does an instance.save() here\nhttps://github.com/openstack/nova/blob/a2d77845ab247f1b09e2ae4f32f9421c3f50b98d/nova/compute/manager.py#L700\n\nand \n\nself.compute_task_api.build_instances\nis what raise the exepction","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1349af07cdf6ea71f34fa83c989e5bd5a14a949f","unresolved":true,"context_lines":[{"line_number":2267,"context_line":"                context, instance.uuid, force\u003dTrue)"},{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"}],"source_content_type":"text/x-python","patch_set":6,"id":"f2f883ea_9f4cc53a","line":2270,"updated":"2022-02-01 12:08:42.000000000","message":"Do you mean deallocated by the aborted claim? As the simple refresh here does not deallocate pci devs, it just re-reads the instance object from the DB.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c64a54e77f9efc295f9fa65cf42ba282eac2b67","unresolved":true,"context_lines":[{"line_number":2267,"context_line":"                context, instance.uuid, force\u003dTrue)"},{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"}],"source_content_type":"text/x-python","patch_set":6,"id":"5afc5f61_2e49c6ab","line":2270,"in_reply_to":"27729933_98f9a49a","updated":"2022-11-17 12:59:58.000000000","message":"I agree that this will reset things from the object but \n1) I\u0027m not sure that only resetting the object is enough. We probably need to deallocate the device properly\n2) we might reset thing we don\u0027t want. See my comments in the test changes on the next patch","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"dc19c02cbf33dacd88e8a88bbdaec34058916422","unresolved":true,"context_lines":[{"line_number":2267,"context_line":"                context, instance.uuid, force\u003dTrue)"},{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"}],"source_content_type":"text/x-python","patch_set":6,"id":"61d7fdaf_cef6e38c","line":2270,"in_reply_to":"5afc5f61_2e49c6ab","updated":"2022-11-17 16:26:36.000000000","message":"I have attempted to prove there is nothing to deallocate in the functional tests. Its possible that is only the case because something is being mocked out I guess, but I am not certain. Its possible we fixed something since the first set of patches I guess.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"f19c0597413b1d96ddf19ebc882113843e24c10d","unresolved":true,"context_lines":[{"line_number":2267,"context_line":"                context, instance.uuid, force\u003dTrue)"},{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"}],"source_content_type":"text/x-python","patch_set":6,"id":"27729933_98f9a49a","line":2270,"in_reply_to":"f2f883ea_9f4cc53a","updated":"2022-11-17 10:47:31.000000000","message":"Yeah, I am not sure if the tracker would clean up, but this seems to stop the wrong device being used on the second host at least. i.e. it stops us persisting to the database the bad casae.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1349af07cdf6ea71f34fa83c989e5bd5a14a949f","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"a43b9e62_95815b44","line":2271,"updated":"2022-02-01 12:08:42.000000000","message":"how this refresh relate to the previous patch in the series? Do we need both patch to fix the issue?","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"412a76b5f51eaf4c3a041a94f6d0936e368e2dd1","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"6d41a645_c21f2a3f","line":2271,"in_reply_to":"01bc7f73_6a301305","updated":"2024-08-14 16:59:39.000000000","message":"i feel like we shoudl be able to test this in a unit/functional test\n\neither the pci requests are incorrect in the db when we call\n\n\n            self.compute_task_api.build_instances\nand fixed by this call to save \n\nhttps://github.com/openstack/nova/blob/a2d77845ab247f1b09e2ae4f32f9421c3f50b98d/nova/conductor/manager.py#L820\n\nor we are getting an exption raised here that causes reverts_task_state to trigger","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"5c64a54e77f9efc295f9fa65cf42ba282eac2b67","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"66b44b91_8ac60e01","line":2271,"in_reply_to":"075fee50_438ae971","updated":"2022-11-17 12:59:58.000000000","message":"yepp we need to check if the device that was allocated to this instance but removed from the instance by the refresh","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"56c940e97a888236b74b915b7d7d4d83550cc52d","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"01bc7f73_6a301305","line":2271,"in_reply_to":"571ef485_6e5852d4","updated":"2024-08-14 16:52:00.000000000","message":"ill admit I also have not found out exactly where the save happens\n\nit coupld be the expction path i noted above  or \n\ncalling build_instances on the conductor passing the now reset instnace could be doing the save as part of the on the new compute node.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"dc19c02cbf33dacd88e8a88bbdaec34058916422","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"f17c6dc9_e7fd7b7a","line":2271,"in_reply_to":"66b44b91_8ac60e01","updated":"2022-11-17 16:26:36.000000000","message":"So I think the functional test does a half decent job of proving all is well now?","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6dac4828d97b3949f2471ec6a3f46fef7a64e39f","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"cf672300_43d5fe80","line":2271,"in_reply_to":"6d41a645_c21f2a3f","updated":"2024-08-15 07:53:24.000000000","message":"I\u0027m one step closer\n\n\u003e See we save the instance at L2491. At that point the instance still has a pci device allocated on the host where the late check failed. Then we refresh the instance at L2502 that for some strange reasons removes the pci device from the instance even though we just saved that state into the DB not long ago. So what is changing the DB state between L2491 and L2502???\n\nWhen we save the instance at L2491 that save ignores the instance.pci_devices: https://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/objects/instance.py#L713-L717\nand also the reload of the instance within the instance.save() does not re-load and therefore does not override instance.pci_devices https://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/objects/instance.py#L869-L871\n\nThis explains why: the save at L2491 keeps instance.pci_devices having the old allocation without actually persisting it in the DB and therefore instance.refresh() can read the empty pci_devices from the DB.\n\nAs the pci tracker seems to be not affected by the bug (the functional test shows it) I suspect that the pci tracker claim was reverted properly during the re-schedule. We should move this instance.refresh to that place to keep the logic together.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":782,"name":"John Garbutt","email":"john@johngarbutt.com","username":"johngarbutt"},"change_message_id":"f19c0597413b1d96ddf19ebc882113843e24c10d","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"075fee50_438ae971","line":2271,"in_reply_to":"a43b9e62_95815b44","updated":"2022-11-17 10:47:31.000000000","message":"I wasn\u0027t sure, but the functional test suggests this is all we need, oddly.\n\nThat probably means we need something else to test we don\u0027t leak PCI devices on the first host I guess?","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"4504ab24b01967595868e5121b01b84664ea8859","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"571ef485_6e5852d4","line":2271,"in_reply_to":"bf214f32_2572f87f","updated":"2024-08-14 16:23:05.000000000","message":"I\u0027m still puzzled how this can actually fix the issue.\n\nSee we save the instance at L2491. At that point the instance still has a pci device allocated on the host where the late check failed. Then we refresh the instance at L2502 that for some strange reasons removes the pci device from the instance even though we just saved that state into the DB not long ago. So what is changing the DB state between L2491 and L2502???","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"354946be57b39eb88b67872230c226361d8a3559","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"ed97b2a5_fe9bbe14","line":2271,"in_reply_to":"cf672300_43d5fe80","updated":"2024-08-15 09:43:01.000000000","message":"OK I see more.\n\nWhen the build raises the late check exception the compute manager calls abort_instance_claim on the resource tracker https://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/compute/resource_tracker.py#L578-L580 the _update_usage_from_instance call there updates the pci_tracker and frees the devices allocated to the instance. At this point the instance.pci_devices list is also empty as PciDevice.allocate and .free handles that list properly.\n\nThen the abort_instance_claim returns and the claim context manager exits. We are back to the compute manager _build_and_run_instance however if I print the instance.pci_devices there then the device re-appeares.\n\nIt seems that the claim works on a different instance object that the compute manager.\n\nThese extra prints\n\n```\n\ndiff --git a/nova/compute/manager.py b/nova/compute/manager.py\nindex db307ac5cc..19b99e88bd 100644\n--- a/nova/compute/manager.py\n+++ b/nova/compute/manager.py\n@@ -2616,8 +2616,16 @@ class ComputeManager(manager.Manager):\n         try:\n             scheduler_hints \u003d self._get_scheduler_hints(filter_properties,\n                                                         request_spec)\n+            print(f\"instance.pci_devices before claim. \"\n+                  f\"id(instance): {id(instance)}, \"\n+                  f\"len(instance.pci_devices): {len(instance.pci_devices)}\")\n+\n             with self.rt.instance_claim(context, instance, node, allocs,\n                                         limits):\n+                print(f\"instance.pci_devices after claim. \"\n+                      f\"id(instance): {id(instance)}, \"\n+                      f\"len(instance.pci_devices): {len(instance.pci_devices)}\")\n+\n                 # NOTE(russellb) It\u0027s important that this validation be done\n                 # *after* the resource tracker instance claim, as that is where\n                 # the host is set on the instance.\n@@ -2741,6 +2749,10 @@ class ComputeManager(manager.Manager):\n             raise exception.RescheduledByPolicyException(\n                     instance_uuid\u003dinstance.uuid, reason\u003dstr(e))\n         except Exception as e:\n+            print(f\"instance.pci_devices after claim is aborted and exception caught\"\n+                  f\"id(instance): {id(instance)}, \"\n+                  f\"len(instance.pci_devices): {len(instance.pci_devices)}\")\n+\n             LOG.exception(\u0027Failed to build and run instance\u0027,\n                           instance\u003dinstance)\n             self._notify_about_instance_usage(context, instance,\ndiff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py\nindex 9f92b98ba1..47c4b0138b 100644\n--- a/nova/compute/resource_tracker.py\n+++ b/nova/compute/resource_tracker.py\n@@ -585,6 +585,10 @@ class ResourceTracker(object):\n \n         self._update(context.elevated(), self.compute_nodes[nodename])\n \n+        print(f\"instance.pci_devices at the end of rt.abort_instance_claim. \"\n+              f\"id(instance): {id(instance)}, \"\n+              f\"len(instance.pci_devices): {len(instance.pci_devices)}\")\n+\n     def _drop_pci_devices(self, instance, nodename, prefix):\n         if self.pci_tracker:\n             # free old/new allocated pci devices\ndiff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py\nindex 2108151e71..5104c16c32 100644\n--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py\n+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py\n@@ -3041,6 +3041,7 @@ class PCIResourceRequestReschedulingTest(_PCIServersTestBase):\n         validate_group_policy_called \u003d False\n \n         def validate_group_policy(manager, instance, *args, **kwargs):\n+            print(\u0027test mocked validate_group_policy called\u0027)\n             nonlocal validate_group_policy_called\n             self.assertEqual(1, len(instance.pci_devices))\n             if not validate_group_policy_called:\n```\n\nproduces the following running the reproducer:\n```\n{0} nova.tests.functional.libvirt.test_pci_sriov_servers.PCIResourceRequestReschedulingTest.test_boot_reschedule_with_proper_pci_device_count [1.891829s] ... ok\n\nCaptured stdout:\n~~~~~~~~~~~~~~~~\n    instance.pci_devices before claim. id(instance): 139668447013136, len(instance.pci_devices): 0\ninstance.pci_devices after claim. id(instance): 139668447013136, len(instance.pci_devices): 1\ntest mocked validate_group_policy called\ninstance.pci_devices at the end of rt.abort_instance_claim. id(instance): 139668436285008, len(instance.pci_devices): 0\ninstance.pci_devices after claim is aborted and exception caughtid(instance): 139668447013136, len(instance.pci_devices): 1\ninstance.pci_devices before claim. id(instance): 139668448191824, len(instance.pci_devices): 0\ninstance.pci_devices after claim. id(instance): 139668448191824, len(instance.pci_devices): 1\ntest mocked validate_group_policy called\n\n```\n\nNow we need an explanation why we have to separate instance objects in this code path.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"9cb640c7fe5b8d7274957a08206bfc72c8a32849","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"fbd022d7_0f0d4651","line":2271,"in_reply_to":"ed97b2a5_fe9bbe14","updated":"2024-08-15 09:58:15.000000000","message":"The claim.Claim() clones the instance object https://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/compute/claims.py#L65-L66\n\nThe claim of the pci devices is outside of the Claim object so it operates on the original instance. https://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/compute/resource_tracker.py#L190-L211\n\nBut the abort of the claim uses the instance object stashed into the Claim object to clean up the pci devices. So that cleanup is not visible to the outside word (the compute manager) \nhttps://github.com/openstack/nova/blob/a7c82399b237aff44810ac76b0c4d10416c3bdf9/nova/compute/claims.py#L85","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"2dadd517fff8c741f63f1a02310c954167c3c04b","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"bf214f32_2572f87f","line":2271,"in_reply_to":"f17c6dc9_e7fd7b7a","updated":"2024-06-11 11:16:36.000000000","message":"so fershs just restes the object in memory \nhttps://github.com/openstack/nova/blob/7dc4b1ea627d864a0ee2745cc9de4336fc0ba7b5/nova/objects/instance.py#L892-L919\nbut i think that is the main part of the bug\n\nbased on the functional tests\n\nhttps://review.opendev.org/c/openstack/nova/+/710848/12/nova/tests/functional/test_servers_resource_request.py\nwe are asserting that the instance only has one device.\n\nim also not aware of any bugs related to \"leaking\" pci devices on incorrect host so i belive we were already handeling the local claims properly.\n\nso i do think this is safe to proceed with.\n\nat the very least this reset would not make any leakign of claimed resouce worse then it was before this patch and we dont seam to have an indication that that was actully happening so im inclide to merge this as is but i would like to have gibi weigh in on that.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"ada7db239734a39b94ceeec7f43f322e9a8b341f","unresolved":true,"context_lines":[{"line_number":2268,"context_line":""},{"line_number":2269,"context_line":"            # NOTE(mgoddard): If the instance has any PCI devices, these will"},{"line_number":2270,"context_line":"            # have been deallocated."},{"line_number":2271,"context_line":"            instance.refresh()"},{"line_number":2272,"context_line":""},{"line_number":2273,"context_line":"            self.compute_task_api.build_instances(context, [instance],"},{"line_number":2274,"context_line":"                    image, filter_properties, admin_password,"}],"source_content_type":"text/x-python","patch_set":6,"id":"b5440d8b_d905c5da","line":2271,"in_reply_to":"fbd022d7_0f0d4651","updated":"2024-08-15 11:22:53.000000000","message":"This would be he more logical fix to me based on what I found above. https://review.opendev.org/c/openstack/nova/+/926407 However this could have wider implications.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"56c940e97a888236b74b915b7d7d4d83550cc52d","unresolved":true,"context_lines":[{"line_number":2368,"context_line":"                        instance.pci_requests.requests, provider_mapping)"},{"line_number":2369,"context_line":"            except (exception.AmbiguousResourceProviderForPCIRequest,"},{"line_number":2370,"context_line":"                    exception.UnexpectedResourceProviderNameForPCIRequest"},{"line_number":2371,"context_line":"                    ) as e:"},{"line_number":2372,"context_line":"                raise exception.BuildAbortException("},{"line_number":2373,"context_line":"                    reason\u003dstr(e), instance_uuid\u003dinstance.uuid)"},{"line_number":2374,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"9b380b27_6298253d","line":2371,"updated":"2024-08-14 16:52:00.000000000","message":"if the rpc is actully where the save is happening now then the instance.refersh shoudl actully be here instead.","commit_id":"fd4af836a2ffa0148cc7e08cd1bc09bf0b0805ab"}]}
