)]}' {"/COMMIT_MSG":[{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d477a1f20a80450adc8c47e6fec3f722904210d0","unresolved":false,"context_lines":[{"line_number":24,"context_line":""},{"line_number":25,"context_line":"This change makes SchedulerReportClient.delete_resource_provider"},{"line_number":26,"context_line":"no longer hide the ResourceProviderInUse/ResourceProviderDeletionFailed"},{"line_number":27,"context_line":"errors and let\u0027s them raise up to the caller."},{"line_number":28,"context_line":""},{"line_number":29,"context_line":"There are two callers of this method, one in the API to delete a compute"},{"line_number":30,"context_line":"service and one in the compute manager to cleanup orphan compute nodes"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":1,"id":"7faddb67_12d64955","line":27,"range":{"start_line":27,"start_character":22,"end_line":27,"end_character":27},"updated":"2019-08-22 20:27:29.000000000","message":"rise?","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"70481259e4592893426398584ef451ac4a3ff3aa","unresolved":false,"context_lines":[{"line_number":11,"context_line":"However, the delete_resource_provider cascade\u003dTrue logic only looks"},{"line_number":12,"context_line":"for instances on the given compute service host being deleted which"},{"line_number":13,"context_line":"will miss (1) allocations remaining from evacuated servers and"},{"line_number":14,"context_line":"(2) unconfirmed migrations."},{"line_number":15,"context_line":""},{"line_number":16,"context_line":"Attempting to delete the resource provider results in an"},{"line_number":17,"context_line":"ResourceProviderInUse error which delete_resource_provider ignores"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"3fa7e38b_f6da1a1b","line":14,"range":{"start_line":14,"start_character":0,"end_line":14,"end_character":27},"updated":"2019-11-14 19:45:49.000000000","message":"This is blocked earlier now:\n\nhttps://review.opendev.org/#/c/694389/","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"}],"nova/api/openstack/compute/services.py":[{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d477a1f20a80450adc8c47e6fec3f722904210d0","unresolved":false,"context_lines":[{"line_number":312,"context_line":" \u0027operations will need to be removed manually \u0027"},{"line_number":313,"context_line":" \u0027or by restarting the service to clean up \u0027"},{"line_number":314,"context_line":" \u0027residual resources left on the node. Pending \u0027"},{"line_number":315,"context_line":" \u0027migrations will need to confirmed before the \u0027"},{"line_number":316,"context_line":" \u0027service can be deleted.\u0027) % {"},{"line_number":317,"context_line":" \u0027uuid\u0027: compute_node.uuid,"},{"line_number":318,"context_line":" \u0027hypervisor_hostname\u0027: compute_node.hypervisor_hostname}"}],"source_content_type":"text/x-python","patch_set":1,"id":"7faddb67_32efc51e","line":315,"range":{"start_line":315,"start_character":46,"end_line":315,"end_character":48},"updated":"2019-08-22 20:27:29.000000000","message":"to be","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d08a84991ff0b34f354ca550d4ef36175dde4823","unresolved":false,"context_lines":[{"line_number":312,"context_line":" \u0027operations will need to be removed manually \u0027"},{"line_number":313,"context_line":" \u0027or by restarting the service to clean up \u0027"},{"line_number":314,"context_line":" \u0027residual resources left on the node. Pending \u0027"},{"line_number":315,"context_line":" \u0027migrations will need to confirmed before the \u0027"},{"line_number":316,"context_line":" \u0027service can be deleted.\u0027) % {"},{"line_number":317,"context_line":" \u0027uuid\u0027: compute_node.uuid,"},{"line_number":318,"context_line":" \u0027hypervisor_hostname\u0027: compute_node.hypervisor_hostname}"}],"source_content_type":"text/x-python","patch_set":1,"id":"3fa7e38b_9a097d85","line":315,"range":{"start_line":315,"start_character":46,"end_line":315,"end_character":48},"in_reply_to":"7faddb67_32efc51e","updated":"2019-10-25 15:42:25.000000000","message":"Done","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"70481259e4592893426398584ef451ac4a3ff3aa","unresolved":false,"context_lines":[{"line_number":355,"context_line":" \"for compute node %s: %s\","},{"line_number":356,"context_line":" compute_node.uuid, six.text_type(e))"},{"line_number":357,"context_line":" except exception.ResourceProviderInUse:"},{"line_number":358,"context_line":" # TODO(mriedem): Write up a detailed troubleshooting"},{"line_number":359,"context_line":" # doc about this situation and how to manually recover"},{"line_number":360,"context_line":" # from it by identifying allocations from an evacuate"},{"line_number":361,"context_line":" # operation and how to clean them up."}],"source_content_type":"text/x-python","patch_set":3,"id":"3fa7e38b_96c96652","line":358,"updated":"2019-11-14 19:45:49.000000000","message":"Done: https://docs.openstack.org/nova/latest/admin/troubleshooting/orphaned-allocations.html","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"70481259e4592893426398584ef451ac4a3ff3aa","unresolved":false,"context_lines":[{"line_number":363,"context_line":" \u0027%(uuid)s for compute node %(hypervisor_hostname)s \u0027"},{"line_number":364,"context_line":" \u0027since there are allocations against the provider. \u0027"},{"line_number":365,"context_line":" \u0027These could be for server evacuate actions \u0027"},{"line_number":366,"context_line":" \u0027off of this compute service or pending \u0027"},{"line_number":367,"context_line":" \u0027migrations. Allocations for evacuate \u0027"},{"line_number":368,"context_line":" \u0027operations will need to be removed manually \u0027"},{"line_number":369,"context_line":" \u0027or by restarting the service to clean up \u0027"},{"line_number":370,"context_line":" \u0027residual resources left on the node. Pending \u0027"}],"source_content_type":"text/x-python","patch_set":3,"id":"3fa7e38b_f603fa74","line":367,"range":{"start_line":366,"start_character":53,"end_line":367,"end_character":35},"updated":"2019-11-14 19:45:49.000000000","message":"We\u0027ll block earlier for this now:\n\nhttps://review.opendev.org/#/c/694389/\n\nSo we can probably just remove this wording. The pending thing to handle is the auto-cascade-delete of allocations for evacuated servers.","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"70481259e4592893426398584ef451ac4a3ff3aa","unresolved":false,"context_lines":[{"line_number":367,"context_line":" \u0027migrations. Allocations for evacuate \u0027"},{"line_number":368,"context_line":" \u0027operations will need to be removed manually \u0027"},{"line_number":369,"context_line":" \u0027or by restarting the service to clean up \u0027"},{"line_number":370,"context_line":" \u0027residual resources left on the node. Pending \u0027"},{"line_number":371,"context_line":" \u0027migrations will need to be confirmed before the \u0027"},{"line_number":372,"context_line":" \u0027service can be deleted.\u0027) % {"},{"line_number":373,"context_line":" \u0027uuid\u0027: compute_node.uuid,"},{"line_number":374,"context_line":" \u0027hypervisor_hostname\u0027: compute_node.hypervisor_hostname}"},{"line_number":375,"context_line":" raise webob.exc.HTTPConflict(explanation\u003dmsg)"}],"source_content_type":"text/x-python","patch_set":3,"id":"3fa7e38b_1607b683","line":372,"range":{"start_line":370,"start_character":62,"end_line":372,"end_character":48},"updated":"2019-11-14 19:45:49.000000000","message":"ditto","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"}],"nova/compute/manager.py":[{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d477a1f20a80450adc8c47e6fec3f722904210d0","unresolved":false,"context_lines":[{"line_number":8313,"context_line":" # Delete the corresponding resource provider in placement,"},{"line_number":8314,"context_line":" # along with any associated allocations and inventory."},{"line_number":8315,"context_line":" try:"},{"line_number":8316,"context_line":" # TODO(mriedem): If this is ironic doing a re-balance"},{"line_number":8317,"context_line":" # should we really be doing cascading deletes of the"},{"line_number":8318,"context_line":" # allocations on the node? This was added back when the"},{"line_number":8319,"context_line":" # ResourceTracker would heal allocations in the"}],"source_content_type":"text/x-python","patch_set":1,"id":"7faddb67_b2e2d52f","line":8316,"updated":"2019-08-22 20:27:29.000000000","message":"Note that this is a separate bug, but something that came up this week so it was fresh in my mind.","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d477a1f20a80450adc8c47e6fec3f722904210d0","unresolved":false,"context_lines":[{"line_number":8328,"context_line":" # TODO(mriedem): What should we do with this? Just log an"},{"line_number":8329,"context_line":" # error and hope that on the next pass it will work?"},{"line_number":8330,"context_line":" # Actually on the next pass we wouldn\u0027t even get here"},{"line_number":8331,"context_line":" # because we deleted the compute node above? So should we"},{"line_number":8332,"context_line":" # do this before deleting the compute node, like in the"},{"line_number":8333,"context_line":" # DELETE /os-services/{service_id} API? And/or make this"},{"line_number":8334,"context_line":" # conditional on the startup value and fail to start if"},{"line_number":8335,"context_line":" # we can\u0027t cleanup the provider?"},{"line_number":8336,"context_line":" LOG.exception(\u0027Failed to delete resource provider for \u0027"}],"source_content_type":"text/x-python","patch_set":1,"id":"7faddb67_52cda1b5","line":8333,"range":{"start_line":8331,"start_character":65,"end_line":8333,"end_character":59},"updated":"2019-08-22 20:27:29.000000000","message":"I think this is a definite *yes* regardless of what we do here, log or fail I mean. Otherwise we\u0027ll orphan and not try again.","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"a99024c263e916113c8117990e4045aba7072b8a","unresolved":false,"context_lines":[{"line_number":9222,"context_line":" # TODO(mriedem): What should we do with this? Just log an"},{"line_number":9223,"context_line":" # error and hope that on the next pass it will work?"},{"line_number":9224,"context_line":" # Actually on the next pass we wouldn\u0027t even get here"},{"line_number":9225,"context_line":" # because we deleted the compute node above. So should we"},{"line_number":9226,"context_line":" # do this before deleting the compute node, like in the"},{"line_number":9227,"context_line":" # DELETE /os-services/{service_id} API (likely yes)?"},{"line_number":9228,"context_line":" # And/or make this conditional on the startup value and"},{"line_number":9229,"context_line":" # fail to start if we can\u0027t cleanup the provider?"},{"line_number":9230,"context_line":" LOG.error("}],"source_content_type":"text/x-python","patch_set":3,"id":"3fa7e38b_9d55a5c4","line":9227,"range":{"start_line":9225,"start_character":65,"end_line":9227,"end_character":72},"updated":"2019-11-22 00:22:16.000000000","message":"I think this is a definite yes too. And something I need to backport to queens *ducks*.\n\nWe\u0027ve got a lot of customer deployments hitting a situation where, nova-compute is started and for whatever reason, keystone is down or placement is down, then this orphan removal code fires and destroys the compute node but fails to delete the RP. Then, nova-compute can *never start up properly again* because during init_host, we\u0027ll create new compute node DB records and then try to create an RP for the same hypervisor_hostname that got removed as an orphan earlier, and fails with ResourceProviderCreationFailed. The customer cannot get past this point once it\u0027s in this state.\n\nThe workaround is deleting the allocations and RPs and this works OK in the absence of nova-manage heal_allocations (in queens) because we happen to be automatically refreshing allocations for ironic (in queens).\n\nAnd even if we backport nova-manage heal_allocations, it\u0027s still an cumbersome manual cleanup step for customers.\n\nWhat\u0027s your thought on a small separate change to swap the ordering of the compute node destroy and RP delete that is backportable to queens? I will try proposing one.","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"e682da48e8993c08fe4c88e2a018126e43d4de5a","unresolved":false,"context_lines":[{"line_number":9222,"context_line":" # TODO(mriedem): What should we do with this? Just log an"},{"line_number":9223,"context_line":" # error and hope that on the next pass it will work?"},{"line_number":9224,"context_line":" # Actually on the next pass we wouldn\u0027t even get here"},{"line_number":9225,"context_line":" # because we deleted the compute node above. So should we"},{"line_number":9226,"context_line":" # do this before deleting the compute node, like in the"},{"line_number":9227,"context_line":" # DELETE /os-services/{service_id} API (likely yes)?"},{"line_number":9228,"context_line":" # And/or make this conditional on the startup value and"},{"line_number":9229,"context_line":" # fail to start if we can\u0027t cleanup the provider?"},{"line_number":9230,"context_line":" LOG.error("}],"source_content_type":"text/x-python","patch_set":3,"id":"3fa7e38b_ef9a8ec2","line":9227,"range":{"start_line":9225,"start_character":65,"end_line":9227,"end_character":72},"in_reply_to":"3fa7e38b_9d55a5c4","updated":"2019-12-17 15:42:44.000000000","message":"\u003e I think this is a definite yes too. And something I need to\n \u003e backport to queens *ducks*.\n \u003e \n \u003e We\u0027ve got a lot of customer deployments hitting a situation where,\n \u003e nova-compute is started and for whatever reason, keystone is down\n \u003e or placement is down, then this orphan removal code fires and\n \u003e destroys the compute node but fails to delete the RP. Then,\n \u003e nova-compute can *never start up properly again* because during\n \u003e init_host, we\u0027ll create new compute node DB records and then try to\n \u003e create an RP for the same hypervisor_hostname that got removed as\n \u003e an orphan earlier, and fails with ResourceProviderCreationFailed.\n \u003e The customer cannot get past this point once it\u0027s in this state.\n\nThat\u0027s a problem in queens because the compute node uuid is generated uniquely each time a new one is created, but starting in Rocky for ironic nodes the node uuid is used for the compute node uuid which is used for the resource provider uuid so I think that part of your problem is at least resolved in Rocky.\n\n \u003e What\u0027s your thought on a small separate change to swap the ordering\n \u003e of the compute node destroy and RP delete that is backportable to\n \u003e queens? I will try proposing one.\n\nSo if we swap the RP delete to happen before the compute node destroy *without* the changes to the delete_resource_provider method to actually (re)raise on provider delete failure, you will have the same problem because we\u0027d fail to destroy the provider but continue to destroy the compute node. Right? So that\u0027s not very helpful.\n\nI think the swap is likely only useful if delete_resource_provider is also raising exceptions on failure and then you *do not* destroy the compute node if the provider deletion fails, right?\n\nYou asked about breaking apart this large change into smaller backportable pieces and for reviewability and I\u0027ve been trying to think about how to do that. Naturally I\u0027d change delete_resource_provider to raise on error first, but then we\u0027re leaking those failures in the API and in this code that is calling delete_resource_provider. So one thought is to change the call sites (API and here) to handle the exceptions even though in those patches delete_resource_provider would not yet be changed - they\u0027d be prep patches for when delete_resource_provider is changed to raise exceptions on failure. Then maybe you do the order of operations swap in here where the provider delete happens before the compute node destroy and if the former fails the latter doesn\u0027t happen.","commit_id":"1b8aa2e3eca00cbb0df02a59b304164ad0ba29d2"}],"nova/scheduler/client/report.py":[{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d08a84991ff0b34f354ca550d4ef36175dde4823","unresolved":false,"context_lines":[{"line_number":2159,"context_line":" # we failed to connect to placement we won\u0027t have deleted the provider."},{"line_number":2160,"context_line":" # We should either return a True/False from _delete_provider to"},{"line_number":2161,"context_line":" # distinguish it from the None that @safe_connect would return on error"},{"line_number":2162,"context_line":" # or remove the @safe_connect decorator. The former is backportable,"},{"line_number":2163,"context_line":" # the latter is likely not."},{"line_number":2164,"context_line":" self._delete_provider(rp_uuid, global_request_id\u003dcontext.global_id)"},{"line_number":2165,"context_line":" # TODO(mriedem): If _delete_provider raises ResourceProviderInUse and"}],"source_content_type":"text/x-python","patch_set":1,"id":"3fa7e38b_ba9f9900","line":2162,"range":{"start_line":2162,"start_character":10,"end_line":2162,"end_character":47},"updated":"2019-10-25 15:42:25.000000000","message":"This already happened:\n\nhttps://review.opendev.org/#/c/671866/","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"5f1c8a2f495590a51d27cb20e14e613f115a3f78","unresolved":false,"context_lines":[{"line_number":2163,"context_line":" self.delete_allocation_for_instance(context, instance_uuid)"},{"line_number":2164,"context_line":" self._delete_provider(rp_uuid, global_request_id\u003dcontext.global_id)"},{"line_number":2165,"context_line":" # TODO(mriedem): If _delete_provider raises ResourceProviderInUse and"},{"line_number":2166,"context_line":" # cascade\u003dTrue, we should look for allocations for the evacuated"},{"line_number":2167,"context_line":" # instance and clean those up and retry the delete."},{"line_number":2168,"context_line":""},{"line_number":2169,"context_line":" def get_provider_by_name(self, context, name):"},{"line_number":2170,"context_line":" \"\"\"Queries the placement API for resource provider information matching"}],"source_content_type":"text/x-python","patch_set":2,"id":"3fa7e38b_dda04fa8","line":2167,"range":{"start_line":2166,"start_character":24,"end_line":2167,"end_character":18},"updated":"2019-10-25 15:55:28.000000000","message":"We\u0027d do this by querying the migrations table for migration_type\u003d\u0027evacuate\u0027 where the source_compute/source_node is this host/node and then delete allocations for the instances in those migrations.","commit_id":"28466fc7b9a3fef9cf7f1113004c599117376e5d"}],"nova/tests/functional/wsgi/test_services.py":[{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d477a1f20a80450adc8c47e6fec3f722904210d0","unresolved":false,"context_lines":[{"line_number":177,"context_line":" self.assertNotIn(\u0027Error updating resources for node host1.\u0027, log_out)"},{"line_number":178,"context_line":" self.assertNotIn(\u0027Failed to create resource provider host1\u0027, log_out)"},{"line_number":179,"context_line":" # Now that the source host1 was restarted, it should have cleaned up"},{"line_number":180,"context_line":" # the allocations for the evacuated server which means we should be"},{"line_number":181,"context_line":" # able to delete the service now."},{"line_number":182,"context_line":" self.admin_api.api_delete(\u0027/os-services/%s\u0027 % service[\u0027id\u0027])"},{"line_number":183,"context_line":" services \u003d self.admin_api.get_services("}],"source_content_type":"text/x-python","patch_set":1,"id":"7faddb67_d24d113c","line":180,"updated":"2019-08-22 20:27:29.000000000","message":"We could also assert we see this in the logs:\n\nhttps://github.com/openstack/nova/blob/a55ae413eaead358ff43225a4caf53650f99dd09/nova/compute/manager.py#L707","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d08a84991ff0b34f354ca550d4ef36175dde4823","unresolved":false,"context_lines":[{"line_number":177,"context_line":" self.assertNotIn(\u0027Error updating resources for node host1.\u0027, log_out)"},{"line_number":178,"context_line":" self.assertNotIn(\u0027Failed to create resource provider host1\u0027, log_out)"},{"line_number":179,"context_line":" # Now that the source host1 was restarted, it should have cleaned up"},{"line_number":180,"context_line":" # the allocations for the evacuated server which means we should be"},{"line_number":181,"context_line":" # able to delete the service now."},{"line_number":182,"context_line":" self.admin_api.api_delete(\u0027/os-services/%s\u0027 % service[\u0027id\u0027])"},{"line_number":183,"context_line":" services \u003d self.admin_api.get_services("}],"source_content_type":"text/x-python","patch_set":1,"id":"3fa7e38b_1a2a6d90","line":180,"in_reply_to":"7faddb67_d24d113c","updated":"2019-10-25 15:42:25.000000000","message":"Done","commit_id":"95660afdbb1a7019d73375271b38046985c84de1"}]}