)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":10135,"name":"Lee Yarwood","display_name":"Lee Yarwood","email":"lyarwood@redhat.com","username":"lyarwood"},"change_message_id":"b8ca9b5f7580a26212a18fdaac26ddbf9c073269","unresolved":false,"context_lines":[{"line_number":10,"context_line":"host via reserve_block_device_name could result in stale BDM entries"},{"line_number":11,"context_line":"being left in the database."},{"line_number":12,"context_line":""},{"line_number":13,"context_line":"This change adds some simple cleanup logic to reserve_block_device_name"},{"line_number":14,"context_line":"to lookup and remove any such BDMs when an exception is encountered"},{"line_number":15,"context_line":"before reraising."},{"line_number":16,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":1,"id":"3fa7e38b_76ebeeaf","line":13,"range":{"start_line":13,"start_character":46,"end_line":13,"end_character":71},"updated":"2019-09-17 09:48:55.000000000","message":"_create_volume_bdm","commit_id":"52457837075d0e985fdc4b0468d0b44be0c6fb00"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"65c9475f73274449edc5300eb72fd133d636e616","unresolved":false,"context_lines":[{"line_number":14,"context_line":"to lookup and remove any such BDMs when an exception is encountered"},{"line_number":15,"context_line":"before reraising."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"Closes-Bug: #1844296"},{"line_number":18,"context_line":"Change-Id: I3c45b3e9c8a5a9ce1e0e47abb8f95089f8c212c3"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"3fa7e38b_8de1349e","line":17,"updated":"2019-10-10 14:03:35.000000000","message":"This looks like a duplicate of bug 1425352 though I was thinking we had another one that ndipanov had opened at one point for the same issue.\n\nI also remember talking about this at the PTG in Dublin and there were some TODOs for you (Lee) to add some online data migration or something to cleanup duplicate records and then we\u0027d add a unique constraint for something to help prevent this?\n\nhttps://etherpad.openstack.org/p/nova-ptg-rocky\n\n~L578?\n\nBug 1427060 is maybe the one I was thinking of.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":14070,"name":"Eric Fried","email":"openstack@fried.cc","username":"efried"},"change_message_id":"ccf554efa6df16693caba12b45133573ae09dc20","unresolved":false,"context_lines":[{"line_number":14,"context_line":"to lookup and remove any such BDMs when an exception is encountered"},{"line_number":15,"context_line":"before reraising."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"Related-Bug: #1844296"},{"line_number":18,"context_line":"Change-Id: I3c45b3e9c8a5a9ce1e0e47abb8f95089f8c212c3"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"3fa7e38b_f3f9f74d","line":17,"range":{"start_line":17,"start_character":0,"end_line":17,"end_character":7},"updated":"2019-12-02 14:49:16.000000000","message":"since this is going to be the backported fix, and it does actually fix the problem (albeit perhaps not optimally) I think this should be Closes-Bug.","commit_id":"7a49b3ffda955fef506e7ca590bee075706fd4ce"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"8fde4995e5cd78d293667d8169adc885f6556607","unresolved":false,"context_lines":[{"line_number":14,"context_line":"to lookup and remove any such BDMs when an exception is encountered"},{"line_number":15,"context_line":"before reraising."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"Related-Bug: #1844296"},{"line_number":18,"context_line":"Change-Id: I3c45b3e9c8a5a9ce1e0e47abb8f95089f8c212c3"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"3fa7e38b_56579928","line":17,"range":{"start_line":17,"start_character":0,"end_line":17,"end_character":7},"in_reply_to":"3fa7e38b_f3f9f74d","updated":"2019-12-02 15:25:58.000000000","message":"This is definitely a duplicate of bug 1425352 so let\u0027s duplicate lee\u0027s bug and mark this as partial-bug since if we timeout before the compute method gets the lock and creates the bdm we still orphan the bdm so this isn\u0027t a fail safe fix to close the bug.","commit_id":"7a49b3ffda955fef506e7ca590bee075706fd4ce"}],"nova/compute/api.py":[{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"1a111b30516e7fbf775bde8f1b7c102535d1ab30","unresolved":false,"context_lines":[{"line_number":4064,"context_line":"                        bdm.destroy()"},{"line_number":4065,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4066,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"},{"line_number":4067,"context_line":""},{"line_number":4068,"context_line":"            volume_bdm.delete_on_termination \u003d delete_on_termination"},{"line_number":4069,"context_line":"            volume_bdm.save()"},{"line_number":4070,"context_line":"        return volume_bdm"}],"source_content_type":"text/x-python","patch_set":1,"id":"3fa7e38b_dd93da4e","line":4067,"updated":"2019-09-17 16:17:41.000000000","message":"This all looks good, just not sure about the log levels. I had thought we usually use \"warning\" to mean the operator should take action and there is no action the operator needs to take here. I would tend to think all should be at debug level, but maybe others would have a different opinion.","commit_id":"52457837075d0e985fdc4b0468d0b44be0c6fb00"},{"author":{"_account_id":10135,"name":"Lee Yarwood","display_name":"Lee Yarwood","email":"lyarwood@redhat.com","username":"lyarwood"},"change_message_id":"331b623d000b064b6ed9d7513a1d5f1c1205ff2b","unresolved":false,"context_lines":[{"line_number":4064,"context_line":"                        bdm.destroy()"},{"line_number":4065,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4066,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"},{"line_number":4067,"context_line":""},{"line_number":4068,"context_line":"            volume_bdm.delete_on_termination \u003d delete_on_termination"},{"line_number":4069,"context_line":"            volume_bdm.save()"},{"line_number":4070,"context_line":"        return volume_bdm"}],"source_content_type":"text/x-python","patch_set":1,"id":"3fa7e38b_a20a6e23","line":4067,"in_reply_to":"3fa7e38b_dd93da4e","updated":"2019-09-18 10:23:38.000000000","message":"Yeah that\u0027s a fair point, would INFO also be too high?\n\nI\u0027m happy to drop these down to DEBUG either way tbh but I\u0027ll wait to see what others think.","commit_id":"52457837075d0e985fdc4b0468d0b44be0c6fb00"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"1a96f1ad5532bbcd99416f6ff471aed0e8eb2031","unresolved":false,"context_lines":[{"line_number":4043,"context_line":"            try:"},{"line_number":4044,"context_line":"                # NOTE(vish): This is done on the compute host because we want"},{"line_number":4045,"context_line":"                # to avoid a race where two devices are requested at the same"},{"line_number":4046,"context_line":"                # time. When db access is removed from compute, the bdm will be"},{"line_number":4047,"context_line":"                # created here and we will have to make sure that they are"},{"line_number":4048,"context_line":"                # assigned atomically."},{"line_number":4049,"context_line":"                volume_bdm \u003d self.compute_rpcapi.reserve_block_device_name("},{"line_number":4050,"context_line":"                    context, instance, device, volume_id, disk_bus\u003ddisk_bus,"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_81ed7bf1","line":4047,"range":{"start_line":4046,"start_character":24,"end_line":4047,"end_character":30},"updated":"2019-10-10 16:03:04.000000000","message":"This happened back in like grizzly or havana. I\u0027ve always thought it\u0027s kind of dumb that the bdm is created in compute during the blocking call. Why not just add an optional bdm parameter to the reserve_block_device_name rpc method, change the behavior on the server side if it gets the parameter, meaning the compute is (1) new enough and (2) compute rpc api is not pinned for a rolling upgrade, and then the API can determine if it can create the bdm in the API and send it down to the compute and if there is an error we destroy it up here. Note that would not be a backportable change since it would involve a new RPC API version.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"d71c445ce860001f69c2d00aaf93a03bc48b5869","unresolved":false,"context_lines":[{"line_number":4043,"context_line":"            try:"},{"line_number":4044,"context_line":"                # NOTE(vish): This is done on the compute host because we want"},{"line_number":4045,"context_line":"                # to avoid a race where two devices are requested at the same"},{"line_number":4046,"context_line":"                # time. When db access is removed from compute, the bdm will be"},{"line_number":4047,"context_line":"                # created here and we will have to make sure that they are"},{"line_number":4048,"context_line":"                # assigned atomically."},{"line_number":4049,"context_line":"                volume_bdm \u003d self.compute_rpcapi.reserve_block_device_name("},{"line_number":4050,"context_line":"                    context, instance, device, volume_id, disk_bus\u003ddisk_bus,"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_3c425069","line":4047,"range":{"start_line":4046,"start_character":24,"end_line":4047,"end_character":30},"in_reply_to":"3fa7e38b_3c638c44","updated":"2019-11-05 02:30:33.000000000","message":"Here is a WIP for this https://review.opendev.org/#/c/692940/.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":9555,"name":"Matthew Booth","email":"mbooth@redhat.com","username":"MatthewBooth"},"change_message_id":"d113b9fef35d90ee3549b59b552a126577c59dc4","unresolved":false,"context_lines":[{"line_number":4043,"context_line":"            try:"},{"line_number":4044,"context_line":"                # NOTE(vish): This is done on the compute host because we want"},{"line_number":4045,"context_line":"                # to avoid a race where two devices are requested at the same"},{"line_number":4046,"context_line":"                # time. When db access is removed from compute, the bdm will be"},{"line_number":4047,"context_line":"                # created here and we will have to make sure that they are"},{"line_number":4048,"context_line":"                # assigned atomically."},{"line_number":4049,"context_line":"                volume_bdm \u003d self.compute_rpcapi.reserve_block_device_name("},{"line_number":4050,"context_line":"                    context, instance, device, volume_id, disk_bus\u003ddisk_bus,"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_3c638c44","line":4047,"range":{"start_line":4046,"start_character":24,"end_line":4047,"end_character":30},"in_reply_to":"3fa7e38b_81ed7bf1","updated":"2019-10-10 17:03:10.000000000","message":"Agree that this is best resolved by a different approach, e.g. this one.\n\nAgree that a backportable approach is also required.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":9555,"name":"Matthew Booth","email":"mbooth@redhat.com","username":"MatthewBooth"},"change_message_id":"010f46741f84613883169da1a5fe7d904c17bcfe","unresolved":false,"context_lines":[{"line_number":4053,"context_line":"            # NOTE(lyarwood): Try to lookup and destroy any BDMs that were"},{"line_number":4054,"context_line":"            # created on the remote compute when exceptions are encountered."},{"line_number":4055,"context_line":"            except Exception:"},{"line_number":4056,"context_line":"                with excutils.save_and_reraise_exception():"},{"line_number":4057,"context_line":"                    try:"},{"line_number":4058,"context_line":"                        LOG.debug(\"reserve_block_device_name has failed, \""},{"line_number":4059,"context_line":"                                    \"attempting to lookup and remove any \""},{"line_number":4060,"context_line":"                                    \"BDMs created by the compute.\")"},{"line_number":4061,"context_line":"                        bdm_obj \u003d block_device_obj.BlockDeviceMapping"},{"line_number":4062,"context_line":"                        bdm \u003d bdm_obj.get_by_volume_and_instance(context,"},{"line_number":4063,"context_line":"                            volume_id, instance.uuid)"},{"line_number":4064,"context_line":"                        LOG.debug(\"BDM %s found and will now be destroyed.\","},{"line_number":4065,"context_line":"                                  bdm.uuid)"},{"line_number":4066,"context_line":"                        bdm.destroy()"},{"line_number":4067,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4068,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"},{"line_number":4069,"context_line":""},{"line_number":4070,"context_line":"            volume_bdm.delete_on_termination \u003d delete_on_termination"},{"line_number":4071,"context_line":"            volume_bdm.save()"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_d0e7bf11","line":4068,"range":{"start_line":4056,"start_character":0,"end_line":4068,"end_character":79},"updated":"2019-10-10 14:22:06.000000000","message":"This is a race, and one which I suspect we will normally hit in practise, meaning this won\u0027t fix the issue. Lets assume that the reason this occurred is because of bug 1840912. We don\u0027t *have* to assume this, btw. This race could occur any other way, but I wouldn\u0027t be surprised to discover that in practise this was the underlying issue, and it is certainly an actual bug which would cause this issue and trigger this race.\n\nIf that happens, the sequence of events is:\n\nCOMPUTE:\n\nblocking_libvirt_call()\n\nAPI:\n\nreserve_block_device_name()\n\nRABBIT:\n\nqueue reserve_block_device_name() for compute, which doesn\u0027t pick it up because it\u0027s frozen\n\nAPI:\n\ntimeout\nLook for bdm to clean up\nDon\u0027t find any, do nothing\n\nCOMPUTE:\n\nblocking_libvirt_call() completes\nReceive reserve_block_device_name from message queue\nCreate BDM\n\n... and the BDM has still been left lying around ...\n\nThe same race would also occur if something on the compute host was holding the instance lock for a long time. In that case rabbit would deliver the message immediately, but it would still timeout before the compute host created the BDM, which it eventually would.\n\nThe same race could also occur if rabbit was being slow delivering messages, or if the compute was slow to respond for any other reason, e.g. slow libvirt calls, storage issues, excessive load, etc.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":9555,"name":"Matthew Booth","email":"mbooth@redhat.com","username":"MatthewBooth"},"change_message_id":"d113b9fef35d90ee3549b59b552a126577c59dc4","unresolved":false,"context_lines":[{"line_number":4053,"context_line":"            # NOTE(lyarwood): Try to lookup and destroy any BDMs that were"},{"line_number":4054,"context_line":"            # created on the remote compute when exceptions are encountered."},{"line_number":4055,"context_line":"            except Exception:"},{"line_number":4056,"context_line":"                with excutils.save_and_reraise_exception():"},{"line_number":4057,"context_line":"                    try:"},{"line_number":4058,"context_line":"                        LOG.debug(\"reserve_block_device_name has failed, \""},{"line_number":4059,"context_line":"                                    \"attempting to lookup and remove any \""},{"line_number":4060,"context_line":"                                    \"BDMs created by the compute.\")"},{"line_number":4061,"context_line":"                        bdm_obj \u003d block_device_obj.BlockDeviceMapping"},{"line_number":4062,"context_line":"                        bdm \u003d bdm_obj.get_by_volume_and_instance(context,"},{"line_number":4063,"context_line":"                            volume_id, instance.uuid)"},{"line_number":4064,"context_line":"                        LOG.debug(\"BDM %s found and will now be destroyed.\","},{"line_number":4065,"context_line":"                                  bdm.uuid)"},{"line_number":4066,"context_line":"                        bdm.destroy()"},{"line_number":4067,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4068,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"},{"line_number":4069,"context_line":""},{"line_number":4070,"context_line":"            volume_bdm.delete_on_termination \u003d delete_on_termination"},{"line_number":4071,"context_line":"            volume_bdm.save()"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_dccd9885","line":4068,"range":{"start_line":4056,"start_character":0,"end_line":4068,"end_character":79},"in_reply_to":"3fa7e38b_21ba876d","updated":"2019-10-10 17:03:10.000000000","message":"Well I\u0027m speculating as to the actual cause, but that very specific sequence is pretty much deterministic if the compute is frozen. My point is that in the presence of the libvirt proxy calls bug, this is exactly what we would expect to see. Given that we\u0027ve seen several of those downstream now, and it was a timeout, it\u0027s not at all unlikely. Certainly that bug would result in this one.\n\nAs noted there are also several other potential practical triggers of this race. In fact, I suggest that most things that would trigger this bug would also trigger this race.\n\nMy -1 is because I think it\u0027s confusing to add this code which I suspect will rarely fix the bug in question if the compute call timed out, which it did in this case.","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":6873,"name":"Matt Riedemann","email":"mriedem.os@gmail.com","username":"mriedem"},"change_message_id":"13d516273657dca2fbac0fa38680c59cf7be56da","unresolved":false,"context_lines":[{"line_number":4053,"context_line":"            # NOTE(lyarwood): Try to lookup and destroy any BDMs that were"},{"line_number":4054,"context_line":"            # created on the remote compute when exceptions are encountered."},{"line_number":4055,"context_line":"            except Exception:"},{"line_number":4056,"context_line":"                with excutils.save_and_reraise_exception():"},{"line_number":4057,"context_line":"                    try:"},{"line_number":4058,"context_line":"                        LOG.debug(\"reserve_block_device_name has failed, \""},{"line_number":4059,"context_line":"                                    \"attempting to lookup and remove any \""},{"line_number":4060,"context_line":"                                    \"BDMs created by the compute.\")"},{"line_number":4061,"context_line":"                        bdm_obj \u003d block_device_obj.BlockDeviceMapping"},{"line_number":4062,"context_line":"                        bdm \u003d bdm_obj.get_by_volume_and_instance(context,"},{"line_number":4063,"context_line":"                            volume_id, instance.uuid)"},{"line_number":4064,"context_line":"                        LOG.debug(\"BDM %s found and will now be destroyed.\","},{"line_number":4065,"context_line":"                                  bdm.uuid)"},{"line_number":4066,"context_line":"                        bdm.destroy()"},{"line_number":4067,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4068,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"},{"line_number":4069,"context_line":""},{"line_number":4070,"context_line":"            volume_bdm.delete_on_termination \u003d delete_on_termination"},{"line_number":4071,"context_line":"            volume_bdm.save()"}],"source_content_type":"text/x-python","patch_set":4,"id":"3fa7e38b_21ba876d","line":4068,"range":{"start_line":4056,"start_character":0,"end_line":4068,"end_character":79},"in_reply_to":"3fa7e38b_d0e7bf11","updated":"2019-10-10 16:11:03.000000000","message":"Even if we hit that very specific sequence we\u0027re no worse off than we are now, right? Meaning the API tries to delete a bdm which doesn\u0027t exist and the compute creates one which gets orphaned - but that\u0027s already the end state we\u0027re in today right? IOW, Lee\u0027s \"fix\" here is a best effort attempt at cleaning up not knowing what might have blown up in compute, e.g. maybe we created the bdm and then something else failed.\n\nI would buy the instance lock causing the messaging timeout - we could workaround that by making reserve_block_device_name using the long_rpc_timeout config, right?","commit_id":"5c2b55bc709bb19fa5909c215a90ca2fb6c6b332"},{"author":{"_account_id":14070,"name":"Eric Fried","email":"openstack@fried.cc","username":"efried"},"change_message_id":"ccf554efa6df16693caba12b45133573ae09dc20","unresolved":false,"context_lines":[{"line_number":4172,"context_line":"                with excutils.save_and_reraise_exception():"},{"line_number":4173,"context_line":"                    try:"},{"line_number":4174,"context_line":"                        LOG.debug(\"reserve_block_device_name has failed, \""},{"line_number":4175,"context_line":"                                    \"attempting to lookup and remove any \""},{"line_number":4176,"context_line":"                                    \"BDMs created by the compute.\")"},{"line_number":4177,"context_line":"                        bdm_obj \u003d block_device_obj.BlockDeviceMapping"},{"line_number":4178,"context_line":"                        bdm \u003d bdm_obj.get_by_volume_and_instance(context,"},{"line_number":4179,"context_line":"                            volume_id, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":6,"id":"3fa7e38b_d394fbe9","line":4176,"range":{"start_line":4175,"start_character":0,"end_line":4176,"end_character":67},"updated":"2019-12-02 14:49:16.000000000","message":"nit: weird indent","commit_id":"7a49b3ffda955fef506e7ca590bee075706fd4ce"},{"author":{"_account_id":14070,"name":"Eric Fried","email":"openstack@fried.cc","username":"efried"},"change_message_id":"ccf554efa6df16693caba12b45133573ae09dc20","unresolved":false,"context_lines":[{"line_number":4178,"context_line":"                        bdm \u003d bdm_obj.get_by_volume_and_instance(context,"},{"line_number":4179,"context_line":"                            volume_id, instance.uuid)"},{"line_number":4180,"context_line":"                        LOG.debug(\"BDM %s found and will now be destroyed.\","},{"line_number":4181,"context_line":"                                  bdm.uuid)"},{"line_number":4182,"context_line":"                        bdm.destroy()"},{"line_number":4183,"context_line":"                    except exception.VolumeBDMNotFound:"},{"line_number":4184,"context_line":"                        LOG.debug(\"No BDM found while attempting to rollback.\")"}],"source_content_type":"text/x-python","patch_set":6,"id":"3fa7e38b_f38ed7cf","line":4181,"updated":"2019-12-02 14:49:16.000000000","message":"Would be nice if these debug messages had instance\u003dinstance. Is it possible that some of the error conditions that got us here could blow that up?","commit_id":"7a49b3ffda955fef506e7ca590bee075706fd4ce"}],"nova/tests/unit/compute/test_compute_api.py":[{"author":{"_account_id":14070,"name":"Eric Fried","email":"openstack@fried.cc","username":"efried"},"change_message_id":"ccf554efa6df16693caba12b45133573ae09dc20","unresolved":false,"context_lines":[{"line_number":426,"context_line":"        self.assertRaises(test.TestingException,"},{"line_number":427,"context_line":"            self.compute_api._create_volume_bdm, self.context, instance,"},{"line_number":428,"context_line":"            mock.sentinel.device, volume, mock.sentinel.disk_bus,"},{"line_number":429,"context_line":"            mock.sentinel.device_type)"},{"line_number":430,"context_line":""},{"line_number":431,"context_line":"    @mock.patch.object(objects.BlockDeviceMapping, \u0027create\u0027)"},{"line_number":432,"context_line":"    def test_create_volume_bdm_local_creation(self, bdm_create):"}],"source_content_type":"text/x-python","patch_set":6,"id":"3fa7e38b_f3a7b74f","line":429,"updated":"2019-12-02 14:49:16.000000000","message":"Be nice to assert that destroy() wasn\u0027t called. (You would have to mock the module.Class.method rather than the object to make that work.)","commit_id":"7a49b3ffda955fef506e7ca590bee075706fd4ce"}]}
