)]}'
{"/PATCHSET_LEVEL":[{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"fd069ac383591cf851f8185edc226adac4033c37","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":4,"id":"6a943769_e155bcfa","updated":"2026-02-05 07:54:10.000000000","message":"Thank you for looking at my patch. I made some changes to address your comments, please have a look.","commit_id":"11f5a2d08f91b044084d1f3ceafd9beada9e68d9"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"5e9f078a9676cbc1d45555bc42d36954bb05368e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"5d8ab8a2_a1f5ade2","updated":"2026-03-13 13:08:13.000000000","message":"I think it doesn\u0027t make sense to continue working on this patch as the fail back of the highest priority chassis, is only a side effect of some other issue. Please check my LP bug comment for details [1]. Thank you All for sharing your comments so far.\n\n[1] https://bugs.launchpad.net/neutron/+bug/2136733/comments/6","commit_id":"f3269deb622fa46a34f21c5a42bfdc523d663989"},{"author":{"_account_id":1131,"name":"Brian Haley","email":"haleyb.dev@gmail.com","username":"brian-haley"},"change_message_id":"4b57a06254eb77e4ee1c5aceb81ea0953c5a9bba","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"ef796b2d_3d5d41bf","updated":"2026-02-25 17:51:05.000000000","message":"recheck neutron-functional should be fixed","commit_id":"f3269deb622fa46a34f21c5a42bfdc523d663989"},{"author":{"_account_id":1131,"name":"Brian Haley","email":"haleyb.dev@gmail.com","username":"brian-haley"},"change_message_id":"3a6478df937426fe96f059819c3032e9e53f0ed1","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"35e94835_5c2e806f","updated":"2026-03-02 21:00:43.000000000","message":"recheck testtools.try_import fixed in n-t-p","commit_id":"f3269deb622fa46a34f21c5a42bfdc523d663989"}],"neutron/plugins/ml2/drivers/ovn/agent/neutron_agent.py":[{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"fd1ec52a707021673671c04d2f38508b5a5aee8a","unresolved":true,"context_lines":[{"line_number":100,"context_line":"        if self.set_down:"},{"line_number":101,"context_line":"            return False"},{"line_number":102,"context_line":"        _nb_cfg \u003d self.nb_cfg"},{"line_number":103,"context_line":"        _updated_at \u003d self.updated_at"},{"line_number":104,"context_line":"        if not skip_cache:"},{"line_number":105,"context_line":"            _nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":106,"context_line":"                    \u0027Chassis_Private\u0027,"}],"source_content_type":"text/x-python","patch_set":2,"id":"a1539cb9_da569a01","line":103,"updated":"2026-01-05 15:25:01.000000000","message":"I think the idea is good but I wouldn\u0027t check the DB always.\n\nIf the local cache declares the agent as dead, I would then check the SB database and update it with a fresh value. That will skip most of the SB lookups when the data is fresh.","commit_id":"f7088fd05362bb22eee36ba98a036b1c94963d5d"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"3a7d7ce16d8e3f85d888856ad95e9424205e21a1","unresolved":true,"context_lines":[{"line_number":100,"context_line":"        if self.set_down:"},{"line_number":101,"context_line":"            return False"},{"line_number":102,"context_line":"        _nb_cfg \u003d self.nb_cfg"},{"line_number":103,"context_line":"        _updated_at \u003d self.updated_at"},{"line_number":104,"context_line":"        if not skip_cache:"},{"line_number":105,"context_line":"            _nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":106,"context_line":"                    \u0027Chassis_Private\u0027,"}],"source_content_type":"text/x-python","patch_set":2,"id":"e3190382_3b9705db","line":103,"in_reply_to":"a1539cb9_da569a01","updated":"2026-01-06 14:49:46.000000000","message":"@ralonsoh@redhat.com thank you very much for looking into my patch and for your comment. Updating the cached agent with the fresh data from the SB database makes sense to me. I am not quite sure how to address your suggestion about limiting the number of the SB db calls. In order to say that the cached agent data is out of synch with the db, we have to compare the agent\u0027s properties (alive, updated_at, etc) with their counterparts in the SB db. So we have to make these calls to read the data. Even if the data is in sync at some point, but the agent hasn\u0027t come up yet, we still need to check its state in the db during the subsequent binding attempts (just in case the agent got out of sync in between the attempts). I added the \u0027skip_cache\u003dTrue\u0027 param to the is_alive method to ensure that we pull the data from the SB database only during the port binding in the OVNMechanismDriver, as this is currently the only situation which we found where the agent is out of sync with the db and this is causing some issues. \nI think, that polling the db and updating the agent\u0027s state periodically is not a solution either.\nBecause I might be missing some context here, I\u0027d appreciate if you could provide me with some more details on how you have thought we could further limit the SB database calls. Thank you in advance, Marcin","commit_id":"f7088fd05362bb22eee36ba98a036b1c94963d5d"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"fd069ac383591cf851f8185edc226adac4033c37","unresolved":false,"context_lines":[{"line_number":100,"context_line":"        if self.set_down:"},{"line_number":101,"context_line":"            return False"},{"line_number":102,"context_line":"        _nb_cfg \u003d self.nb_cfg"},{"line_number":103,"context_line":"        _updated_at \u003d self.updated_at"},{"line_number":104,"context_line":"        if not skip_cache:"},{"line_number":105,"context_line":"            _nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":106,"context_line":"                    \u0027Chassis_Private\u0027,"}],"source_content_type":"text/x-python","patch_set":2,"id":"1c50627d_b6cb161b","line":103,"in_reply_to":"cef6d0d5_96dc125a","updated":"2026-02-05 07:54:10.000000000","message":"Thank you for looking into this. I addressed your latest comment. Please have look.","commit_id":"f7088fd05362bb22eee36ba98a036b1c94963d5d"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"5b0ba1447ce71b98a0119e9524480b893fef8ec1","unresolved":true,"context_lines":[{"line_number":100,"context_line":"        if self.set_down:"},{"line_number":101,"context_line":"            return False"},{"line_number":102,"context_line":"        _nb_cfg \u003d self.nb_cfg"},{"line_number":103,"context_line":"        _updated_at \u003d self.updated_at"},{"line_number":104,"context_line":"        if not skip_cache:"},{"line_number":105,"context_line":"            _nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":106,"context_line":"                    \u0027Chassis_Private\u0027,"}],"source_content_type":"text/x-python","patch_set":2,"id":"cef6d0d5_96dc125a","line":103,"in_reply_to":"e3190382_3b9705db","updated":"2026-01-26 08:09:03.000000000","message":"So my point is not to make a difference between a port binding call or another call. This method should be the same. It should rely on the cache:\n* If the cache determines the agent is alive, return it.\n* Only if the agent cache says that the agent is dead, we can check directly on the DB if that value is correct","commit_id":"f7088fd05362bb22eee36ba98a036b1c94963d5d"},{"author":{"_account_id":5756,"name":"Terry Wilson","email":"twilson@redhat.com","username":"otherwiseguy"},"change_message_id":"5df22ca0579ab3d30707cf300a9afa476e2c7e47","unresolved":true,"context_lines":[{"line_number":109,"context_line":"        # The agent seems to be down based on the cache state."},{"line_number":110,"context_line":"        # Checking SB DB as a last resort, just in case"},{"line_number":111,"context_line":"        # the cache is out of sync."},{"line_number":112,"context_line":"        nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":113,"context_line":"                \u0027Chassis_Private\u0027,"},{"line_number":114,"context_line":"                self.chassis_private.name,"},{"line_number":115,"context_line":"                \u0027nb_cfg\u0027).execute(check_error\u003dTrue)"}],"source_content_type":"text/x-python","patch_set":4,"id":"d0c49a86_25ea1771","line":112,"updated":"2026-02-09 17:39:09.000000000","message":"The values in the SBDB are locally stored in memory just like AgentCache is. AgentCache should be updated on the Chassis_Private.nb_cfg is updated by ChassisAgentWriteEvent. If it isn\u0027t, then there is likely something wrong with the logical processing the events. Because if we can access nb_cfg/timestamp here, then we *have* gotten the update event from the DB locally.\n\nAlso, the reason we use AgentCache instead of accessing values directly is because ovn-controller, if shut down gracefully, will delete Chassis/Chassis_Private rows. In this case, we have to show the agent as *down*, not just delete it (agent delete doesn\u0027t happen until the `openstack network agent delete` command is called). So we cannot guarantee Chassis_Private even exists here.","commit_id":"11f5a2d08f91b044084d1f3ceafd9beada9e68d9"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"a812e2bfbbc44ba8232fc1ad73731dd4fc1e25db","unresolved":true,"context_lines":[{"line_number":109,"context_line":"        # The agent seems to be down based on the cache state."},{"line_number":110,"context_line":"        # Checking SB DB as a last resort, just in case"},{"line_number":111,"context_line":"        # the cache is out of sync."},{"line_number":112,"context_line":"        nb_cfg \u003d self.driver.sb_ovn.db_get("},{"line_number":113,"context_line":"                \u0027Chassis_Private\u0027,"},{"line_number":114,"context_line":"                self.chassis_private.name,"},{"line_number":115,"context_line":"                \u0027nb_cfg\u0027).execute(check_error\u003dTrue)"}],"source_content_type":"text/x-python","patch_set":4,"id":"b6d31c9a_5a856c78","line":112,"in_reply_to":"d0c49a86_25ea1771","updated":"2026-02-11 17:05:58.000000000","message":"Hey Terry,\nThank you for your comments, I was not aware of the fact that the \u0027ovs\u0027 library caches the state of the OVN DBs.\n\nI did a bit more testing and if I swap the order of the conditions in the NeutronAgent.alive() to test the difference between the update_at vs agent_down_time first, folowed by the nb_cfg difference test (see patch 5) it also solves this problem (sort of because during the failback I see the port bounced between the highest priority and the second highest priority chassis [1]). The bouncing was triggered by the PortBindingChassisEvent. Just sharing the observation.\n\n[1] https://pastebin.ubuntu.com/p/BS6ww9VKBc/","commit_id":"11f5a2d08f91b044084d1f3ceafd9beada9e68d9"},{"author":{"_account_id":5756,"name":"Terry Wilson","email":"twilson@redhat.com","username":"otherwiseguy"},"change_message_id":"6a5f0349bfd81eeca45c08f9626477d520d2d2a3","unresolved":true,"context_lines":[{"line_number":96,"context_line":"    def alive(self):"},{"line_number":97,"context_line":"        if self.set_down:"},{"line_number":98,"context_line":"            return False"},{"line_number":99,"context_line":"        now \u003d timeutils.utcnow(with_timezone\u003dTrue)"},{"line_number":100,"context_line":"        time_diff \u003d (now - self.updated_at).total_seconds()"},{"line_number":101,"context_line":""},{"line_number":102,"context_line":"        # Check timestamp FIRST - handles restart scenarios"}],"source_content_type":"text/x-python","patch_set":5,"id":"10c43448_9025e22c","line":99,"updated":"2026-02-12 14:55:07.000000000","message":"Unless I\u0027m missing something, it seems like changing the order of two different conditions that both just return True if they match shouldn\u0027t lead to a different return value unless `self.driver.nb_ovn.nb_global.nb_cfg - self.nb_cfg \u003c\u003d 1` was throwing an exception or blocking execution long enough to surpass agent_down_time? I\u0027d expect an exception to show up in the logs.\n\n`self.driver.nb_ovn.nb_global` uses `list_rows()` which is a ReadOnlyCommand, which does grab a lock and run a \"read-only\" transaction. If this is really changing behavior, you could try doing the original order and\n\n`self.driver.nb_ovn._tables[\u0027NB_Global\u0027].rows.values()[0].nb_cfg - self.nb_cfg \u003c\u003d 1` to access it directly (not generally something we recommend, but it could be instructive). It seems weird, but I\u0027m having trouble coming up with another way that swapping the order would make a difference.","commit_id":"6e571d6a6365b19f9d5709004e19514db5896e6b"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"fd1b8cfddb5abe697c77fa2e7ddf37af6342b6bf","unresolved":true,"context_lines":[{"line_number":96,"context_line":"    def alive(self):"},{"line_number":97,"context_line":"        if self.set_down:"},{"line_number":98,"context_line":"            return False"},{"line_number":99,"context_line":"        now \u003d timeutils.utcnow(with_timezone\u003dTrue)"},{"line_number":100,"context_line":"        time_diff \u003d (now - self.updated_at).total_seconds()"},{"line_number":101,"context_line":""},{"line_number":102,"context_line":"        # Check timestamp FIRST - handles restart scenarios"}],"source_content_type":"text/x-python","patch_set":5,"id":"60b34e98_031f36bb","line":99,"in_reply_to":"10c43448_9025e22c","updated":"2026-02-23 09:05:06.000000000","message":"Hey Terry, sorry for the delay, I needed some time for extra testing. I realized that my reproducer method with a graceful shutdown of the ovn-controller (sudo ovn-appctl -t ovn-controller exit) it not consistent. Sometimes it works, sometimes not. As a result, it led me to the premature conclusion that changing the order of the tests in the NeutronAgent.alive makes a difference. It does not. Sorry for that noise. The only reliable method to reproduce the problem is to shut down a host with the chassis and restart it.\n\nHaving said that, I just submitted another patch which shows two areas that need to be addressed to solve this problem:\n1. NeutronAgent.alive - unless I read the data from the SB db, the agent is never considered alive during the bind attempts. I also made some tests with a graceful shutdown of the ovn-controller (with that change applied) and on Yoga, the agent never disappeard from the list. It just shows up as Alive\u003dXXX. I managed to test just his behavior on Flamingo, most of the time the behavior is the same, but once or twice, after the graceful shutdown, the agent disappeard from the agent list (as you mentioned in your previous comment). So the results are inconsistent - again.\n\n2. Even when the agent alive status is checked based on the SB db state (ovs cache), it\u0027s still not sufficient as after Neutron is notified about the PortBindingChassisEvent it needs to wait a bit for agent to become active. Hence the added logic to check agent\u0027s status in the \u0027neutron/services/ovn_l3/plugin.py\u0027.\n\nPlease let me know your thoughts.","commit_id":"6e571d6a6365b19f9d5709004e19514db5896e6b"}],"neutron/plugins/ml2/plugin.py":[{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"5b0ba1447ce71b98a0119e9524480b893fef8ec1","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"5141f9e8_4ac9505f","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"updated":"2026-01-26 08:09:03.000000000","message":"No, I would not recommend that. We already re-try the port provisioning 10 times in `_port_provisioned`. That would add an extra timeout not necessary.","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"fd069ac383591cf851f8185edc226adac4033c37","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"7af7ac06_f5bb056f","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"in_reply_to":"5141f9e8_4ac9505f","updated":"2026-02-05 07:54:10.000000000","message":"Thank you for pointing this out. I took a closer look on the \u0027_port_provisioned\u0027 and did more testing. The following are my observations.\nThe problem I am trying to solve is a corner case, as the GW port is already provisioned but during the fail-back (initially highest priority chassis comes up after a failure/shutdown) it is rebound back to the highest priority chassis. In this scenario, during the fail-back, the \u0027_port_provisioned\u0027 is not called. Logs confirm this (I should see [1], but I don\u0027t): https://pastebin.ubuntu.com/p/HTx94mk4v5/\n\nThe logs confirm that the \u0027_port_provisioned\u0027 wasn\u0027t called in this scenario. And even if it was, the loop in the \u0027_port_provisioned\u0027 only checks the binding status. Without extra delay in the for loop in \u0027_bind_port_if_needed\u0027, all 10 binding attempts are exhausted within milliseconds which is not enugh for th agent to become active. I highlighted this in my comments to the second bind attempt in the logs above. The following are logs from the same scenario when the \u0027BIND_RETRY_WAIT \u003d 0\u0027 - no extra wait: https://pastebin.ubuntu.com/p/H5knCc6zn6/\n\nAfter 10 attempts, which took a little bit more than 100 milliseconds, the port failed to bind.\n\nWhen I changed the \u0027BIND_RETRY_WAIT\u0027 to 0.5s, it took 9 attempts (almost 5 seconds) for the port to bind successfully: https://pastebin.ubuntu.com/p/cvSqXHQsRF/\n\nBased on the above I think that adding extra dealy to the \u0027_bind_port_if_needed\u0027 only affects situations where there are some problems with binding a port. In most situations I expect a port to bind in the first attempt. Retrying it 10 times without any delay in beteen may noit increase the success rate a lot. Maybe 2 second wait (instead of 5) would be a better choice?\n\nPlease let me know what you think.\nThank you, Marcin.\n\n[1] https://opendev.org/openstack/neutron/src/commit/845dc9756d3b8c91c12b6eb39fbe407a096c428f/neutron/plugins/ml2/plugin.py#L368\n[2] https://opendev.org/openstack/neutron/src/commit/845dc9756d3b8c91c12b6eb39fbe407a096c428f/neutron/plugins/ml2/plugin.py#L653","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"a04ac8697cc8c1fcde15a55eaa63f8f88d4dc206","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"883aa962_783e30bd","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"in_reply_to":"767d1279_0b11413b","updated":"2026-02-09 14:27:47.000000000","message":"Thank you Rodolfo,\n  I will join the weekly meeting tomorrow as well. The following is a problem summary just in case someone else will be looking at it:\n\n  Upon node (hypervisor + ovn chassis) restart, the ovn-controller registers itself in the OVN Southbound DB in the Chassis and Chassis_Private tables. This trigges a number of actions on the OVN side. If the chassis is assigned a router gateway port, the ovn-northd would create a record in the Southbound DB Port_Binding table with the binding type \u0027chassisredirect\u0027 [1]. This triggers GW port binding sequence on the Neutron side [2][3]. Eventually, down the call stack, it will call the neutron.plugins.ml2.drivers.ovn.mech_driver.mech_driver.bind_port(), which will try to bind the port only if the OVN agent is alive [4]. The OVN controller agent is considered to be alive if the \u0027nb_cfg\u0027 in the corresponding Chassis_Private record differs by 1 at the most with the \u0027nb_cfg\u0027 value in the Northbound DB NB_Global table. Essentially it means the ovn-controller is done processing all the updates from the Southbound DB, such as OVS flow updates. This processing typically, on the ovn-controller side, takes some time (longer on the busy clouds) and usually longer than the time needed to exhaust 10 binding attempts in the neutron.plugins.ml2.plugin._bind_port_if_needed(). As a result, the GW router port fails to bind.\n\n  Today I tested an alternative approach with adding wait-agent-alive-retry-loop (10 attempts with a 3 second non-blocking delay in between) to the neutron.services.ovn_l3.plugin.update_router_gateway_port_bindings(). Which also solves the problem. At least this approach does not require making changes to the generic ml2 plugin.\n\n\n  [1] https://www.ovn.org/support/dist-docs/ovn-architecture.7.html\n  [2] https://opendev.org/openstack/neutron/src/commit/f8216b8cb16a1cac4920ca3a24b4ea0a726dd78b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py#L471\n  [3] https://opendev.org/openstack/neutron/src/commit/f8216b8cb16a1cac4920ca3a24b4ea0a726dd78b/neutron/services/ovn_l3/plugin.py#L265\n  [4] https://opendev.org/openstack/neutron/src/commit/f8216b8cb16a1cac4920ca3a24b4ea0a726dd78b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py#L1174","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"c39fa204a64194a540c7f8b5371b520284751438","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"dabdf238_02992495","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"in_reply_to":"7af7ac06_f5bb056f","updated":"2026-02-06 12:24:51.000000000","message":"| The problem I am trying to solve is a corner case, as the GW port is already provisioned but during the fail-back (initially highest priority chassis comes up after a failure/shutdown) it is rebound back to the highest priority chassis.\n\nNo, we cannot handle this specific situation by adding a delay. If the node is actually down when the port tries to bind, we cannot wait for it. This is a very specific situation that could happen, of course. But in this case, the mech driver should decline binding the port.\n\nI\u0027m in favor of using the SB to re-verify the agent status, but this change is out of scope.","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"b5799d880e1aef2f9506916c48d0ddb68fde0f34","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"767d1279_0b11413b","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"in_reply_to":"9abaa379_960e67d2","updated":"2026-02-09 08:29:16.000000000","message":"We can discuss it here. The issue you are facing is legit but cannot be handled by Neutron. If a node has been restarted, we should have the NB/SB databases updated at the same time. However, it seems not to happen in this case.\n\nI\u0027m not a fan of changing the generic ML2 plugin code `_bind_port_if_needed` for a specific mech driver issue. I\u0027ll raise this issue in the Neutron meeting tomorrow (https://meetings.opendev.org/#Neutron_Team_Meeting). I\u0027ll also ping other Neutron folks in order to find a better solution.","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"},{"author":{"_account_id":30534,"name":"Marcin Wilk","email":"marcin.wilk@canonical.com","username":"wilkmar"},"change_message_id":"e7a058d7efa337c6e31696bdbbe6338fdeb1cead","unresolved":true,"context_lines":[{"line_number":650,"context_line":"                # chance to do their work. Ensure we wait some time between"},{"line_number":651,"context_line":"                # the binding attempts, so an agent can recover or come up,"},{"line_number":652,"context_line":"                # if it was down."},{"line_number":653,"context_line":"                time.sleep(BIND_RETRY_WAIT)"},{"line_number":654,"context_line":""},{"line_number":655,"context_line":"                # multiple attempts shouldn\u0027t happen very often so we log each"},{"line_number":656,"context_line":"                # attempt after the 1st."}],"source_content_type":"text/x-python","patch_set":3,"id":"9abaa379_960e67d2","line":653,"range":{"start_line":653,"start_character":16,"end_line":653,"end_character":43},"in_reply_to":"dabdf238_02992495","updated":"2026-02-06 15:24:31.000000000","message":"Thank you for your comment. I will evaluate other options as this patch without adding extra delay to the \u0027_bind_port_if_needed\u0027 loop, doesn\u0027t solve the problem, for the reasons I stated earlier. \nPlease let me know if you prefer to continue the discussion here or in the Launchpad bug LP#2136733.\nMarcin","commit_id":"db81a7ccfb2738612bba0db22a73587d3f2020a3"}]}
