)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"240ca4637f3d2abfcffa3bb1145921325cfe4ae9","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit:     Rodolfo Alonso Hernandez \u003cralonsoh@redhat.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2019-08-02 14:36:40 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Delay HA router transition from \"backup\" to \"master\""},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"As described in the bug, when a HA router transitions from \"master\" to"},{"line_number":10,"context_line":"\"backup\", \"keepalived\" processes will set the virtual IP in all other"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"7faddb67_02f2a970","line":7,"range":{"start_line":7,"start_character":0,"end_line":7,"end_character":5},"updated":"2019-08-05 15:58:07.000000000","message":"Please try to use Rally to run some scenario testing like: \"create_and_delete_router\" to see what we will get after such delay.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"122b2fa90575486be558bfcf819333c54ec02fa2","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit:     Rodolfo Alonso Hernandez \u003cralonsoh@redhat.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2019-08-02 14:36:40 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Delay HA router transition from \"backup\" to \"master\""},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"As described in the bug, when a HA router transitions from \"master\" to"},{"line_number":10,"context_line":"\"backup\", \"keepalived\" processes will set the virtual IP in all other"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"7faddb67_1b165ef3","line":7,"range":{"start_line":7,"start_character":0,"end_line":7,"end_character":5},"in_reply_to":"7faddb67_02f2a970","updated":"2019-08-08 09:21:03.000000000","message":"I\u0027ve executed manually this process. The behavior is the same. The delay introduced does not add any significant time excess. The number of HA routers does not affect to this delay.\n\n\"create_and_delete_routers\" rally test does not use HA.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"7db7191ae5987ed604dfbd824d6ffea14fbde644","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit:     Rodolfo Alonso Hernandez \u003cralonsoh@redhat.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2019-08-02 14:36:40 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Delay HA router transition from \"backup\" to \"master\""},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"As described in the bug, when a HA router transitions from \"master\" to"},{"line_number":10,"context_line":"\"backup\", \"keepalived\" processes will set the virtual IP in all other"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"7faddb67_6de217a3","line":7,"range":{"start_line":7,"start_character":0,"end_line":7,"end_character":5},"in_reply_to":"7faddb67_1b165ef3","updated":"2019-08-09 16:21:48.000000000","message":"Set \u0027l3_ha \u003d True\u0027 for neutron-server to create HA router by default?","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"dbfbb3a821cd40fdfb014642115a3c8d4878f9fd","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit:     Rodolfo Alonso Hernandez \u003cralonsoh@redhat.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2019-08-02 14:36:40 +0000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Delay HA router transition from \"backup\" to \"master\""},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"As described in the bug, when a HA router transitions from \"master\" to"},{"line_number":10,"context_line":"\"backup\", \"keepalived\" processes will set the virtual IP in all other"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"7faddb67_1e61c5a9","line":7,"range":{"start_line":7,"start_character":0,"end_line":7,"end_character":5},"in_reply_to":"7faddb67_6de217a3","updated":"2019-08-19 15:58:15.000000000","message":"As I commented before, this patch will add a deterministic delay when transitioning to master. In this case, when a router is created and becomes master, this delay will be present.\n\nBut this delay will not scale with the number of routers, if this is the question here. It doesn\u0027t matter the number of HA routers present. This delay will be always constant.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"}],"neutron/agent/l3/agent.py":[{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"240ca4637f3d2abfcffa3bb1145921325cfe4ae9","unresolved":false,"context_lines":[{"line_number":543,"context_line":"                             self.context, states\u003d(ri,),"},{"line_number":544,"context_line":"                             resource_id\u003drouter_id))"},{"line_number":545,"context_line":""},{"line_number":546,"context_line":"        del self.router_info[router_id]"},{"line_number":547,"context_line":"        try:"},{"line_number":548,"context_line":"            ri.delete()"},{"line_number":549,"context_line":"        except Exception:"},{"line_number":550,"context_line":"            with excutils.save_and_reraise_exception():"},{"line_number":551,"context_line":"                self.router_info[router_id] \u003d ri"},{"line_number":552,"context_line":""},{"line_number":553,"context_line":"        registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router\u003dri)"},{"line_number":554,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_424661c2","line":551,"range":{"start_line":546,"start_character":8,"end_line":551,"end_character":48},"updated":"2019-08-05 15:58:07.000000000","message":"Actually, if we do not use the ResourceProcessingQueue, such call order reversing does not solve the main problem of the race condition. Other thread (greenlet) may get router info before line 546 running, and then here l3 agent still run router deleting and other run ha_state_change functions.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"122b2fa90575486be558bfcf819333c54ec02fa2","unresolved":false,"context_lines":[{"line_number":543,"context_line":"                             self.context, states\u003d(ri,),"},{"line_number":544,"context_line":"                             resource_id\u003drouter_id))"},{"line_number":545,"context_line":""},{"line_number":546,"context_line":"        del self.router_info[router_id]"},{"line_number":547,"context_line":"        try:"},{"line_number":548,"context_line":"            ri.delete()"},{"line_number":549,"context_line":"        except Exception:"},{"line_number":550,"context_line":"            with excutils.save_and_reraise_exception():"},{"line_number":551,"context_line":"                self.router_info[router_id] \u003d ri"},{"line_number":552,"context_line":""},{"line_number":553,"context_line":"        registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router\u003dri)"},{"line_number":554,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_5b67f652","line":551,"range":{"start_line":546,"start_character":8,"end_line":551,"end_character":48},"in_reply_to":"7faddb67_424661c2","updated":"2019-08-08 09:21:03.000000000","message":"The L3NATAgent processes the routers updates in the main loop function \"_process_router_update\". This function retrieves the messages only from ResourceProcessingQueue.\n\nThe behavior you are describing is more possible with the current implementation. The delete execution, called in \"ri.delete()\", is the most time consuming process. With the base implementation, what you are describing can happen but not with the solution I\u0027m proposing. During the deletion process, the agent cache does not have the router information anymore and this router can\u0027t be updated during this process; this is an improvement from the base code.\n\nRegardless of the rest of the code introduced by this patch, this modification should be included in the code.\n\nNOTE: because many workers can update at the same time the same router, a blocking mechanism should be implemented to avoid this, using the same agent router cache used to retrieve the router information.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"}],"neutron/agent/l3/ha.py":[{"author":{"_account_id":11975,"name":"Slawek Kaplonski","email":"skaplons@redhat.com","username":"slaweq"},"change_message_id":"aeca51da7eff49a7e76f25d3d22e8df89888e62f","unresolved":false,"context_lines":[{"line_number":84,"context_line":"        self.state_change_notifier \u003d batch_notifier.BatchNotifier("},{"line_number":85,"context_line":"            self._calculate_batch_duration(), self.notify_server)"},{"line_number":86,"context_line":"        eventlet.spawn(self._start_keepalived_notifications_server)"},{"line_number":87,"context_line":"        self._transition_state \u003d {}"},{"line_number":88,"context_line":"        self._transition_state_mutex \u003d threading.Lock()"},{"line_number":89,"context_line":""},{"line_number":90,"context_line":"    def _get_router_info(self, router_id):"}],"source_content_type":"text/x-python","patch_set":4,"id":"7faddb67_9374f869","line":87,"range":{"start_line":87,"start_character":13,"end_line":87,"end_character":30},"updated":"2019-07-31 08:07:50.000000000","message":"nit: this could be IMO named \u0027_transition_states\u0027 or something like that","commit_id":"f04d2febb5968f6bc8dbad45ea9c00ddba3cabc7"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"e1e310ce5d139e5b267f95e4f75bd193d9e64e82","unresolved":false,"context_lines":[{"line_number":84,"context_line":"        self.state_change_notifier \u003d batch_notifier.BatchNotifier("},{"line_number":85,"context_line":"            self._calculate_batch_duration(), self.notify_server)"},{"line_number":86,"context_line":"        eventlet.spawn(self._start_keepalived_notifications_server)"},{"line_number":87,"context_line":"        self._transition_state \u003d {}"},{"line_number":88,"context_line":"        self._transition_state_mutex \u003d threading.Lock()"},{"line_number":89,"context_line":""},{"line_number":90,"context_line":"    def _get_router_info(self, router_id):"}],"source_content_type":"text/x-python","patch_set":4,"id":"7faddb67_2bfc9270","line":87,"range":{"start_line":87,"start_character":13,"end_line":87,"end_character":30},"in_reply_to":"7faddb67_9374f869","updated":"2019-08-01 13:37:09.000000000","message":"Done","commit_id":"f04d2febb5968f6bc8dbad45ea9c00ddba3cabc7"},{"author":{"_account_id":11975,"name":"Slawek Kaplonski","email":"skaplons@redhat.com","username":"slaweq"},"change_message_id":"aeca51da7eff49a7e76f25d3d22e8df89888e62f","unresolved":false,"context_lines":[{"line_number":144,"context_line":"            eventlet.sleep(0)"},{"line_number":145,"context_line":""},{"line_number":146,"context_line":"    def _enqueue_state_change(self, router_id, state):"},{"line_number":147,"context_line":"        # NOTE(ralonsoh): move \u0027master\u0027 and \u0027backup\u0027 constants to n-lib"},{"line_number":148,"context_line":"        if state \u003d\u003d \u0027master\u0027:"},{"line_number":149,"context_line":"            eventlet.sleep(self.conf.ha_vrrp_advert_int)"},{"line_number":150,"context_line":"        if self._update_transition_state(router_id) !\u003d state:"}],"source_content_type":"text/x-python","patch_set":4,"id":"7faddb67_536a0081","line":147,"updated":"2019-07-31 08:07:50.000000000","message":"good idea :)","commit_id":"f04d2febb5968f6bc8dbad45ea9c00ddba3cabc7"},{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"43892adf63e3cf4a20ba2281e063f78fed114f99","unresolved":false,"context_lines":[{"line_number":140,"context_line":"        :param state: [\u0027master\u0027, \u0027backup\u0027]"},{"line_number":141,"context_line":"        \"\"\""},{"line_number":142,"context_line":"        if not self._update_transition_state(router_id, state):"},{"line_number":143,"context_line":"            eventlet.spawn_n(self._enqueue_state_change, router_id, state)"},{"line_number":144,"context_line":"            eventlet.sleep(0)"},{"line_number":145,"context_line":""},{"line_number":146,"context_line":"    def _enqueue_state_change(self, router_id, state):"}],"source_content_type":"text/x-python","patch_set":5,"id":"7faddb67_6b3927d5","line":143,"range":{"start_line":143,"start_character":34,"end_line":143,"end_character":55},"updated":"2019-08-01 15:42:42.000000000","message":"Allow me to recall my former change here again (I comment this once in your another patch): https://review.opendev.org/#/c/275614/\n\nA queue introduced here, which means there will be a delay for state change. And this queue is not that L3 agent RouterProcessQueue (ResourceProcessingQueue). So the race condition mentioned in 275614 will still can be encontered: when a HA router is doing the state change, the _transition_states delay some time (maybe the lock wait time), and L3 agent is concurrently deleting the router. Line 171 to 177 can be run after the router is deleted, and some stale resource may be left forever.","commit_id":"2a98859cc2318497dadcc3740651ffcc7622da7a"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"d36a476777a4af1072924d7acebc7b76e0191fb0","unresolved":false,"context_lines":[{"line_number":140,"context_line":"        :param state: [\u0027master\u0027, \u0027backup\u0027]"},{"line_number":141,"context_line":"        \"\"\""},{"line_number":142,"context_line":"        if not self._update_transition_state(router_id, state):"},{"line_number":143,"context_line":"            eventlet.spawn_n(self._enqueue_state_change, router_id, state)"},{"line_number":144,"context_line":"            eventlet.sleep(0)"},{"line_number":145,"context_line":""},{"line_number":146,"context_line":"    def _enqueue_state_change(self, router_id, state):"}],"source_content_type":"text/x-python","patch_set":5,"id":"7faddb67_6b5da745","line":143,"range":{"start_line":143,"start_character":34,"end_line":143,"end_character":55},"in_reply_to":"7faddb67_6b3927d5","updated":"2019-08-01 17:09:59.000000000","message":"This patch introduces a delay in the transition from \"backup\" to \"master\".\n\nYou are right, during this waiting period, the router can be deleted or could be in the process of being deleted.\n\nThe router deletion process is called from the L3 agent, L3NATAgent._safe_router_removed() --\u003e  L3NATAgent._router_removed(). In this last function:\n- The ns is deleted.\n- The router DB entry is deleted\n- The router cache in the agent (self.router_info) is deleted\n\nThis last step should be done first in order to clean the agent cache and, in case the router is deleted during the waiting period, hit L158 (exit because ri is None).\n\nI\u0027ll propose a new PS.","commit_id":"2a98859cc2318497dadcc3740651ffcc7622da7a"},{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"240ca4637f3d2abfcffa3bb1145921325cfe4ae9","unresolved":false,"context_lines":[{"line_number":115,"context_line":"        # default 2 seconds."},{"line_number":116,"context_line":"        return self.conf.ha_vrrp_advert_int"},{"line_number":117,"context_line":""},{"line_number":118,"context_line":"    def _update_transition_state(self, router_id, new_state\u003dNone):"},{"line_number":119,"context_line":"        with self._transition_state_mutex:"},{"line_number":120,"context_line":"            transition_state \u003d self._transition_states.get(router_id)"},{"line_number":121,"context_line":"            if new_state:"}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_5d5c0618","line":118,"range":{"start_line":118,"start_character":50,"end_line":118,"end_character":64},"updated":"2019-08-05 15:58:07.000000000","message":"Maybe I\u0027m missing something, I do not see the new_state param was used in the caller? Then how this _transition_states is filled up?","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"122b2fa90575486be558bfcf819333c54ec02fa2","unresolved":false,"context_lines":[{"line_number":115,"context_line":"        # default 2 seconds."},{"line_number":116,"context_line":"        return self.conf.ha_vrrp_advert_int"},{"line_number":117,"context_line":""},{"line_number":118,"context_line":"    def _update_transition_state(self, router_id, new_state\u003dNone):"},{"line_number":119,"context_line":"        with self._transition_state_mutex:"},{"line_number":120,"context_line":"            transition_state \u003d self._transition_states.get(router_id)"},{"line_number":121,"context_line":"            if new_state:"}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_db64464c","line":118,"range":{"start_line":118,"start_character":50,"end_line":118,"end_character":64},"in_reply_to":"7faddb67_5d5c0618","updated":"2019-08-08 09:21:03.000000000","message":"This is used in L142","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":9531,"name":"liuyulong","display_name":"LIU Yulong","email":"i@liuyulong.me","username":"LIU-Yulong"},"change_message_id":"240ca4637f3d2abfcffa3bb1145921325cfe4ae9","unresolved":false,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"        This function will also update the metadata proxy, the radvd daemon,"},{"line_number":131,"context_line":"        process the prefix delegation and inform to the L3 extensions. If the"},{"line_number":132,"context_line":"        HA router changes to \"master\", this transition will be delayed for at"},{"line_number":133,"context_line":"        least \"ha_vrrp_advert_int\" seconds. When the \"master\" router"},{"line_number":134,"context_line":"        transitions to \"backup\", \"keepalived\" will set the rest of HA routers"},{"line_number":135,"context_line":"        to \"master\" until it decides which one should be the only \"master\"."}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_1d3c4e91","line":132,"range":{"start_line":132,"start_character":63,"end_line":132,"end_character":70},"updated":"2019-08-05 15:58:07.000000000","message":"Another race condition is happening when a HA router is deleting. A router notification is processed first in the \u0027master\u0027 node, and the \u0027VRRP\u0027 process will be dropped first, but in \u0027backup\u0027 node the \u0027VRRP\u0027 process may still run. Then a VRRP re-election causes such state change here. IMO, this delay may reduce the probability of such racing. But again, we need some test to prove that.","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"122b2fa90575486be558bfcf819333c54ec02fa2","unresolved":false,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"        This function will also update the metadata proxy, the radvd daemon,"},{"line_number":131,"context_line":"        process the prefix delegation and inform to the L3 extensions. If the"},{"line_number":132,"context_line":"        HA router changes to \"master\", this transition will be delayed for at"},{"line_number":133,"context_line":"        least \"ha_vrrp_advert_int\" seconds. When the \"master\" router"},{"line_number":134,"context_line":"        transitions to \"backup\", \"keepalived\" will set the rest of HA routers"},{"line_number":135,"context_line":"        to \"master\" until it decides which one should be the only \"master\"."}],"source_content_type":"text/x-python","patch_set":6,"id":"7faddb67_1e688c85","line":132,"range":{"start_line":132,"start_character":63,"end_line":132,"end_character":70},"in_reply_to":"7faddb67_1d3c4e91","updated":"2019-08-08 09:21:03.000000000","message":"There are two scenarios here:\n- State change from master to backup. This method does not change the current behavior. The state change will be applied immediately.\n\n- State change from backup to master. In this case a delay is introduced between the state change reception and the state change execution. If during this time the router is deleted (or is being deleted), as commented in [1], the agent cache will delete the router entry during the deletion process. If succeeded, the cache will keep this register deleted; in case of exception, the register will be written again. \n\n\nThis code is improving the current behavior and preventing a collision between commands in the same router.\n\n[1] https://review.opendev.org/#/c/672568/6/neutron/agent/l3/agent.py","commit_id":"e60738be6f5bac6c313a1b46522ced9ef22eb36f"}]}
