)]}'
{"/PATCHSET_LEVEL":[{"author":{"_account_id":5948,"name":"Oleg Bondarev","email":"obondarev@mirantis.com","username":"obondarev"},"change_message_id":"d5bedd3c461504767fa6d35c287aeaf55fa2f837","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"c50f3973_c0504ccc","updated":"2021-10-25 15:04:46.000000000","message":"Copying here from the bug, Tobias suggested another alternative, my feedback is:\n\nActually I think setting state report rpc timeout to report_interval makes sense: it\u0027s not heavy for the server and if there is no response during report_interval likely there is something wrong with the server and there is not much sense to wait more.\"","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"43af75de5b0b3019b0f2447fd9b8c529f7a4df83","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"63073fbe_2c1daf59","updated":"2021-10-25 14:41:21.000000000","message":"I\u0027ve answered with more details in launchpad bug.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"b31a20dadfab674134f13d4dfab107748c17a98a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"1375a517_9f0c93ff","updated":"2021-10-25 15:48:10.000000000","message":"Instead of this I would use PluginReportStateAPI.report_state() with \"use_call\u003dFalse\".\n\nWhy the agent should wait for the RPC response? If the agent reports is missed by the server, another report will be sent in the next \"AGENT.report_interval\".\n\nI would like other people to check this idea before creating a new config var.\n","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":5948,"name":"Oleg Bondarev","email":"obondarev@mirantis.com","username":"obondarev"},"change_message_id":"8ee407d14254e0afefcff18d2e79fef8a4c9049c","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"b15e67d8_b4d9bff3","updated":"2021-10-25 14:21:30.000000000","message":"To me it looks like a workaround of rpc messaging timeout during rabbit restart.\nNeed to investigate if this timeout could be avoided (like error raised on rabbit restart) from messaging side before adding yet another rpc timeout config knob in neutron.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"5016ce9af02fd906abc3447e4898cbef1afbdf0d","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"06e364fa_b1ddec7f","updated":"2021-10-25 14:01:34.000000000","message":"Trying to get some of our internal fixes upstream in a more cleaner way with this.\n\nPerhaps there is a better way than modifying PluginReportStateAPI so that this would be more backport-friendly, if that\u0027s the case please let me know 😊","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":5948,"name":"Oleg Bondarev","email":"obondarev@mirantis.com","username":"obondarev"},"change_message_id":"307665ede3dfd5d6b8a6319f01154bca7e89f24e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"92bec421_61047004","in_reply_to":"1375a517_9f0c93ff","updated":"2021-10-25 16:10:42.000000000","message":"reason for using \"call\": https://review.opendev.org/c/openstack/neutron/+/232661","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":5948,"name":"Oleg Bondarev","email":"obondarev@mirantis.com","username":"obondarev"},"change_message_id":"312261a8f23ef3ff95ac991564dd2ed40b5ee5df","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"f13f60db_bcbb582a","in_reply_to":"349382e6_dfb8c847","updated":"2021-10-26 14:05:19.000000000","message":"Actually that was Tobias proposal :)","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"715219bf556f0dbe15f6fa2438cf510bbafb3aec","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"c0f9eacf_4621e45a","in_reply_to":"3d34cee3_45b1722b","updated":"2021-10-26 14:50:05.000000000","message":"s/can be backported/cannot be backported/g","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"206103dbdaa4772884eaa7f7c15130322aa7d1bf","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"97caf6cd_d787188a","in_reply_to":"4fe3abea_4302aa42","updated":"2021-10-26 11:16:37.000000000","message":"Then my proposal of using cast makes more sense. \"cast\" won\u0027t block the agent. If the server receives this message (if it is not discarded by the MQ), the server can send, if needed, a \"agent_revived\" status. Of course, that implies a new API method between server and agent.\n\nI prefer that approach rather than adding a new variable just for this specific timeout.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16688,"name":"Rodolfo Alonso","email":"ralonsoh@redhat.com","username":"rodolfo-alonso-hernandez"},"change_message_id":"ef0039a57ed7c2dd0004b46fbc509bee0ef21d17","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"e4f1a135_659d9e5f","in_reply_to":"92bec421_61047004","updated":"2021-10-26 07:26:56.000000000","message":"Yes, and despite this we have a problem with this initial sync when rebooting too many agents or in loaded systems.\n\nThis feature will delay the problem. Increasing timeouts is not the solution.\n\nI still think we should:\n- Move calls to cast.\n- RPC (client and server) should implement a method to inform to the agent this status (agent revived).\n\nThis communication should be async, the agent should not wait for a confirmation. And ultimately, if that happens, admin should manually disable some agents allowing the others to properly restart.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"61edf378330e8ace770b9c422e0efbfc5779354c","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"b80d3374_ac590cee","in_reply_to":"97caf6cd_d787188a","updated":"2021-10-26 11:20:58.000000000","message":"I agree, that\u0027s another option.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"43af75de5b0b3019b0f2447fd9b8c529f7a4df83","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"ec5f5354_b0699c34","in_reply_to":"b15e67d8_b4d9bff3","updated":"2021-10-25 14:41:21.000000000","message":"It\u0027s not only for rabbit, it\u0027s for neutron-server restarted as well.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":11975,"name":"Slawek Kaplonski","email":"skaplons@redhat.com","username":"slaweq"},"change_message_id":"c14413769b8e4327c7a1c0d5b93ca061e1a80aa9","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"349382e6_dfb8c847","in_reply_to":"b80d3374_ac590cee","updated":"2021-10-26 13:23:01.000000000","message":"I personally agree with opinion that using e.g. 5 minutes timeout for the heartbeats don\u0027t makes sense really. And I also agree with Oleg\u0027s proposal of using report_interval as timeout in that case to avoid new config option.\nIMO that could be an option for now and I think we can backport it even to stable branches. This would actually just make things to work \"properly\" as even if someone already set rpc_timeout to e.g. 300 seconds, that\u0027s not correct for heartbeat messages, right?\n\n@Rodolfo - I think we can explore switching those heartbeats to cast again but this probably can be bigger change and will require exploring what we can break by doing that 😊\n\nSo from me +1 for Oleg\u0027s proposal.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"0fa72157e2f9bda97e9b7663b7421c13a04494e0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"4fe3abea_4302aa42","in_reply_to":"e4f1a135_659d9e5f","updated":"2021-10-26 07:58:56.000000000","message":"It\u0027s the other way around, this decreases the RPC timeout (given that rpc_response_timeout\u003d60 and report_interval\u003d30 by default, but the important part here is that the interval is decoupled from rpc_response_timeout).\n\nWe have a very large amount of routers (taking L3 agent as example now) and need to increase rpc_response_timeout from default 60 seconds to 5 minutes because RPC messages will timeout because neutron-server is to slow to compose the answer to the RPC call getting all routers.\n\nThe rpc_response_timeout is also used for the report_state RPC call() which doesn\u0027t make sense. If a neutron-server is restarted at the same time an agent is mid RPC call() the message will timeout because n-server will not reply.\n\nThe agent will wait for 5 minutes (due to rpc_response_timeout) and during that time n-server will set the agent as down because agent_down_time has expired. Note that this only applies to the thread running the report_state RPC call()\u0027s, the agent is happily looping and applying state without any issues.\n\nWhen those 5 minutes has passed, the agent will try report_state RPC call() again and successfully update it\u0027s state with n-server, agent will be considered revived (even though it has synced it\u0027s state all the time, only report_state thread was \"hung\" due to the blocking timeout) and *all* resources on the L3 agent will be resynced.\n\nFrom what I understand based on Oleg\u0027s link there is that the use of cast in report_state is intentional.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"11e8ec6ef15f0073355775d6585a9e2c38d066cd","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"3d34cee3_45b1722b","in_reply_to":"f13f60db_bcbb582a","updated":"2021-10-26 14:48:09.000000000","message":"Thanks for your feedback, I think using report_interval is the solution for now also considering (ops hat on) that it would also be up for consideration being backported to stable branches.\n\nI think Rodolfo has the best solution moving forward but would require more work regarding the RPC layer which would be something that can be backported.\n\nI will work more on this proposal and let you know once it\u0027s in a working/reviewable state. Thanks!","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"e4f6d28a84143563d2b94ca110c2fa4c021fa40f","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"579a12e1_732eacc2","updated":"2021-10-25 15:32:23.000000000","message":"PoC, probably not functional as if AGENT is not set it will raise.","commit_id":"0cc9c8adca69193f03532716a425505eb0d15b23"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"61d768c8c9d2928934c0e5e734c84aee76e6e5bf","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"24d1a162_c6d3ea24","updated":"2021-10-25 15:33:15.000000000","message":"The reason I wanted a configuration option the first time is because I wanted the possibility of backporting the behavior since we are carrying custom patches.","commit_id":"0cc9c8adca69193f03532716a425505eb0d15b23"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"d18cd14862dcfee08bbc9fc46531466bccebd3d7","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"f32fb4ac_ae8d6d9d","updated":"2021-10-26 11:01:00.000000000","message":"Except for the unit test failure, I\u0027m assuming some testing (functionals?) might have a very low report_interval causing failures.","commit_id":"d28a5d0ace667961c49ce8f0641d76e47abcdfb5"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"61edf378330e8ace770b9c422e0efbfc5779354c","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"b8a5188e_45a1d724","updated":"2021-10-26 11:20:58.000000000","message":"Let\u0027s see if we can get some more feedback.","commit_id":"d28a5d0ace667961c49ce8f0641d76e47abcdfb5"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"6baae1e985d9167d270878705d6248b6673a3712","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"aa86e62f_58e07dc7","updated":"2021-10-27 11:09:05.000000000","message":"ready for reviews","commit_id":"37b22a40a239c3a189fab0ab16d684288bc0a2ef"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"870c9c23b30ff2d6857dbf2d55907b6c16c2cc39","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"fa4fe116_885cd3a5","updated":"2021-10-27 13:44:57.000000000","message":"recheck","commit_id":"7d552848c272b4fbfdafdc552e54cefd25b6d46a"}],"neutron/agent/dhcp/agent.py":[{"author":{"_account_id":5948,"name":"Oleg Bondarev","email":"obondarev@mirantis.com","username":"obondarev"},"change_message_id":"8ee407d14254e0afefcff18d2e79fef8a4c9049c","unresolved":true,"context_lines":[{"line_number":1035,"context_line":"    def __init__(self, host\u003dNone, conf\u003dNone):"},{"line_number":1036,"context_line":"        super(DhcpAgentWithStateReport, self).__init__(host\u003dhost, conf\u003dconf)"},{"line_number":1037,"context_line":"        report_timeout \u003d cfg.CONF.AGENT.report_rpc_response_timeout"},{"line_number":1038,"context_line":"        self.state_rpc \u003d agent_rpc.PluginReportStateAPI(topics.REPORTS,"},{"line_number":1039,"context_line":"                                                        report_timeout)"},{"line_number":1040,"context_line":"        self.failed_report_state \u003d False"},{"line_number":1041,"context_line":"        self.agent_state \u003d {"}],"source_content_type":"text/x-python","patch_set":2,"id":"ad36fad1_faeff2f8","line":1038,"range":{"start_line":1038,"start_character":35,"end_line":1038,"end_character":55},"updated":"2021-10-25 14:21:30.000000000","message":"If new config option is eventually agreed, it should be handled inside PluginReportStateAPI, not updating all agents","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"},{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"43af75de5b0b3019b0f2447fd9b8c529f7a4df83","unresolved":true,"context_lines":[{"line_number":1035,"context_line":"    def __init__(self, host\u003dNone, conf\u003dNone):"},{"line_number":1036,"context_line":"        super(DhcpAgentWithStateReport, self).__init__(host\u003dhost, conf\u003dconf)"},{"line_number":1037,"context_line":"        report_timeout \u003d cfg.CONF.AGENT.report_rpc_response_timeout"},{"line_number":1038,"context_line":"        self.state_rpc \u003d agent_rpc.PluginReportStateAPI(topics.REPORTS,"},{"line_number":1039,"context_line":"                                                        report_timeout)"},{"line_number":1040,"context_line":"        self.failed_report_state \u003d False"},{"line_number":1041,"context_line":"        self.agent_state \u003d {"}],"source_content_type":"text/x-python","patch_set":2,"id":"25f4cd01_3ffd1dc0","line":1038,"range":{"start_line":1038,"start_character":35,"end_line":1038,"end_character":55},"in_reply_to":"ad36fad1_faeff2f8","updated":"2021-10-25 14:41:21.000000000","message":"I agree.","commit_id":"01ccec6a43e67200d68ec9be8d91b4e6ea537886"}],"neutron/agent/rpc.py":[{"author":{"_account_id":11975,"name":"Slawek Kaplonski","email":"skaplons@redhat.com","username":"slaweq"},"change_message_id":"741f39c7b4fabcf22f3e63f4f7ca9befa9be94c5","unresolved":true,"context_lines":[{"line_number":84,"context_line":"                                       namespace\u003dconstants.RPC_NAMESPACE_STATE)"},{"line_number":85,"context_line":"        self.client \u003d lib_rpc.get_client(target)"},{"line_number":86,"context_line":"        self.timeout \u003d (cfg.CONF.AGENT.report_interval or"},{"line_number":87,"context_line":"                        lib_rpc.TRANSPORT.conf.rpc_response_timeout)"},{"line_number":88,"context_line":""},{"line_number":89,"context_line":"    def has_alive_neutron_server(self, context, **kwargs):"},{"line_number":90,"context_line":"        cctxt \u003d self.client.prepare()"}],"source_content_type":"text/x-python","patch_set":6,"id":"f6f13503_be2c1374","line":87,"updated":"2021-10-27 11:26:15.000000000","message":"actually can report_interval be not set at all, or set to 0? I think it can\u0027t so such \"or\" here doesn\u0027t makes sense probably","commit_id":"37b22a40a239c3a189fab0ab16d684288bc0a2ef"}],"neutron/tests/unit/agent/test_rpc.py":[{"author":{"_account_id":16137,"name":"Tobias Urdin","email":"tobias.urdin@binero.com","username":"tobasco"},"change_message_id":"0ad564f8b21910a3ff37fe8f336b3f573598b2ee","unresolved":true,"context_lines":[{"line_number":81,"context_line":"        self.assertEqual(reportStateAPI.timeout, 32)"},{"line_number":82,"context_line":""},{"line_number":83,"context_line":"    def test_plugin_report_state_interval(self):"},{"line_number":84,"context_line":"        conf.CONF.set_override(\u0027report_interval\u0027, 15, group\u003d\u0027AGENT\u0027)"},{"line_number":85,"context_line":"        reportStateAPI \u003d rpc.PluginReportStateAPI(\u0027test\u0027)"},{"line_number":86,"context_line":"        self.assertEqual(reportStateAPI.timeout, 15)"},{"line_number":87,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"bee21623_ee53830f","line":84,"range":{"start_line":84,"start_character":61,"end_line":84,"end_character":66},"updated":"2021-10-25 17:08:53.000000000","message":"think this should be lowercase agent","commit_id":"0cc9c8adca69193f03532716a425505eb0d15b23"},{"author":{"_account_id":11975,"name":"Slawek Kaplonski","email":"skaplons@redhat.com","username":"slaweq"},"change_message_id":"741f39c7b4fabcf22f3e63f4f7ca9befa9be94c5","unresolved":true,"context_lines":[{"line_number":76,"context_line":""},{"line_number":77,"context_line":"class AgentPluginReportState(base.BaseTestCase):"},{"line_number":78,"context_line":"    def test_plugin_report_state_timeout(self):"},{"line_number":79,"context_line":"        cfg.CONF.set_override(\u0027report_interval\u0027, 0, \u0027AGENT\u0027)"},{"line_number":80,"context_line":"        n_rpc.TRANSPORT.conf.set_override(\u0027rpc_response_timeout\u0027, 32)"},{"line_number":81,"context_line":"        reportStateAPI \u003d rpc.PluginReportStateAPI(\u0027test\u0027)"},{"line_number":82,"context_line":"        self.assertEqual(reportStateAPI.timeout, 32)"}],"source_content_type":"text/x-python","patch_set":6,"id":"73be004e_91c903b8","line":79,"updated":"2021-10-27 11:26:15.000000000","message":"is that real use case that report_interval will be 0?","commit_id":"37b22a40a239c3a189fab0ab16d684288bc0a2ef"}]}
