)]}' {"/COMMIT_MSG":[{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"ada926e9cd2742ca636b69e9a64e645d5e437dab","unresolved":true,"context_lines":[{"line_number":19,"context_line":"incremented or not: the error counter was only incremented if a useful"},{"line_number":20,"context_line":"replacement node was found."},{"line_number":21,"context_line":""},{"line_number":22,"context_line":"For example, if a read from the last useful primary node failed, and"},{"line_number":23,"context_line":"all handoff nodes then returned 404, the last primary node\u0027s would not"},{"line_number":24,"context_line":"have its error counter incremented (for no good reason). However, if a"},{"line_number":25,"context_line":"useful replacement handoff node was found, then the last primary"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"c9268de7_7da20b75","line":22,"updated":"2023-08-31 05:08:25.000000000","message":"For this example, will this patch change the proxy error code returned to users when the last useful primary node gets error_limited with enough read failures? I don\u0027t see user error code related test cases. \nAlso I feel this case is very unique, if the last useful primary node is the only one which has the data, even though it returns 503 on each read, maybe we should have user continue to try reading it instead of error limiting it?","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":597,"name":"Pete Zaitcev","email":"zaitcev@kotori.zaitcev.us","username":"zaitcev"},"change_message_id":"6461c09814ed88f6cc8759b5c7e5167144b0a43d","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"6fb248d5_20510e49","updated":"2023-08-04 15:53:18.000000000","message":"I probably should explain. I\u0027ve read through the commit message, and it explains that the error counters are now incremented where they haven\u0027t been before. But why is this useful? What happened before that was bad when the last node wasn\u0027t limited? Looks like it\u0027s implied that it\u0027s better to offload to handoffs fast than timeout on the not-limited primary, but it\u0027s left to the guess of the reader.","commit_id":"27c840d5d4bb932cb93e3c3e3b0e3d07e53882fe"},{"author":{"_account_id":597,"name":"Pete Zaitcev","email":"zaitcev@kotori.zaitcev.us","username":"zaitcev"},"change_message_id":"4fe390136663cd67e8d19d6a3c9c9cb964f6d379","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"468bf444_991c4ef6","updated":"2023-08-04 15:50:06.000000000","message":"What is the benefit of this change?","commit_id":"27c840d5d4bb932cb93e3c3e3b0e3d07e53882fe"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"40df303474e39838ca27e626385eb363c571d4f3","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"117b38f1_715b837d","updated":"2023-08-04 16:59:39.000000000","message":"@Pete - it\u0027s a minor bug. I\u0027ve updated the commit message to hopefully explain better.","commit_id":"f6dce2d17a45e428f3ec2aff9fa9b679209bd886"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"802ae34f28b6c80f3aa5b8c8f261c811c8251429","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"8b82574f_090dbada","updated":"2023-09-06 11:12:23.000000000","message":"@Jianjian You raise an interesting point: if this is the last node available to serve a request for resource X then should we error limit the node or leave it in service?\n\n1. The error limiting here is just incrementing a counter, so we\u0027d need the code to get much smarter to detect if *this* error is the one that will actually tip the node into being out of service.\n\n2. The error limiting is per node (host:port/device) not per resource X, so even if we did skip the error limit increment when it\u0027s the final available node for resource X, a request for node Y could still increment the error count for the same node because it may not be the final available node for Y.\n\nIdeally, rather error limiting being binary (in service/out of service). We\u0027d have a weighted node selection, so that the least errored node would be chosen, but a node could always be chosen.","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"7481ec6f4239c67e4086cca3a4da0cb4289e6735","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"3504923d_dcf15cc3","updated":"2023-09-20 04:51:03.000000000","message":"LGTM, it\u0027s a correct fix to an old bug.","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"b3735d8a4ef047eee9a0de75ad1d30bdb4bf1e8d","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"a3945e85_84e6723a","updated":"2023-08-16 14:38:11.000000000","message":"recheck\n\nPOST_FAILURE","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"6e7ff75c73fcaf2c681685f19d6e9365805fc404","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"dafc1861_052c3b1c","updated":"2023-09-05 17:03:30.000000000","message":"we think this is what we want to do - let\u0027s try it out in prod and provide some additional feedback.","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"80b919997c798788687cca4140f58224c2c287cb","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"3bef1c0a_1c00dffe","updated":"2023-09-27 16:39:20.000000000","message":"we\u0027ve been carrying this for awhile; seems to work well","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"7481ec6f4239c67e4086cca3a4da0cb4289e6735","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"adcbe62f_013372c6","in_reply_to":"8b82574f_090dbada","updated":"2023-09-20 04:51:03.000000000","message":"a weighted node selection sounds great, yes, it\u0027s something should be done on top of error incrementing and counting functions.","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"}],"test/unit/proxy/controllers/test_obj.py":[{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"7481ec6f4239c67e4086cca3a4da0cb4289e6735","unresolved":false,"context_lines":[{"line_number":4539,"context_line":" self.assertEqual(len(log), self.policy.ec_n_unique_fragments * 2)"},{"line_number":4540,"context_line":" log_lines \u003d self.app.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":4541,"context_line":" self.assertEqual(3, len(log_lines), log_lines)"},{"line_number":4542,"context_line":" self.assertIn(\u0027Trying to read next part of EC multi-part GET\u0027,"},{"line_number":4543,"context_line":" log_lines[0])"},{"line_number":4544,"context_line":" self.assertIn(\u0027Trying to read during GET: ChunkReadTimeout\u0027,"},{"line_number":4545,"context_line":" log_lines[1])"}],"source_content_type":"text/x-python","patch_set":5,"id":"d4c414ae_c4da4562","line":4542,"updated":"2023-09-20 04:51:03.000000000","message":"this new log line is printed out by \"self.app.error_occurred(self.source.node, err_msg)\", the fix works!","commit_id":"01c7ade798b612eca440fd389942abe131574cc2"}]}