)]}' {"/PATCHSET_LEVEL":[{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"5b7c89ec4cc896f66892222d80059900e4dc3ddf","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"24321d8c_564d1595","updated":"2022-05-26 11:31:33.000000000","message":"covers this:\n\n diff --git a/swift/proxy/server.py b/swift/proxy/server.py\nindex 26480582c..77e9f4290 100644\n--- a/swift/proxy/server.py\n+++ b/swift/proxy/server.py\n@@ -656,7 +656,7 @@ class Application(object):\n return False\n if \u0027last_error\u0027 in error_stats and error_stats[\u0027last_error\u0027] \u003c \\\n now - self.error_suppression_interval:\n- self._error_limiting.pop(node_key, None)\n+ #self._error_limiting.pop(node_key, None)\n return False\n limited \u003d error_stats[\u0027errors\u0027] \u003e self.error_suppression_limit\n if limited:","commit_id":"744ae585315b4566e4cdc3f87f6adab2e4ce6554"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"553f4b76f1ac2d3dc2c30f2a65dc7567af954bad","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"e63ea13b_e8e9bca7","updated":"2022-05-26 14:08:00.000000000","message":"I’m glad you’re reading this code carefully and sharing us the existing behaviors. I apparently have no idea how proxy worker error counting works. ","commit_id":"cc89b8e3f32d1fa03acdf75b7499b547c747fa9e"}],"test/unit/proxy/test_server.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"553f4b76f1ac2d3dc2c30f2a65dc7567af954bad","unresolved":true,"context_lines":[{"line_number":1308,"context_line":" self.assertFalse(app.error_limited(node))"},{"line_number":1309,"context_line":" app.error_occurred(node, \u0027test msg\u0027)"},{"line_number":1310,"context_line":" # the node is error limited for 1 minute"},{"line_number":1311,"context_line":" self.assertTrue(app.error_limited(node))"},{"line_number":1312,"context_line":" incr_time(60)"},{"line_number":1313,"context_line":" self.assertTrue(app.error_limited(node))"},{"line_number":1314,"context_line":" # then error limiting is reset"}],"source_content_type":"text/x-python","patch_set":2,"id":"bd8056d5_312bd0d0","line":1311,"updated":"2022-05-26 14:08:00.000000000","message":"oh gosh, so the error is like a keep alive for the error counter.\n\nI wonder if the goal was “error limit errors in suppression interval suppresses requests for suppression interval” and it’s just a bug? that’s always how I thought it worked and don’t understand why it doesn’t.","commit_id":"cc89b8e3f32d1fa03acdf75b7499b547c747fa9e"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"cd36c5422b0b9a3159d09b7fc6b352764e644112","unresolved":true,"context_lines":[{"line_number":1308,"context_line":" self.assertFalse(app.error_limited(node))"},{"line_number":1309,"context_line":" app.error_occurred(node, \u0027test msg\u0027)"},{"line_number":1310,"context_line":" # the node is error limited for 1 minute"},{"line_number":1311,"context_line":" self.assertTrue(app.error_limited(node))"},{"line_number":1312,"context_line":" incr_time(60)"},{"line_number":1313,"context_line":" self.assertTrue(app.error_limited(node))"},{"line_number":1314,"context_line":" # then error limiting is reset"}],"source_content_type":"text/x-python","patch_set":2,"id":"9586dcd5_b7a4e504","line":1311,"in_reply_to":"bd8056d5_312bd0d0","updated":"2022-05-27 23:52:53.000000000","message":"I\u0027ve got this gut feeling like we need to move away from this notion that a node is either error-limited or it\u0027s not -- what I kind of want is something where we try to estimate the probability that a request to a node is going to error and use that to inform whether we make a particular request or not.\n\nSomething like https://paste.opendev.org/show/bj5m3lXImDLiX5tp4Bvu/ but\n\n* have those 0.5 and 0.001 constants be configurable and\n* consider whether we want the simple P(make request) :\u003d 1 - P(request will error) that I implemented or something more complicated\n\nIt also occurs to me that we may want to distinguish between a congestion-related error (timeout, 529) and other server errors. I\u0027m not convinced that a 500 on one request means the next request to that node is particularly likely to 500 -- it was more likely due to the particular request that was made.\n\nWe probably also ought to distinguish between 507-unmounted and 507-disk full -- it 100% makes sense to immediately stop sending any requests to a currently-unmounted drive, or to stop sending *writes* to a full drive. *Reads*, on the other hand, are a more complicated story...","commit_id":"cc89b8e3f32d1fa03acdf75b7499b547c747fa9e"}]}