)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"2af21ea891e5288ccdf356493f9ef1dd8a1f8492","unresolved":true,"context_lines":[{"line_number":5,"context_line":"CommitDate: 2026-03-03 21:24:05 -0800"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"obj-reconstructor: check fragment bucket timestamp before rebuilding"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Change-Id: Ife734a35da0015236c2249e6b27aad3e0afade89"},{"line_number":10,"context_line":"Co-Authored-By: Alistair Coles \u003calistairncoles@gmail.com\u003e"},{"line_number":11,"context_line":"Signed-off-by: Jianjian Huo \u003cjhuo@nvidia.com\u003e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":1,"id":"37544181_d206a89a","line":8,"updated":"2026-03-04 10:20:48.000000000","message":"``Closes-Bug: #2143202``","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"6a2a069e893c7a3106b1b2715e2aa6f84b439ab5","unresolved":false,"context_lines":[{"line_number":5,"context_line":"CommitDate: 2026-03-03 21:24:05 -0800"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"obj-reconstructor: check fragment bucket timestamp before rebuilding"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Change-Id: Ife734a35da0015236c2249e6b27aad3e0afade89"},{"line_number":10,"context_line":"Co-Authored-By: Alistair Coles \u003calistairncoles@gmail.com\u003e"},{"line_number":11,"context_line":"Signed-off-by: Jianjian Huo \u003cjhuo@nvidia.com\u003e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":1,"id":"6b469cb1_086dddee","line":8,"in_reply_to":"37544181_d206a89a","updated":"2026-03-11 18:51:32.000000000","message":"Done","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"d259ffc78cc88a237fae5d024dac4f0a75ad6962","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"477cf3ae_7b42ea7f","updated":"2026-03-05 21:39:23.000000000","message":"Part of me wonders if we ought to be using `ECGetResponseBucket` / `ECGetResponseCollection` from `swift/proxy/controllers/obj.py` in the reconstructor...\n\nTactically, though, this makes sense enough. Bailing if the reconstructable timestamp doesn\u0027t match our timestamp seems pretty defensible; let one of the guys that has data at that timestamp be responsible for pushing it.\n\nShould we maybe also compare `bucket.etag` against our local etag? Might help limit corrupt rebuilds until we get the timestamp collisions resolved.","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"3ca0ba7c3b707a54598f01d4c6a1af0f34f5910b","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":1,"id":"675586e3_2467198f","in_reply_to":"477cf3ae_7b42ea7f","updated":"2026-03-06 04:04:42.000000000","message":"\u003e Should we maybe also compare bucket.etag against our local etag?\n\ngood idea! I was thinking to add the etag check when sending out fragment to the missing node, but here is better. After I have that added, more broken test cases to fix: https://review.opendev.org/c/openstack/swift/+/979098/1?usp\u003drelated-change","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"6a2a069e893c7a3106b1b2715e2aa6f84b439ab5","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"319fd0a6_1c6e4a2d","in_reply_to":"675586e3_2467198f","updated":"2026-03-11 18:51:32.000000000","message":"Done","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"cb7d13ee16420a7dc9fc0a4b5741dc7e84505e15","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"1722b77c_464d2d71","updated":"2026-03-12 16:18:39.000000000","message":"I\u0027ve not yet worked through the tests but wanted to leave early feedback.\n\nI think this is an expedient fix to avoid corruption. We could build on this by choosing to use a mismatching bucket with the correct metadata, but I\u0027d be happy to see this fix land soon and a better one land later.\n\nI don\u0027t think this needs to be based on the parent, there\u0027s no dependency IIUC.","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"6a2a069e893c7a3106b1b2715e2aa6f84b439ab5","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"eade4504_cf373720","updated":"2026-03-11 18:51:32.000000000","message":"Thanks for the reviews!","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"dd7a96f653adea21cf967ea1e85c7148e66ed896","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"35afa54e_52d43a5f","updated":"2026-03-17 22:58:02.000000000","message":"recheck\ngrenade and devstack failures","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"b37df984150470f79986a1965aebd39bcdb92c26","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"66a2bed9_00930b45","updated":"2026-03-16 03:57:43.000000000","message":"recheck\ngrenade and devstack failures","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6049a43639b37daa942a402c64d5881b031a1771","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"6c0c1321_2e9a2530","updated":"2026-03-25 14:42:58.000000000","message":"The one change I\u0027d prefer to see before shipping this patch is to make the log message similar enough to find with the same search e.g. greping for \"enough responses\", see https://review.opendev.org/c/openstack/swift/+/982099","commit_id":"41bf539dce5fbeecc1888b76f2819297f941a61c"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"4f4a14de95b073a133813a6a2e521f43aba48306","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"0da2de49_9811c448","updated":"2026-03-25 17:31:12.000000000","message":"looking forward to seeing the new patch","commit_id":"41bf539dce5fbeecc1888b76f2819297f941a61c"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"2f3d4c67f3962086b1b810438c0ecfab1e1eb5d0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"a378e64f_fd7f11dc","updated":"2026-03-25 14:41:05.000000000","message":"rebased\n\nI pushed some suggested changes in https://review.opendev.org/c/openstack/swift/+/982099","commit_id":"41bf539dce5fbeecc1888b76f2819297f941a61c"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"aef6b81d_7fca4ca2","updated":"2026-03-31 00:06:41.000000000","message":"I don\u0027t really like this change, but it\u0027s better than corrupt data!  😜\n\nI like the previous change quite a lot, I just wish it went that last step further:\n\n982735: raise DiskfileError on \"bad\" useful_bucket | https://review.opendev.org/c/openstack/swift/+/982735","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9892fa10c5bedcd024f6db876bdde021041e338a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"635c198a_056728c5","updated":"2026-03-31 16:42:42.000000000","message":"still prevents data corruption - huge win.\n\nFrustrated that \"waiting on useless responses so we can write more log lines\" isn\u0027t received as obvious.  I guess either one has to hope the responses aren\u0027t useless (e.g. 2+3 schema) OR that they\u0027ll be super helpful to debug, but ultimately just a temporary measure (i.e. once we SEE the log messages in prod we\u0027ll know EXACTLY what code we need to write and in the VERY next release instead of logging a bunch of \"everything is terrible; we waited on all the responses and can\u0027t do anything useful\" we\u0027ll just debug log \"we made the best of a bad situation; no worries; you\u0027re welcome\"\n\nI was thinking more about the \"just fix this in reconstruct_fa\" option:\n\n982735: raise DiskfileError on \"bad\" useful_bucket | https://review.opendev.org/c/openstack/swift/+/982735\n\n... and suspected and have now confirmed; that on THIS option, with this diff applied:\n\n```\ndiff --git a/swift/obj/reconstructor.py b/swift/obj/reconstructor.py\nindex 6af722061d..ec14cef8be 100644\n--- a/swift/obj/reconstructor.py\n+++ b/swift/obj/reconstructor.py\n@@ -623,9 +623,7 @@ class ObjectReconstructor(Daemon):\n                 bucket \u003d self._handle_fragment_response(\n                     node, policy, partition, fi_to_rebuild, path, buckets,\n                     error_responses, resp)\n-                if (bucket\n-                        and bucket.is_useful(policy)\n-                        and bucket.matches(local_timestamp, local_etag)):\n+                if bucket and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata:\n                     useful_bucket \u003d bucket\n                     self.logger.debug(\n                         \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027\n```\n\nno tests in `swift/test/unit/obj/test_reconstructor.py` fail\n\n... it\u0027s THIS situation where: \n\nall primaries 404 but we still find a \"useful\" (but not matching) bucket\n\n... scenario that we think might actually be related to timestamp collision data corruption we\u0027ve hypothesized may exists.  So it\u0027d be nice to have test coverage to ensure that conditional really works the way we expect.\n\nFWIW if we \"just fix it in reconstruct_fa\" it doesn\u0027t really matter how we come out of make_frag_reqeusts with a \"useful\" bucket - if it doesn\u0027t match we won\u0027t rebuild it; and that diff hunk is well tested.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"b41a02d75e4f0077e52e598223ed0c32ec9a3384","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"8f103da3_a980faf5","updated":"2026-03-31 23:30:11.000000000","message":"thanks all for the thoughts and reviews!","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"b41a02d75e4f0077e52e598223ed0c32ec9a3384","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"51c99efe_527aad51","in_reply_to":"635c198a_056728c5","updated":"2026-03-31 23:30:11.000000000","message":"added a new test ``test_reconstruct_fa_handoffs_mismatched_timestamp`` for the case you mentioned above.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e9526853c46da517a741384df191e6896855654b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":12,"id":"bca1b815_a89e34c6","updated":"2026-04-01 23:33:19.000000000","message":"FWIW I don\u0027t LOVE tests who are mostly trying to maintain some specific logging behavior - we generally want to be able to adapt/improve/annotate log messages with more/better info as we learn what sort of descriptive information is helpful when debugging.\n\n*generally* I\u0027d like code to log a message \"when an important event resolves\" - then we can add as much useful information as we want to THAT log message - to the extent that we think some of that information is *really important* (like a db file path) we might have a test like:\n\n```\nself.assertIn(tmp_db_file_path, error_log_msg)\n```\n\n^ that test won\u0027t fail if we later decide we also want to include the db\u0027s storage-namespace-path, or the number of shard-ranges that failed and what states they were in.\n\nI think we should use a similar strategy with the reconstructor logging; the current pattern of emitting NEW log messages for every error/warning/mis-matched-timestamp-etag as we go is annoying; and mostly un-necessary given that we gather everything up in these buckets and error_resp dicts anyway.\n\nI\u0027d prefer to have one debug message when we rebuild (along with relevant information about the responses we used/ignored) and one error message if we fail to rebuild (along with relevant information about the responses we didn\u0027t think we could use):\n\n983143: not MORE logging; BETTER logging | https://review.opendev.org/c/openstack/swift/+/983143\n\n^ again none of this requires a LOGIC CHANGE in how many requests we make - improved logging should be orthogonal to if we need/want to add the behavior of \"keep looking for a second useful bucket\".  If we want to improve logging (and I think we could) - let\u0027s do that.  Un-related, if we want to add the new behavior of keep looking past ndata matching responses for a second bucket full of matching responses (unlikely) - please don\u0027t mistakenly think we would want to justify adding that behavior w/ tests that validate the un-related logging additions.","commit_id":"1b36e0f122f21dbe6f5f746d0571906ca4b1a8ec"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a7e3f5c0d601b575cd16ec9f8cca98b7f5396eb0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":15,"id":"8ef8c700_e8d7c3b7","updated":"2026-05-01 20:21:05.000000000","message":"Looking at this again fresh I needed to remind myself it\u0027s ONLY increasing the time we spend waiting on primary responses to collect (or timeout) - it is NOT adding any NEW handoff requests to the `quorum !\u003d local` case (handoff requests are reserved for `_is_quarantine_candidate` - which is *very* restrictive).\n\nThe change in behavior in the \"long-tail on the last primary\" case might be interesting; but data suggests the `quorum !\u003d local` case is already very rare; so in the off-chance that last primary is interesting: we can always wait for all primaries to timeout or respond before we return an error.\n\nI don\u0027t LOVE it - but I HATE it less.","commit_id":"6cef345a24e61a6a77646968210f2ac2933c5bdb"}],"swift/obj/reconstructor.py":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"d259ffc78cc88a237fae5d024dac4f0a75ad6962","unresolved":true,"context_lines":[{"line_number":587,"context_line":"                error_responses, resp)"},{"line_number":588,"context_line":"            if (bucket"},{"line_number":589,"context_line":"                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":590,"context_line":"                    and bucket.timestamp \u003d\u003d local_timestamp):"},{"line_number":591,"context_line":"                useful_bucket \u003d bucket"},{"line_number":592,"context_line":"                break"},{"line_number":593,"context_line":""}],"source_content_type":"text/x-python","patch_set":1,"id":"b48de511_93339bd2","line":590,"updated":"2026-03-05 21:39:23.000000000","message":"I\u0027m not convinced that we *shouldn\u0027t* break here any more when `bucket.timestamp !\u003d local_timestamp` -- do we really expect to have another chance at a (different) bucket with `\u003e\u003d ndata` responses if we keep digging through primaries? I mean, I guess *maybe* -- if you\u0027ve got some policy with `nparity \u003e ndata`, or you\u0027ve got a duplication factor other than 1...","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"394119362bf5dc5d7fba3dc988f982be81278b6c","unresolved":false,"context_lines":[{"line_number":587,"context_line":"                error_responses, resp)"},{"line_number":588,"context_line":"            if (bucket"},{"line_number":589,"context_line":"                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":590,"context_line":"                    and bucket.timestamp \u003d\u003d local_timestamp):"},{"line_number":591,"context_line":"                useful_bucket \u003d bucket"},{"line_number":592,"context_line":"                break"},{"line_number":593,"context_line":""}],"source_content_type":"text/x-python","patch_set":1,"id":"8249a48a_1625f71d","line":590,"in_reply_to":"74741541_a0aac493","updated":"2026-03-25 20:05:37.000000000","message":"discussed in another thread","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"6a2a069e893c7a3106b1b2715e2aa6f84b439ab5","unresolved":true,"context_lines":[{"line_number":587,"context_line":"                error_responses, resp)"},{"line_number":588,"context_line":"            if (bucket"},{"line_number":589,"context_line":"                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":590,"context_line":"                    and bucket.timestamp \u003d\u003d local_timestamp):"},{"line_number":591,"context_line":"                useful_bucket \u003d bucket"},{"line_number":592,"context_line":"                break"},{"line_number":593,"context_line":""}],"source_content_type":"text/x-python","patch_set":1,"id":"74741541_a0aac493","line":590,"in_reply_to":"b48de511_93339bd2","updated":"2026-03-11 18:51:32.000000000","message":"I wrote two test cases for this with ``policy with nparity \u003e ndata``:\nhttps://review.opendev.org/c/openstack/swift/+/978845/5/test/unit/obj/test_reconstructor.py#5289 and https://review.opendev.org/c/openstack/swift/+/978845/5/test/unit/obj/test_reconstructor.py#5412","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"d259ffc78cc88a237fae5d024dac4f0a75ad6962","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"8a8e4f21_04a4c1b0","line":637,"updated":"2026-03-05 21:39:23.000000000","message":"I wonder a bit if this function\u0027s change should look more like\n\n```\n@@ -634,6 +632,8 @@ class ObjectReconstructor(Daemon):\n                 # optimistically wait for any remaining responses in case a\n                 # useful bucket is assembled.\n\n+        if useful_bucket.timestamp !\u003d local_timestamp:\n+            return None\n         return useful_bucket\n\n     def reconstruct_fa(self, job, node, df):\n```","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"4f4a14de95b073a133813a6a2e521f43aba48306","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"9b950b9d_5e43d933","line":637,"in_reply_to":"15130349_90f8d967","updated":"2026-03-25 17:31:12.000000000","message":"IME we should tend to value the \"in for a penny - in for a pound\" mentality - the WORST scenario is \"do a bunch of IO and get nothing for it\" - much better to \"do some IO, detect we\u0027re in a bad state; so do a bunch MORE IO and be done with it once and for all\"\n\ne.g. the `_is_quarantine_candidate` candidate will push quite far into handoffs to make sure we\u0027re dealing with a lone frag so that we can make a strong decision about discarding a \"bad local frag\" so we NEVER HAVE TO DEAL WITH IT AGAIN.\n\nI think in this scenario we should be aiming for the same resolution:\n\nIf we\u0027re getting ndata *durable* frags with `remote.timestamp \u003e local_timestamp` we should be looking for a way to push *that* fragment to our peer (and be done with it; we already have the responses in hand); and probably also deciding if maybe 1) we\u0027re just missing the commit bit on a local non-durable frag or 2) we missed a frag write and ALSO need to rebuild OUR frag\n\nI\u0027m always fine with \"doing more IO\" - but only if it moves the system towards consistency.\n\nThe worst case scenario (for otherwise correct code) is \"doing more IO and getting nothing for it\"\n\ni.e. \"un-optimal but correct\" is not as bad as writing corrupt data; but doing IO and gaining nothing useful is \"worse case scenario\" - it\u0027s just a \"BUG\"\n\nFWIW I feel like this change in it\u0027s current state is going out of it\u0027s way to DO EVER MORE IO - and even if it managed to \"get something out of it\" - it\u0027s probably not even the end state; it\u0027s just a different inconsistent state.\n\nI think we should at least be LOOKING at the order of a local_timestamp and useful_bucket.timestamp","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"ab311f96_a05e38a5","line":637,"in_reply_to":"361e0443_710813ea","updated":"2026-03-31 00:06:41.000000000","message":"\u003e the current fix refuses the timestamp mismatch\n\nI don\u0027t think \"refuses\" is the right word, it\u0027s more like \"it completely ignores a useful_bucket if it\u0027s a different timestamp than the local one and continues on in ignorance with false hope\"\n\nI\u0027d prefer a fix that \"recognizes if useful_bucket is better and use it\"\n\nIn the surprising situation where we get a bucket with \u003e ndata matching etags *and* it\u0027s a timestamp \u003c local_timestamp I guess in THAT case we could keep digging into handoffs and see if enough matching responses are out there (somewhere?  somehow?).\n\nBut I don\u0027t think this change does that either, it just keeps asking the remaining primaries for what they have even tho there\u0027s NO CHACE it can get \u003e ndata matching responses once \u003e ndata primaries have already told us they don\u0027t match.","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"cb7d13ee16420a7dc9fc0a4b5741dc7e84505e15","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"c24c88de_26f6afb1","line":637,"in_reply_to":"7f83b301_b7ed8937","updated":"2026-03-12 16:18:39.000000000","message":"@Tim IIUC with your suggestion the first bucket to reach ndata responses would become *the* useful bucket, but then be thrown away if it didn\u0027t match timestamp. Whereas with Jianjian\u0027s patch a bucket is only useful if it has ndata frags AND matches, so mismatched buckets don\u0027t prevent other buckets also reaching ndata frags.","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"6a2a069e893c7a3106b1b2715e2aa6f84b439ab5","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"7f83b301_b7ed8937","line":637,"in_reply_to":"8a8e4f21_04a4c1b0","updated":"2026-03-11 18:51:32.000000000","message":"we would need this optimization only if we don\u0027t want this loop to continue searching for next available bucket after a mismatched bucket.\n\n```\n            if (bucket\n                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata\n                    and bucket.timestamp \u003d\u003d local_timestamp\n                    and bucket.etag \u003d\u003d local_etag):\n                useful_bucket \u003d bucket\n                break\n```","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"4edc5f7794311fe8a0e49ba49a7132b25d82bdde","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"ced1fe71_65500abf","line":637,"in_reply_to":"9b950b9d_5e43d933","updated":"2026-03-25 17:33:14.000000000","message":"\u003e gaining nothing useful is \"worse case scenario\" - it\u0027s just *not* a \"BUG\"\n\ntypo","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"1a5634cdfbd42ea438888b13afef0597d4d52064","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"15130349_90f8d967","line":637,"in_reply_to":"c24c88de_26f6afb1","updated":"2026-03-24 20:05:40.000000000","message":"Yeah, I\u0027m looking for whether we can abort before looking at all `max_node_count` responses. With *most* configurations, you could only hope to have **one** bucket with ndata responses -- and if it doesn\u0027t match (and therefore we shouldn\u0027t rebuild), there\u0027s no point in continuing.\n\nMaybe we could bail early iff `ndata \u003e replica_count - ndata`? Or I\u0027m overthinking things and we should just allow the extra traffic -- it\u0027s all in a background process anyway...","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"18166d53aa60d165dc14d1509618112d4a07ab05","unresolved":true,"context_lines":[{"line_number":634,"context_line":"                # optimistically wait for any remaining responses in case a"},{"line_number":635,"context_line":"                # useful bucket is assembled."},{"line_number":636,"context_line":""},{"line_number":637,"context_line":"        return useful_bucket"},{"line_number":638,"context_line":""},{"line_number":639,"context_line":"    def reconstruct_fa(self, job, node, df):"},{"line_number":640,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":1,"id":"361e0443_710813ea","line":637,"in_reply_to":"ced1fe71_65500abf","updated":"2026-03-26 22:54:20.000000000","message":"\u003eMaybe we could bail early iff ndata \u003e replica_count - ndata? Or I\u0027m overthinking things and we should just allow the extra traffic -- it\u0027s all in a background process anyway...\n\n@tburke@nvidia.com other than the case ``nparity \u003e ndata``, there is also a benefit if we allow the loop to continue searching for next useful bucket: after all fragments being processed by ``_handle_fragment_response``, the quarantine candidate check ``_is_quarantine_candidate`` which runs immediately after this loop will have more complete picture; if it bails early, the quarantine logic would see partial data (some responses unprocessed in the pile).\n\n\n\u003e If we\u0027re getting ndata durable frags with remote.timestamp \u003e local_timestamp we should be looking for a way to push that fragment to our peer\n\n@clay.gerrard@gmail.com the current fix refuses the timestamp mismatch and let the right node in the 8+4 handle it. E.g. if one stale node got ndata durable frags with newer remote.timestamp, and just refuse to rebuild it with local timestamp. since all primary nodes run the reconstructor, normally it should be just next cycle (or in parallel already) that the right node with newer timestamp will be able to rebuild the missing fragment. So it\u0027s just 12 frags reads IO wasted. It will be an improvement if the stale node rebuilds the missing frag, but probably not that much considering this case should be rare; also, the current reconstructor has a design of \"always stamp rebuilt data with local verified metadata\", this simple design choice probably has made a lot things easier.","commit_id":"dfea4f1cb4fa7e36a26a982d71cfb232bb2f83cf"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"cb7d13ee16420a7dc9fc0a4b5741dc7e84505e15","unresolved":true,"context_lines":[{"line_number":534,"context_line":"        :param buckets: dict of per-timestamp buckets for ok responses."},{"line_number":535,"context_line":"        :param error_responses: dict of per-status lists of error responses."},{"line_number":536,"context_line":"        :return: A per-timestamp with sufficient responses, or None if"},{"line_number":537,"context_line":"            there is no such bucket."},{"line_number":538,"context_line":"        \"\"\""},{"line_number":539,"context_line":"        policy \u003d job[\u0027policy\u0027]"},{"line_number":540,"context_line":"        partition \u003d job[\u0027partition\u0027]"}],"source_content_type":"text/x-python","patch_set":5,"id":"610d411a_71d35f85","line":537,"updated":"2026-03-12 16:18:39.000000000","message":"```\n:return: A bucket of at least ec_ndata responses that \nmatch the timestamp and EC-etag of the local diskfile (``df``)\ndata file, or None if there is no such bucket.\n```","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"ee4e642b0ccdf92fb16fbcbcd138f34a9b8fef15","unresolved":false,"context_lines":[{"line_number":534,"context_line":"        :param buckets: dict of per-timestamp buckets for ok responses."},{"line_number":535,"context_line":"        :param error_responses: dict of per-status lists of error responses."},{"line_number":536,"context_line":"        :return: A per-timestamp with sufficient responses, or None if"},{"line_number":537,"context_line":"            there is no such bucket."},{"line_number":538,"context_line":"        \"\"\""},{"line_number":539,"context_line":"        policy \u003d job[\u0027policy\u0027]"},{"line_number":540,"context_line":"        partition \u003d job[\u0027partition\u0027]"}],"source_content_type":"text/x-python","patch_set":5,"id":"157786b4_5784e7aa","line":537,"in_reply_to":"610d411a_71d35f85","updated":"2026-03-15 21:49:51.000000000","message":"Done","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"cb7d13ee16420a7dc9fc0a4b5741dc7e84505e15","unresolved":true,"context_lines":[{"line_number":694,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"},{"line_number":695,"context_line":"                    \u0027for %s frag#%s\u0027 %"},{"line_number":696,"context_line":"                    (timestamp.internal,"},{"line_number":697,"context_line":"                     bucket.timestamp.internal if bucket.timestamp"},{"line_number":698,"context_line":"                     else None,"},{"line_number":699,"context_line":"                     local_timestamp.internal,"},{"line_number":700,"context_line":"                     bucket.etag,"}],"source_content_type":"text/x-python","patch_set":5,"id":"2906104c_7aa9ce00","line":697,"range":{"start_line":697,"start_character":50,"end_line":697,"end_character":66},"updated":"2026-03-12 16:18:39.000000000","message":"A bucket must have a timestamp I think? And it\u0027s always equal to ``timestamp`` in this context","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"ee4e642b0ccdf92fb16fbcbcd138f34a9b8fef15","unresolved":false,"context_lines":[{"line_number":694,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"},{"line_number":695,"context_line":"                    \u0027for %s frag#%s\u0027 %"},{"line_number":696,"context_line":"                    (timestamp.internal,"},{"line_number":697,"context_line":"                     bucket.timestamp.internal if bucket.timestamp"},{"line_number":698,"context_line":"                     else None,"},{"line_number":699,"context_line":"                     local_timestamp.internal,"},{"line_number":700,"context_line":"                     bucket.etag,"}],"source_content_type":"text/x-python","patch_set":5,"id":"44fb8939_014fc8ab","line":697,"range":{"start_line":697,"start_character":50,"end_line":697,"end_character":66},"in_reply_to":"2906104c_7aa9ce00","updated":"2026-03-15 21:49:51.000000000","message":"Done","commit_id":"11a121868f91a9dbe41014763fece43ed2c958bf"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"be9373b7bf7de02a296ceb85910ae826722a7947","unresolved":true,"context_lines":[{"line_number":591,"context_line":"            if (bucket"},{"line_number":592,"context_line":"                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":593,"context_line":"                    and bucket.timestamp \u003d\u003d local_timestamp"},{"line_number":594,"context_line":"                    and bucket.etag \u003d\u003d local_etag):"},{"line_number":595,"context_line":"                useful_bucket \u003d bucket"},{"line_number":596,"context_line":"                break"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"c857a765_ccc98a7e","line":594,"updated":"2026-03-25 14:40:36.000000000","message":"these could be encapsulated as bucket properties/methods e.g.\n\n```\nif bucket.num_useful_responses \u003e\u003d policy.ec_ndata and bucket.matches(local_timestamp, local_etag):``\n    useful_bucket\u003dbucket\n```    \nthen later, in the warning condition, the same methods could be re-used:\n\n```\nif bucket.num_useful_responses \u003e\u003d policy.ec_ndata and not bucket.matches(local_timestamp, local_etag):``\n    # log warning\n```","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"394119362bf5dc5d7fba3dc988f982be81278b6c","unresolved":false,"context_lines":[{"line_number":591,"context_line":"            if (bucket"},{"line_number":592,"context_line":"                    and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":593,"context_line":"                    and bucket.timestamp \u003d\u003d local_timestamp"},{"line_number":594,"context_line":"                    and bucket.etag \u003d\u003d local_etag):"},{"line_number":595,"context_line":"                useful_bucket \u003d bucket"},{"line_number":596,"context_line":"                break"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"9234ffe9_62de6a99","line":594,"in_reply_to":"c857a765_ccc98a7e","updated":"2026-03-25 20:05:37.000000000","message":"Done","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"be9373b7bf7de02a296ceb85910ae826722a7947","unresolved":true,"context_lines":[{"line_number":691,"context_line":"            if (len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":692,"context_line":"                    and (bucket.timestamp !\u003d local_timestamp"},{"line_number":693,"context_line":"                         or bucket.etag !\u003d local_etag)):"},{"line_number":694,"context_line":"                self.logger.warning("},{"line_number":695,"context_line":"                    \u0027Received enough fragments at timestamp %s \u0027"},{"line_number":696,"context_line":"                    \u0027(bucket.timestamp\u003d%s) but local timestamp is %s, \u0027"},{"line_number":697,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"}],"source_content_type":"text/x-python","patch_set":6,"id":"0f92b398_139458e0","line":694,"range":{"start_line":694,"start_character":28,"end_line":694,"end_character":35},"updated":"2026-03-25 14:40:36.000000000","message":"this should probably be at same log level as the existing *error*","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"394119362bf5dc5d7fba3dc988f982be81278b6c","unresolved":false,"context_lines":[{"line_number":691,"context_line":"            if (len(bucket.useful_responses) \u003e\u003d policy.ec_ndata"},{"line_number":692,"context_line":"                    and (bucket.timestamp !\u003d local_timestamp"},{"line_number":693,"context_line":"                         or bucket.etag !\u003d local_etag)):"},{"line_number":694,"context_line":"                self.logger.warning("},{"line_number":695,"context_line":"                    \u0027Received enough fragments at timestamp %s \u0027"},{"line_number":696,"context_line":"                    \u0027(bucket.timestamp\u003d%s) but local timestamp is %s, \u0027"},{"line_number":697,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"}],"source_content_type":"text/x-python","patch_set":6,"id":"2024eccc_18bb7614","line":694,"range":{"start_line":694,"start_character":28,"end_line":694,"end_character":35},"in_reply_to":"0f92b398_139458e0","updated":"2026-03-25 20:05:37.000000000","message":"Acknowledged","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"be9373b7bf7de02a296ceb85910ae826722a7947","unresolved":true,"context_lines":[{"line_number":692,"context_line":"                    and (bucket.timestamp !\u003d local_timestamp"},{"line_number":693,"context_line":"                         or bucket.etag !\u003d local_etag)):"},{"line_number":694,"context_line":"                self.logger.warning("},{"line_number":695,"context_line":"                    \u0027Received enough fragments at timestamp %s \u0027"},{"line_number":696,"context_line":"                    \u0027(bucket.timestamp\u003d%s) but local timestamp is %s, \u0027"},{"line_number":697,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"},{"line_number":698,"context_line":"                    \u0027for %s frag#%s\u0027 %"}],"source_content_type":"text/x-python","patch_set":6,"id":"bcea5fad_f26d22ce","line":695,"updated":"2026-03-25 14:40:36.000000000","message":"could we make the message similar to the \u0027Unable to get enough responses...\u0027 message so that it\u0027s easy to search for both.\nAlso, include the response count\n\ne.g.\n ``Received enough responses (10/10 from 10 ok responses)...``\n \n cf\n\n ``Unable to get enough responses (9/10 from 9 ok responses)...``","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"394119362bf5dc5d7fba3dc988f982be81278b6c","unresolved":false,"context_lines":[{"line_number":692,"context_line":"                    and (bucket.timestamp !\u003d local_timestamp"},{"line_number":693,"context_line":"                         or bucket.etag !\u003d local_etag)):"},{"line_number":694,"context_line":"                self.logger.warning("},{"line_number":695,"context_line":"                    \u0027Received enough fragments at timestamp %s \u0027"},{"line_number":696,"context_line":"                    \u0027(bucket.timestamp\u003d%s) but local timestamp is %s, \u0027"},{"line_number":697,"context_line":"                    \u0027received etag %s but local etag is %s \u0027"},{"line_number":698,"context_line":"                    \u0027for %s frag#%s\u0027 %"}],"source_content_type":"text/x-python","patch_set":6,"id":"4e6da561_dc2e4f89","line":695,"in_reply_to":"bcea5fad_f26d22ce","updated":"2026-03-25 20:05:37.000000000","message":"Acknowledged","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"be9373b7bf7de02a296ceb85910ae826722a7947","unresolved":true,"context_lines":[{"line_number":710,"context_line":"                     bucket.num_responses,"},{"line_number":711,"context_line":"                     \u0027durable\u0027 if bucket.durable else \u0027non-durable\u0027,"},{"line_number":712,"context_line":"                     full_path, fi_to_rebuild, bucket.etag,"},{"line_number":713,"context_line":"                     bucket.timestamp.internal))"},{"line_number":714,"context_line":""},{"line_number":715,"context_line":"        if error_responses:"},{"line_number":716,"context_line":"            durable \u003d buckets[local_timestamp].durable"}],"source_content_type":"text/x-python","patch_set":6,"id":"7bc81feb_0da37b4d","line":713,"updated":"2026-03-25 14:40:36.000000000","message":"nit: it would be nice if these two logs could be unified into a single \"bucket wasn\u0027t useful\" message","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"18166d53aa60d165dc14d1509618112d4a07ab05","unresolved":false,"context_lines":[{"line_number":710,"context_line":"                     bucket.num_responses,"},{"line_number":711,"context_line":"                     \u0027durable\u0027 if bucket.durable else \u0027non-durable\u0027,"},{"line_number":712,"context_line":"                     full_path, fi_to_rebuild, bucket.etag,"},{"line_number":713,"context_line":"                     bucket.timestamp.internal))"},{"line_number":714,"context_line":""},{"line_number":715,"context_line":"        if error_responses:"},{"line_number":716,"context_line":"            durable \u003d buckets[local_timestamp].durable"}],"source_content_type":"text/x-python","patch_set":6,"id":"6fa30652_d45b564c","line":713,"in_reply_to":"7bc81feb_0da37b4d","updated":"2026-03-26 22:54:20.000000000","message":"the existing log line \"Unable to get enough responses...\" is being used by scripts to scan this kind of errors on prod, let\u0027s keep it for now.","commit_id":"46dde2a95c3813ca873ab7b63c0b087bc137f828"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":97,"context_line":"    Encapsulates fragment GET response data related to a single timestamp."},{"line_number":98,"context_line":"    \"\"\""},{"line_number":99,"context_line":""},{"line_number":100,"context_line":"    def __init__(self):"},{"line_number":101,"context_line":"        # count of all responses associated with this Bucket"},{"line_number":102,"context_line":"        self.num_responses \u003d 0"},{"line_number":103,"context_line":"        # map {frag_index: response} for subset of responses that could be used"}],"source_content_type":"text/x-python","patch_set":8,"id":"254f4eaa_6895b218","line":100,"updated":"2026-03-31 00:06:41.000000000","message":"as cute as it is to have this not take any params so we can use it with a default dict:\n\n```\nbuckets \u003d defaultdict(ResponseBucket)\n```\n\nI wonder if a `ResponseBucketCollection` would do a better job of encapsulating some of the logic around \"what EC policy is this and how big is ndata\" or \"If get a response with \u003d\u003d timestamp and !\u003d etag something is very wrong\"","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"be541568327fa862bccaaa4b28ada45a9fa5542d","unresolved":true,"context_lines":[{"line_number":97,"context_line":"    Encapsulates fragment GET response data related to a single timestamp."},{"line_number":98,"context_line":"    \"\"\""},{"line_number":99,"context_line":""},{"line_number":100,"context_line":"    def __init__(self):"},{"line_number":101,"context_line":"        # count of all responses associated with this Bucket"},{"line_number":102,"context_line":"        self.num_responses \u003d 0"},{"line_number":103,"context_line":"        # map {frag_index: response} for subset of responses that could be used"}],"source_content_type":"text/x-python","patch_set":8,"id":"524bd1ba_211744d0","line":100,"in_reply_to":"254f4eaa_6895b218","updated":"2026-03-31 06:00:43.000000000","message":"I am not sure what do you mean, would appreciate some example code.","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9892fa10c5bedcd024f6db876bdde021041e338a","unresolved":true,"context_lines":[{"line_number":97,"context_line":"    Encapsulates fragment GET response data related to a single timestamp."},{"line_number":98,"context_line":"    \"\"\""},{"line_number":99,"context_line":""},{"line_number":100,"context_line":"    def __init__(self):"},{"line_number":101,"context_line":"        # count of all responses associated with this Bucket"},{"line_number":102,"context_line":"        self.num_responses \u003d 0"},{"line_number":103,"context_line":"        # map {frag_index: response} for subset of responses that could be used"}],"source_content_type":"text/x-python","patch_set":8,"id":"a1c3a2a8_13a61a4e","line":100,"in_reply_to":"524bd1ba_211744d0","updated":"2026-03-31 16:42:42.000000000","message":"like the proxy does:\n\nhttps://github.com/NVIDIA/swift/blob/master/swift/proxy/controllers/obj.py#L2226","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":114,"context_line":"        if self.timestamp is None:"},{"line_number":115,"context_line":"            self.timestamp \u003d timestamp"},{"line_number":116,"context_line":"        if self.etag is None:"},{"line_number":117,"context_line":"            self.etag \u003d etag"},{"line_number":118,"context_line":""},{"line_number":119,"context_line":"    def matches(self, timestamp, etag):"},{"line_number":120,"context_line":"        return self.timestamp \u003d\u003d timestamp and self.etag \u003d\u003d etag"}],"source_content_type":"text/x-python","patch_set":8,"id":"939611f2_37671fb1","line":117,"updated":"2026-03-31 00:06:41.000000000","message":"I don\u0027t think it\u0027s *super* great to call this `setdefault` and be \"ok\" with\n\n```\n        bucket \u003d buckets[timestamp]\n        bucket.set_default(timestamp, etag)\n        bucket.num_responses +\u003d 1\n        if bucket.etag !\u003d etag:\n```\n\nLike if `buckets[(timestamp, etag)]` is empty we could create one before we `num_responses +\u003d 1`","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":676,"context_line":"        local_etag \u003d datafile_metadata.get(\u0027X-Object-Sysmeta-Ec-Etag\u0027)"},{"line_number":677,"context_line":"        path \u003d datafile_metadata[\u0027name\u0027]"},{"line_number":678,"context_line":""},{"line_number":679,"context_line":"        buckets \u003d defaultdict(ResponseBucket)  # map timestamp -\u003e Bucket"},{"line_number":680,"context_line":"        error_responses \u003d defaultdict(list)  # map status code -\u003e response list"},{"line_number":681,"context_line":""},{"line_number":682,"context_line":"        # don\u0027t try and fetch a fragment from the node we\u0027re rebuilding to"}],"source_content_type":"text/x-python","patch_set":8,"id":"77f25647_3c1577db","line":679,"updated":"2026-03-31 00:06:41.000000000","message":"maybe we should be mapping `(etag, timestmap) \u003d\u003e bucket`?","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":683,"context_line":"        useful_bucket \u003d self._make_fragment_requests("},{"line_number":684,"context_line":"            job, node, df, buckets, error_responses)"},{"line_number":685,"context_line":""},{"line_number":686,"context_line":"        if useful_bucket:"},{"line_number":687,"context_line":"            frag_indexes \u003d list(useful_bucket.useful_responses.keys())"},{"line_number":688,"context_line":"            self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":689,"context_line":"                              % (fi_to_rebuild, frag_indexes))"}],"source_content_type":"text/x-python","patch_set":8,"id":"052b1048_3443b34c","line":686,"updated":"2026-03-31 00:06:41.000000000","message":"here see seem to be defining \"is useful\" as \"has enough AND matches\"","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":683,"context_line":"        useful_bucket \u003d self._make_fragment_requests("},{"line_number":684,"context_line":"            job, node, df, buckets, error_responses)"},{"line_number":685,"context_line":""},{"line_number":686,"context_line":"        if useful_bucket:"},{"line_number":687,"context_line":"            frag_indexes \u003d list(useful_bucket.useful_responses.keys())"},{"line_number":688,"context_line":"            self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":689,"context_line":"                              % (fi_to_rebuild, frag_indexes))"}],"source_content_type":"text/x-python","patch_set":8,"id":"b43900f9_d16a2415","line":686,"in_reply_to":"052b1048_3443b34c","updated":"2026-04-03 23:51:27.000000000","message":"I renamed ``useful_bucket`` to ``matching_bucket`` to avoid confusion.","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":696,"context_line":"        full_path \u003d _full_path(node, partition, path, policy)"},{"line_number":697,"context_line":"        for _, bucket in sorted(buckets.items()):"},{"line_number":698,"context_line":"            if (bucket.is_useful(policy)"},{"line_number":699,"context_line":"                    and not bucket.matches(local_timestamp, local_etag)):"},{"line_number":700,"context_line":"                self.logger.error("},{"line_number":701,"context_line":"                    \u0027Received enough responses (%s/%s from %s ok responses) \u0027"},{"line_number":702,"context_line":"                    \u0027but not reconstructing %s %s frag#%s, \u0027"}],"source_content_type":"text/x-python","patch_set":8,"id":"20f72976_83a0969c","line":699,"updated":"2026-03-31 00:06:41.000000000","message":"here `bucket.is_useful` *clearly* does NOT include \"AND matches\"","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":696,"context_line":"        full_path \u003d _full_path(node, partition, path, policy)"},{"line_number":697,"context_line":"        for _, bucket in sorted(buckets.items()):"},{"line_number":698,"context_line":"            if (bucket.is_useful(policy)"},{"line_number":699,"context_line":"                    and not bucket.matches(local_timestamp, local_etag)):"},{"line_number":700,"context_line":"                self.logger.error("},{"line_number":701,"context_line":"                    \u0027Received enough responses (%s/%s from %s ok responses) \u0027"},{"line_number":702,"context_line":"                    \u0027but not reconstructing %s %s frag#%s, \u0027"}],"source_content_type":"text/x-python","patch_set":8,"id":"1d77a84e_17d46578","line":699,"in_reply_to":"20f72976_83a0969c","updated":"2026-04-03 23:51:27.000000000","message":"Acknowledged","commit_id":"818d4ed8641abb09bbbeb92338d4c76906bac115"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":681,"context_line":"                     local_timestamp.internal,"},{"line_number":682,"context_line":"                     useful_bucket.etag,"},{"line_number":683,"context_line":"                     local_etag,"},{"line_number":684,"context_line":"                     full_path, fi_to_rebuild))"},{"line_number":685,"context_line":"            frag_indexes \u003d list(useful_bucket.useful_responses.keys())"},{"line_number":686,"context_line":"            self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":687,"context_line":"                              % (fi_to_rebuild, frag_indexes))"}],"source_content_type":"text/x-python","patch_set":10,"id":"887ba75e_f4ec297d","side":"PARENT","line":684,"updated":"2026-03-31 00:06:41.000000000","message":"the situation on this patch chain is really confusing\n\nin the previous un-merged patch we added this new log message, a warning that we\u0027re about to write corrupt data\n\n... and yet here in the fix we remove it - why not add originally add the log message where we intended to apply the fix?  Why not just fix it here by logging this warning and raising a DiskFileError?","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9892fa10c5bedcd024f6db876bdde021041e338a","unresolved":true,"context_lines":[{"line_number":681,"context_line":"                     local_timestamp.internal,"},{"line_number":682,"context_line":"                     useful_bucket.etag,"},{"line_number":683,"context_line":"                     local_etag,"},{"line_number":684,"context_line":"                     full_path, fi_to_rebuild))"},{"line_number":685,"context_line":"            frag_indexes \u003d list(useful_bucket.useful_responses.keys())"},{"line_number":686,"context_line":"            self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":687,"context_line":"                              % (fi_to_rebuild, frag_indexes))"}],"source_content_type":"text/x-python","patch_set":10,"id":"6d067452_4e3cc478","side":"PARENT","line":684,"in_reply_to":"7a13f6f4_0033fd1f","updated":"2026-03-31 16:42:42.000000000","message":"\u003e we need those extra processing of received responses\n\ni\u0027m very sure that\u0027s not true; the example tests using a 2+3 policy don\u0027t convince me that it\u0027s *necessary* to ADD the behavior that we\u0027ll \"continue waiting past ndata\" *just* to start preventing data corruption.\n\nI would prefer we use the reconstruct_fa detection to raise DiskFileError directly and prevent data corruption NOW w/o any changes to response handling - and then eventually we could also maybe make 2+3 policies rebuild (if they\u0027re lucky?) - OR maybe we restrict digging around waiting on more responses (to handoffs?) only when our local timestamp is *newer*","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"be541568327fa862bccaaa4b28ada45a9fa5542d","unresolved":true,"context_lines":[{"line_number":681,"context_line":"                     local_timestamp.internal,"},{"line_number":682,"context_line":"                     useful_bucket.etag,"},{"line_number":683,"context_line":"                     local_etag,"},{"line_number":684,"context_line":"                     full_path, fi_to_rebuild))"},{"line_number":685,"context_line":"            frag_indexes \u003d list(useful_bucket.useful_responses.keys())"},{"line_number":686,"context_line":"            self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":687,"context_line":"                              % (fi_to_rebuild, frag_indexes))"}],"source_content_type":"text/x-python","patch_set":10,"id":"7a13f6f4_0033fd1f","side":"PARENT","line":684,"in_reply_to":"887ba75e_f4ec297d","updated":"2026-03-31 06:00:43.000000000","message":"yeah... we got here, because we tried to accommodate the checkpoint release of new jitter timestamp, and also we need those extra processing of received responses ``_handle_fragment_response``.","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":625,"context_line":"                    error_responses, resp)"},{"line_number":626,"context_line":"                if (bucket"},{"line_number":627,"context_line":"                        and bucket.is_useful(policy)"},{"line_number":628,"context_line":"                        and bucket.matches(local_timestamp, local_etag)):"},{"line_number":629,"context_line":"                    useful_bucket \u003d bucket"},{"line_number":630,"context_line":"                    self.logger.debug("},{"line_number":631,"context_line":"                        \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"6875c263_3bd51da4","line":628,"updated":"2026-03-31 00:06:41.000000000","message":"I guess it\u0027s a pre-existing problem that we have the same conditional repeated when evaluating primary responses and handoff responses.\n\nI just wish it was something closer to what we have in the proxy w/ `ECGetResponseCollection`\n\n```\nbest_bucket \u003d buckets.choose_best_bucket()\n```","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":629,"context_line":"                    useful_bucket \u003d bucket"},{"line_number":630,"context_line":"                    self.logger.debug("},{"line_number":631,"context_line":"                        \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":632,"context_line":"                        % node_count)"},{"line_number":633,"context_line":"                    break"},{"line_number":634,"context_line":"                elif self._is_quarantine_candidate("},{"line_number":635,"context_line":"                        policy, buckets, error_responses, df):"}],"source_content_type":"text/x-python","patch_set":10,"id":"77d30fb7_e847f08e","line":632,"updated":"2026-03-31 00:06:41.000000000","message":"it\u0027s interesting to me that the only situation in which we\u0027d rebuild from handoffs is when all the primaries were returning 404s","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":629,"context_line":"                    useful_bucket \u003d bucket"},{"line_number":630,"context_line":"                    self.logger.debug("},{"line_number":631,"context_line":"                        \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":632,"context_line":"                        % node_count)"},{"line_number":633,"context_line":"                    break"},{"line_number":634,"context_line":"                elif self._is_quarantine_candidate("},{"line_number":635,"context_line":"                        policy, buckets, error_responses, df):"}],"source_content_type":"text/x-python","patch_set":10,"id":"3738f7c8_29d6fdf4","line":632,"in_reply_to":"77d30fb7_e847f08e","updated":"2026-04-03 23:51:27.000000000","message":"Acknowledged","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"}],"test/probe/test_reconstructor_rebuild.py":[{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"e4d8f12426f450f05d93695cf82bea74d7fbe92d","unresolved":true,"context_lines":[{"line_number":38,"context_line":"from swiftclient import client, ClientException"},{"line_number":39,"context_line":""},{"line_number":40,"context_line":""},{"line_number":41,"context_line":"class Body(object):"},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"    def __init__(self, total\u003d3.5 * 2 ** 20):"},{"line_number":44,"context_line":"        self.total \u003d int(total)"}],"source_content_type":"text/x-python","patch_set":10,"id":"6f6bd5b1_99fa061c","side":"PARENT","line":41,"updated":"2026-03-28 23:38:10.000000000","message":"this is dead code. all test cases in this file use the ``Body`` in ``test.probe.common``, let\u0027s remove it to avoid confusion.","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":38,"context_line":"from swiftclient import client, ClientException"},{"line_number":39,"context_line":""},{"line_number":40,"context_line":""},{"line_number":41,"context_line":"class Body(object):"},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"    def __init__(self, total\u003d3.5 * 2 ** 20):"},{"line_number":44,"context_line":"        self.total \u003d int(total)"}],"source_content_type":"text/x-python","patch_set":10,"id":"a0ab4c77_d24d0f92","side":"PARENT","line":41,"in_reply_to":"6f6bd5b1_99fa061c","updated":"2026-03-31 00:06:41.000000000","message":"this comment is helpful, thank you.","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":38,"context_line":"from swiftclient import client, ClientException"},{"line_number":39,"context_line":""},{"line_number":40,"context_line":""},{"line_number":41,"context_line":"class Body(object):"},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"    def __init__(self, total\u003d3.5 * 2 ** 20):"},{"line_number":44,"context_line":"        self.total \u003d int(total)"}],"source_content_type":"text/x-python","patch_set":10,"id":"28d66e2f_9dc9a8ce","side":"PARENT","line":41,"in_reply_to":"a0ab4c77_d24d0f92","updated":"2026-04-03 23:51:27.000000000","message":"Acknowledged","commit_id":"d0b321a8e6a197531abca9f442fd9c5538f10d62"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"e4d8f12426f450f05d93695cf82bea74d7fbe92d","unresolved":true,"context_lines":[{"line_number":703,"context_line":"        self.break_nodes(self.onodes, self.opart, [fail_node_idx], [])"},{"line_number":704,"context_line":"        self.assert_direct_get_fails(fail_node, self.opart, 404)"},{"line_number":705,"context_line":""},{"line_number":706,"context_line":"        # Run only the stale node\u0027s device reconstructor. If we ran on all"},{"line_number":707,"context_line":"        # devices, a T2-holding node would be more likely to successfully"},{"line_number":708,"context_line":"        # rebuild fail node first, and reconstructor on stale node wouldn\u0027t be"},{"line_number":709,"context_line":"        # called and no mismatch error would be logged."}],"source_content_type":"text/x-python","patch_set":10,"id":"ab28c267_a3fd5dd6","line":706,"updated":"2026-03-28 23:38:10.000000000","message":"If I ran reconstructors on all nodes, I didn\u0027t see the mismatch error for a few runs. I think this is showing that, in production, there are more likely to have parallel rebuilds going on. So for the node who sees that the local timestamp/etag don\u0027t match the bucket timestamp/etag, it should just stop it and let other nodes to rebuild, and it probably would actually waste cpu cycles and send more unnecessary IOs if it rebuilds the missing frag based on remote timestamp. Anyway, the saved or wasted IOs/CPU should all be negligible considering this should be a very rare event in a large cluster.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"e4d8f12426f450f05d93695cf82bea74d7fbe92d","unresolved":true,"context_lines":[{"line_number":719,"context_line":"        # With the fix: bucket.matches(local_timestamp\u003dT1) fails because"},{"line_number":720,"context_line":"        # the bucket is at T2. Reconstruction is refused and the fail node"},{"line_number":721,"context_line":"        # should NOT have been rebuilt."},{"line_number":722,"context_line":"        self.assert_direct_get_fails(fail_node, self.opart, 404)"},{"line_number":723,"context_line":""},{"line_number":724,"context_line":"        # proxy GET should still return v2 from the T2 nodes"},{"line_number":725,"context_line":"        headers, etag \u003d self.proxy_get()"}],"source_content_type":"text/x-python","patch_set":10,"id":"fb85059e_26863365","line":722,"updated":"2026-03-28 23:38:10.000000000","message":"on master branch, I was able to reproduce the corrupted rebuild\n```\n        # the bucket is at T2. Reconstruction is refused and the fail node\n        # should NOT have been rebuilt.\n\u003e       self.assert_direct_get_fails(fail_node, self.opart, 404)\n\ntest/probe/test_reconstructor_rebuild.py:722:\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\ntest/probe/common.py:737: in assert_direct_get_fails\n    self.fail(\u0027Node data on %r was not fully destroyed!\u0027 % (onode,))\nE   AssertionError: Node data on {\u0027device\u0027: \u0027sdb8\u0027, \u0027id\u0027: 7, \u0027ip\u0027: \u0027127.0.0.4\u0027, \u0027meta\u0027: \u0027\u0027, \u0027port\u0027: 6040, \u0027region\u0027: 1, \u0027replication_ip\u0027: \u0027127.0.0.4\u0027, \u0027replication_port\u0027: 6040, \u0027weight\u0027: 1.0, \u0027zone\u0027: 4, \u0027index\u0027: 5} was not fully destroyed!\n----------------------------------------------------- Captured stdout call -----------------------------------------------------\n```","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"}],"test/unit/obj/test_reconstructor.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":5275,"context_line":"        # expect a warning about the timestamp mismatch"},{"line_number":5276,"context_line":"        error_lines \u003d self.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":5277,"context_line":"        self.assertEqual("},{"line_number":5278,"context_line":"            [\u0027Received enough responses (13/10 from %s ok responses) but not \u0027"},{"line_number":5279,"context_line":"             \u0027reconstructing non-durable 10.0.0.1:1001/sdb/0%s policy#0 \u0027"},{"line_number":5280,"context_line":"             \u0027frag#1, bucket timestamp: %s, local timestamp: %s, \u0027"},{"line_number":5281,"context_line":"             \u0027bucket etag: %s, local etag: %s (mismatch)\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"9d9f5208_007cddbe","line":5278,"updated":"2026-03-31 00:06:41.000000000","message":"this is a perfect example of a questionable behavior\n\nOn master we\u0027d have stopped at 10 responses - there\u0027s enough *matching* etag to rebuild - and that should be sufficient to detect we can\u0027t stick our local metadata on the frag rebuilt from this useful_bucket.\n\nBut with this change we\u0027re waiting (a little?) longer for all primary responses to finish.\n\nI\u0027m fine with \"fix it now optimize it later\" - but I would assume the fix would be \"raise `DiskFileError` instead of returning a corrupt `RebuildingECDiskFileStream`\" and the \"optimize later\" would be:\n\na) just use the metadata from the useful_bucket - everything we need is in the response headers, pulling the metadata from the local diskfile is just lazy/wrong\nb) leave the \"fix\" of `if useful_bucket.metadata !\u003d local_metadata: raise DiskfileError` ... but just notice that if \"nparity + 1 resp agree; the primaries are a loss\" (i.e. break after only 5 \"remote quorum but not local matching\" resp instead of 10 - but certainly not \"wait on an extra 3\" !???)\n\nI don\u0027t think the additional log information it\u0027s worth waiting on the additional primary responses.  We already know we\u0027re boned coming out of `_make_fragment_requests` - let\u0027s just leave that code alone for now and `raise DiskFileError` in `reconstruct_fa` (right where the parent patch adds the warning)","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"b41a02d75e4f0077e52e598223ed0c32ec9a3384","unresolved":true,"context_lines":[{"line_number":5275,"context_line":"        # expect a warning about the timestamp mismatch"},{"line_number":5276,"context_line":"        error_lines \u003d self.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":5277,"context_line":"        self.assertEqual("},{"line_number":5278,"context_line":"            [\u0027Received enough responses (13/10 from %s ok responses) but not \u0027"},{"line_number":5279,"context_line":"             \u0027reconstructing non-durable 10.0.0.1:1001/sdb/0%s policy#0 \u0027"},{"line_number":5280,"context_line":"             \u0027frag#1, bucket timestamp: %s, local timestamp: %s, \u0027"},{"line_number":5281,"context_line":"             \u0027bucket etag: %s, local etag: %s (mismatch)\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"a0a840d8_c7438c6f","line":5278,"in_reply_to":"3519ad13_2b8b32c9","updated":"2026-03-31 23:30:11.000000000","message":"added a new test ``test_reconstruct_fa_two_buckets_both_error_messages`` which will show two error messages if reconstructor sees two buckets with different timestamps: one at the wrong timestamp with enough responses and one at the local timestamp with too few, then we get both error messages. I think they are important if we want to find out if any other part of system went wrong potentially.\n\n```\n\u0027Unable to get enough responses (3/10 from 3 ok responses) to reconstruct non-durable 10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1 with ETag f0510d5479985e9523dc1688b0bc7d63 and timestamp 1774998972.00000\u0027,\n\u0027Received enough responses (10/10 from 10 ok responses) but not reconstructing non-durable 10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1, bucket timestamp: 1774998973.00000, local timestamp: 1774998972.00000, bucket etag: eca04c02876e71327ae3710a178067c6, local etag: f0510d5479985e9523dc1688b0bc7d63 (mismatch)\u0027\n```\n\nit would be also very interesting to see how those numbers change over time in the logs if mismatch happens. For example, if we have some frags on the handoffs, and revert jobs are moving them back to primaries. then we will see:\n```\nt1: Unable to get enough responses (1/10 from 1 ok responses)... Received enough responses (10/10 from 10 ok responses) but not reconstructing...\nt2: Unable to get enough responses (2/10 from 2 ok responses)... Received enough responses (10/10 from 10 ok responses) but not reconstructing...\nt3: Unable to get enough responses (3/10 from 3 ok responses)... Received enough responses (10/10 from 10 ok responses) but not reconstructing...\n```","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"be541568327fa862bccaaa4b28ada45a9fa5542d","unresolved":true,"context_lines":[{"line_number":5275,"context_line":"        # expect a warning about the timestamp mismatch"},{"line_number":5276,"context_line":"        error_lines \u003d self.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":5277,"context_line":"        self.assertEqual("},{"line_number":5278,"context_line":"            [\u0027Received enough responses (13/10 from %s ok responses) but not \u0027"},{"line_number":5279,"context_line":"             \u0027reconstructing non-durable 10.0.0.1:1001/sdb/0%s policy#0 \u0027"},{"line_number":5280,"context_line":"             \u0027frag#1, bucket timestamp: %s, local timestamp: %s, \u0027"},{"line_number":5281,"context_line":"             \u0027bucket etag: %s, local etag: %s (mismatch)\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"becebdc8_0a6b7d96","line":5278,"in_reply_to":"9d9f5208_007cddbe","updated":"2026-03-31 06:00:43.000000000","message":"\u003e but I would assume the fix would be \"raise DiskFileError instead of returning a corrupt RebuildingECDiskFileStream\"\n\n\nthis is what this patch is actually doing, as demonstrated by this test case, at line 5270: ``self.assertRaises(DiskFileError``.\n\n\n\u003e I don\u0027t think the additional log information it\u0027s worth waiting on the additional primary responses.\n\nwhy not? it doesn\u0027t hurt to process a few more received responses (as a matter of fact, it should finish processing all responses, since ``_is_quarantine_candidate`` needs them) and and it\u0027ll be great to include more complete info in order to dig out what\u0027s going on, otherwise we would regret \"oh no, we should have that info to help find out why this mismatch happened on prod\". For example, @tburke@nvidia.com suspected handoff revert job could be in the play for one case of corrupted EC object, if that\u0027s the case we would see the number ``13/10`` changing over time in those log messages emitted by the same reconstructor, because revert jobs copy the handoff frag back to the primaries. \n\nIn this test, ``Received enough responses (13/10 from %s ok responses)``, the number ``13/10`` was just chosen for this test. In production, it could be different cases. when those mismatch happens, it would be great to know 1. how many responses received. 2 how many buckets. 3. num of responses in each bucket (can be indirectly inferred from current log msg, but we can add more details).","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"9b294ef8fd114672db291d3160cf1a5fa93cf4e0","unresolved":true,"context_lines":[{"line_number":5275,"context_line":"        # expect a warning about the timestamp mismatch"},{"line_number":5276,"context_line":"        error_lines \u003d self.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":5277,"context_line":"        self.assertEqual("},{"line_number":5278,"context_line":"            [\u0027Received enough responses (13/10 from %s ok responses) but not \u0027"},{"line_number":5279,"context_line":"             \u0027reconstructing non-durable 10.0.0.1:1001/sdb/0%s policy#0 \u0027"},{"line_number":5280,"context_line":"             \u0027frag#1, bucket timestamp: %s, local timestamp: %s, \u0027"},{"line_number":5281,"context_line":"             \u0027bucket etag: %s, local etag: %s (mismatch)\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"3e6afa6a_302cdc81","line":5278,"in_reply_to":"a0a840d8_c7438c6f","updated":"2026-04-05 18:22:18.000000000","message":"Other than this observability improvement which it\u0027s necessary for debugging root cause IMHO, the continued loop is also needed for the case of ``Duplication factor \u003e 1``. \n\nWith ``ec_duplication_factor\u003d2``, each fragment index is stored on 2 nodes. For an EC 8+4 policy that means 24 primary nodes and 24 responses. If the first ec_ndata responses happen to be at the wrong timestamp (e.g., some nodes missed an overwrite), breaking early would miss the remaining responses that could assemble a matching bucket at the correct timestamp. The new test ``TestReconstructFragmentArchiveECDuplicationFactor::test_reconstruct_fa_wrong_bucket_first_finds_match_in_duplicates`` test demonstrates that the continued loop could find a matching bucket instead and allow reconstruction to happen, see: https://review.opendev.org/c/openstack/swift/+/978845/15/test/unit/obj/test_reconstructor.py#7222","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9892fa10c5bedcd024f6db876bdde021041e338a","unresolved":true,"context_lines":[{"line_number":5275,"context_line":"        # expect a warning about the timestamp mismatch"},{"line_number":5276,"context_line":"        error_lines \u003d self.logger.get_lines_for_level(\u0027error\u0027)"},{"line_number":5277,"context_line":"        self.assertEqual("},{"line_number":5278,"context_line":"            [\u0027Received enough responses (13/10 from %s ok responses) but not \u0027"},{"line_number":5279,"context_line":"             \u0027reconstructing non-durable 10.0.0.1:1001/sdb/0%s policy#0 \u0027"},{"line_number":5280,"context_line":"             \u0027frag#1, bucket timestamp: %s, local timestamp: %s, \u0027"},{"line_number":5281,"context_line":"             \u0027bucket etag: %s, local etag: %s (mismatch)\u0027"}],"source_content_type":"text/x-python","patch_set":10,"id":"3519ad13_2b8b32c9","line":5278,"in_reply_to":"becebdc8_0a6b7d96","updated":"2026-03-31 16:42:42.000000000","message":"\u003e this is what this patch is actually doing\n\nbut it could do it (more simply in one place, w/o changing `_make_frag_requests` response processing) in `reconstruct_fa`\n\n982735: raise DiskfileError on \"bad\" useful_bucket | https://review.opendev.org/c/openstack/swift/+/982735\n\n\u003e it doesn\u0027t hurt to process a few more received responses\n\nI agree it\u0027s not the cost of *processing* them - it\u0027s the cost of *waiting* on them - because the consistency engine needs to operate at scale small optimizations have a large impact\n\n\u003e _is_quarantine_candidate needs them\n\n`_is_quarnatine` prevents requests to handoffs unless all primary 404\n\nhttps://github.com/NVIDIA/swift/blob/master/swift/obj/reconstructor.py#L495\n\n... so that\u0027s why we see 10/13 instead of 10/26\n\n\u003e it\u0027ll be great to include more complete info in order to dig out what\u0027s going on\n\nit\u0027ll be GREAT to not write corrupt data, a log message when we don\u0027t do that happens will be really nice to have as we expect we\u0027ll need to dig in more\n\n\u003e can be indirectly inferred from current log msg, but we can add more details\n\nI think the most important part of the current log msg is `bucket timestamp: %s, local timestamp: %s,` - because we\u0027ll discover either that it\u0027s a timestamp collision or just an out-of-date primary.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":5290,"context_line":"        # Instead, it should keep processing remaining responses so that"},{"line_number":5291,"context_line":"        # the right-timestamp bucket also gets a chance to fill up."},{"line_number":5292,"context_line":"        #"},{"line_number":5293,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5294,"context_line":"        # (2 wrong-ts + 2 right-ts), the wrong-ts bucket fills to ec_ndata"},{"line_number":5295,"context_line":"        # first but the right-ts bucket also has enough to reconstruct."},{"line_number":5296,"context_line":"        custom_policy \u003d ECStoragePolicy("}],"source_content_type":"text/x-python","patch_set":10,"id":"d776c327_4ded4b96","line":5293,"updated":"2026-03-31 00:06:41.000000000","message":"I don\u0027t think testing a 2+3 schema is helpful; IMHO this test is bending over backwards trying to justify complexity that is not necessary in any realistic deployment and actively NOT HELPFUL in any deployment where npairty \u003c ndata.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":5290,"context_line":"        # Instead, it should keep processing remaining responses so that"},{"line_number":5291,"context_line":"        # the right-timestamp bucket also gets a chance to fill up."},{"line_number":5292,"context_line":"        #"},{"line_number":5293,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5294,"context_line":"        # (2 wrong-ts + 2 right-ts), the wrong-ts bucket fills to ec_ndata"},{"line_number":5295,"context_line":"        # first but the right-ts bucket also has enough to reconstruct."},{"line_number":5296,"context_line":"        custom_policy \u003d ECStoragePolicy("}],"source_content_type":"text/x-python","patch_set":10,"id":"c2ab90ca_7f2d6404","line":5293,"in_reply_to":"d776c327_4ded4b96","updated":"2026-04-03 23:51:27.000000000","message":"repeated points in other comments","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9c7b9a4a8567a44754277ff011dc0ca1dcd4e3d1","unresolved":true,"context_lines":[{"line_number":5413,"context_line":"        # responses so that the right-etag bucket (at the local timestamp)"},{"line_number":5414,"context_line":"        # also gets a chance to fill up."},{"line_number":5415,"context_line":"        #"},{"line_number":5416,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5417,"context_line":"        # (2 wrong-etag at wrong-ts + 2 right-etag at right-ts), the"},{"line_number":5418,"context_line":"        # wrong bucket fills to ec_ndata first but the right bucket also"},{"line_number":5419,"context_line":"        # has enough to reconstruct."}],"source_content_type":"text/x-python","patch_set":10,"id":"1badef98_53202f70","line":5416,"updated":"2026-03-31 00:06:41.000000000","message":"what?  I don\u0027t think a 2+3 policy is realistic.  Our docs specifically call out a 10+18 schema as inefficient:\n\nhttps://docs.openstack.org/swift/latest/overview_erasure_code.html#ec-duplication\n\nthis is not a case we should be optimizing for","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"be541568327fa862bccaaa4b28ada45a9fa5542d","unresolved":true,"context_lines":[{"line_number":5413,"context_line":"        # responses so that the right-etag bucket (at the local timestamp)"},{"line_number":5414,"context_line":"        # also gets a chance to fill up."},{"line_number":5415,"context_line":"        #"},{"line_number":5416,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5417,"context_line":"        # (2 wrong-etag at wrong-ts + 2 right-etag at right-ts), the"},{"line_number":5418,"context_line":"        # wrong bucket fills to ec_ndata first but the right bucket also"},{"line_number":5419,"context_line":"        # has enough to reconstruct."}],"source_content_type":"text/x-python","patch_set":10,"id":"763f35cf_31b98531","line":5416,"in_reply_to":"1badef98_53202f70","updated":"2026-03-31 06:00:43.000000000","message":"Initially, I was thinking of those possible upstream scenarios, and have the ``_handle_fragment_response`` loop to search for possible second useful bucket; since our prod won\u0027t have this, the extra processing of ``_handle_fragment_response`` only benefits the upstream user, so it\u0027s optional for us. But since last iteration, I realized we also need those extra processing of received responses because:\n\n1. ``_is_quarantine_candidate`` needs the complete picture. after all fragments being processed by ``_handle_fragment_response``, the quarantine candidate check ``_is_quarantine_candidate`` which runs immediately after this loop will have more complete information; if it bails early, the quarantine logic would see partial data (some responses unprocessed in the pile). Also, we may need to improve ``_is_quarantine_candidate`` in future, based on continuous learnings from those extra responses and log messages on prod.\n2. It\u0027ll be great/necessary to include more complete info in log messages in order to dig out what\u0027s going on if we see an occurrence of reconstructor prevention from building corrupted object.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"9892fa10c5bedcd024f6db876bdde021041e338a","unresolved":true,"context_lines":[{"line_number":5413,"context_line":"        # responses so that the right-etag bucket (at the local timestamp)"},{"line_number":5414,"context_line":"        # also gets a chance to fill up."},{"line_number":5415,"context_line":"        #"},{"line_number":5416,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5417,"context_line":"        # (2 wrong-etag at wrong-ts + 2 right-etag at right-ts), the"},{"line_number":5418,"context_line":"        # wrong bucket fills to ec_ndata first but the right bucket also"},{"line_number":5419,"context_line":"        # has enough to reconstruct."}],"source_content_type":"text/x-python","patch_set":10,"id":"898b085a_33aba4fb","line":5416,"in_reply_to":"763f35cf_31b98531","updated":"2026-03-31 16:42:42.000000000","message":"\u003e search for possible second useful bucket\n\nI think the main thing the loop was trying to accomplish is \"ask all primaries and see if we get a rebuild-able set of responses\"\n\nWe originally defined the stopping condition is \"ndata match\" - we\u0027ve now discovered an *earlier* stopping condition \"nparity+1 !\u003d local\" ... which is why I find it awkward that given this discovery this change decides to wait beyond the original ndata stopping condition all the way up to \"replica_count\"","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"9b294ef8fd114672db291d3160cf1a5fa93cf4e0","unresolved":true,"context_lines":[{"line_number":5413,"context_line":"        # responses so that the right-etag bucket (at the local timestamp)"},{"line_number":5414,"context_line":"        # also gets a chance to fill up."},{"line_number":5415,"context_line":"        #"},{"line_number":5416,"context_line":"        # Use a custom ec 2+3 policy so that with 4 primaries responses"},{"line_number":5417,"context_line":"        # (2 wrong-etag at wrong-ts + 2 right-etag at right-ts), the"},{"line_number":5418,"context_line":"        # wrong bucket fills to ec_ndata first but the right bucket also"},{"line_number":5419,"context_line":"        # has enough to reconstruct."}],"source_content_type":"text/x-python","patch_set":10,"id":"680ef78d_f8537b98","line":5416,"in_reply_to":"898b085a_33aba4fb","updated":"2026-04-05 18:22:18.000000000","message":"Without draining remaining responses, the error-reporting loop at lines 699-722 would only see one bucket (the wrong-timestamp one). Reconstructor can go around a few cycles already overnight, by the time when operator sees that single bucket error message and go to look at the object, the situation of those frags could already changed a lot. \n\nBy letting all primary responses complete, we get the full picture, e.g., \"bucket at T2 had enough frags but mismatched\" AND \"bucket at T1 only got 2/10 responses.\" An operator seeing both messages can immediately understand the object state, rather than having to guess why reconstruction failed. The ``test_reconstruct_fa_two_buckets_both_error_messages`` test validates exactly this, see: https://review.opendev.org/c/openstack/swift/+/978845/15/test/unit/obj/test_reconstructor.py#5410\n\nAnd the loop is still cheap (it just processes the already-spawned responses sitting in the GreenAsyncPile) and yields the observability benefit. It doesn\u0027t spawn any new requests.","commit_id":"57d6a7c7713218ed2a2bf5fdb0c5cf2b8ce9305f"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"fc119385f35e078559df0459f219a8f695b9f8b6","unresolved":true,"context_lines":[{"line_number":6859,"context_line":"        self.assertIn(\u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":6860,"context_line":"                      % (self.policy.object_ring.replicas * 2), debug_lines)"},{"line_number":6861,"context_line":""},{"line_number":6862,"context_line":"    def test_reconstruct_fa_handoffs_mismatched_timestamp(self):"},{"line_number":6863,"context_line":"        # Verify that when all primaries 404 except the local frag (quarantine"},{"line_number":6864,"context_line":"        # candidate), and handoffs return enough fragments at a WRONG"},{"line_number":6865,"context_line":"        # timestamp, the reconstructor refuses to rebuild. set reclaim_age to 0"}],"source_content_type":"text/x-python","patch_set":11,"id":"7ad03737_0ed58399","line":6862,"updated":"2026-04-01 04:54:25.000000000","message":"wait, those two new test cases failed with ``TestReconstructFragmentArchiveECDuplicationFactor``, oh, it\u0027s the ``ec_duplication_factor\u003d2``, so numbers won\u0027t match. will fix it tomorrow.\n\n```\nE       AssertionError: \u0027Received enough responses (26/10 from 26 ok responses) but not \nreconstructing non-durable 10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1, bucket timestamp: \n1775001194.00000, local timestamp: 1775001193.00000, bucket etag: \neca04c02876e71327ae3710a178067c6, local etag: f0510d5479985e9523dc1688b0bc7d63 (mismatch)\u0027 not \nfound in [\u0027Unable to get enough responses (1/10 from 1 ok responses) to reconstruct non-durable\n 10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1 with ETag f0510d5479985e9523dc1688b0bc7d63 and \n timestamp 1775001193.00000\u0027, \u0027Received enough responses (13/10 from 26 ok responses) but not\n  reconstructing non-durable 10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1, bucket timestamp: \n  1775001194.00000, local timestamp: 1775001193.00000, bucket etag: \n  eca04c02876e71327ae3710a178067c6, local etag: f0510d5479985e9523dc1688b0bc7d63 (mismatch)\u0027,\n   \u0027Unable to get enough responses (28 x 404 error responses) to reconstruct non-durable \n   10.0.0.1:1001/sdb/0/a/c/o policy#0 frag#1\u0027]\n   \n```","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":6859,"context_line":"        self.assertIn(\u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":6860,"context_line":"                      % (self.policy.object_ring.replicas * 2), debug_lines)"},{"line_number":6861,"context_line":""},{"line_number":6862,"context_line":"    def test_reconstruct_fa_handoffs_mismatched_timestamp(self):"},{"line_number":6863,"context_line":"        # Verify that when all primaries 404 except the local frag (quarantine"},{"line_number":6864,"context_line":"        # candidate), and handoffs return enough fragments at a WRONG"},{"line_number":6865,"context_line":"        # timestamp, the reconstructor refuses to rebuild. set reclaim_age to 0"}],"source_content_type":"text/x-python","patch_set":11,"id":"b11d2557_c023f687","line":6862,"in_reply_to":"7ad03737_0ed58399","updated":"2026-04-03 23:51:27.000000000","message":"Done","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e9526853c46da517a741384df191e6896855654b","unresolved":true,"context_lines":[{"line_number":6897,"context_line":"                responses.append((200, body, headers))"},{"line_number":6898,"context_line":"                # frag#1 is being rebuilt; insert 404s for remaining primaries"},{"line_number":6899,"context_line":"                responses.extend("},{"line_number":6900,"context_line":"                    ((404, None, None),) * self.policy.object_ring.replicas)"},{"line_number":6901,"context_line":"            else:"},{"line_number":6902,"context_line":"                # handoffs return frags at a different timestamp/etag"},{"line_number":6903,"context_line":"                headers.update({"}],"source_content_type":"text/x-python","patch_set":11,"id":"fdaef9c5_d678657b","line":6900,"updated":"2026-04-01 23:33:19.000000000","message":"ok, so we append the frag[0] valid response with correct etag; then we *extend* another replica count 404s\n\n```\n(Pdb) len(responses)\n27\n(Pdb) len(ec_archive_bodies)\n13\n```\n\n^ isn\u0027t that going to make one of the handoffs 404?  shouldn\u0027t the maximum request count be 2 * replica?  how is this test not hitting a \"un-used response\" error?","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":6897,"context_line":"                responses.append((200, body, headers))"},{"line_number":6898,"context_line":"                # frag#1 is being rebuilt; insert 404s for remaining primaries"},{"line_number":6899,"context_line":"                responses.extend("},{"line_number":6900,"context_line":"                    ((404, None, None),) * self.policy.object_ring.replicas)"},{"line_number":6901,"context_line":"            else:"},{"line_number":6902,"context_line":"                # handoffs return frags at a different timestamp/etag"},{"line_number":6903,"context_line":"                headers.update({"}],"source_content_type":"text/x-python","patch_set":11,"id":"043dbd1d_a4576de8","line":6900,"in_reply_to":"fdaef9c5_d678657b","updated":"2026-04-03 23:51:27.000000000","message":"yes, there were extra two 404 requests leaked into handoff responses. And 27 responses were all consumed, so no leftovers for mocked_http_conn to complain about. The 2 leaked 404s just happen to be absorbed by the handoff request processing logic. I got this fixed.","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e9526853c46da517a741384df191e6896855654b","unresolved":true,"context_lines":[{"line_number":6904,"context_line":"                    \u0027X-Object-Sysmeta-Ec-Etag\u0027: wrong_etag,"},{"line_number":6905,"context_line":"                    \u0027X-Backend-Data-Timestamp\u0027: wrong_timestamp.internal,"},{"line_number":6906,"context_line":"                })"},{"line_number":6907,"context_line":"                responses.append((200, body, headers))"},{"line_number":6908,"context_line":""},{"line_number":6909,"context_line":"        codes, body_iter, headers \u003d zip(*responses)"},{"line_number":6910,"context_line":"        with mocked_http_conn("}],"source_content_type":"text/x-python","patch_set":11,"id":"d2582628_99feab33","line":6907,"updated":"2026-04-01 23:33:19.000000000","message":"is this associating the \"wrong_etag\" with a frag body that was constructed with test_data?  Like this is already a invalid/corrupt/inconsistent/unrealistic/non-nonsensical fake response?","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"2ed82563742524ac2115dd97a4f8375c44b871e9","unresolved":false,"context_lines":[{"line_number":6904,"context_line":"                    \u0027X-Object-Sysmeta-Ec-Etag\u0027: wrong_etag,"},{"line_number":6905,"context_line":"                    \u0027X-Backend-Data-Timestamp\u0027: wrong_timestamp.internal,"},{"line_number":6906,"context_line":"                })"},{"line_number":6907,"context_line":"                responses.append((200, body, headers))"},{"line_number":6908,"context_line":""},{"line_number":6909,"context_line":"        codes, body_iter, headers \u003d zip(*responses)"},{"line_number":6910,"context_line":"        with mocked_http_conn("}],"source_content_type":"text/x-python","patch_set":11,"id":"adcdcc87_25edca2a","line":6907,"in_reply_to":"d2582628_99feab33","updated":"2026-04-03 23:51:27.000000000","message":"I reused the test_data since this test won\u0027t check them due to mismatch. To avoid the confusion, I got this fixed.","commit_id":"a2e1b5dc37653d7639f90b0e8b403a5f7a556655"}]}
