)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"immediately bomb out."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"1e005a79_a00d7e4a","line":15,"updated":"2022-06-27 21:55:22.000000000","message":"is that what happens?  do you demonstrate this behavior with tests?","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e655bfd048635910aa089ff8cfc0bb6a33eea507","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"immediately bomb out."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"55a0bb46_035a60f9","line":15,"in_reply_to":"1e005a79_a00d7e4a","updated":"2022-06-28 22:27:30.000000000","message":"Huh. I think I must\u0027ve thought the sync would raise an exception or something for some reason -- not sure why though.\n\nThe actual behavior (carry on and try to sync the next batch) seems pretty reasonable, though -- may as well try the next batch, as long as we\u0027re not deleting the whole partition at the end.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"immediately bomb out."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"8b84c083_4720278c","line":15,"in_reply_to":"55a0bb46_035a60f9","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":10,"context_line":"change as we gain more operational experience."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"6b0edd86_83379c48","line":13,"updated":"2024-01-26 00:49:21.000000000","message":"\"willing\" here means \"optionally configurable\"","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":10,"context_line":"change as we gain more operational experience."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"8c1faea9_0e3bb6be","line":13,"in_reply_to":"6b0edd86_83379c48","updated":"2024-01-27 00:24:21.000000000","message":"I mean, I wouldn\u0027t mind moving toward a default of 10 or so...","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":10,"context_line":"change as we gain more operational experience."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"998397e7_a1b20bd9","line":13,"in_reply_to":"8c1faea9_0e3bb6be","updated":"2024-02-06 15:38:48.000000000","message":"can you maybe talk to SRE about maybe turning this ON in prod before we go start changing upstream defaults?","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":10,"context_line":"change as we gain more operational experience."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"90e173d3_262cdb77","line":13,"in_reply_to":"998397e7_a1b20bd9","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"631fa1d3_33bef853","line":15,"updated":"2024-01-26 00:49:21.000000000","message":"ok, so it\u0027s revert specific, probably mainly useful in rebalance when this handoff partition is big and dense, i.e. \"old primary\".\n\nI\u0027m guessing SSYNC will clean up suffixes (objects?) as it goes; while rsync will leave everything on disk until the end - but maybe this change improves rsync to work more like SSYNC?\n\ndoes the replicator have a thing it calls \"revert\" - I was thinking reconstructor/ec as soon as I saw \"revert\" - I feel like we\u0027re borrowing language from the reconstructor/ssync","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"98306813_5ca27eca","line":15,"in_reply_to":"0e1d1876_67155402","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"a6fe24bd_c1a5a292","line":15,"in_reply_to":"330489c0_7daf596d","updated":"2024-02-06 15:38:48.000000000","message":"yes, I\u0027ll buy \"revert\" as a better concept/term than \"update_delted\" - that\u0027s probably why the reconstructor introduced the concept.\n\nMaybe as a prefactor (or follow-on; or as part of this change) we could update the language in the repliator to consistently use \"revert\" instead of \"update_deleted\" - so that we\u0027re not mixing metaphores and making this code schizophrinic.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"330489c0_7daf596d","line":15,"in_reply_to":"631fa1d3_33bef853","updated":"2024-01-27 00:24:21.000000000","message":"\u003e probably mainly useful in rebalance when this handoff partition is big and dense, i.e. \"old primary\".\n\n100%\n\n\u003e I\u0027m guessing SSYNC will clean up suffixes (objects?) as it goes\n\nNope! It\u0027s got https://github.com/openstack/swift/blob/2.32.0/swift/obj/replicator.py#L564-L569 but that\u0027s only after the whole partition is handled. The good news is that it will do partial deletes for partial successes, but that\u0027s not the same thing.\n\n\u003e maybe this change improves rsync to work more like SSYNC?\n\nKinda sorta -- rsync gets the partial delete for partial success like ssync, but the multiple delete opportunities per part per cycle benefits both.\n\n\u003e I feel like we\u0027re borrowing language from the reconstructor/ssync\n\nYeah, I am... `revert` seems way more descriptive than `update_deleted`. Maybe that\u0027s just me, though.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":12,"context_line":"Still try to revert all handoff data in the first pass -- just be"},{"line_number":13,"context_line":"willing to break it up into (randomized) batches so we can show progress"},{"line_number":14,"context_line":"(and delete data!) more frequently. If any of the batches fail,"},{"line_number":15,"context_line":"continue to the next batch but don\u0027t delete the partition."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"0e1d1876_67155402","line":15,"in_reply_to":"a6fe24bd_c1a5a292","updated":"2024-02-07 07:33:47.000000000","message":"https://review.opendev.org/c/openstack/swift/+/908252","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"},{"line_number":19,"context_line":"per-batch."},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":22,"context_line":"sneak in and grab an rsync connection slot."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"5d78d747_0e86946f","line":19,"updated":"2024-01-26 00:49:21.000000000","message":"this sounds like either a sneaky hack or the heart of the bug","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"},{"line_number":19,"context_line":"per-batch."},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":22,"context_line":"sneak in and grab an rsync connection slot."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"6e9d9164_0f87d020","line":19,"in_reply_to":"4a857ff7_7827c702","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":16,"context_line":""},{"line_number":17,"context_line":"This reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":18,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"},{"line_number":19,"context_line":"per-batch."},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":22,"context_line":"sneak in and grab an rsync connection slot."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"4a857ff7_7827c702","line":19,"in_reply_to":"5d78d747_0e86946f","updated":"2024-01-27 00:24:21.000000000","message":"Nah -- increasing timeouts won\u0027t help with FS errors that lead to a full-weight partition that hangs around indefinitely.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":19,"context_line":"per-batch."},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":22,"context_line":"sneak in and grab an rsync connection slot."},{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"1e14fec3_747c48f4","line":22,"updated":"2024-01-26 00:49:21.000000000","message":"i guess that\u0027s good","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":19,"context_line":"per-batch."},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":22,"context_line":"sneak in and grab an rsync connection slot."},{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"d9888747_eee0d8a0","line":22,"in_reply_to":"1e14fec3_747c48f4","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"},{"line_number":26,"context_line":"to None."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":29,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"74c83e52_485463dd","line":26,"updated":"2024-01-26 00:49:21.000000000","message":"WTF, this sounds like a drive-by","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"},{"line_number":26,"context_line":"to None."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":29,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"435ebc50_fad55d40","line":26,"in_reply_to":"345ec955_2a48f22b","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"},{"line_number":26,"context_line":"to None."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":29,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"fd5cc836_fd86b0d6","line":26,"in_reply_to":"74c83e52_485463dd","updated":"2024-01-27 00:24:21.000000000","message":"Otherwise there\u0027s this almost unbounded window of state on disk not matching state in hashes.pkl -- seems risky to me.\n\n1. If you\u0027re monitoring handoff suffixes (rather than just parts), it seems way cheaper to peek in hashes.pkl than call `os.listdir`\n\n   And you definitely want to be monitoring suffixes for rebalance completion -- it better captures the level of work still required.\n\n2. If you\u0027ve got some nodes with stale rings, you can get a handoff REPLICATEing with a node that it thinks is a primary but is actually a handoff. Having the replicator trust that hash seems bad when we know the data\u0027s been deleted.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":23,"context_line":""},{"line_number":24,"context_line":"Add a new interface to the hashes.invalid API -- if the line starts with"},{"line_number":25,"context_line":"\u0027!\u0027, remove the suffix during consolidation rather than just setting it"},{"line_number":26,"context_line":"to None."},{"line_number":27,"context_line":""},{"line_number":28,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":29,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":12,"id":"345ec955_2a48f22b","line":26,"in_reply_to":"fd5cc836_fd86b0d6","updated":"2024-02-06 15:38:48.000000000","message":"\u003e Otherwise there\u0027s this almost unbounded window of state on disk not matching state in hashes.pkl\n\nthat\u0027s mostly an effect of turning on this option; otherwise all the suffixes stay on disk until you finish the partition rsync - which is by far the common case, happening successfully in swift clusters everywhere since the begining of swift.\n\n\u003e And you definitely want to be monitoring suffixes for rebalance completion\n\nI think especially so if you start breaking rsync into batches so that 10% of part moves here, then gets rejected for concurrency so that another machine can move 10% of *it\u0027s* part.\n\n\u003e you can get a handoff REPLICATEing with a node that it thinks is a primary\n\nI don\u0027t think there\u0027s any reasonable or useful suffix has to return from  REPLICATE call here; MAYBE you could make a case for \"the last hash you had before you started trying to evict this part\" is helpful if there\u0027s a \"quick jitter\" as new rings are still getting reloaded around the cluster.\n\n\u003e Having the replicator trust that hash seems bad when we know the data\u0027s been deleted.\n\nagain, if the stale ring is going to push data to us regardless I\u0027m not sure there\u0027s much we can do.  We\u0027ll create the suffix and presumably pick it up on the post sync REPLICATE rehash.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":25,"context_line":"  to prevent quarantining during auditing. In such a case, the entire"},{"line_number":26,"context_line":"  partition would previously hang around indefinitely; with batching,"},{"line_number":27,"context_line":"  the partition would gradually drain, leaving behind only the bad"},{"line_number":28,"context_line":"  suffixes."},{"line_number":29,"context_line":""},{"line_number":30,"context_line":"It also reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":31,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"10d02b9b_933cac34","line":28,"updated":"2024-02-07 16:56:50.000000000","message":"super important and helpful WHY here in the commit message - KUDOS!  this \"whole part gets stuck\" problem actually came up recently in an ops call!","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":29,"context_line":""},{"line_number":30,"context_line":"It also reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":31,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"},{"line_number":32,"context_line":"per-batch."},{"line_number":33,"context_line":""},{"line_number":34,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":35,"context_line":"sneak in and grab an rsync connection slot."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"3747baaa_a6b317e3","line":32,"updated":"2024-02-07 16:56:50.000000000","message":"I think this is accurate and the semantic update of the timeout is important to to call out - but if ops sees an rsync timeout should they increase the rsync timeout or start using more batches - they seem kind of orthogonal.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"675a41717bf856e3ebfbf9ba858ccf4dfbdcf01b","unresolved":true,"context_lines":[{"line_number":29,"context_line":""},{"line_number":30,"context_line":"It also reduces the likelihood of timeouts (either rsync_timeout or"},{"line_number":31,"context_line":"http_timeout during REPLICATE calls). These timeouts now apply"},{"line_number":32,"context_line":"per-batch."},{"line_number":33,"context_line":""},{"line_number":34,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":35,"context_line":"sneak in and grab an rsync connection slot."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"2b732654_f93735fd","line":32,"in_reply_to":"3747baaa_a6b317e3","updated":"2026-03-05 00:44:13.000000000","message":"I mean, what should they do when they see an rsync timeout today? Increase the timeout, or work on doing a part-power increase?\n\nI think in all cases, it\u0027s going to depend upon the state of the cluster and ops are going to need to use their knowledge of that (and general experience) to decide what the appropriate action to take is.\n\nAnd really, they\u0027ll probably want to fiddle with *both*. Which there\u0027s ample precedent for: you almost certainly don\u0027t want to go tuning `rsync_bwlimit` without touching `rsync_timeout`, for example.\n\nFWIW, I feel like this plus some future change to set the default number of batches to 10 or so will make our default `rsync_timeout` a lot more reasonable for modern drives. A 15min timeout per partition with ~100 parts per disk, assuming they\u0027re say 70% full? About right for 20T drives -- if you let rsync take the full 150MB/s you can get out of that spinner.\n\nBut let\u0027s assume you still want to allow *some* client-driven traffic -- so you cap rsync bandwidth to 50MB/s, and now that timeout would only be appropriate for a 7T drive. 10 batches per part makes it good for a 50T drive, even when it\u0027s 90% full.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":32,"context_line":"per-batch."},{"line_number":33,"context_line":""},{"line_number":34,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":35,"context_line":"sneak in and grab an rsync connection slot."},{"line_number":36,"context_line":""},{"line_number":37,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":38,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"1ef7017c_f88596c2","line":35,"updated":"2024-02-07 16:56:50.000000000","message":"I\u0027m not really sure I see this as an obviously good thing - but maybe it\u0027s important to call out as a concern and this statement doesn\u0027t obvisn\u0027t intended to make any judgement on the possible benifit or inefficiency of switching the horse mid-stream or multiplexing nodes.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"675a41717bf856e3ebfbf9ba858ccf4dfbdcf01b","unresolved":true,"context_lines":[{"line_number":32,"context_line":"per-batch."},{"line_number":33,"context_line":""},{"line_number":34,"context_line":"Additionally, it gives periodic opportunities for other replicators to"},{"line_number":35,"context_line":"sneak in and grab an rsync connection slot."},{"line_number":36,"context_line":""},{"line_number":37,"context_line":"See also: https://review.opendev.org/c/openstack/swift/+/818017"},{"line_number":38,"context_line":"Change-Id: Ie12dcb4e524ee3af3ddc2b70c027fb62f68720fa"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"29997aa6_b22a096f","line":35,"in_reply_to":"1ef7017c_f88596c2","updated":"2026-03-05 00:44:13.000000000","message":"I think mainly, I tend to assume any long-held lock is a bad thing. That\u0027s part of why we set rsync timeouts, right?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"04b44b3265aad3ee583e4fa62cc19d15243e39e6","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"5369a563_88adcf3b","updated":"2022-05-23 16:23:44.000000000","message":"Thanks for the review, Charles! A clear sign that I should write some tests 😄","commit_id":"c006f2120d4cf7a2f057af4b7d1e1825063bcee9"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"b6402ab7dc35cc10b0d18fe95d81bf950236a062","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":5,"id":"41997eba_a4722998","updated":"2022-06-24 20:21:01.000000000","message":"Switched to batches-per-revert for the config option instead of suffixes-per-batch -- this way, ops shouldn\u0027t need to touch the config as much. For example, suppose you\u0027ve got a handful of suffix dirs that raise ENODATA when you try to stat or list them, in a partition that\u0027s got a few thousand suffixes. You take it as a sign that the disk is failing, and try to drain the disk before removal to save some effort.\n\nOn master, we\u0027d hand all the suffixes over to rsync, which would eventually fail. Some data may have moved, but the failure meant that we wouldn\u0027t delete any of it, so the disk would never fully drain.\n\nWith the original config option, you could set rsync_max_suffixes_per_revert to something like 60 and expect to need 30-50 batches to move everything. If you hit an ENODATA, say, 10 batches in, the next cycle again tries to move 60 suffixes per batch, but now only needs to move 20-40 batches\u0027 worth since we delete what we can after each batch. Replication will continue to choke on that ENODATA, but the suffix shuffling should mean you\u0027ll continue to make progress -- up to a point. Once all remaining suffixes fit in a single batch, you never get a clean rsync. You have to adjust the suffixes-per-revert down if you want to continue deleting the remaining good data.\n\nNow, you could set rsync_batches_per_revert to something like 50, so on the first pass batches are around 40-60 suffixes each. Let\u0027s say you hit that ENODATA around halfway through -- the next pass, you\u0027re down to 20-30 suffixes per batch since we don\u0027t have as many total suffixes. Replication will continue failing, but the batch size will keep decreasing as long as we\u0027re deleting some data. As long as you\u0027ve got less than 50 bad suffixes, all the good data should get replicated off eventually.","commit_id":"ad0774e5b5c48ee4813a608958ce123766d0374b"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"92a44bf2_cadbd30f","updated":"2022-06-27 21:55:22.000000000","message":"This seems like a reaonably well justified idea - i can imagine it helping.\n\nI\u0027ll have to try to get probetests passing with a non-default value!","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"d105576acd796a4999c46b8c5eebad3ab8a3037f","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":10,"id":"5ad74237_0afd8232","updated":"2022-10-24 22:24:08.000000000","message":"Just a rebase -- outstanding issues/questions are still valid:\n\n- Add some \"N of M batches\" indicator to per-sync logging\n- Should the `rsync` stat indicate \"peers rsynced to\" or \"number of rsync invocations\"?\n- Do we really need to log every successful batch that actually sends data?","commit_id":"5bb0417186e26b578f95efbf053818d673883c84"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":12,"id":"fdb8312c_0fb33f07","updated":"2024-01-26 00:49:21.000000000","message":"I don\u0027t understand what\u0027s going on with hashes.pkl; my ignorace aside - the new invalid file format doesn\u0027t seem to be explicitly tested.\n\nFWIW, I don\u0027t think we\u0027re actually using this (I know ssync can be slow and get stuck, but does good \u0027ol replicator rsync really have the timeout problem that can\u0027t be fixed with a bigger rsync_timeout!?)\n\nI think before we add new behaviors to this method we should try to break it up some; in particular if bifrication of rsync/ssync was more explicit the weird multi region ssync candidates stuff might keep it bugs out of perfectly reliable rsync code AND we could consider if it would be better to use max_objects_per_revert with ssync like reconstructor already does.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"7e5cc309_b9c9e094","updated":"2024-02-06 15:38:48.000000000","message":"since our prod tooling uses os.listdir to monitor suffixes and you\u0027ve confirmed the only requirement for the \"remove suffix from hashes.pkl\" is out-of-tree AND you expect adding the capability will extra suffix rehash on upgrade - I would still like you to pull it out to a follow on change (there\u0027s pleanty enough going on here as it is)\n\nI think \"break partitions into suffix batches\" is a reasonable idea in principle; if we had some operatoional experience with the feature and could demonstrate it\u0027s value in practice I\u0027d be more inclined to help invest in improving the telemetry/logging (which I DO think this patch makes \"worse\" by changing it\u0027s semantic meaning).\n\nFudementally I think \"progressive delete\" is a justifiable improvement, perhaps mainly for ENODATA - but at the end of the day we want to delete the whole part anyway.  If the primary motivation is \"improve rebalance disk-freeing in a crunch\" - I don\u0027t understand why you\u0027d START with the replicator tho - it\u0027s been working fine for +10 years (ops can make rsync go FAST; fast enough to tank disk-io); so why experiment with optimizations to rebalance HERE instead of the reconstructor (where we routinely have to disable primary partition syncing during a rebalance so there\u0027s obvious value to making it faster or more robust in the face of contention)?\n\nI\u0027m still really bothered by the interaction of suffix batching and ssync partial success; short of a more significant refactor can you at least provide feedback on my follow-up (which I think is a reasonable improvement):\n\n906770: confused about ssync delete_objs | https://review.opendev.org/c/openstack/swift/+/906770","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"8757c2cb_6c7adf87","in_reply_to":"7e5cc309_b9c9e094","updated":"2024-02-07 07:33:47.000000000","message":"\u003e  If the primary motivation is \"improve rebalance disk-freeing in a crunch\" - I don\u0027t understand why you\u0027d START with the replicator tho\n\nWe didn\u0027t -- this was in no small part inspired by https://review.opendev.org/c/openstack/swift/+/818017 and seeing that we added an option to the reconstructor that had no equivalent in the replicator.\n\n\u003e it\u0027s been working fine for +10 years (ops can make rsync go FAST; fast enough to tank disk-io)\n\nBut\n\n1. Disk size hasn\u0027t stood still for those years -- given our heuristic target of ~100 parts/disk, we\u0027ve gone from a replicanth potentially weighing 20G to 200G! Maybe that\u0027s a sign that we should be targeting ~500 or ~1000 parts per disk now, but part power increases are operationally intensive and require that you shut down replication *entirely* (so ops may well need to deal with the bigger cluster-full fire *first*).\n2. If we\u0027re saturating disk IO *anyway*, I want to make damn sure we\u0027re actually achieving something useful for it (like deleting successfully replicated data from a full-ish disk).\n\n\u003e Fudementally I think \"progressive delete\" is a justifiable improvement\n\n... and something that the replicator (even when using ssync!) does not currently have! You *have* to wait for the entire sync process to complete, for all remotes, before you can even get to multi-region-ssync\u0027s partial-success deletes.","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":14,"id":"ffa26705_f632d164","updated":"2024-02-07 07:33:47.000000000","message":"Congratulations, Clay, instead of one patch you don\u0027t like, you now have four patches, at least two of which you won\u0027t like ;-)\n\nLogging and stats surely still deserve some more discussion, and there\u0027s a question of how much of ssync we try to fix now that we\u0027ve got it loaded in our heads and seen all the warts.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":14,"id":"c7d05f52_660152b4","updated":"2024-02-07 16:56:50.000000000","message":"I tried to ack the comments that tim addressed in other patches; thanks for helping get the parts that were already done out of this patch and merged.\n\nI think we should try to keep the \"revert_partition\" method to have *one* error log message if it doesn\u0027t revert the partition.  If we reverted *some* suffixes because batches were configured we could log it as info \"removed x/y suffixes in part (leaving part)\"\n\nThe ssync stuff will continue to be a rabbit hole - please consider pulling out what you\u0027ve bumped into already into a prefactor that moves it out of the way of the code you want to change; we could easily merge some \"TODO: who is even using this?! WHY is it like this!?\" and maybe make a few cheap fixes along the way - and then get back to work on progressive delete telemetry!","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":17,"id":"61de20e9_0b3c00bf","updated":"2024-08-28 19:52:39.000000000","message":"Oh hey, I have some old drafts here...","commit_id":"0555d3eba874ad35324d230c3878e19936a11e53"},{"author":{"_account_id":34892,"name":"ASHWIN A NAIR","display_name":"indianwhocodes","email":"nairashwin952013@gmail.com","username":"indianwhocodes","status":"Nvidia"},"change_message_id":"622a3a78777ca7b2d75820598cde9554b5cefc21","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":22,"id":"48448383_05391fb9","updated":"2025-05-20 03:24:55.000000000","message":"recheck","commit_id":"4d6063219e703162ed99cdf8769ba98ced041275"}],"etc/object-server.conf-sample":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"8e0d85f5fd6d574ac1b9744d1d3f4c7628d2cec0","unresolved":false,"context_lines":[{"line_number":280,"context_line":"# etc/rsyncd.conf-sample for some usage examples."},{"line_number":281,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":282,"context_line":"#"},{"line_number":283,"context_line":"# rsync_batches_per_revert \u003d 0"},{"line_number":284,"context_line":"#"},{"line_number":285,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":286,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":6,"id":"d28926aa_e95022b2","line":283,"updated":"2022-06-28 06:58:47.000000000","message":"This should be a 1","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e655bfd048635910aa089ff8cfc0bb6a33eea507","unresolved":false,"context_lines":[{"line_number":280,"context_line":"# etc/rsyncd.conf-sample for some usage examples."},{"line_number":281,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":282,"context_line":"#"},{"line_number":283,"context_line":"# rsync_batches_per_revert \u003d 0"},{"line_number":284,"context_line":"#"},{"line_number":285,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":286,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":6,"id":"88fa787c_5c7f61dd","line":283,"in_reply_to":"d28926aa_e95022b2","updated":"2022-06-28 22:27:30.000000000","message":"It\u0027s also badly named, since it\u0027ll affect ssync as well. #willfix","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"12b004b4e6c6cb518827d1d278f436ba2223e8a6","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"cf084d3e_6c63422a","line":195,"updated":"2022-10-23 04:40:41.000000000","message":"So this default makes ssync logging kinda sucky -- you do an expansion that adds an extra 50% more disks, and it\u0027s basically a coin flip for whether any given old disk gets the replication slot or pops a bunch of timeouts that all log (roughly) the same thing:\n\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999344825744629 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.999475002288818 seconds: /srv/node/d5/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999219179153442 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.999271631240845 seconds: /srv/node/d5/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.999446630477905 seconds: /srv/node/d5/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999050378799438 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.999439477920532 seconds: /srv/node/d5/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999490976333618 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.99946117401123 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999249696731567 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.99949598312378 seconds: /srv/node/d5/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999480247497559 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6203/d2/217 Unexpected response: \":ERROR: 0 \u002714.999489784240723 seconds: /srv/node/d2/.lock\u0027\"\n [worker 2/2 pid\u003d1597474] 192.168.50.19:6204/d5/217 Unexpected response: \":ERROR: 0 \u002714.999445676803589 seconds: /srv/node/d5/.lock\u0027\"\n\nHowever, I\u0027m not convinced that this patch is the problem: rather, we need to either\n\n- Increase this default -- 4 seems like it might be a good middle ground? Still provide *some* protection from overload when there\u0027s a disk replacement, say, while not dropping all the way to one-on-one transfers.\n- Reduce the log level for replication-lock-timeout-related failures to debug -- error is way too noisy.\n\n...or both.\n\nWhat maybe *should* change in this patch: ensure the logging offers some indication of batch-ed-ness. Something like\n\n 192.168.50.19:6204/d2/217 (12/20) Unexpected response...\n 192.168.50.19:6204/d2/217 (13/20) Unexpected response...\n\nwould go a long way toward making ops feel better about the sea of red.","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"b25234d8_944663ce","line":195,"in_reply_to":"63f2de5e_5522db11","updated":"2024-02-06 15:38:48.000000000","message":"and this only effects replicator ssync?","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"63f2de5e_5522db11","line":195,"in_reply_to":"6c96ada6_ab2c9c7b","updated":"2024-01-27 00:24:21.000000000","message":"\u003e previously you\u0027d only get the one error per partition\n\nWhich could still be ~100 per disk, and if you\u0027re prioritizing handoffs because of a rebalance, you\u0027re going to be looping back to the same work real quick _anyway_. Like I said, I\u0027m not convinced this patch is the problem, though maybe it\u0027s making things worse enough that I ought to address it here. Maybe a pre-req that changes the status at https://github.com/openstack/swift/blob/2.32.0/swift/obj/ssync_receiver.py#L201 to be more like 529 or something would be good? And look for that in the sender as a \"abort this job, and backoff this node for a while\" signal.","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"2f82e2bc_1d13bf48","line":195,"in_reply_to":"b25234d8_944663ce","updated":"2024-02-07 16:56:50.000000000","message":"it effects any replication mode when you\u0027re using batches and under contention during rebalance (i.e. during a large expansion there\u0027s a 4000x increase in error log messages)","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"fec9dd3a_890da9e2","line":195,"in_reply_to":"b25234d8_944663ce","updated":"2024-02-07 16:56:50.000000000","message":"so, maybe this IS only an ssync problem?  I don\u0027t see where the rsync method (that we call each batch) will log per-batch-errors?","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":192,"context_line":"# I/O to each device. This does not override replication_concurrency described"},{"line_number":193,"context_line":"# above, so you may need to adjust both parameters depending on your hardware"},{"line_number":194,"context_line":"# or network capacity."},{"line_number":195,"context_line":"# replication_concurrency_per_device \u003d 1"},{"line_number":196,"context_line":"#"},{"line_number":197,"context_line":"# Number of seconds to wait for an existing replication device lock before"},{"line_number":198,"context_line":"# giving up."}],"source_content_type":"application/octet-stream","patch_set":9,"id":"6c96ada6_ab2c9c7b","line":195,"in_reply_to":"cf084d3e_6c63422a","updated":"2024-01-26 00:49:21.000000000","message":"oic, where as previously you\u0027d only get the one error per partition... i wonder if we can leave better logging for a follow-on and say \"if you\u0027re at the point you\u0027re tuning max_suffixes_per_revert you\u0027ve probably already tried tuning replication_concurrency_per_device\" or even \"why are you using ssync for replicated!?\"","commit_id":"ce55760a18940d67f580770602069af3776d5ca5"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":300,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":301,"context_line":"#"},{"line_number":302,"context_line":"# Maximum number of batches to use when syncing a handoff back to primaries."},{"line_number":303,"context_line":"# sync_batches_per_revert \u003d 1"},{"line_number":304,"context_line":"#"},{"line_number":305,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":306,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":12,"id":"95ee88c5_3d8cdbb6","line":303,"updated":"2024-01-26 00:49:21.000000000","message":"oh ok, so this is an *replicator* option; not to be confused with SSYNC/reconstructor\u0027s max_objects_per_revert (althought they seem similar in spirit?)\n\nFWIW I don\u0027t think we\u0027re using either this option in prod\n\n    $ grep revert /etc/swift/object-server/2.conf \n    sync_batches_per_revert \u003d 1\n    max_objects_per_revert \u003d 0\n\nmaybe it\u0027s more of an \"emergency rebalance\" kind of option","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":300,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":301,"context_line":"#"},{"line_number":302,"context_line":"# Maximum number of batches to use when syncing a handoff back to primaries."},{"line_number":303,"context_line":"# sync_batches_per_revert \u003d 1"},{"line_number":304,"context_line":"#"},{"line_number":305,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":306,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":12,"id":"a2d80bcb_20b07b39","line":303,"in_reply_to":"01fd51bf_29391177","updated":"2024-02-06 15:38:48.000000000","message":"seems like reasonable information to move to the commit message; \"add a new option - here\u0027s when to use it\"","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":300,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":301,"context_line":"#"},{"line_number":302,"context_line":"# Maximum number of batches to use when syncing a handoff back to primaries."},{"line_number":303,"context_line":"# sync_batches_per_revert \u003d 1"},{"line_number":304,"context_line":"#"},{"line_number":305,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":306,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":12,"id":"01fd51bf_29391177","line":303,"in_reply_to":"95ee88c5_3d8cdbb6","updated":"2024-01-27 00:24:21.000000000","message":"Yeah -- so there are two main use-cases:\n\n1. Emergency rebalance, in the classic \"cluster is nearly full and we just now got a bunch of hardware to help\" case.\n\n   Here, the fact that the whole partition hangs around until everything has transferred can be a problem, as your disks continue to fill up from the assignments that haven\u0027t changed.\n\n2. Decommissioning old hardware, and discovering data that can\u0027t be moved off disk.\n\n   We\u0027ve seen cases where trying to list or even stat a suffix raises ENODATA and attempting to quarantine it **fails** -- the only solution at that point seems to be nuking the whole partition. Arguably, we might want to take it as signal that the disk is bad and shouldn\u0027t be trusted, but short of us getting away from rsync and into the data path...\n\nBoth of these are aided by the progressive delete.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":300,"context_line":"# rsync_module \u003d {replication_ip}::object"},{"line_number":301,"context_line":"#"},{"line_number":302,"context_line":"# Maximum number of batches to use when syncing a handoff back to primaries."},{"line_number":303,"context_line":"# sync_batches_per_revert \u003d 1"},{"line_number":304,"context_line":"#"},{"line_number":305,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"},{"line_number":306,"context_line":"# max duration of an http request; this is for REPLICATE finalization calls and"}],"source_content_type":"application/octet-stream","patch_set":12,"id":"674b4c9f_bd9d2f23","line":303,"in_reply_to":"a2d80bcb_20b07b39","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"643263023f4df352229befb7dfb35fedbd995db0","unresolved":true,"context_lines":[{"line_number":328,"context_line":"# Maximum number of batches to use when syncing a handoff back to primaries."},{"line_number":329,"context_line":"# This progressively deletes handoff data as batches complete; it may be"},{"line_number":330,"context_line":"# useful to increase this when rebalancing a fairly full cluster."},{"line_number":331,"context_line":"# Note that this will inflate some progress stats emitted to logs and recon."},{"line_number":332,"context_line":"# sync_batches_per_revert \u003d 1"},{"line_number":333,"context_line":"#"},{"line_number":334,"context_line":"# node_timeout \u003d \u003cwhatever\u0027s in the DEFAULT section or 10\u003e"}],"source_content_type":"application/octet-stream","patch_set":25,"id":"6ea5a96f_e68b931f","line":331,"updated":"2025-05-21 15:57:32.000000000","message":"it\u0027d be helpful I think (even if only in a comment on the review) to call out which these are; I think it\u0027s at least the rysnc stat and log line - but maybe others?","commit_id":"26cc99b57a94db52c5422d3d8269468353309c23"}],"swift/obj/diskfile.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"877cbdb9_faa1559f","side":"PARENT","line":437,"updated":"2024-01-26 00:49:21.000000000","message":"there\u0027s an upgrade concern here\n\nIf you update packages and reload services your replicators could immediately die and come back writing !suff into your hashes.pkl while your object-servers are still shutting down (or maybe you just don\u0027t want to restart them right now); regardless if they\u0027re allowed to service REPLICATE requests (old code still running; new format on disk) they\u0027ll create bogus keys when eating the new invalidation file.\n\nWhy are you changing this?  Everytime we modify this code it\u0027s rife with tricky bugs.  Can we not do this?","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"b0273f9c_2c4b9678","side":"PARENT","line":437,"in_reply_to":"0fa517c6_4d5bee70","updated":"2024-02-06 15:38:48.000000000","message":"that makes it sound like \"the plan\" is to *break* suffix hash acceleration until your object-servers restart.\n\nI suppose we have an existance proof of \"our cluster survived this upgrade\" - but I\u0027d bet ops noticed the uptick in rehash (or would have if we had better stats on it).\n\nIt still seems orthogonal; exactly the kind of thing we\u0027d want in a follow-up change that defaults to \"read but not write\" and we can turn on after upgrade.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"f1a7ad347737435f37fb03a9b94c3f313aef252c","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"0fa517c6_4d5bee70","side":"PARENT","line":437,"in_reply_to":"31fa5477_fcf8473b","updated":"2024-01-29 20:37:15.000000000","message":"FWIW, that validation [was added in 2.22.0](https://github.com/openstack/swift/commit/c9e78d15e1e05f23facf7c28758b442bb25bde68), and later [tightened in 2.28.0](https://github.com/openstack/swift/commit/ade3b28636dd3be79152af4dedfcc7a039dff025) (though the `!` is enough to trip the older validation).","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"31fa5477_fcf8473b","side":"PARENT","line":437,"in_reply_to":"877cbdb9_faa1559f","updated":"2024-01-27 00:24:21.000000000","message":"\u003e they\u0027ll create bogus keys when eating the new invalidation file.\n\n...which will be [interpreted as corruption](https://github.com/openstack/swift/blob/2.32.0/swift/obj/diskfile.py#L390-L392) on read, and [the whole partition will be rehashed](https://github.com/openstack/swift/blob/2.32.0/swift/obj/diskfile.py#L1320-L1336)","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"bf20b54b_77e5d289","side":"PARENT","line":437,"in_reply_to":"b0273f9c_2c4b9678","updated":"2024-02-07 07:33:47.000000000","message":"*If* they had the batching turned on during the upgrade.\n\nIn general, though, I don\u0027t like that we\u0027re apparently afraid to touch this code -- I totally want to change up suffix hashing more! At the very least, I want us tracking object counts and byte counts alongside those hashes.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":434,"context_line":"                for line in inv_fh:"},{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    hashes[suffix] \u003d None"},{"line_number":438,"context_line":"        except (IOError, OSError) as e:"},{"line_number":439,"context_line":"            if e.errno !\u003d errno.ENOENT:"},{"line_number":440,"context_line":"                raise"}],"source_content_type":"text/x-python","patch_set":12,"id":"f83bb890_29051feb","side":"PARENT","line":437,"in_reply_to":"bf20b54b_77e5d289","updated":"2024-02-07 16:56:50.000000000","message":"\u003e If they had the batching turned on during the upgrade.\n\nbig if true!  I assumed even with the default you\u0027d hit some of the remove_suffix path; maybe that only kicks in if you have more than one batch.\n\n\u003e I don\u0027t like that we\u0027re apparently afraid to touch this code\n\nI\u0027m not afraid; I\u0027m thoughtful ;)\n\n\u003e At the very least, I want us tracking object counts and byte counts alongside those hashes.\n\nbecause this file is deserialized in the fast path for replication I think we should try to show restraint bloating it up.  There\u0027s \"data the replicator needs to do it\u0027s job\" and there\u0027s \"data we could potentially find useful with out-of-tree-tools\" - in the middle is probably something like a compromise.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    if line.startswith(\u0027!\u0027):"},{"line_number":438,"context_line":"                        hashes.pop(suffix[1:], None)"},{"line_number":439,"context_line":"                    else:"},{"line_number":440,"context_line":"                        hashes[suffix] \u003d None"},{"line_number":441,"context_line":"        except (IOError, OSError) as e:"}],"source_content_type":"text/x-python","patch_set":12,"id":"ef1736ee_79cee598","line":438,"updated":"2024-01-26 00:49:21.000000000","message":"why is this needed... it does\u0027t look like we ever previously \"reaped\" suffixes from this datastructure.  There\u0027s kind of always/only 4096 of them; it might be more expensive to trigger the dictionary resize than to just leave the value empty!","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    if line.startswith(\u0027!\u0027):"},{"line_number":438,"context_line":"                        hashes.pop(suffix[1:], None)"},{"line_number":439,"context_line":"                    else:"},{"line_number":440,"context_line":"                        hashes[suffix] \u003d None"},{"line_number":441,"context_line":"        except (IOError, OSError) as e:"}],"source_content_type":"text/x-python","patch_set":12,"id":"50958e9f_a6b855fa","line":438,"in_reply_to":"01d9661a_38359f3c","updated":"2024-02-06 15:38:48.000000000","message":"I think we\u0027re \"mainly\" relying on the fact that no one calls REPLICATE on a handoff.  FWIW I checked SRE\u0027s classify_handoff_parts script and it uses os.listdir:\n\n    suffixes \u003d len([f for f in os.listdir(suffixes_dir) if len(f) \u003d\u003d 3])\n\n... so it\u0027s possible the only cluster that benifits from this is your home lab.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    if line.startswith(\u0027!\u0027):"},{"line_number":438,"context_line":"                        hashes.pop(suffix[1:], None)"},{"line_number":439,"context_line":"                    else:"},{"line_number":440,"context_line":"                        hashes[suffix] \u003d None"},{"line_number":441,"context_line":"        except (IOError, OSError) as e:"}],"source_content_type":"text/x-python","patch_set":12,"id":"50da7c69_401c0f3f","line":438,"in_reply_to":"50958e9f_a6b855fa","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":435,"context_line":"                    found_invalidation_entry \u003d True"},{"line_number":436,"context_line":"                    suffix \u003d line.strip()"},{"line_number":437,"context_line":"                    if line.startswith(\u0027!\u0027):"},{"line_number":438,"context_line":"                        hashes.pop(suffix[1:], None)"},{"line_number":439,"context_line":"                    else:"},{"line_number":440,"context_line":"                        hashes[suffix] \u003d None"},{"line_number":441,"context_line":"        except (IOError, OSError) as e:"}],"source_content_type":"text/x-python","patch_set":12,"id":"01d9661a_38359f3c","line":438,"in_reply_to":"ef1736ee_79cee598","updated":"2024-01-27 00:24:21.000000000","message":"\u003e it does\u0027t look like we ever previously \"reaped\" suffixes from this datastructure.\n\nWell, certainly not as part of the replicator process -- it was all-or-nothing. Either we\u0027d unlink the whole part (and the hashes.pkl) or everything would hang out on disk. Ugh, even ssync wouldn\u0027t invalidate the suffix when we handle the partial success...\n\nThe only other way we would need to reap suffixes would be because everything got deleted, and then all tombstones got reaped.\n\nHmm... I\u0027m a little surprised we don\u0027t invalidate the suffix when we unlink the hash dir: https://github.com/openstack/swift/blob/2.32.0/swift/obj/diskfile.py#L1137-L1145\n\nI found this comment: https://github.com/openstack/swift/blob/2.32.0/swift/obj/diskfile.py#L1732-L1734\n\nBut it sucks that we\u0027d have to wait until everything gets deleted. So I guess we\u0027re mainly relying on the [10% chance of forcing a rehash](https://github.com/openstack/swift/blob/2.32.0/swift/obj/replicator.py#L51-L52) each cycle...","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":459,"context_line":"    :param suffix_dir: absolute path to suffix dir whose hash needs"},{"line_number":460,"context_line":"                       invalidating"},{"line_number":461,"context_line":"    :param removed: if true, mark the suffix as not just invalidated, but"},{"line_number":462,"context_line":"                    removed"},{"line_number":463,"context_line":"    \"\"\""},{"line_number":464,"context_line":""},{"line_number":465,"context_line":"    suffix \u003d basename(suffix_dir)"}],"source_content_type":"text/x-python","patch_set":12,"id":"ff6634d0_272f21cd","line":462,"updated":"2024-01-26 00:49:21.000000000","message":"but WHY?","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":459,"context_line":"    :param suffix_dir: absolute path to suffix dir whose hash needs"},{"line_number":460,"context_line":"                       invalidating"},{"line_number":461,"context_line":"    :param removed: if true, mark the suffix as not just invalidated, but"},{"line_number":462,"context_line":"                    removed"},{"line_number":463,"context_line":"    \"\"\""},{"line_number":464,"context_line":""},{"line_number":465,"context_line":"    suffix \u003d basename(suffix_dir)"}],"source_content_type":"text/x-python","patch_set":12,"id":"30c4ab55_32f9c590","line":462,"in_reply_to":"096100dd_d7bb59af","updated":"2024-02-07 07:33:47.000000000","message":"Fine; https://review.opendev.org/c/openstack/swift/+/908254 -- but I still don\u0027t like our tendency to have known-inaccurate state in hashes.pkl -- this multi-region-ssync business had me see a REPLICATE response actively lying to me, claiming to have a valid suffix hash for a part with no suffixes!","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":459,"context_line":"    :param suffix_dir: absolute path to suffix dir whose hash needs"},{"line_number":460,"context_line":"                       invalidating"},{"line_number":461,"context_line":"    :param removed: if true, mark the suffix as not just invalidated, but"},{"line_number":462,"context_line":"                    removed"},{"line_number":463,"context_line":"    \"\"\""},{"line_number":464,"context_line":""},{"line_number":465,"context_line":"    suffix \u003d basename(suffix_dir)"}],"source_content_type":"text/x-python","patch_set":12,"id":"015cf594_9026c59d","line":462,"in_reply_to":"30c4ab55_32f9c590","updated":"2024-02-07 16:56:50.000000000","message":"\u003e I still don\u0027t like our tendency to have known-inaccurate state in hashes.pkl\n\nI feel that, and it\u0027s only an issue when you\u0027re using progressive delete - I appreicate your default was to keep things tidy; I think this was an exception - but maybe I\u0027ll come around if this all gets done or it really matters.\n\nWhat is the impact to our prod clusters of removing the code that knows how to read the new invalid format from the patch we\u0027re already carrying?  More rehashing?\n\nupdate: I clicked +A on the prefactor valid_suffix in invalidate (quite nice!) - so the impact should be minimal/non-existant","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":459,"context_line":"    :param suffix_dir: absolute path to suffix dir whose hash needs"},{"line_number":460,"context_line":"                       invalidating"},{"line_number":461,"context_line":"    :param removed: if true, mark the suffix as not just invalidated, but"},{"line_number":462,"context_line":"                    removed"},{"line_number":463,"context_line":"    \"\"\""},{"line_number":464,"context_line":""},{"line_number":465,"context_line":"    suffix \u003d basename(suffix_dir)"}],"source_content_type":"text/x-python","patch_set":12,"id":"096100dd_d7bb59af","line":462,"in_reply_to":"7e93653b_6d306400","updated":"2024-02-06 15:38:48.000000000","message":"\"why NOT\" is not a great reason to introduce a drive-by that has an upgrade impact and questionable value.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":459,"context_line":"    :param suffix_dir: absolute path to suffix dir whose hash needs"},{"line_number":460,"context_line":"                       invalidating"},{"line_number":461,"context_line":"    :param removed: if true, mark the suffix as not just invalidated, but"},{"line_number":462,"context_line":"                    removed"},{"line_number":463,"context_line":"    \"\"\""},{"line_number":464,"context_line":""},{"line_number":465,"context_line":"    suffix \u003d basename(suffix_dir)"}],"source_content_type":"text/x-python","patch_set":12,"id":"7e93653b_6d306400","line":462,"in_reply_to":"ff6634d0_272f21cd","updated":"2024-01-27 00:24:21.000000000","message":"... because we already know it\u0027s gone? Why mark it invalid and then wait for the next opportunity to rehash when we already know the current state on disk?","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":469,"context_line":"        suffix \u003d suffix.encode(\u0027utf-8\u0027)"},{"line_number":470,"context_line":"    with lock_path(partition_dir), open(invalidations_file, \u0027ab\u0027) as inv_fh:"},{"line_number":471,"context_line":"        if removed:"},{"line_number":472,"context_line":"            inv_fh.write(b\"!\" + suffix + b\"\\n\")"},{"line_number":473,"context_line":"        else:"},{"line_number":474,"context_line":"            inv_fh.write(suffix + b\"\\n\")"},{"line_number":475,"context_line":""}],"source_content_type":"text/x-python","patch_set":12,"id":"741580d6_b767ed77","line":472,"updated":"2024-01-26 00:49:21.000000000","message":"this needs explicit testing and consideration of upgrades","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":469,"context_line":"        suffix \u003d suffix.encode(\u0027utf-8\u0027)"},{"line_number":470,"context_line":"    with lock_path(partition_dir), open(invalidations_file, \u0027ab\u0027) as inv_fh:"},{"line_number":471,"context_line":"        if removed:"},{"line_number":472,"context_line":"            inv_fh.write(b\"!\" + suffix + b\"\\n\")"},{"line_number":473,"context_line":"        else:"},{"line_number":474,"context_line":"            inv_fh.write(suffix + b\"\\n\")"},{"line_number":475,"context_line":""}],"source_content_type":"text/x-python","patch_set":12,"id":"94013cb7_490f4feb","line":472,"in_reply_to":"741580d6_b767ed77","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"}],"swift/obj/replicator.py":[{"author":{"_account_id":12050,"name":"Charles Hsu","email":"charles0126@gmail.com","username":"charz"},"change_message_id":"f5ff65584d5a2e384dd9a58d0810167a3236d370","unresolved":true,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_max_suffixes_per_revert \u003d non_negative_int("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_timeout\u0027, \u00270\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":2,"id":"862d6c22_75d17c93","line":169,"updated":"2022-05-23 01:58:44.000000000","message":"just curious, why we pick the `rsync_timeout` for default if that exists?","commit_id":"c006f2120d4cf7a2f057af4b7d1e1825063bcee9"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"04b44b3265aad3ee583e4fa62cc19d15243e39e6","unresolved":true,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_max_suffixes_per_revert \u003d non_negative_int("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_timeout\u0027, \u00270\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":2,"id":"98334866_794849e6","line":169,"in_reply_to":"862d6c22_75d17c93","updated":"2022-05-23 16:23:44.000000000","message":"Good catch! Must\u0027ve been a copy/paste error. #willfix","commit_id":"c006f2120d4cf7a2f057af4b7d1e1825063bcee9"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"b6402ab7dc35cc10b0d18fe95d81bf950236a062","unresolved":false,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_max_suffixes_per_revert \u003d non_negative_int("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_timeout\u0027, \u00270\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":2,"id":"108336dc_0cd49d40","line":169,"in_reply_to":"98334866_794849e6","updated":"2022-06-24 20:21:01.000000000","message":"Done","commit_id":"c006f2120d4cf7a2f057af4b7d1e1825063bcee9"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"cf42dbdb5ae3aebad94e1e5ec0ec9d0af4bf513f","unresolved":true,"context_lines":[{"line_number":595,"context_line":"                                      failure_dev[\u0027device\u0027])"},{"line_number":596,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":597,"context_line":"                        else:"},{"line_number":598,"context_line":"                            self.delete_partition(job[\u0027path\u0027])"},{"line_number":599,"context_line":"                            handoff_partition_deleted \u003d True"},{"line_number":600,"context_line":""},{"line_number":601,"context_line":"                if not suffixes:"}],"source_content_type":"text/x-python","patch_set":4,"id":"8591b837_79110c64","line":598,"updated":"2022-06-24 16:04:44.000000000","message":"Wait wait wait -- we can\u0027t do this until we\u0027ve done *all* the batches! Or, better yet, just delete the suffixes that synced this batch.","commit_id":"2ef38df5781400407646cf4fac974ff7dc102583"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"b6402ab7dc35cc10b0d18fe95d81bf950236a062","unresolved":false,"context_lines":[{"line_number":595,"context_line":"                                      failure_dev[\u0027device\u0027])"},{"line_number":596,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":597,"context_line":"                        else:"},{"line_number":598,"context_line":"                            self.delete_partition(job[\u0027path\u0027])"},{"line_number":599,"context_line":"                            handoff_partition_deleted \u003d True"},{"line_number":600,"context_line":""},{"line_number":601,"context_line":"                if not suffixes:"}],"source_content_type":"text/x-python","patch_set":4,"id":"9f91122c_08d91a3c","line":598,"in_reply_to":"8591b837_79110c64","updated":"2022-06-24 20:21:01.000000000","message":"Done","commit_id":"2ef38df5781400407646cf4fac974ff7dc102583"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":35,"context_line":"    rsync_module_interpolation, mkdirs, config_true_value, \\"},{"line_number":36,"context_line":"    config_auto_int_value, storage_directory, \\"},{"line_number":37,"context_line":"    load_recon_cache, PrefixLoggerAdapter, parse_override_options, \\"},{"line_number":38,"context_line":"    distribute_evenly, listdir, config_positive_int_value"},{"line_number":39,"context_line":"from swift.common.bufferedhttp import http_connect"},{"line_number":40,"context_line":"from swift.common.daemon import Daemon"},{"line_number":41,"context_line":"from swift.common.http import HTTP_OK, HTTP_INSUFFICIENT_STORAGE"}],"source_content_type":"text/x-python","patch_set":6,"id":"da345245_5ed81324","line":38,"updated":"2022-06-27 21:55:22.000000000","message":"interesting tha twe already had this helper in utils\n\n\tdef distribute_evenly(items, num_buckets):\n\t    \"\"\"\n\t    Distribute items as evenly as possible into N buckets.\n\t    \"\"\"\n\t    out \u003d [[] for _ in range(num_buckets)]\n\t    for index, item in enumerate(items):\n\t\tout[index % num_buckets].append(item)\n\t    return out\n\t    \n... AND it was already used in get_worker_args","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":35,"context_line":"    rsync_module_interpolation, mkdirs, config_true_value, \\"},{"line_number":36,"context_line":"    config_auto_int_value, storage_directory, \\"},{"line_number":37,"context_line":"    load_recon_cache, PrefixLoggerAdapter, parse_override_options, \\"},{"line_number":38,"context_line":"    distribute_evenly, listdir, config_positive_int_value"},{"line_number":39,"context_line":"from swift.common.bufferedhttp import http_connect"},{"line_number":40,"context_line":"from swift.common.daemon import Daemon"},{"line_number":41,"context_line":"from swift.common.http import HTTP_OK, HTTP_INSUFFICIENT_STORAGE"}],"source_content_type":"text/x-python","patch_set":6,"id":"23248235_169791ca","line":38,"in_reply_to":"da345245_5ed81324","updated":"2024-01-26 00:49:21.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"8e0d85f5fd6d574ac1b9744d1d3f4c7628d2cec0","unresolved":false,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"c3a63556_8c1c6a39","line":169,"updated":"2022-06-28 06:58:47.000000000","message":"Wait, in the sample config the default is 0. The sample config needs to be updated","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"ff62971a_10a788a4","line":169,"in_reply_to":"4137450c_017ee4f9","updated":"2024-02-06 15:38:48.000000000","message":"I support adopting the term revert in the replicator option; but I think the replicator code should follow suit.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"4137450c_017ee4f9","line":169,"in_reply_to":"7b099e0b_0e62710d","updated":"2024-01-27 00:24:21.000000000","message":"\u003e configured the replicator to use SSYNC like a crazy person.\n\nIDK, I think there\u0027s a good chance OVH still runs ssync for everything, and for not-entirely-crazy reasons.\n\n\u003e prefer the \"revert\" term as a concept\n\nFor sure my thought. Plus we\u0027ve got ample language already in object-server.conf-sample about \"revert\" being the movement of data from handoffs to primaries.\n\n\u003e the \"update_deleted\" method is too much of an implementation detail\n\nFor sure -- and unless you\u0027ve done some swift development, it\u0027s probably not a term you\u0027d be familiar with. I suppose \"revert\" might also be a little jargon-y, but it seems (to me, anyway) to be less of a leap.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"7b099e0b_0e62710d","line":169,"in_reply_to":"c3a63556_8c1c6a39","updated":"2024-01-26 00:49:21.000000000","message":"oh goodness, so it was orginally called \"rsync_batches_per_revert\" but them someone remembered you can configured the replicator to use SSYNC like a crazy person.\n\nreconstructor option is \"max_objects_per_revert\" maybe \"max_suffix_per_update_deleted\" (???) - maybe the thinking was we prefer the \"revert\" term as a concept and the \"update_deleted\" method is too much of an implementation detail (could we rename it?)","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"5f7ba61f_f7544fb7","line":169,"in_reply_to":"ff62971a_10a788a4","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":166,"context_line":"        self.rsync_io_timeout \u003d conf.get(\u0027rsync_io_timeout\u0027, \u002730\u0027)"},{"line_number":167,"context_line":"        self.rsync_bwlimit \u003d conf.get(\u0027rsync_bwlimit\u0027, \u00270\u0027)"},{"line_number":168,"context_line":"        self.rsync_batches_per_revert \u003d config_positive_int_value("},{"line_number":169,"context_line":"            conf.get(\u0027rsync_batches_per_revert\u0027, \u00271\u0027))"},{"line_number":170,"context_line":"        self.rsync_compress \u003d config_true_value("},{"line_number":171,"context_line":"            conf.get(\u0027rsync_compress\u0027, \u0027no\u0027))"},{"line_number":172,"context_line":"        self.rsync_module \u003d conf.get(\u0027rsync_module\u0027, \u0027\u0027).rstrip(\u0027/\u0027)"}],"source_content_type":"text/x-python","patch_set":6,"id":"dc879f38_f8955e2c","line":169,"in_reply_to":"ff62971a_10a788a4","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":530,"context_line":"            with df_mgr.partition_lock(job[\u0027device\u0027], job[\u0027policy\u0027],"},{"line_number":531,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":532,"context_line":"                                       timeout\u003d0.2):"},{"line_number":533,"context_line":"                batches_deleted \u003d []"},{"line_number":534,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":535,"context_line":"                random.shuffle(suffixes)"},{"line_number":536,"context_line":"                for suffixes_to_revert in distribute_evenly("}],"source_content_type":"text/x-python","patch_set":6,"id":"6e62144c_ed93704b","line":533,"updated":"2022-06-27 21:55:22.000000000","message":"top level check is now \"all(batches_deleted)\" instead of \"all(resp is success)\"","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":530,"context_line":"            with df_mgr.partition_lock(job[\u0027device\u0027], job[\u0027policy\u0027],"},{"line_number":531,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":532,"context_line":"                                       timeout\u003d0.2):"},{"line_number":533,"context_line":"                batches_deleted \u003d []"},{"line_number":534,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":535,"context_line":"                random.shuffle(suffixes)"},{"line_number":536,"context_line":"                for suffixes_to_revert in distribute_evenly("}],"source_content_type":"text/x-python","patch_set":6,"id":"8e2300db_3e7220b8","line":533,"in_reply_to":"6e62144c_ed93704b","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"8e0d85f5fd6d574ac1b9744d1d3f4c7628d2cec0","unresolved":false,"context_lines":[{"line_number":536,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":537,"context_line":"                        suffixes, self.rsync_batches_per_revert):"},{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":6,"id":"5e76dd13_9df81aa6","line":539,"updated":"2022-06-28 06:58:47.000000000","message":"But if we use a 0 then it\u0027s ZeroDivisionError, which is why we\u0027re using PositiveIntegerValue. So Yup the sample config definitely needs to be changed to a 1.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":536,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":537,"context_line":"                        suffixes, self.rsync_batches_per_revert):"},{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":6,"id":"e3ae8913_13c3035c","line":539,"updated":"2022-06-27 21:55:22.000000000","message":"good call\n\n\t\u003e\u003e\u003e utils.distribute_evenly([1, 2, 3], 4)\n\t[[1], [2], [3], []]","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":536,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":537,"context_line":"                        suffixes, self.rsync_batches_per_revert):"},{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":6,"id":"d5e81182_17131188","line":539,"in_reply_to":"e3ae8913_13c3035c","updated":"2024-01-26 00:49:21.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":537,"context_line":"                        suffixes, self.rsync_batches_per_revert):"},{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"}],"source_content_type":"text/x-python","patch_set":6,"id":"d2b1fdc8_c94958a8","line":540,"updated":"2022-06-27 21:55:22.000000000","message":"and we count responses per batch","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":537,"context_line":"                        suffixes, self.rsync_batches_per_revert):"},{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"}],"source_content_type":"text/x-python","patch_set":6,"id":"80424b30_16c65b18","line":540,"in_reply_to":"d2b1fdc8_c94958a8","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"8e0d85f5fd6d574ac1b9744d1d3f4c7628d2cec0","unresolved":false,"context_lines":[{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"}],"source_content_type":"text/x-python","patch_set":6,"id":"51595973_87e83d6a","line":541,"updated":"2022-06-28 06:58:47.000000000","message":"So what are the ramifications of synced_remote_reasons be reset every batch? Do we end up doing more ssync work, when using ssync with object replicators? Or rather we don\u0027t send remote_sync","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e655bfd048635910aa089ff8cfc0bb6a33eea507","unresolved":false,"context_lines":[{"line_number":538,"context_line":"                    if not suffixes_to_revert:"},{"line_number":539,"context_line":"                        break"},{"line_number":540,"context_line":"                    responses \u003d []"},{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"}],"source_content_type":"text/x-python","patch_set":6,"id":"a6377abe_cb97d545","line":541,"in_reply_to":"51595973_87e83d6a","updated":"2022-06-28 22:27:30.000000000","message":"synced_remote_regions is only used to determine delete_objs, which in turn is only used per-batch. Resetting here has no impact; it just seemed cleaner. Maybe I should have broken a bunch of this out to a sync_batch() helper.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e655bfd048635910aa089ff8cfc0bb6a33eea507","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"7b424cbf_8108036a","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"updated":"2022-06-28 22:27:30.000000000","message":"Off-topic: well, *that\u0027s* misnamed...","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"b979d057_29ea4624","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"331dc9f7_39060a35","updated":"2024-02-07 16:56:50.000000000","message":"valid points.  What telemetry do we have?  What telemetry do we need?  I think this is where this patch needs to focus to get ready to merge.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"12b004b4e6c6cb518827d1d278f436ba2223e8a6","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"e7568f59_e5c04d4a","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"7b424cbf_8108036a","updated":"2022-10-23 04:40:41.000000000","message":"...but on-topic: Should we be counting rsync invocations, or partitions sync\u0027ed?","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"e740f430_5f3475f4","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"b2e0457c_9fdb5346","updated":"2024-02-06 15:38:48.000000000","message":"i think this is related to the discussion about logging - you took some code that was originally designed to be invoked once per part and indented it without fixing the related telemetry.  The telemetry *should* be updated with the code change.\n\nI can totally imagine justifying needing to carry a partial implementation to deal with an issue in prod (although I\u0027m not sure that\u0027s why this patch got carried since we\u0027re not actually using the option).  But if we\u0027re convinced batching suffixes is worthwhile to invest in - why not finish the patch and fix the telemetry to match the design.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"643263023f4df352229befb7dfb35fedbd995db0","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"198bebde_83d7fb00","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"b979d057_29ea4624","updated":"2025-05-21 15:57:32.000000000","message":"\u003e except we already get that with attempted\n\nok, so since we have the \"jobs\" counter - I can get behind the rsync counter being tied to invocations of [r|s]sync","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"331dc9f7_39060a35","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"e740f430_5f3475f4","updated":"2024-02-07 07:33:47.000000000","message":"\u003e this stat obviously previously ment \"[r|s]sync jobs\" of which there was only one per node.\n\nI think you and I must have different ideas of what constitutes a \"sync job\" -- I could see two valid interpretations:\n\n1. Something tied back to a job dict out of `build_replication_jobs` -- i.e. a call to `update_deleted`, except we already get that with `attempted`\n2. An actual invocation of rsync (or, a single ssync session)\n\nYou seem to be proposing some third notion, where you expect to get exactly `len(nodes)` of them per `update_deleted` call, but I don\u0027t really get _why_.\n\nMaybe it\u0027s just a _crappy_ metric, regardless? This happens a lot with these per-cycle metrics, especially the ones that go out to recon -- we see some counter ticking but it keeps resetting more quickly than seems useful and we have no indication of why it has the rate it does. Is it related to the rsync bw we configured? Are there only a handful of parts to move? Is the replicator interval way too high? At least it\u0027s better than _no metrics_ -- there\u0027s some sense of progress -- and in that vein, tying it to an rsync/ssync session makes some sense.\n\nPoint taken, though, that it probably deserves a call-out for ops. I\u0027ll update the sample config.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":541,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":542,"context_line":"                    delete_objs \u003d None"},{"line_number":543,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":544,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":545,"context_line":"                        kwargs \u003d {}"},{"line_number":546,"context_line":"                        if self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027 \\"},{"line_number":547,"context_line":"                                and node[\u0027region\u0027] in synced_remote_regions:"}],"source_content_type":"text/x-python","patch_set":6,"id":"b2e0457c_9fdb5346","line":544,"range":{"start_line":544,"start_character":30,"end_line":544,"end_character":35},"in_reply_to":"e7568f59_e5c04d4a","updated":"2024-01-26 00:49:21.000000000","message":"i think it\u0027s very on-topic; this stat obviously previously ment \"[r|s]sync jobs\" of which there was only one per node.  Since this patch is nesting this stat under a new loop we shouldn\u0027t leave this incr here.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":550,"context_line":"                        # candidates is a dict(hash\u003d\u003etimestamp) of objects"},{"line_number":551,"context_line":"                        # for deletion"},{"line_number":552,"context_line":"                        success, candidates \u003d self.sync("},{"line_number":553,"context_line":"                            node, job, suffixes_to_revert, **kwargs)"},{"line_number":554,"context_line":"                        if not success:"},{"line_number":555,"context_line":"                            failure_devs_info.add((node[\u0027replication_ip\u0027],"},{"line_number":556,"context_line":"                                                   node[\u0027device\u0027]))"}],"source_content_type":"text/x-python","patch_set":6,"id":"a2ec6ea8_ae477c27","line":553,"updated":"2022-06-27 21:55:22.000000000","message":"and the assumption is if a second batch gets rejected for concurrency (because another node got in) we\u0027ll get success \u003d\u003d False, append delete_handoff\u003dFalse to batches_delete and skip the partition delete","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":550,"context_line":"                        # candidates is a dict(hash\u003d\u003etimestamp) of objects"},{"line_number":551,"context_line":"                        # for deletion"},{"line_number":552,"context_line":"                        success, candidates \u003d self.sync("},{"line_number":553,"context_line":"                            node, job, suffixes_to_revert, **kwargs)"},{"line_number":554,"context_line":"                        if not success:"},{"line_number":555,"context_line":"                            failure_devs_info.add((node[\u0027replication_ip\u0027],"},{"line_number":556,"context_line":"                                                   node[\u0027device\u0027]))"}],"source_content_type":"text/x-python","patch_set":6,"id":"61bbe2b0_50eaec4f","line":553,"in_reply_to":"775a98cf_0346bf2c","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":550,"context_line":"                        # candidates is a dict(hash\u003d\u003etimestamp) of objects"},{"line_number":551,"context_line":"                        # for deletion"},{"line_number":552,"context_line":"                        success, candidates \u003d self.sync("},{"line_number":553,"context_line":"                            node, job, suffixes_to_revert, **kwargs)"},{"line_number":554,"context_line":"                        if not success:"},{"line_number":555,"context_line":"                            failure_devs_info.add((node[\u0027replication_ip\u0027],"},{"line_number":556,"context_line":"                                                   node[\u0027device\u0027]))"}],"source_content_type":"text/x-python","patch_set":6,"id":"bca223bc_c74f95f8","line":553,"in_reply_to":"a2ec6ea8_ae477c27","updated":"2024-01-26 00:49:21.000000000","message":"With more context loaded in my head I now see this option was *primarily* meant to address a \"deficiency\" with replicators (rsync) similarlly to the problem we discovered and addressed in EC/SSYNC.  Do we really *have* this problem with rsync tho?   Is configuring your replicator to asking for MORE successful rysnc connections to all three nodes a good strategy or do we just end up syncing successfully to 2/3 primaries only to get repeatly rejected by the new busy node and not delete anything?\n\nWhen your replicator is configured","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":550,"context_line":"                        # candidates is a dict(hash\u003d\u003etimestamp) of objects"},{"line_number":551,"context_line":"                        # for deletion"},{"line_number":552,"context_line":"                        success, candidates \u003d self.sync("},{"line_number":553,"context_line":"                            node, job, suffixes_to_revert, **kwargs)"},{"line_number":554,"context_line":"                        if not success:"},{"line_number":555,"context_line":"                            failure_devs_info.add((node[\u0027replication_ip\u0027],"},{"line_number":556,"context_line":"                                                   node[\u0027device\u0027]))"}],"source_content_type":"text/x-python","patch_set":6,"id":"775a98cf_0346bf2c","line":553,"in_reply_to":"bca223bc_c74f95f8","updated":"2024-02-06 15:38:48.000000000","message":"I\u0027m not sure what else I was getting at; the gist of it seems to be \"do we need this/are we sure this is helpful\"\n\nI\u0027m onboard with optionally breaking up partition revert into batched suffixes purely on principle (progress delete has some justifiable use-cases) - but ideally it would be implemented consistently across the replicator and reconstructor with consistent naming and updated telemetry.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":566,"context_line":""},{"line_number":567,"context_line":"                    if self.handoff_delete:"},{"line_number":568,"context_line":"                        # delete handoff if we had handoff_delete successes"},{"line_number":569,"context_line":"                        successes_count \u003d sum(1 for resp in responses if resp)"},{"line_number":570,"context_line":"                        delete_handoff \u003d successes_count \u003e\u003d self.handoff_delete"},{"line_number":571,"context_line":"                    else:"},{"line_number":572,"context_line":"                        # delete handoff if all syncs were successful"}],"source_content_type":"text/x-python","patch_set":6,"id":"7e69fb02_92a3888b","line":569,"updated":"2022-06-27 21:55:22.000000000","message":"how about even *crazier*\n\n\t\u003e\u003e\u003e sum([True, False, True])\n\t2","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":566,"context_line":""},{"line_number":567,"context_line":"                    if self.handoff_delete:"},{"line_number":568,"context_line":"                        # delete handoff if we had handoff_delete successes"},{"line_number":569,"context_line":"                        successes_count \u003d sum(1 for resp in responses if resp)"},{"line_number":570,"context_line":"                        delete_handoff \u003d successes_count \u003e\u003d self.handoff_delete"},{"line_number":571,"context_line":"                    else:"},{"line_number":572,"context_line":"                        # delete handoff if all syncs were successful"}],"source_content_type":"text/x-python","patch_set":6,"id":"0ec3fd3b_6b303c44","line":569,"in_reply_to":"29d81708_d24bd086","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"74d66d24c580a196546efb2193d4cb48e91fd5ae","unresolved":true,"context_lines":[{"line_number":566,"context_line":""},{"line_number":567,"context_line":"                    if self.handoff_delete:"},{"line_number":568,"context_line":"                        # delete handoff if we had handoff_delete successes"},{"line_number":569,"context_line":"                        successes_count \u003d sum(1 for resp in responses if resp)"},{"line_number":570,"context_line":"                        delete_handoff \u003d successes_count \u003e\u003d self.handoff_delete"},{"line_number":571,"context_line":"                    else:"},{"line_number":572,"context_line":"                        # delete handoff if all syncs were successful"}],"source_content_type":"text/x-python","patch_set":6,"id":"29d81708_d24bd086","line":569,"in_reply_to":"7e69fb02_92a3888b","updated":"2022-06-28 06:04:45.000000000","message":"IDK -- I\u0027m not sure I trust our return values being (staying?) sane.\n\n \u003e\u003e\u003e sum([True, False, None])\n Traceback (most recent call last):\n   File \"\u003cstdin\u003e\", line 1, in \u003cmodule\u003e\n TypeError: unsupported operand type(s) for +: \u0027int\u0027 and \u0027NoneType\u0027","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":580,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":581,"context_line":"                                             len(delete_objs))"},{"line_number":582,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("},{"line_number":583,"context_line":"                                job, delete_objs)"},{"line_number":584,"context_line":"                            # if replication works for a hand-off device and it"},{"line_number":585,"context_line":"                            # failed, the remote devices which are target of"},{"line_number":586,"context_line":"                            # the replication from the hand-off device will be"}],"source_content_type":"text/x-python","patch_set":6,"id":"6cfe5874_52178f3e","line":583,"updated":"2022-06-27 21:55:22.000000000","message":"so i guess this is why ssync was already better at progressive delete\n\nand we expect it should still work even if you use the new batch option?","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"d105576acd796a4999c46b8c5eebad3ab8a3037f","unresolved":true,"context_lines":[{"line_number":580,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":581,"context_line":"                                             len(delete_objs))"},{"line_number":582,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("},{"line_number":583,"context_line":"                                job, delete_objs)"},{"line_number":584,"context_line":"                            # if replication works for a hand-off device and it"},{"line_number":585,"context_line":"                            # failed, the remote devices which are target of"},{"line_number":586,"context_line":"                            # the replication from the hand-off device will be"}],"source_content_type":"text/x-python","patch_set":6,"id":"7eb40de4_bd15932c","line":583,"in_reply_to":"6cfe5874_52178f3e","updated":"2022-10-24 22:24:08.000000000","message":"Yes, ssync still works fine -- though it didn\u0027t seem to get the same sorts of benefits as rsync.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":580,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":581,"context_line":"                                             len(delete_objs))"},{"line_number":582,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("},{"line_number":583,"context_line":"                                job, delete_objs)"},{"line_number":584,"context_line":"                            # if replication works for a hand-off device and it"},{"line_number":585,"context_line":"                            # failed, the remote devices which are target of"},{"line_number":586,"context_line":"                            # the replication from the hand-off device will be"}],"source_content_type":"text/x-python","patch_set":6,"id":"32c5f368_f9d7cfba","line":583,"in_reply_to":"7eb40de4_bd15932c","updated":"2024-01-26 00:49:21.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":598,"context_line":""},{"line_number":599,"context_line":"                    batches_deleted.append(delete_handoff)"},{"line_number":600,"context_line":""},{"line_number":601,"context_line":"                if all(batches_deleted):"},{"line_number":602,"context_line":"                    self.delete_partition(job[\u0027path\u0027])"},{"line_number":603,"context_line":"                    handoff_partition_deleted \u003d True"},{"line_number":604,"context_line":"        except PartitionLockTimeout:"}],"source_content_type":"text/x-python","patch_set":6,"id":"d622d993_d8074d94","line":601,"updated":"2022-06-27 21:55:22.000000000","message":"the only I can see that pops when this is False has something to do with ssync","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":598,"context_line":""},{"line_number":599,"context_line":"                    batches_deleted.append(delete_handoff)"},{"line_number":600,"context_line":""},{"line_number":601,"context_line":"                if all(batches_deleted):"},{"line_number":602,"context_line":"                    self.delete_partition(job[\u0027path\u0027])"},{"line_number":603,"context_line":"                    handoff_partition_deleted \u003d True"},{"line_number":604,"context_line":"        except PartitionLockTimeout:"}],"source_content_type":"text/x-python","patch_set":6,"id":"109dc09a_0a131dda","line":601,"in_reply_to":"3537f9c5_01d99aa9","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":598,"context_line":""},{"line_number":599,"context_line":"                    batches_deleted.append(delete_handoff)"},{"line_number":600,"context_line":""},{"line_number":601,"context_line":"                if all(batches_deleted):"},{"line_number":602,"context_line":"                    self.delete_partition(job[\u0027path\u0027])"},{"line_number":603,"context_line":"                    handoff_partition_deleted \u003d True"},{"line_number":604,"context_line":"        except PartitionLockTimeout:"}],"source_content_type":"text/x-python","patch_set":6,"id":"3537f9c5_01d99aa9","line":601,"in_reply_to":"d622d993_d8074d94","updated":"2024-01-26 00:49:21.000000000","message":"OMG!  I *totally* forgot you can use ssync with the replicator!?\n\nanyway this totally confused me again.  did you know that `all([])` is True?","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":627,"context_line":"            try:"},{"line_number":628,"context_line":"                tpool.execute(shutil.rmtree, os.path.join(part_path, suffix))"},{"line_number":629,"context_line":"            except OSError as e:"},{"line_number":630,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"},{"line_number":631,"context_line":"                    # If there was a race to create or delete, don\u0027t worry"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"f6651b8d_92a261fe","line":630,"updated":"2022-06-27 21:55:22.000000000","message":"You should ask Ash for some feedback on this line ;)","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":627,"context_line":"            try:"},{"line_number":628,"context_line":"                tpool.execute(shutil.rmtree, os.path.join(part_path, suffix))"},{"line_number":629,"context_line":"            except OSError as e:"},{"line_number":630,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"},{"line_number":631,"context_line":"                    # If there was a race to create or delete, don\u0027t worry"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"d38e3b7b_7128b5d6","line":630,"in_reply_to":"c015672f_b8819956","updated":"2024-01-26 00:49:21.000000000","message":"AFAIK adding ENODATA to the list of exceptions we handle was win, and this patch now mirrors what\u0027s on master in the other delete_XXX methods in the replicator","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":34892,"name":"ASHWIN A NAIR","display_name":"indianwhocodes","email":"nairashwin952013@gmail.com","username":"indianwhocodes","status":"Nvidia"},"change_message_id":"c540228aea8d468ac8613ff1a6110e1f722e1f25","unresolved":true,"context_lines":[{"line_number":627,"context_line":"            try:"},{"line_number":628,"context_line":"                tpool.execute(shutil.rmtree, os.path.join(part_path, suffix))"},{"line_number":629,"context_line":"            except OSError as e:"},{"line_number":630,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"},{"line_number":631,"context_line":"                    # If there was a race to create or delete, don\u0027t worry"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"c015672f_b8819956","line":630,"in_reply_to":"f6651b8d_92a261fe","updated":"2023-08-17 16:15:03.000000000","message":"What we experience with ENODATA mostly was problems quarantining a whole suffix because it looked like we\u0027re getting the OSError in a specific file/unlink of a specific suffix when trying to \"rmtree\" the whole partition which rendered us incapable of listing the contents of a hash dir and we couldn\u0027t perform any directory operations on it. On my discussions with Tim, I now know we take a durability hit on some other objects in the suffix which is on the order of like 10-30.\n\nBut if a folder is corrupted in the suffix_prefix folder, quarantining a suffix dir fails, we go up and quarantine the whole partition which has possible 2-3k suffixes(source: Tim) meaning a bigger overhead and a lot more data needing to be replicated. \n\nI think Tim\u0027s patch and mine together will address both of these concerns. But nothing can be said until this release rolls over into production since we only saw ENODATA in prod.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":629,"context_line":"            except OSError as e:"},{"line_number":630,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"},{"line_number":631,"context_line":"                    # If there was a race to create or delete, don\u0027t worry"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":""},{"line_number":634,"context_line":"    def delete_partition(self, path):"},{"line_number":635,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":6,"id":"4c0dc83e_34acea40","line":632,"updated":"2022-06-27 21:55:22.000000000","message":"oic, so this is a totally new code path - and there\u0027s no lock or anything... but we only call it from updated_deleted, so it\u0027s not any worse than delete_partition!","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":629,"context_line":"            except OSError as e:"},{"line_number":630,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"},{"line_number":631,"context_line":"                    # If there was a race to create or delete, don\u0027t worry"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":""},{"line_number":634,"context_line":"    def delete_partition(self, path):"},{"line_number":635,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":6,"id":"4de8ef3a_a1fc46d9","line":632,"in_reply_to":"4c0dc83e_34acea40","updated":"2024-01-26 00:49:21.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"f60e12b383ff7f4afd6dc587344d5f5ec09f1bb7","unresolved":true,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"aee163a2_3de763d5","line":430,"updated":"2022-08-22 19:50:55.000000000","message":"This starts to feel noisy, doing it for every batch.","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"1944ce1b_900dcc29","line":430,"in_reply_to":"11433e3e_19bd6681","updated":"2024-01-26 00:49:21.000000000","message":"maybe; unrelated.","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"f92c2245_921ac19f","line":430,"in_reply_to":"1944ce1b_900dcc29","updated":"2024-01-27 00:24:21.000000000","message":"Yes and no -- it\u0027s the same sort of thing as the lock timeouts, where batching increases the amount of logging.","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"0627d229f02c8164fca7f11dfdc8e48d58ac03eb","unresolved":true,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"11433e3e_19bd6681","line":430,"in_reply_to":"38156ea9_b1afa00f","updated":"2023-06-27 18:17:50.000000000","message":"Maybe should always be at debug, though?","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"bb3ea02e0b2f9ee780036b2afad87ce6dc8559f9","unresolved":false,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"38156ea9_b1afa00f","line":430,"in_reply_to":"aee163a2_3de763d5","updated":"2022-10-26 17:01:40.000000000","message":"Not near as bad now that we\u0027ve got https://review.opendev.org/c/openstack/swift/+/839442","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":427,"context_line":"        else:"},{"line_number":428,"context_line":"            log_method \u003d self.logger.info if results else self.logger.debug"},{"line_number":429,"context_line":"            log_method("},{"line_number":430,"context_line":"                \"Successful rsync of %(src)s to %(dst)s (%(time).03f)\","},{"line_number":431,"context_line":"                {\u0027src\u0027: args[-2], \u0027dst\u0027: args[-1], \u0027time\u0027: total_time})"},{"line_number":432,"context_line":"        return ret_val"},{"line_number":433,"context_line":""}],"source_content_type":"text/x-python","patch_set":8,"id":"d43da666_caa7021b","line":430,"in_reply_to":"f92c2245_921ac19f","updated":"2024-02-06 15:38:48.000000000","message":"the move the batching seems significant enough change in the design to warrent some amount of rework in the logging - it feels like we just took some existing code that was orginally designed to track errors per part an indented it.\n\nMaybe collect the results in a list and log once at the end?","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"c66c35393a18a03576bea2e5f7c54899e99d7c38","unresolved":true,"context_lines":[{"line_number":621,"context_line":"            self.logger.timing_since(\u0027partition.delete.timing\u0027, begin)"},{"line_number":622,"context_line":""},{"line_number":623,"context_line":"    def delete_suffixes(self, part_path, suffixes):"},{"line_number":624,"context_line":"        self.logger.info(\"Removing %s suffixes from partition: %s\","},{"line_number":625,"context_line":"                         len(suffixes), part_path)"},{"line_number":626,"context_line":"        for suffix in suffixes:"},{"line_number":627,"context_line":"            try:"}],"source_content_type":"text/x-python","patch_set":8,"id":"efed6db8_33be66d2","line":624,"updated":"2022-08-20 05:09:07.000000000","message":"If we\u0027re going to delete them -- we ought to invalidate the suffixes, too. Should be reasonably cheap, and it ensures the hashes.pkl isn\u0027t *too* far out of date.","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"12b004b4e6c6cb518827d1d278f436ba2223e8a6","unresolved":false,"context_lines":[{"line_number":621,"context_line":"            self.logger.timing_since(\u0027partition.delete.timing\u0027, begin)"},{"line_number":622,"context_line":""},{"line_number":623,"context_line":"    def delete_suffixes(self, part_path, suffixes):"},{"line_number":624,"context_line":"        self.logger.info(\"Removing %s suffixes from partition: %s\","},{"line_number":625,"context_line":"                         len(suffixes), part_path)"},{"line_number":626,"context_line":"        for suffix in suffixes:"},{"line_number":627,"context_line":"            try:"}],"source_content_type":"text/x-python","patch_set":8,"id":"22994373_2d003af9","line":624,"in_reply_to":"2e71bd95_dc258010","updated":"2022-10-23 04:40:41.000000000","message":"Done","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"f60e12b383ff7f4afd6dc587344d5f5ec09f1bb7","unresolved":true,"context_lines":[{"line_number":621,"context_line":"            self.logger.timing_since(\u0027partition.delete.timing\u0027, begin)"},{"line_number":622,"context_line":""},{"line_number":623,"context_line":"    def delete_suffixes(self, part_path, suffixes):"},{"line_number":624,"context_line":"        self.logger.info(\"Removing %s suffixes from partition: %s\","},{"line_number":625,"context_line":"                         len(suffixes), part_path)"},{"line_number":626,"context_line":"        for suffix in suffixes:"},{"line_number":627,"context_line":"            try:"}],"source_content_type":"text/x-python","patch_set":8,"id":"2e71bd95_dc258010","line":624,"in_reply_to":"efed6db8_33be66d2","updated":"2022-08-22 19:50:55.000000000","message":"More than just invalidate -- find a way to flag them to be removed :-/\n\nAlso, this logging is likely too noisy for info -- maybe keep it at debug, though?","commit_id":"e4738b70f324095331068754bb1b1833661f6451"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":526,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":527,"context_line":"                synced_remote_regions \u003d {}"},{"line_number":528,"context_line":"                delete_objs \u003d None"},{"line_number":529,"context_line":"                if suffixes:"},{"line_number":530,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":531,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":532,"context_line":"                        kwargs \u003d {}"}],"source_content_type":"text/x-python","patch_set":12,"id":"b572e107_be2323e7","side":"PARENT","line":529,"updated":"2024-01-26 00:49:21.000000000","message":"apparently in the else case where suffixes \u003d [] we\u0027re not supposed to delete the partition; I don\u0027t understand that.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":526,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":527,"context_line":"                synced_remote_regions \u003d {}"},{"line_number":528,"context_line":"                delete_objs \u003d None"},{"line_number":529,"context_line":"                if suffixes:"},{"line_number":530,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":531,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":532,"context_line":"                        kwargs \u003d {}"}],"source_content_type":"text/x-python","patch_set":12,"id":"70c85da2_a2e36d38","side":"PARENT","line":529,"in_reply_to":"0f500b23_6928254d","updated":"2024-02-06 15:38:48.000000000","message":"oh, so suffixes \u003d [] DOES delete the partition - that makes more sense: we\u0027re ostensibly in the code path for a non-primary part - if it\u0027s empty may as well clean it up.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":526,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":527,"context_line":"                synced_remote_regions \u003d {}"},{"line_number":528,"context_line":"                delete_objs \u003d None"},{"line_number":529,"context_line":"                if suffixes:"},{"line_number":530,"context_line":"                    for node in job[\u0027nodes\u0027]:"},{"line_number":531,"context_line":"                        stats.rsync +\u003d 1"},{"line_number":532,"context_line":"                        kwargs \u003d {}"}],"source_content_type":"text/x-python","patch_set":12,"id":"0f500b23_6928254d","side":"PARENT","line":529,"in_reply_to":"b572e107_be2323e7","updated":"2024-01-27 00:24:21.000000000","message":"It\u0027s handled at the end of this try block, just before the `PartitionLockTimeout` handling. No suffixes \u003d\u003e no responses \u003d\u003e `delete_handoff` is false, so we fall into the `elif not suffixes:` block.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":578,"context_line":"                                [(failure_dev[\u0027replication_ip\u0027],"},{"line_number":579,"context_line":"                                  failure_dev[\u0027device\u0027])"},{"line_number":580,"context_line":"                                 for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":581,"context_line":"                    else:"},{"line_number":582,"context_line":"                        self.delete_partition(job[\u0027path\u0027])"},{"line_number":583,"context_line":"                        handoff_partition_deleted \u003d True"},{"line_number":584,"context_line":"                elif not suffixes:"}],"source_content_type":"text/x-python","patch_set":12,"id":"31e13cf3_4f64542c","side":"PARENT","line":581,"updated":"2024-01-26 00:49:21.000000000","message":"so ssync\u0027s \"delete_objs/candidates\" is always some kind of \"partial failure\" (we wouldn\u0027t remove the partition) even tho all resp are *success*?","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":578,"context_line":"                                [(failure_dev[\u0027replication_ip\u0027],"},{"line_number":579,"context_line":"                                  failure_dev[\u0027device\u0027])"},{"line_number":580,"context_line":"                                 for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":581,"context_line":"                    else:"},{"line_number":582,"context_line":"                        self.delete_partition(job[\u0027path\u0027])"},{"line_number":583,"context_line":"                        handoff_partition_deleted \u003d True"},{"line_number":584,"context_line":"                elif not suffixes:"}],"source_content_type":"text/x-python","patch_set":12,"id":"2ee46862_91e55a6e","side":"PARENT","line":581,"in_reply_to":"11f8279e_29a496f7","updated":"2024-02-07 16:56:50.000000000","message":"maybe \"partial success\" would be more glass is half full.\n\nI\u0027m relieved to know that this multi-region-ssync-optimization complexity is just collateral damage - making it more obvious these green lines are not what this patch is about is probably going to help it get merged.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":578,"context_line":"                                [(failure_dev[\u0027replication_ip\u0027],"},{"line_number":579,"context_line":"                                  failure_dev[\u0027device\u0027])"},{"line_number":580,"context_line":"                                 for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":581,"context_line":"                    else:"},{"line_number":582,"context_line":"                        self.delete_partition(job[\u0027path\u0027])"},{"line_number":583,"context_line":"                        handoff_partition_deleted \u003d True"},{"line_number":584,"context_line":"                elif not suffixes:"}],"source_content_type":"text/x-python","patch_set":12,"id":"f6bdd66c_f159de44","side":"PARENT","line":581,"in_reply_to":"11f8279e_29a496f7","updated":"2024-02-07 16:56:50.000000000","message":"maybe \"partial success\" would have been more glass-is-half-full\n\n\u003e I can assure you that applicability toward ssync was an afterthought\n\nwell, it\u0027s there tho - burden of mainteance.  Maybe it would help move things along to invest in a method-extract prefactor that moves this \"ssync multi-region mess\" out of the way (maybe tpool the object delete, or only increment removed when we actually remove the part) - then when we get to THIS patch it\u0027s more obvious that it\u0027s not about THAT.","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":578,"context_line":"                                [(failure_dev[\u0027replication_ip\u0027],"},{"line_number":579,"context_line":"                                  failure_dev[\u0027device\u0027])"},{"line_number":580,"context_line":"                                 for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":581,"context_line":"                    else:"},{"line_number":582,"context_line":"                        self.delete_partition(job[\u0027path\u0027])"},{"line_number":583,"context_line":"                        handoff_partition_deleted \u003d True"},{"line_number":584,"context_line":"                elif not suffixes:"}],"source_content_type":"text/x-python","patch_set":12,"id":"d04868ec_b97dd9b4","side":"PARENT","line":581,"in_reply_to":"31e13cf3_4f64542c","updated":"2024-02-06 15:38:48.000000000","message":"Tim do you have any insight into this?\n\nEC uses SSYNC exclusively and it\u0027s the only place we have to consistently turn on handoffs_only mode to complete a rebalance.  It seems like if you really want SSYNC to support suffix batching that would be the place to start (and afterwards maybe ops won\u0027t need max_objects_per_revert?)\n\nIf you want to focus on replicators (for whatever reason) maybe it would be helpful to break apart this ssync specific behavior first?  Or maybe somehow this ssync-partial-failure mode is significant and related to the motivation of replicator suffix batching?","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":578,"context_line":"                                [(failure_dev[\u0027replication_ip\u0027],"},{"line_number":579,"context_line":"                                  failure_dev[\u0027device\u0027])"},{"line_number":580,"context_line":"                                 for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":581,"context_line":"                    else:"},{"line_number":582,"context_line":"                        self.delete_partition(job[\u0027path\u0027])"},{"line_number":583,"context_line":"                        handoff_partition_deleted \u003d True"},{"line_number":584,"context_line":"                elif not suffixes:"}],"source_content_type":"text/x-python","patch_set":12,"id":"11f8279e_29a496f7","side":"PARENT","line":581,"in_reply_to":"d04868ec_b97dd9b4","updated":"2024-02-07 07:33:47.000000000","message":"IDK about \"partial failure\", but yeah, today a fully successful ssync of a handoff partition in a multi-region cluster will delete all suffixes but leave the partition to be cleaned up on the next cycle. It also does nothing to invalidate hashes, so ... that\u0027s fun.\n\n\u003e  Or maybe somehow this ssync-partial-failure mode is significant and related to the motivation of replicator suffix batching?\n\nWhatever else may be said of this patch, I can assure you that applicability toward ssync was an afterthought 😄","commit_id":"7b1f7ee857b0d7019b3e6e17a75d38ebbf650c15"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":528,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":529,"context_line":"                                       timeout\u003d0.2):"},{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"}],"source_content_type":"text/x-python","patch_set":12,"id":"7871b14b_173c62ca","line":531,"updated":"2024-01-26 00:49:21.000000000","message":"so this always goes to listdir in the part; there\u0027s no hashes invaldate suffix\u003dNone keys to worry about.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":528,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":529,"context_line":"                                       timeout\u003d0.2):"},{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"}],"source_content_type":"text/x-python","patch_set":12,"id":"e3a1f2d1_c9985bbf","line":531,"in_reply_to":"7871b14b_173c62ca","updated":"2024-02-06 15:38:48.000000000","message":"i.e. the empty suffix removal doesn\u0027t benifit swift itself - but MAYBE some hypothetical external monitoring that\u0027s trying to avoid listdirs (N.B. swift itself doesn\u0027t try to avoid the listdir on \"soon to be evicted\" parts)","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":528,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":529,"context_line":"                                       timeout\u003d0.2):"},{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"}],"source_content_type":"text/x-python","patch_set":12,"id":"d9bd7a9f_c07b4978","line":531,"in_reply_to":"e3a1f2d1_c9985bbf","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":528,"context_line":"                                       job[\u0027partition\u0027], name\u003d\u0027replication\u0027,"},{"line_number":529,"context_line":"                                       timeout\u003d0.2):"},{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"}],"source_content_type":"text/x-python","patch_set":12,"id":"f6c73047_70bab4f6","line":531,"in_reply_to":"e3a1f2d1_c9985bbf","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"}],"source_content_type":"text/x-python","patch_set":12,"id":"a92d3ac8_6369b40c","line":533,"updated":"2024-01-26 00:49:21.000000000","message":"ok, so THIS patch adds the \"revert\" concept to the replicator; that\u0027s fine I guess - I like the term, I just need to update my brain to not map \"revert\" \u003d\u003d \"reconstructor\"","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"}],"source_content_type":"text/x-python","patch_set":12,"id":"e1d5ff83_95a9276f","line":533,"in_reply_to":"a92d3ac8_6369b40c","updated":"2024-02-06 15:38:48.000000000","message":"and maybe also this method name","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"}],"source_content_type":"text/x-python","patch_set":12,"id":"7519d9cb_93a53380","line":533,"in_reply_to":"e1d5ff83_95a9276f","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":530,"context_line":"                batches_deleted \u003d []"},{"line_number":531,"context_line":"                suffixes \u003d tpool.execute(tpool_get_suffixes, job[\u0027path\u0027])"},{"line_number":532,"context_line":"                random.shuffle(suffixes)"},{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"}],"source_content_type":"text/x-python","patch_set":12,"id":"87f54ac1_ed3ede38","line":533,"in_reply_to":"e1d5ff83_95a9276f","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"},{"line_number":537,"context_line":"                    responses \u003d []"},{"line_number":538,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":539,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":12,"id":"d663c14b_57387382","line":536,"updated":"2024-01-26 00:49:21.000000000","message":"this \u0027not suffixes_to_revert\u0027 breaks out of the loop","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"},{"line_number":537,"context_line":"                    responses \u003d []"},{"line_number":538,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":539,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":12,"id":"a8322e14_90b9cbf1","line":536,"in_reply_to":"b109bef8_34666fc1","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"e8b774570d39d8192bb0295663034a9e307ee13a","unresolved":true,"context_lines":[{"line_number":533,"context_line":"                for suffixes_to_revert in distribute_evenly("},{"line_number":534,"context_line":"                        suffixes, self.sync_batches_per_revert):"},{"line_number":535,"context_line":"                    if not suffixes_to_revert:"},{"line_number":536,"context_line":"                        break"},{"line_number":537,"context_line":"                    responses \u003d []"},{"line_number":538,"context_line":"                    synced_remote_regions \u003d {}"},{"line_number":539,"context_line":"                    delete_objs \u003d None"}],"source_content_type":"text/x-python","patch_set":12,"id":"b109bef8_34666fc1","line":536,"in_reply_to":"d663c14b_57387382","updated":"2024-01-27 00:24:21.000000000","message":"Yup; happens when you have fewer suffixes than batches:\n```\n\u003e\u003e\u003e distribute_evenly(range(2), 5)\n[[0], [1], [], [], []]\n```","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":573,"context_line":"                        stats.remove +\u003d 1"},{"line_number":574,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":575,"context_line":"                                and delete_objs is not None):"},{"line_number":576,"context_line":"                            delete_handoff \u003d False  # not deleting *everything*"},{"line_number":577,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":578,"context_line":"                                             len(delete_objs))"},{"line_number":579,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("}],"source_content_type":"text/x-python","patch_set":12,"id":"40dc3fde_cd49c712","line":576,"updated":"2024-01-26 00:49:21.000000000","message":"so under delete_handoff \u003d True we set delete_handoff to False *if* we\u0027re running ssync\n\nthere *has* to be a better way to word this; what is ssync doing?\n\nhttps://review.opendev.org/c/openstack/swift/+/906770","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":573,"context_line":"                        stats.remove +\u003d 1"},{"line_number":574,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":575,"context_line":"                                and delete_objs is not None):"},{"line_number":576,"context_line":"                            delete_handoff \u003d False  # not deleting *everything*"},{"line_number":577,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":578,"context_line":"                                             len(delete_objs))"},{"line_number":579,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("}],"source_content_type":"text/x-python","patch_set":12,"id":"ab7ca402_684f569d","line":576,"in_reply_to":"23f980ad_fe2457b8","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":573,"context_line":"                        stats.remove +\u003d 1"},{"line_number":574,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":575,"context_line":"                                and delete_objs is not None):"},{"line_number":576,"context_line":"                            delete_handoff \u003d False  # not deleting *everything*"},{"line_number":577,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":578,"context_line":"                                             len(delete_objs))"},{"line_number":579,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("}],"source_content_type":"text/x-python","patch_set":12,"id":"597270bf_d7cceedf","line":576,"in_reply_to":"23f980ad_fe2457b8","updated":"2024-02-07 16:56:50.000000000","message":"if delete_handoff:\n        delete_handoff \u003d False\n        \n... was NOT pre-existing and look akward; granted it wasn\u0027t/isn\u0027t super obvious that ssync wouldn\u0027t always delete the partition even if delete_handoff (or why), but if you squint the pre-existing code was just:\n\n    if delete_handoff:\n        if ssync and special_condition:\n            ...\n        else:\n            delete_partition()\n            \nwhen you moved delete_partition out of the delete_handoff block you inherited that problem, and using the explicit bool to track \"all batches success\" seems pretty obvious to me and you can quickly bounce to the two conditions when that optimistic flag gets flipped.  I\u0027m glad you\u0027re ambivolent - b/c to me it\u0027s a big improvement.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":573,"context_line":"                        stats.remove +\u003d 1"},{"line_number":574,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":575,"context_line":"                                and delete_objs is not None):"},{"line_number":576,"context_line":"                            delete_handoff \u003d False  # not deleting *everything*"},{"line_number":577,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":578,"context_line":"                                             len(delete_objs))"},{"line_number":579,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("}],"source_content_type":"text/x-python","patch_set":12,"id":"47a9389e_04190ac3","line":576,"in_reply_to":"40dc3fde_cd49c712","updated":"2024-02-06 15:38:48.000000000","message":"still really bugged about\n\n    if delete_handoff:\n        if ssync:\n            delete_handoff \u003d False\n            \nit looks like my follow-up failed grenade but otherwise gets to the heart of the issue; why keep track of batches_deleted and rely on all([]) \u003d\u003d True if we can have an explicitly named boolean that *draws attention* to this weird ssync partial failure handling in the middle of the replicators rsync/revert handling.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":573,"context_line":"                        stats.remove +\u003d 1"},{"line_number":574,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":575,"context_line":"                                and delete_objs is not None):"},{"line_number":576,"context_line":"                            delete_handoff \u003d False  # not deleting *everything*"},{"line_number":577,"context_line":"                            self.logger.info(\"Removing %s objects\","},{"line_number":578,"context_line":"                                             len(delete_objs))"},{"line_number":579,"context_line":"                            _junk, error_paths \u003d self.delete_handoff_objs("}],"source_content_type":"text/x-python","patch_set":12,"id":"23f980ad_fe2457b8","line":576,"in_reply_to":"47a9389e_04190ac3","updated":"2024-02-07 07:33:47.000000000","message":"\u003e still really bugged about ...\n\nSee, that seems like a pre-existing issue to me... if we get into this block, we\u0027re **not** deleting the handoff partition, regardless of what variable manipulations I do -- at least I\u0027m trying to keep us honest about that.\n\nw/e, I\u0027ll squash it in.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":591,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":592,"context_line":"                        else:"},{"line_number":593,"context_line":"                            self.delete_suffixes("},{"line_number":594,"context_line":"                                job[\u0027path\u0027], suffixes_to_revert)"},{"line_number":595,"context_line":""},{"line_number":596,"context_line":"                    batches_deleted.append(delete_handoff)"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":12,"id":"86808801_a415a52a","line":594,"updated":"2024-01-26 00:49:21.000000000","message":"if i\u0027m reading the diff right this \"progressive delete\" is a new feature for rsync; and we only delete the suffixes if success_count (for the batch) is \u003e than handoff_delete.  \"delete_suffixes\" is a new interface in the replicator.","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":591,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":592,"context_line":"                        else:"},{"line_number":593,"context_line":"                            self.delete_suffixes("},{"line_number":594,"context_line":"                                job[\u0027path\u0027], suffixes_to_revert)"},{"line_number":595,"context_line":""},{"line_number":596,"context_line":"                    batches_deleted.append(delete_handoff)"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":12,"id":"34d741ec_9ffdd459","line":594,"in_reply_to":"86808801_a415a52a","updated":"2024-02-06 15:38:48.000000000","message":"\"progressive delete\" is a new feature for *the replicator\" (rsync or ssync)\n\nit does still respect handoff_delete and the delete_suffixes interface is new","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":631,"context_line":"                    # or some disk corruption that happened after the sync"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":"            else:"},{"line_number":634,"context_line":"                invalidate_hash(suffix_dir, removed\u003dTrue)"},{"line_number":635,"context_line":""},{"line_number":636,"context_line":"    def delete_partition(self, path):"},{"line_number":637,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":12,"id":"3e8572e8_fb3a2345","line":634,"updated":"2024-01-26 00:49:21.000000000","message":"what are we trying to accomplish with removed here?","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":631,"context_line":"                    # or some disk corruption that happened after the sync"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":"            else:"},{"line_number":634,"context_line":"                invalidate_hash(suffix_dir, removed\u003dTrue)"},{"line_number":635,"context_line":""},{"line_number":636,"context_line":"    def delete_partition(self, path):"},{"line_number":637,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":12,"id":"15b3d9c2_83afad5a","line":634,"in_reply_to":"3685218d_631d266e","updated":"2024-02-07 16:56:50.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":631,"context_line":"                    # or some disk corruption that happened after the sync"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":"            else:"},{"line_number":634,"context_line":"                invalidate_hash(suffix_dir, removed\u003dTrue)"},{"line_number":635,"context_line":""},{"line_number":636,"context_line":"    def delete_partition(self, path):"},{"line_number":637,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":12,"id":"3330089e_dafb9205","line":634,"in_reply_to":"3685218d_631d266e","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":631,"context_line":"                    # or some disk corruption that happened after the sync"},{"line_number":632,"context_line":"                    raise"},{"line_number":633,"context_line":"            else:"},{"line_number":634,"context_line":"                invalidate_hash(suffix_dir, removed\u003dTrue)"},{"line_number":635,"context_line":""},{"line_number":636,"context_line":"    def delete_partition(self, path):"},{"line_number":637,"context_line":"        self.logger.info(\"Removing partition: %s\", path)"}],"source_content_type":"text/x-python","patch_set":12,"id":"3685218d_631d266e","line":634,"in_reply_to":"3e8572e8_fb3a2345","updated":"2024-02-06 15:38:48.000000000","message":"AFAICT the idea seems to be that something outside of swift MAY be monitoring hashes.pkl and after this new progressive-delete is landed you don\u0027t want to have to os.listdir\n\nN.B. there\u0027s no use-case or requirement for this in-tree and nvidia\u0027s SRE suffix monitoring already uses os.listdir (same as the replicator\u0027s update_deleted/revert method itself)","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":555,"context_line":"                            synced_remote_regions[node[\u0027region\u0027]] \u003d viewkeys("},{"line_number":556,"context_line":"                                candidates)"},{"line_number":557,"context_line":"                        responses.append(success)"},{"line_number":558,"context_line":"                    for cand_objs in synced_remote_regions.values():"},{"line_number":559,"context_line":"                        if delete_objs is None:"},{"line_number":560,"context_line":"                            delete_objs \u003d cand_objs"},{"line_number":561,"context_line":"                        else:"}],"source_content_type":"text/x-python","patch_set":13,"id":"a8108f4e_f34c92b8","line":558,"range":{"start_line":558,"start_character":37,"end_line":558,"end_character":58},"updated":"2024-02-07 07:33:47.000000000","message":"I wonder if we should have the SAIO docs recommend splitting to at least two regions...","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":555,"context_line":"                            synced_remote_regions[node[\u0027region\u0027]] \u003d viewkeys("},{"line_number":556,"context_line":"                                candidates)"},{"line_number":557,"context_line":"                        responses.append(success)"},{"line_number":558,"context_line":"                    for cand_objs in synced_remote_regions.values():"},{"line_number":559,"context_line":"                        if delete_objs is None:"},{"line_number":560,"context_line":"                            delete_objs \u003d cand_objs"},{"line_number":561,"context_line":"                        else:"}],"source_content_type":"text/x-python","patch_set":13,"id":"8f579e83_13cbe7ee","line":558,"range":{"start_line":558,"start_character":37,"end_line":558,"end_character":58},"in_reply_to":"a8108f4e_f34c92b8","updated":"2024-02-07 16:56:50.000000000","message":"Acknowledged","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":652,"context_line":"            tpool.execute(shutil.rmtree, object_path, ignore_errors\u003dTrue)"},{"line_number":653,"context_line":"            suffix_dir \u003d dirname(object_path)"},{"line_number":654,"context_line":"            try:"},{"line_number":655,"context_line":"                os.rmdir(suffix_dir)"},{"line_number":656,"context_line":"                success_paths.append(object_path)"},{"line_number":657,"context_line":"            except OSError as e:"},{"line_number":658,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"}],"source_content_type":"text/x-python","patch_set":13,"id":"5e7a2ebe_60052f6c","line":655,"updated":"2024-02-07 07:33:47.000000000","message":"😞 This should probably be in a `tpool.execute`, too...","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":652,"context_line":"            tpool.execute(shutil.rmtree, object_path, ignore_errors\u003dTrue)"},{"line_number":653,"context_line":"            suffix_dir \u003d dirname(object_path)"},{"line_number":654,"context_line":"            try:"},{"line_number":655,"context_line":"                os.rmdir(suffix_dir)"},{"line_number":656,"context_line":"                success_paths.append(object_path)"},{"line_number":657,"context_line":"            except OSError as e:"},{"line_number":658,"context_line":"                if e.errno not in (errno.ENOENT, errno.ENOTEMPTY):"}],"source_content_type":"text/x-python","patch_set":13,"id":"88d254ec_fc638b0c","line":655,"in_reply_to":"5e7a2ebe_60052f6c","updated":"2024-02-07 16:56:50.000000000","message":"Acknowledged","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":660,"context_line":"                    self.logger.exception("},{"line_number":661,"context_line":"                        \"Unexpected error trying to cleanup suffix dir %r\","},{"line_number":662,"context_line":"                        suffix_dir)"},{"line_number":663,"context_line":"        return success_paths, error_paths"},{"line_number":664,"context_line":""},{"line_number":665,"context_line":"    def update(self, job):"},{"line_number":666,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":13,"id":"9216a008_54975c7e","line":663,"updated":"2024-02-07 07:33:47.000000000","message":"Ugh -- I just realized this _does not_ in fact partition `delete_objs` into either `success_paths` or `error_paths` -- sometimes it drops a path on the floor! Which is OK, I guess -- after all, we drop all of `success_paths` on the floor in the one and only caller 🙄","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":660,"context_line":"                    self.logger.exception("},{"line_number":661,"context_line":"                        \"Unexpected error trying to cleanup suffix dir %r\","},{"line_number":662,"context_line":"                        suffix_dir)"},{"line_number":663,"context_line":"        return success_paths, error_paths"},{"line_number":664,"context_line":""},{"line_number":665,"context_line":"    def update(self, job):"},{"line_number":666,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":13,"id":"35feb71c_d3cd1ffd","line":663,"in_reply_to":"9216a008_54975c7e","updated":"2024-02-07 16:56:50.000000000","message":"jank af","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":660,"context_line":"                    self.logger.exception("},{"line_number":661,"context_line":"                        \"Unexpected error trying to cleanup suffix dir %r\","},{"line_number":662,"context_line":"                        suffix_dir)"},{"line_number":663,"context_line":"        return success_paths, error_paths"},{"line_number":664,"context_line":""},{"line_number":665,"context_line":"    def update(self, job):"},{"line_number":666,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":13,"id":"282d9742_e55c1b25","line":663,"in_reply_to":"9216a008_54975c7e","updated":"2024-02-07 16:56:50.000000000","message":"thanks for calling this out as part of the review process - this comment is spot on and probably doesn\u0027t belong in *this* change when we merge it.\n\nHow about we do a method-extract prefactor for this whole ssync mess (maybe fix the self.removed +\u003d 1 even when we don\u0027t remove the part) and then when we get to *this* patch there\u0027s no delta between the existing ssync mess to distract us from what you\u0027re trying to do with batches.","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":410,"context_line":"                continue"},{"line_number":411,"context_line":"            if result.startswith(\u0027\u003c\u0027) and not self.log_rsync_transfers:"},{"line_number":412,"context_line":"                continue"},{"line_number":413,"context_line":"            logged_line \u003d True"},{"line_number":414,"context_line":"            if not ret_val:"},{"line_number":415,"context_line":"                self.logger.debug(result)"},{"line_number":416,"context_line":"            else:"}],"source_content_type":"text/x-python","patch_set":14,"id":"57cba0c6_4174014d","line":413,"updated":"2024-02-07 16:56:50.000000000","message":"i don\u0027t understand the significance of the difference between logged_line and results in the conditional below.  What\u0027s the goal here?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":410,"context_line":"                continue"},{"line_number":411,"context_line":"            if result.startswith(\u0027\u003c\u0027) and not self.log_rsync_transfers:"},{"line_number":412,"context_line":"                continue"},{"line_number":413,"context_line":"            logged_line \u003d True"},{"line_number":414,"context_line":"            if not ret_val:"},{"line_number":415,"context_line":"                self.logger.debug(result)"},{"line_number":416,"context_line":"            else:"}],"source_content_type":"text/x-python","patch_set":14,"id":"634292a1_be3f3e28","line":413,"in_reply_to":"57cba0c6_4174014d","updated":"2024-08-28 19:52:39.000000000","message":"\u003e What\u0027s the goal here?\n\nMake it easier for ops to understand *why* it got logged at a particular level -- it seemed weird to split logging about success between info/debug based off something that won\u0027t even show up in logs. I don\u0027t remember exactly why I went with this part the way it is, but I see three reasonable paths forward:\n\n- log success at info if and only if there was something else worth logging about (i.e., this patch)\n- log success at info if and only if there was some data transferred (i.e., move this up a couple lines, in between the `cd+` and `\u003c` handling)\n- log success at info (or maybe debug?) *all* the time -- it\u0027s *weird* that we split it, but [it\u0027s also super old](https://github.com/openstack/swift/blob/001407b969bc12d48bd7f10960f84f519bb19111/swift/obj/replicator.py#L248-L254)\n\nI think this could be one step in a larger story to improve the signal coming out of logs. At _some point_, we should really work toward not running prod at debug (unless/until there\u0027s some issue that we\u0027re actively diagnosing).","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":465,"context_line":"        data_dir \u003d get_data_dir(job[\u0027policy\u0027])"},{"line_number":466,"context_line":"        args.append(join(rsync_module, node[\u0027device\u0027],"},{"line_number":467,"context_line":"                    data_dir, job[\u0027partition\u0027]))"},{"line_number":468,"context_line":"        success \u003d (self._rsync(args) \u003d\u003d 0)"},{"line_number":469,"context_line":""},{"line_number":470,"context_line":"        # TODO: Catch and swallow (or at least minimize) timeouts when doing"},{"line_number":471,"context_line":"        # an update job; if we don\u0027t manage to notify the remote, we should"}],"source_content_type":"text/x-python","patch_set":14,"id":"ea61442a_404c84f2","line":468,"updated":"2024-02-07 16:56:50.000000000","message":"this method maybe *always* logs?  it was \"assumed\" to be \"once per part\" i.e. \"one batch of 4096\"  I\u0027m not sure what the ideal value for batch size is going to end up being... does this increas logs by 4x?  100x?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"643263023f4df352229befb7dfb35fedbd995db0","unresolved":true,"context_lines":[{"line_number":465,"context_line":"        data_dir \u003d get_data_dir(job[\u0027policy\u0027])"},{"line_number":466,"context_line":"        args.append(join(rsync_module, node[\u0027device\u0027],"},{"line_number":467,"context_line":"                    data_dir, job[\u0027partition\u0027]))"},{"line_number":468,"context_line":"        success \u003d (self._rsync(args) \u003d\u003d 0)"},{"line_number":469,"context_line":""},{"line_number":470,"context_line":"        # TODO: Catch and swallow (or at least minimize) timeouts when doing"},{"line_number":471,"context_line":"        # an update job; if we don\u0027t manage to notify the remote, we should"}],"source_content_type":"text/x-python","patch_set":14,"id":"c4a2d496_78d92873","line":468,"in_reply_to":"3f53ead9_055a8e20","updated":"2025-05-21 15:57:32.000000000","message":"\u003e I\u0027ve been thinking around 10.\n\nso instead of a single rsync of 4096 suffix and one log line we\u0027ll have 10 rsync with ~400 suffix each; k.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":465,"context_line":"        data_dir \u003d get_data_dir(job[\u0027policy\u0027])"},{"line_number":466,"context_line":"        args.append(join(rsync_module, node[\u0027device\u0027],"},{"line_number":467,"context_line":"                    data_dir, job[\u0027partition\u0027]))"},{"line_number":468,"context_line":"        success \u003d (self._rsync(args) \u003d\u003d 0)"},{"line_number":469,"context_line":""},{"line_number":470,"context_line":"        # TODO: Catch and swallow (or at least minimize) timeouts when doing"},{"line_number":471,"context_line":"        # an update job; if we don\u0027t manage to notify the remote, we should"}],"source_content_type":"text/x-python","patch_set":14,"id":"3f53ead9_055a8e20","line":468,"in_reply_to":"ea61442a_404c84f2","updated":"2024-08-28 19:52:39.000000000","message":"\u003e this method maybe always logs?\n\nYup, and always has. Part of why I wanted to find a way to make log levels more useful in filtering out uninteresting messages.\n\n\u003e I\u0027m not sure what the ideal value for batch size is going to end up being...\n\nI\u0027ve been thinking around 10.\n\n\u003e does this increas logs by 4x? 100x?\n\nIncrease which logs? object-replicator logs in particular -- yeah, maybe 10x during a rebalance; depends on how many batches you\u0027ve got configured. Object server logs in general, though? I\u0027d expect it to be a drop in the bucket compared to the logs for actual client-driven usage. And here *we* are, dumping everything into a `/var/log/swift/all.log`...","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":473,"context_line":"        if success or not job[\u0027delete\u0027]:"},{"line_number":474,"context_line":"            headers \u003d dict(self.default_headers)"},{"line_number":475,"context_line":"            headers[\u0027X-Backend-Storage-Policy-Index\u0027] \u003d int(job[\u0027policy\u0027])"},{"line_number":476,"context_line":"            with Timeout(self.http_timeout):"},{"line_number":477,"context_line":"                conn \u003d http_connect("},{"line_number":478,"context_line":"                    node[\u0027replication_ip\u0027], node[\u0027replication_port\u0027],"},{"line_number":479,"context_line":"                    node[\u0027device\u0027], job[\u0027partition\u0027], \u0027REPLICATE\u0027,"}],"source_content_type":"text/x-python","patch_set":14,"id":"3dd00193_ab5ba61c","line":476,"updated":"2024-02-07 16:56:50.000000000","message":"do we want this timeout to pop the whole partition or just one suffix?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":473,"context_line":"        if success or not job[\u0027delete\u0027]:"},{"line_number":474,"context_line":"            headers \u003d dict(self.default_headers)"},{"line_number":475,"context_line":"            headers[\u0027X-Backend-Storage-Policy-Index\u0027] \u003d int(job[\u0027policy\u0027])"},{"line_number":476,"context_line":"            with Timeout(self.http_timeout):"},{"line_number":477,"context_line":"                conn \u003d http_connect("},{"line_number":478,"context_line":"                    node[\u0027replication_ip\u0027], node[\u0027replication_port\u0027],"},{"line_number":479,"context_line":"                    node[\u0027device\u0027], job[\u0027partition\u0027], \u0027REPLICATE\u0027,"}],"source_content_type":"text/x-python","patch_set":14,"id":"a3b97ec3_e0614fc0","line":476,"in_reply_to":"3dd00193_ab5ba61c","updated":"2024-08-28 19:52:39.000000000","message":"I was planning on leaving that question until we actually address the `TODO` up above ;-)\n\nAt the moment, it bombs out for the whole partition, and pretty hard. It\u0027s surely a harsher failure mode than strictly necessary, but will ensure we attempt again on subsequent passes (as it prevents us from deleting anything).","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":478,"context_line":"                    node[\u0027replication_ip\u0027], node[\u0027replication_port\u0027],"},{"line_number":479,"context_line":"                    node[\u0027device\u0027], job[\u0027partition\u0027], \u0027REPLICATE\u0027,"},{"line_number":480,"context_line":"                    \u0027/\u0027 + \u0027-\u0027.join(suffixes), headers\u003dheaders)"},{"line_number":481,"context_line":"                conn.getresponse().read()"},{"line_number":482,"context_line":"        return success, {}"},{"line_number":483,"context_line":""},{"line_number":484,"context_line":"    def ssync(self, node, job, suffixes, remote_check_objs\u003dNone):"}],"source_content_type":"text/x-python","patch_set":14,"id":"df41564b_ee4e6d13","line":481,"updated":"2024-02-07 16:56:50.000000000","message":"do we WANT to call REPLICATE per-batch?  I guess we\u0027re telling the remote which suffixes it needs to rehash each batch.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":478,"context_line":"                    node[\u0027replication_ip\u0027], node[\u0027replication_port\u0027],"},{"line_number":479,"context_line":"                    node[\u0027device\u0027], job[\u0027partition\u0027], \u0027REPLICATE\u0027,"},{"line_number":480,"context_line":"                    \u0027/\u0027 + \u0027-\u0027.join(suffixes), headers\u003dheaders)"},{"line_number":481,"context_line":"                conn.getresponse().read()"},{"line_number":482,"context_line":"        return success, {}"},{"line_number":483,"context_line":""},{"line_number":484,"context_line":"    def ssync(self, node, job, suffixes, remote_check_objs\u003dNone):"}],"source_content_type":"text/x-python","patch_set":14,"id":"e1243220_fb820551","line":481,"in_reply_to":"df41564b_ee4e6d13","updated":"2024-08-28 19:52:39.000000000","message":"I thought half the point of https://github.com/openstack/swift/commit/5eaf1548 was that this is (now) pretty cheap -- I don\u0027t see why we\u0027d change from our invalidate-immediately-after-rsync policy. I think it\u0027s also necessary to feel confident in our being able to do the progressive delete -- we\u0027ll only delete if the transfer was successful *and* we invalidated the transferred suffixes.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":564,"context_line":"                    successes_count \u003d sum(1 for resp in responses if resp)"},{"line_number":565,"context_line":"                    required_successes \u003d self.handoff_delete or len(job[\u0027nodes\u0027])"},{"line_number":566,"context_line":"                    if successes_count \u003e\u003d required_successes:"},{"line_number":567,"context_line":"                        stats.remove +\u003d 1"},{"line_number":568,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":569,"context_line":"                                and delete_objs is not None):"},{"line_number":570,"context_line":"                            # multi-region ssync will send at most one replica"}],"source_content_type":"text/x-python","patch_set":14,"id":"94c7eaa3_266d0819","line":567,"updated":"2024-02-07 16:56:50.000000000","message":"should we be incrementing this every batch or per partition removed?  Should the existing code have been incrementing it regarldess of the ssync special condition?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":564,"context_line":"                    successes_count \u003d sum(1 for resp in responses if resp)"},{"line_number":565,"context_line":"                    required_successes \u003d self.handoff_delete or len(job[\u0027nodes\u0027])"},{"line_number":566,"context_line":"                    if successes_count \u003e\u003d required_successes:"},{"line_number":567,"context_line":"                        stats.remove +\u003d 1"},{"line_number":568,"context_line":"                        if (self.conf.get(\u0027sync_method\u0027, \u0027rsync\u0027) \u003d\u003d \u0027ssync\u0027"},{"line_number":569,"context_line":"                                and delete_objs is not None):"},{"line_number":570,"context_line":"                            # multi-region ssync will send at most one replica"}],"source_content_type":"text/x-python","patch_set":14,"id":"ae61bdcf_1ff30ba2","line":567,"in_reply_to":"94c7eaa3_266d0819","updated":"2024-08-28 19:52:39.000000000","message":"So there are two ways to make sense of the metrics we\u0027ve got today:\n\nThe first is to track them over time. This has questionable value: they can *only* be approximate since they\u0027re going out periodically to logs \u0026 recon, not statsd, and they\u0027re a per-worker, per-cycle gauge so they\u0027re reasonably often resetting.\n\nThe other is to track them relative to each other. At the `revert`-call level, it\u0027s useful to consider `success / attempted`; at the batch level, `remove / rsync` can be vaguely useful (if you keep in mind `handoff_delete` / `replica_count`).\n\nToday, on master, I don\u0027t see how we get much of any value out of `remove` that we don\u0027t already get out of `success`. Which is to say, I\u0027m fine with `remove` meaning \"I just unlinked some object data.\"\n\n\u003e Should the existing code have been incrementing it regarldess of the ssync special condition?\n\nFair question; probably, it should only do it `if delete_objs is None or delete_objs`","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":583,"context_line":"                            # the remote indicated it didn\u0027t need it, or we"},{"line_number":584,"context_line":"                            # would have hit some error during read)."},{"line_number":585,"context_line":"                            # Since it was some kind of failure,  flag the"},{"line_number":586,"context_line":"                            # remotes (!?) in failure_devs_info."},{"line_number":587,"context_line":"                            if error_paths:"},{"line_number":588,"context_line":"                                failure_devs_info.update("},{"line_number":589,"context_line":"                                    [(failure_dev[\u0027replication_ip\u0027],"}],"source_content_type":"text/x-python","patch_set":14,"id":"9b4b1ca7_bc611a47","line":586,"updated":"2024-02-07 16:56:50.000000000","message":"LOL @ \"(!?)\"\n\nthese comments are much improved.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":true,"context_lines":[{"line_number":591,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":592,"context_line":"                        else:"},{"line_number":593,"context_line":"                            self.delete_suffixes("},{"line_number":594,"context_line":"                                job[\u0027path\u0027], suffixes_to_revert)"},{"line_number":595,"context_line":"                    else:"},{"line_number":596,"context_line":"                        all_batches_synced_successfully \u003d False"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":14,"id":"4e6853e9_010656b7","line":594,"updated":"2024-02-07 16:56:50.000000000","message":"you hit this even if there\u0027s only one batch right?  We delete all the suffixes then we delete the whole part?","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"48e457f70501fe4ea15393c01f968c0c6b252486","unresolved":true,"context_lines":[{"line_number":591,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":592,"context_line":"                        else:"},{"line_number":593,"context_line":"                            self.delete_suffixes("},{"line_number":594,"context_line":"                                job[\u0027path\u0027], suffixes_to_revert)"},{"line_number":595,"context_line":"                    else:"},{"line_number":596,"context_line":"                        all_batches_synced_successfully \u003d False"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":14,"id":"50115c36_879c044c","line":594,"in_reply_to":"4e6853e9_010656b7","updated":"2024-08-28 19:52:39.000000000","message":"Yup; `delete_partition` should have dramatically less work to do. It makes it more obvious that there\u0027s a race wherein a proxy could write down new data to the partition in between and it\u0027d get deleted, but that\u0027s always been there.","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"643263023f4df352229befb7dfb35fedbd995db0","unresolved":false,"context_lines":[{"line_number":591,"context_line":"                                     for failure_dev in job[\u0027nodes\u0027]])"},{"line_number":592,"context_line":"                        else:"},{"line_number":593,"context_line":"                            self.delete_suffixes("},{"line_number":594,"context_line":"                                job[\u0027path\u0027], suffixes_to_revert)"},{"line_number":595,"context_line":"                    else:"},{"line_number":596,"context_line":"                        all_batches_synced_successfully \u003d False"},{"line_number":597,"context_line":""}],"source_content_type":"text/x-python","patch_set":14,"id":"075cd983_b9ab1736","line":594,"in_reply_to":"50115c36_879c044c","updated":"2025-05-21 15:57:32.000000000","message":"Acknowledged","commit_id":"4353dc6e513fff9853879d12be88ca01e693610d"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"675a41717bf856e3ebfbf9ba858ccf4dfbdcf01b","unresolved":true,"context_lines":[{"line_number":609,"context_line":"                        all_batches_synced_successfully \u003d False"},{"line_number":610,"context_line":""},{"line_number":611,"context_line":"                if all_batches_synced_successfully:"},{"line_number":612,"context_line":"                    self.delete_partition(job[\u0027path\u0027])"},{"line_number":613,"context_line":"                    handoff_partition_deleted \u003d True"},{"line_number":614,"context_line":"        except PartitionLockTimeout:"},{"line_number":615,"context_line":"            self.logger.info(\"Unable to lock handoff partition %s for \""}],"source_content_type":"text/x-python","patch_set":17,"id":"960f2d98_def17fa5","line":612,"updated":"2026-03-05 00:44:13.000000000","message":"If we change this to `rmdir` instead of `rmtree`, we ought to noticeably improve the situation for https://bugs.launchpad.net/swift/+bug/1408061","commit_id":"0555d3eba874ad35324d230c3878e19936a11e53"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ab7a8febaec574718e937890b4f12d104d5f19d","unresolved":true,"context_lines":[{"line_number":392,"context_line":"            if proc:"},{"line_number":393,"context_line":"                proc.kill()"},{"line_number":394,"context_line":"                try:"},{"line_number":395,"context_line":"                    # Note: Python 2.7\u0027s subprocess.Popen class doesn\u0027t take"},{"line_number":396,"context_line":"                    # any arguments for wait(), but Python 3\u0027s does."},{"line_number":397,"context_line":"                    # However, Eventlet\u0027s replacement Popen takes a timeout"},{"line_number":398,"context_line":"                    # argument regardless of Python version, so we don\u0027t"},{"line_number":399,"context_line":"                    # need any conditional code here."},{"line_number":400,"context_line":"                    proc.wait(timeout\u003d1.0)"},{"line_number":401,"context_line":"                except subprocess.TimeoutExpired:"},{"line_number":402,"context_line":"                    # Sometimes a process won\u0027t die immediately even after a"}],"source_content_type":"text/x-python","patch_set":28,"id":"b2486a51_27538853","line":399,"range":{"start_line":395,"start_character":20,"end_line":399,"end_character":53},"updated":"2026-05-11 05:45:50.000000000","message":"We don\u0027t support py2 anymore, so we probably don\u0027t even need this comment anymore!","commit_id":"e2d46f1fe2e8d19f3cd269d6682fb102678f9eb8"}],"test/unit/obj/test_diskfile.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":7974,"context_line":"            # Sure enough, it\u0027s in there"},{"line_number":7975,"context_line":"            with open(invalidations_file, \u0027r\u0027) as f:"},{"line_number":7976,"context_line":"                self.assertEqual(bad_suffix + \"\\n\", f.read())"},{"line_number":7977,"context_line":"            # ... and it propogates to the pickle when we consolidate"},{"line_number":7978,"context_line":"            hashes \u003d assert_consolidation([bad_suffix])"},{"line_number":7979,"context_line":"            self.assertIn(bad_suffix, hashes)"},{"line_number":7980,"context_line":"            self.assertIsNone(hashes[bad_suffix])"}],"source_content_type":"text/x-python","patch_set":13,"id":"c5cb98f1_7f250374","line":7977,"updated":"2024-02-06 15:38:48.000000000","message":"probably the ideal behavior here would be the invalidations line is ignored.","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":7974,"context_line":"            # Sure enough, it\u0027s in there"},{"line_number":7975,"context_line":"            with open(invalidations_file, \u0027r\u0027) as f:"},{"line_number":7976,"context_line":"                self.assertEqual(bad_suffix + \"\\n\", f.read())"},{"line_number":7977,"context_line":"            # ... and it propogates to the pickle when we consolidate"},{"line_number":7978,"context_line":"            hashes \u003d assert_consolidation([bad_suffix])"},{"line_number":7979,"context_line":"            self.assertIn(bad_suffix, hashes)"},{"line_number":7980,"context_line":"            self.assertIsNone(hashes[bad_suffix])"}],"source_content_type":"text/x-python","patch_set":13,"id":"b5b918e7_0817f4c0","line":7977,"in_reply_to":"5e776a33_461406da","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"6e2c2c9361f2096b69ed2360b130aab798959235","unresolved":true,"context_lines":[{"line_number":7974,"context_line":"            # Sure enough, it\u0027s in there"},{"line_number":7975,"context_line":"            with open(invalidations_file, \u0027r\u0027) as f:"},{"line_number":7976,"context_line":"                self.assertEqual(bad_suffix + \"\\n\", f.read())"},{"line_number":7977,"context_line":"            # ... and it propogates to the pickle when we consolidate"},{"line_number":7978,"context_line":"            hashes \u003d assert_consolidation([bad_suffix])"},{"line_number":7979,"context_line":"            self.assertIn(bad_suffix, hashes)"},{"line_number":7980,"context_line":"            self.assertIsNone(hashes[bad_suffix])"}],"source_content_type":"text/x-python","patch_set":13,"id":"5e776a33_461406da","line":7977,"in_reply_to":"c5cb98f1_7f250374","updated":"2024-02-07 07:33:47.000000000","message":"Can\u0027t change old code, man. Though I suppose we could have `consolidate_hashes` check `valid_suffix` going forward, then patch it out  for these upgrade tests.\n\nhttps://review.opendev.org/c/openstack/swift/+/908253","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":true,"context_lines":[{"line_number":7986,"context_line":"            self.assertIn(suffix, hashes)  # It\u0027s back! We really went to disk"},{"line_number":7987,"context_line":"            self.assertIsNotNone(hashes[suffix])"},{"line_number":7988,"context_line":"            self.assertIn(suffix2, hashes)"},{"line_number":7989,"context_line":"            self.assertIsNotNone(hashes[suffix2])"},{"line_number":7990,"context_line":""},{"line_number":7991,"context_line":"    def test_get_hashes_consolidates_suffix_rehash_once(self):"},{"line_number":7992,"context_line":"        for policy in self.iter_policies():"}],"source_content_type":"text/x-python","patch_set":13,"id":"2dd22420_013a5c75","line":7989,"updated":"2024-02-06 15:38:48.000000000","message":"these are helpful; it would still look better in a separate change","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"a5235bd5e5fe82d4bf77e08f2074021f81f2def0","unresolved":false,"context_lines":[{"line_number":7986,"context_line":"            self.assertIn(suffix, hashes)  # It\u0027s back! We really went to disk"},{"line_number":7987,"context_line":"            self.assertIsNotNone(hashes[suffix])"},{"line_number":7988,"context_line":"            self.assertIn(suffix2, hashes)"},{"line_number":7989,"context_line":"            self.assertIsNotNone(hashes[suffix2])"},{"line_number":7990,"context_line":""},{"line_number":7991,"context_line":"    def test_get_hashes_consolidates_suffix_rehash_once(self):"},{"line_number":7992,"context_line":"        for policy in self.iter_policies():"}],"source_content_type":"text/x-python","patch_set":13,"id":"c1a8e287_73ccc54f","line":7989,"in_reply_to":"2dd22420_013a5c75","updated":"2024-02-07 16:56:50.000000000","message":"Done","commit_id":"df279b2ef18b61e010909bdc4ccd311495feeef4"}],"test/unit/obj/test_replicator.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":1631,"context_line":"                instance.errno \u003d error_no"},{"line_number":1632,"context_line":"                instance.strerror \u003d os.strerror(error_no)"},{"line_number":1633,"context_line":""},{"line_number":1634,"context_line":"                def func(directory, **kwargs):"},{"line_number":1635,"context_line":"                    if directory \u003d\u003d suffix_dir_path:"},{"line_number":1636,"context_line":"                        raise instance"},{"line_number":1637,"context_line":"                    else:"}],"source_content_type":"text/x-python","patch_set":6,"id":"0565a12e_a118d57e","line":1634,"updated":"2022-06-27 21:55:22.000000000","message":"what is this?  it doesn\u0027t seem to be needed on my machine.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"74d66d24c580a196546efb2193d4cb48e91fd5ae","unresolved":true,"context_lines":[{"line_number":1631,"context_line":"                instance.errno \u003d error_no"},{"line_number":1632,"context_line":"                instance.strerror \u003d os.strerror(error_no)"},{"line_number":1633,"context_line":""},{"line_number":1634,"context_line":"                def func(directory, **kwargs):"},{"line_number":1635,"context_line":"                    if directory \u003d\u003d suffix_dir_path:"},{"line_number":1636,"context_line":"                        raise instance"},{"line_number":1637,"context_line":"                    else:"}],"source_content_type":"text/x-python","patch_set":6,"id":"818172bb_3791f462","line":1634,"in_reply_to":"0565a12e_a118d57e","updated":"2022-06-28 06:04:45.000000000","message":"At some point while I was futzing with things, I started getting TypeErrors bubbling up out of shutil.rmtree because this didn\u0027t take a fd kwarg or something. Might not be strictly necessary now, but it seems closer to the real -- or maybe it was necessary to have it fail at all sensibly when i backed out the changes under swift/ ? Also, what version of python were you running?","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":false,"context_lines":[{"line_number":1631,"context_line":"                instance.errno \u003d error_no"},{"line_number":1632,"context_line":"                instance.strerror \u003d os.strerror(error_no)"},{"line_number":1633,"context_line":""},{"line_number":1634,"context_line":"                def func(directory, **kwargs):"},{"line_number":1635,"context_line":"                    if directory \u003d\u003d suffix_dir_path:"},{"line_number":1636,"context_line":"                        raise instance"},{"line_number":1637,"context_line":"                    else:"}],"source_content_type":"text/x-python","patch_set":6,"id":"9e385218_91600585","line":1634,"in_reply_to":"2d549e73_6c10dcde","updated":"2024-01-26 00:49:21.000000000","message":"886541: CI: test under py311 | https://review.opendev.org/c/openstack/swift/+/886541 is merged\nso is 886538: tests: Fix replicator test for py311 | https://review.opendev.org/c/openstack/swift/+/886538\n\ni guess at this point we\u0027re just passing in whatever arguments come into our mocked rmdir to the real rmdir - that seems reasonable.","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"0627d229f02c8164fca7f11dfdc8e48d58ac03eb","unresolved":true,"context_lines":[{"line_number":1631,"context_line":"                instance.errno \u003d error_no"},{"line_number":1632,"context_line":"                instance.strerror \u003d os.strerror(error_no)"},{"line_number":1633,"context_line":""},{"line_number":1634,"context_line":"                def func(directory, **kwargs):"},{"line_number":1635,"context_line":"                    if directory \u003d\u003d suffix_dir_path:"},{"line_number":1636,"context_line":"                        raise instance"},{"line_number":1637,"context_line":"                    else:"}],"source_content_type":"text/x-python","patch_set":6,"id":"2d549e73_6c10dcde","line":1634,"in_reply_to":"5b8eaad8_b3e39032","updated":"2023-06-27 18:17:50.000000000","message":"Oh, whoa! I\u0027d _completely_ forgotten that I\u0027d ever seen that `dir_fd` thing before... I don\u0027t really care which way it goes -- though I suppose it\u0027s a little weird that the way things are now, we *don\u0027t* pass through the `dir_fd` we got if we got one.\n\nSeems like an argument that we ought to merge https://review.opendev.org/c/openstack/swift/+/886541 so we know whether our tests actually need that passed through (looks like no, at least currently?)","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"3ebb27b2e5142cd708e4292c7161d61f5574bb04","unresolved":true,"context_lines":[{"line_number":1631,"context_line":"                instance.errno \u003d error_no"},{"line_number":1632,"context_line":"                instance.strerror \u003d os.strerror(error_no)"},{"line_number":1633,"context_line":""},{"line_number":1634,"context_line":"                def func(directory, **kwargs):"},{"line_number":1635,"context_line":"                    if directory \u003d\u003d suffix_dir_path:"},{"line_number":1636,"context_line":"                        raise instance"},{"line_number":1637,"context_line":"                    else:"}],"source_content_type":"text/x-python","patch_set":6,"id":"5b8eaad8_b3e39032","line":1634,"in_reply_to":"818172bb_3791f462","updated":"2023-06-27 04:40:37.000000000","message":"i got a merge conflict here, do you prefer the weird dir_fd change from https://review.opendev.org/c/openstack/swift/+/886538 ???","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":2140,"context_line":"        delete_jobs.sort(key\u003dlambda j: j[\u0027policy\u0027])"},{"line_number":2141,"context_line":"        job \u003d delete_jobs[0]"},{"line_number":2142,"context_line":"        for suffix in range(11):"},{"line_number":2143,"context_line":"            os.mkdir(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2144,"context_line":"        with mock.patch.object(self.replicator, \u0027sync\u0027,"},{"line_number":2145,"context_line":"                               return_value\u003d(True, {})) as mock_sync, \\"},{"line_number":2146,"context_line":"                mock.patch(\u0027shutil.rmtree\u0027) as mock_rmtree:"}],"source_content_type":"text/x-python","patch_set":6,"id":"e22efab0_6b02256b","line":2143,"updated":"2022-06-27 21:55:22.000000000","message":"ok, so we\u0027re setting up all the suffixes in the part\n\n\t\u003e\u003e\u003e [\u0027%03x\u0027 % i for i in range(11)]\n\t[\u0027000\u0027, \u0027001\u0027, \u0027002\u0027, \u0027003\u0027, \u0027004\u0027, \u0027005\u0027, \u0027006\u0027, \u0027007\u0027, \u0027008\u0027, \u0027009\u0027, \u002700a\u0027]","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":2140,"context_line":"        delete_jobs.sort(key\u003dlambda j: j[\u0027policy\u0027])"},{"line_number":2141,"context_line":"        job \u003d delete_jobs[0]"},{"line_number":2142,"context_line":"        for suffix in range(11):"},{"line_number":2143,"context_line":"            os.mkdir(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2144,"context_line":"        with mock.patch.object(self.replicator, \u0027sync\u0027,"},{"line_number":2145,"context_line":"                               return_value\u003d(True, {})) as mock_sync, \\"},{"line_number":2146,"context_line":"                mock.patch(\u0027shutil.rmtree\u0027) as mock_rmtree:"}],"source_content_type":"text/x-python","patch_set":6,"id":"59110e59_4b037488","line":2143,"in_reply_to":"e22efab0_6b02256b","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":2147,"context_line":"            self.replicator.update_deleted(job)"},{"line_number":2148,"context_line":"        self.assertEqual(len(mock_sync.mock_calls), 12)"},{"line_number":2149,"context_line":"        self.assertEqual(mock_sync.mock_calls, ["},{"line_number":2150,"context_line":"            mock.call(node, job, mock.ANY) for node in job[\u0027nodes\u0027]] * 4)"},{"line_number":2151,"context_line":"        self.assertEqual([len(c[1][2]) for c in mock_sync.mock_calls], ["},{"line_number":2152,"context_line":"            3, 3, 3,"},{"line_number":2153,"context_line":"            3, 3, 3,"}],"source_content_type":"text/x-python","patch_set":6,"id":"ad3505f3_1dee8ed0","line":2150,"updated":"2022-06-27 21:55:22.000000000","message":"len(job[\u0027nodes\u0027]) * 4 \u003d\u003d 12","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":2147,"context_line":"            self.replicator.update_deleted(job)"},{"line_number":2148,"context_line":"        self.assertEqual(len(mock_sync.mock_calls), 12)"},{"line_number":2149,"context_line":"        self.assertEqual(mock_sync.mock_calls, ["},{"line_number":2150,"context_line":"            mock.call(node, job, mock.ANY) for node in job[\u0027nodes\u0027]] * 4)"},{"line_number":2151,"context_line":"        self.assertEqual([len(c[1][2]) for c in mock_sync.mock_calls], ["},{"line_number":2152,"context_line":"            3, 3, 3,"},{"line_number":2153,"context_line":"            3, 3, 3,"}],"source_content_type":"text/x-python","patch_set":6,"id":"c0324552_df9066c7","line":2150,"in_reply_to":"ad3505f3_1dee8ed0","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":2148,"context_line":"        self.assertEqual(len(mock_sync.mock_calls), 12)"},{"line_number":2149,"context_line":"        self.assertEqual(mock_sync.mock_calls, ["},{"line_number":2150,"context_line":"            mock.call(node, job, mock.ANY) for node in job[\u0027nodes\u0027]] * 4)"},{"line_number":2151,"context_line":"        self.assertEqual([len(c[1][2]) for c in mock_sync.mock_calls], ["},{"line_number":2152,"context_line":"            3, 3, 3,"},{"line_number":2153,"context_line":"            3, 3, 3,"},{"line_number":2154,"context_line":"            3, 3, 3,"}],"source_content_type":"text/x-python","patch_set":6,"id":"262eb9aa_0bef00c2","line":2151,"updated":"2022-06-27 21:55:22.000000000","message":"this is an assert on a specific pos arg to the sync method... maybe \u0027suffixes_to_revert\u0027?  oh it\u0027s the *length* of the batch...","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":2148,"context_line":"        self.assertEqual(len(mock_sync.mock_calls), 12)"},{"line_number":2149,"context_line":"        self.assertEqual(mock_sync.mock_calls, ["},{"line_number":2150,"context_line":"            mock.call(node, job, mock.ANY) for node in job[\u0027nodes\u0027]] * 4)"},{"line_number":2151,"context_line":"        self.assertEqual([len(c[1][2]) for c in mock_sync.mock_calls], ["},{"line_number":2152,"context_line":"            3, 3, 3,"},{"line_number":2153,"context_line":"            3, 3, 3,"},{"line_number":2154,"context_line":"            3, 3, 3,"}],"source_content_type":"text/x-python","patch_set":6,"id":"bb3b1816_61778898","line":2151,"in_reply_to":"262eb9aa_0bef00c2","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":2156,"context_line":"        ])"},{"line_number":2157,"context_line":"        self.assertEqual(sorted(mock_rmtree.mock_calls[:-1]), ["},{"line_number":2158,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2159,"context_line":"            for suffix in range(11)])"},{"line_number":2160,"context_line":"        self.assertEqual(mock_rmtree.mock_calls[-1], mock.call(job[\u0027path\u0027]))"},{"line_number":2161,"context_line":""},{"line_number":2162,"context_line":"    def test_replicate_skipped_partpower_increase(self):"}],"source_content_type":"text/x-python","patch_set":6,"id":"98afcdc5_9e90203d","line":2159,"updated":"2022-06-27 21:55:22.000000000","message":"and we call rmtree on every suffix","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":2156,"context_line":"        ])"},{"line_number":2157,"context_line":"        self.assertEqual(sorted(mock_rmtree.mock_calls[:-1]), ["},{"line_number":2158,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2159,"context_line":"            for suffix in range(11)])"},{"line_number":2160,"context_line":"        self.assertEqual(mock_rmtree.mock_calls[-1], mock.call(job[\u0027path\u0027]))"},{"line_number":2161,"context_line":""},{"line_number":2162,"context_line":"    def test_replicate_skipped_partpower_increase(self):"}],"source_content_type":"text/x-python","patch_set":6,"id":"53e56ef6_e81f8582","line":2159,"in_reply_to":"98afcdc5_9e90203d","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e39711a9f0884527eb757c2c45624b3f80696bfa","unresolved":true,"context_lines":[{"line_number":2157,"context_line":"        self.assertEqual(sorted(mock_rmtree.mock_calls[:-1]), ["},{"line_number":2158,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2159,"context_line":"            for suffix in range(11)])"},{"line_number":2160,"context_line":"        self.assertEqual(mock_rmtree.mock_calls[-1], mock.call(job[\u0027path\u0027]))"},{"line_number":2161,"context_line":""},{"line_number":2162,"context_line":"    def test_replicate_skipped_partpower_increase(self):"},{"line_number":2163,"context_line":"        _create_test_rings(self.testdir, next_part_power\u003d4)"}],"source_content_type":"text/x-python","patch_set":6,"id":"29a13b8e_34f59dec","line":2160,"updated":"2022-06-27 21:55:22.000000000","message":"and we still cleanup the part 👍","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"74d66d24c580a196546efb2193d4cb48e91fd5ae","unresolved":true,"context_lines":[{"line_number":2157,"context_line":"        self.assertEqual(sorted(mock_rmtree.mock_calls[:-1]), ["},{"line_number":2158,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2159,"context_line":"            for suffix in range(11)])"},{"line_number":2160,"context_line":"        self.assertEqual(mock_rmtree.mock_calls[-1], mock.call(job[\u0027path\u0027]))"},{"line_number":2161,"context_line":""},{"line_number":2162,"context_line":"    def test_replicate_skipped_partpower_increase(self):"},{"line_number":2163,"context_line":"        _create_test_rings(self.testdir, next_part_power\u003d4)"}],"source_content_type":"text/x-python","patch_set":6,"id":"88d5edd9_265d752b","line":2160,"in_reply_to":"29a13b8e_34f59dec","updated":"2022-06-28 06:04:45.000000000","message":"Not only that, but we do it *last* -- there was a bad bug in an earlier patchset where I\u0027d distribute the suffixes all nice... then unlink the whole partition after each batch :-(","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":34892,"name":"ASHWIN A NAIR","display_name":"indianwhocodes","email":"nairashwin952013@gmail.com","username":"indianwhocodes","status":"Nvidia"},"change_message_id":"c540228aea8d468ac8613ff1a6110e1f722e1f25","unresolved":false,"context_lines":[{"line_number":2157,"context_line":"        self.assertEqual(sorted(mock_rmtree.mock_calls[:-1]), ["},{"line_number":2158,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2159,"context_line":"            for suffix in range(11)])"},{"line_number":2160,"context_line":"        self.assertEqual(mock_rmtree.mock_calls[-1], mock.call(job[\u0027path\u0027]))"},{"line_number":2161,"context_line":""},{"line_number":2162,"context_line":"    def test_replicate_skipped_partpower_increase(self):"},{"line_number":2163,"context_line":"        _create_test_rings(self.testdir, next_part_power\u003d4)"}],"source_content_type":"text/x-python","patch_set":6,"id":"0f51f229_d2197912","line":2160,"in_reply_to":"88d5edd9_265d752b","updated":"2023-08-17 16:15:03.000000000","message":"Ack","commit_id":"8493f1d81a999f5642fe2381008520aa6048b797"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"29b63b159246ab118af377df11fc2db5f8af00f3","unresolved":true,"context_lines":[{"line_number":2419,"context_line":"        ])"},{"line_number":2420,"context_line":"        self.assertEqual(mock_rmtree.mock_calls, ["},{"line_number":2421,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2422,"context_line":"            for suffix in (0, 4, 8, 2, 6, 10, 3, 7)])"},{"line_number":2423,"context_line":""},{"line_number":2424,"context_line":"    def test_replicate_skipped_partpower_increase(self):"},{"line_number":2425,"context_line":"        _create_test_rings(self.testdir, next_part_power\u003d4)"}],"source_content_type":"text/x-python","patch_set":12,"id":"77be96b4_9c6e539c","line":2422,"updated":"2024-01-26 00:49:21.000000000","message":"since we patch shuffle, this order is a side-effect of \"distribute_evenly\"\n\n    \u003e\u003e\u003e from swift.common.utils import distribute_evenly\n    \u003e\u003e\u003e distribute_evenly(range(12), 4)\n    [[0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11]]\n\nN.B. the second batch had the last node fail","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"c551215d3143c73023445e054625725b74f14d36","unresolved":false,"context_lines":[{"line_number":2419,"context_line":"        ])"},{"line_number":2420,"context_line":"        self.assertEqual(mock_rmtree.mock_calls, ["},{"line_number":2421,"context_line":"            mock.call(os.path.join(job[\u0027path\u0027], \"%03x\" % suffix))"},{"line_number":2422,"context_line":"            for suffix in (0, 4, 8, 2, 6, 10, 3, 7)])"},{"line_number":2423,"context_line":""},{"line_number":2424,"context_line":"    def test_replicate_skipped_partpower_increase(self):"},{"line_number":2425,"context_line":"        _create_test_rings(self.testdir, next_part_power\u003d4)"}],"source_content_type":"text/x-python","patch_set":12,"id":"d90e3a05_edd397fb","line":2422,"in_reply_to":"77be96b4_9c6e539c","updated":"2024-02-06 15:38:48.000000000","message":"Acknowledged","commit_id":"7e3f0563854cabc839e93e8716dddb37f0b41979"}]}
