)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":4,"context_line":"Commit:     Matthew Oliver \u003cmatt@oliver.net.au\u003e"},{"line_number":5,"context_line":"CommitDate: 2021-09-21 17:11:54 +1000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Sharding: a remote SR without an epoch can\u0027t replication over one with an epoch"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"We\u0027ve seen this happen a couple of times in production where a root"},{"line_number":10,"context_line":"container suddenly thinks it\u0027s unsharded as it\u0027s own_shard_range is"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"70f6195b_71683b2a","line":7,"range":{"start_line":7,"start_character":45,"end_line":7,"end_character":56},"updated":"2021-09-27 11:49:06.000000000","message":"s/replication/replicate/","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit:     Matthew Oliver \u003cmatt@oliver.net.au\u003e"},{"line_number":5,"context_line":"CommitDate: 2021-09-21 17:11:54 +1000"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Sharding: a remote SR without an epoch can\u0027t replication over one with an epoch"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"We\u0027ve seen this happen a couple of times in production where a root"},{"line_number":10,"context_line":"container suddenly thinks it\u0027s unsharded as it\u0027s own_shard_range is"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"149741f9_ea8ecbf9","line":7,"range":{"start_line":7,"start_character":45,"end_line":7,"end_character":56},"in_reply_to":"70f6195b_71683b2a","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":12,"context_line":"places a container and a new own_shard_range is created and replicated"},{"line_number":13,"context_line":"to older sharded primaries."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"This version move the filtering up to the container replicator where we"},{"line_number":16,"context_line":"only deal with OSRs (own_shard_ranges). Looks much cleaner initially,"},{"line_number":17,"context_line":"however it only really works one way. Block remote osr epoch None overwriting"},{"line_number":18,"context_line":"local epoched OSR."},{"line_number":19,"context_line":"Going the other way it much harder. So as this patch stands it"},{"line_number":20,"context_line":"essentially blocks a primary with a good epoched OSR from pulling in a"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"75a677e0_ade7e1f6","line":17,"range":{"start_line":15,"start_character":0,"end_line":17,"end_character":36},"updated":"2021-09-27 11:49:06.000000000","message":"Comments about patch versions, or comparison with other patchsets, in commit messages lose all context once we chose a solution and merge.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":12,"context_line":"places a container and a new own_shard_range is created and replicated"},{"line_number":13,"context_line":"to older sharded primaries."},{"line_number":14,"context_line":""},{"line_number":15,"context_line":"This version move the filtering up to the container replicator where we"},{"line_number":16,"context_line":"only deal with OSRs (own_shard_ranges). Looks much cleaner initially,"},{"line_number":17,"context_line":"however it only really works one way. Block remote osr epoch None overwriting"},{"line_number":18,"context_line":"local epoched OSR."},{"line_number":19,"context_line":"Going the other way it much harder. So as this patch stands it"},{"line_number":20,"context_line":"essentially blocks a primary with a good epoched OSR from pulling in a"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"2accdcc8_097c4224","line":17,"range":{"start_line":15,"start_character":0,"end_line":17,"end_character":36},"in_reply_to":"75a677e0_ade7e1f6","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":18,"context_line":"local epoched OSR."},{"line_number":19,"context_line":"Going the other way it much harder. So as this patch stands it"},{"line_number":20,"context_line":"essentially blocks a primary with a good epoched OSR from pulling in a"},{"line_number":21,"context_line":"bad non-epoched OSR from a remote. As a conserquence if the bad nodes"},{"line_number":22,"context_line":"non-epoched OSR is newer then the good one it pulls from it\u0027s neighbours"},{"line_number":23,"context_line":"then it\u0027ll be left stuck with it\u0027s bad one."},{"line_number":24,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"55a8ecf2_dd8a75de","line":21,"range":{"start_line":21,"start_character":40,"end_line":21,"end_character":52},"updated":"2021-09-27 11:49:06.000000000","message":"typo: consequence","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":18,"context_line":"local epoched OSR."},{"line_number":19,"context_line":"Going the other way it much harder. So as this patch stands it"},{"line_number":20,"context_line":"essentially blocks a primary with a good epoched OSR from pulling in a"},{"line_number":21,"context_line":"bad non-epoched OSR from a remote. As a conserquence if the bad nodes"},{"line_number":22,"context_line":"non-epoched OSR is newer then the good one it pulls from it\u0027s neighbours"},{"line_number":23,"context_line":"then it\u0027ll be left stuck with it\u0027s bad one."},{"line_number":24,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"e096c2ef_cff28506","line":21,"range":{"start_line":21,"start_character":40,"end_line":21,"end_character":52},"in_reply_to":"55a8ecf2_dd8a75de","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"ad65d9524e6d6d8dd57dfc2f21e1af52fcc3d5d8","unresolved":true,"context_lines":[{"line_number":26,"context_line":"messages as they\u0027d be stuck. If however, the bad OSR is older than the"},{"line_number":27,"context_line":"good OSR then the bad node pulls a good OSR it\u0027ll merge the shard over the"},{"line_number":28,"context_line":"bad one."},{"line_number":29,"context_line":""},{"line_number":30,"context_line":"Change-Id: I069bdbeb430e89074605e40525d955b3a704a44f"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"065695be_4b2e3d60","line":29,"updated":"2021-09-28 10:09:33.000000000","message":"Is there a launchpad bug?","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":true,"context_lines":[{"line_number":26,"context_line":"messages as they\u0027d be stuck. If however, the bad OSR is older than the"},{"line_number":27,"context_line":"good OSR then the bad node pulls a good OSR it\u0027ll merge the shard over the"},{"line_number":28,"context_line":"bad one."},{"line_number":29,"context_line":""},{"line_number":30,"context_line":"Change-Id: I069bdbeb430e89074605e40525d955b3a704a44f"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"2cbd4fe2_432332cc","line":29,"in_reply_to":"065695be_4b2e3d60","updated":"2022-02-28 06:33:28.000000000","message":"don\u0027t think so.. but maybe there should be :)","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"7fafbf6caafc86a52b2b252767ac03dbf89fae44","unresolved":false,"context_lines":[{"line_number":26,"context_line":"messages as they\u0027d be stuck. If however, the bad OSR is older than the"},{"line_number":27,"context_line":"good OSR then the bad node pulls a good OSR it\u0027ll merge the shard over the"},{"line_number":28,"context_line":"bad one."},{"line_number":29,"context_line":""},{"line_number":30,"context_line":"Change-Id: I069bdbeb430e89074605e40525d955b3a704a44f"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":6,"id":"a762fe87_65d65216","line":29,"in_reply_to":"2cbd4fe2_432332cc","updated":"2022-07-01 06:15:24.000000000","message":"There is now.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"58794d2d562c1a8c3fbf7b8003ce466898bac434","unresolved":true,"context_lines":[{"line_number":12,"context_line":""},{"line_number":13,"context_line":"The only way we\u0027ve observed this happen is when a new replica or handoff"},{"line_number":14,"context_line":"node creates a container and it\u0027s new own_shard_range is created without"},{"line_number":15,"context_line":"an epoch and then replicated to older primaries."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":"However, if a bad node with a non-epoched OSR is on a primary, it\u0027s"},{"line_number":18,"context_line":"newer timestamp would prevent pulling the good osr from it\u0027s peers.  So"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"5a38501a_fa510a47","line":15,"updated":"2024-02-02 19:40:50.000000000","message":"Can we replicate this case in a probe test? Maybe create a sharded container, nuke everything from one of the primaries, then do a container PUT?","commit_id":"1607b25175586c6e0055610b85915e4ddeba40b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"58794d2d562c1a8c3fbf7b8003ce466898bac434","unresolved":true,"context_lines":[{"line_number":22,"context_line":"    Ignoring remote osr w/o epoch: x, from: y"},{"line_number":23,"context_line":""},{"line_number":24,"context_line":"These warnings can also be observed, when sharding shards, as"},{"line_number":25,"context_line":"demonstrated by the probetest."},{"line_number":26,"context_line":""},{"line_number":27,"context_line":"Closes-bug: #1980451"},{"line_number":28,"context_line":"Change-Id: I069bdbeb430e89074605e40525d955b3a704a44f"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":14,"id":"ab35cede_2306112d","line":25,"updated":"2024-02-02 19:40:50.000000000","message":"Note that we\u0027ll want to drop this from the commit message if we squash in Al\u0027s follow-up.","commit_id":"1607b25175586c6e0055610b85915e4ddeba40b7"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"53934b7f1ad67ae50e3750f87e14d36c6564b94e","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":8,"id":"bc9a79c0_540aa977","updated":"2022-07-06 05:44:11.000000000","message":"I have a probe test the shows these false positives which I\u0027ll then use to work at stopping the false positives, I\u0027ll push the probe test up in a new patchset.\n\nMight have a play with the fixing in a follow up or seperate patch (at least for now) as this one is including in our prod downstream.","commit_id":"7d1a178771d3d8961c93679c963ff40293055cd5"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"48f1c663fd3b6bd2f4f5d49d438ff83d44005369","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"35a90bcb_7978e9b7","updated":"2022-07-06 07:09:56.000000000","message":"OK I\u0027ve been looking back at the original incidents we had when we have a epochs reset on us. And in all the cases the OSR state was ACTIVE, the epoch was None and there was a state_timestamp.\n\nThat last one is actually very interesting. I delved back into the sharding statemachine. The default OSR makes the state_timestamp None and the only times we set the state_timestamp is when we move to SHARDING, SHRINKING or setting the epoch. This means that the \"reset\" epoch isn\u0027t simply a default OSR. But also means we have a way of detecting the false positives.\n\nA shard OSR (which always exists in a shard) will be ACTIVE, have an epoch of None and a state_timestamp of None. But it\u0027ll also share a timestamp with that of it\u0027s replicas, as the timestamp shouldn\u0027t change unless we change deleted, name, upper or lower.\nThis means I can simply _not_ throw the warning if the replica OSR shard range has the same timestamp and has no state_timestamp. It means it\u0027s just an ACTIVE shard range that hasn\u0027t got its epoch yet.\n\nAlong these lines, I can do something like: https://paste.opendev.org/show/815286/\n\nWhich contains:\n\n  if shard[\u0027name\u0027] \u003d\u003d osr[\u0027name\u0027] and shard[\u0027epoch\u0027] is None:\n      # state_timestamp is only set when marking for sharding, shrinking\n      # or setting an epoch. Therefore we can make an assumption:\n      # 1. if shard has the same timestamp, then it\u0027s a replicas version\n      #    of the OSR (not a new default); and\n      # 2. If the state_timestamp is still None, then it\u0027s still just\n      #   an ACTIVE shard OSR\n      # If both those cases are true, then it\u0027s just another OSR replica\n      # that hasn\u0027t moved to sharding yet, so we can safely ignore\n      # without firing off a warning\n      if not (osr[\u0027timestamp\u0027] \u003d\u003d shard[\u0027timestamp\u0027] and\n               not shard[\u0027state_timestamp\u0027]):\n           logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,\n                          shard, source)\n      continue\n\nSure enough when I add this code to this patch the new false positive probetests start to fail (there are no false positives).\n\nNow that we know that, we might be able to just update the patch, which should remove the false positives and we can get back to monitoring for the warning to see if we actually every trigger the bug again. OR we could move this same logic into merge_shards and have a \"fix\" inside the broker.","commit_id":"7fd883bc64f24089b28937b004353ae21f78301b"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"7cae078c11092d87cc70cefdf8aa89886d5136c8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":9,"id":"af60b99f_8ce5a855","in_reply_to":"35a90bcb_7978e9b7","updated":"2023-05-23 23:25:36.000000000","message":"So where\u0027d we land on this? I don\u0027t love the false positives, but it\u0027s been like a year and a half now, and the false positives have gotta be better than the bug...","commit_id":"7fd883bc64f24089b28937b004353ae21f78301b"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"28ab4908148447a31564b2c0f1cfb2548fe8aec8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":11,"id":"5aee975b_eb5c1cb6","updated":"2022-09-21 19:19:30.000000000","message":"this seems to be helping in prod with the associated bug, I\u0027d like to see it merged if that makes sense","commit_id":"7d864b0e730f764e5852e876919061a9997021de"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"99caf9bdcbf9afb628e475e9eaa1f806011bdf85","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":13,"id":"c074d9c1_f82e3d57","updated":"2024-01-24 18:00:35.000000000","message":"recheck\n\nzuul page no longer has test results","commit_id":"126463f13df7219e553da82d359acd5a16e4e49c"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"4ad6fbfa51ad5c86195853fb7b5164f98fbe5197","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":14,"id":"8c5ba39c_75a27435","updated":"2024-02-02 17:05:11.000000000","message":"I think it would be good to eliminate the false postives so put up a follow up patch https://review.opendev.org/c/openstack/swift/+/907622/1?usp\u003drelated-change\n\nTim had previously left a paste to similarly verify if the merge would actually reset the epoch (true positive) but I liked the idea of doing the check pre-emptively in the replicator so that we can include the source in the logged warning.","commit_id":"1607b25175586c6e0055610b85915e4ddeba40b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"58794d2d562c1a8c3fbf7b8003ce466898bac434","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":14,"id":"8e0acb01_167cbf08","updated":"2024-02-02 19:40:50.000000000","message":"We\u0027ve been stewing on this for two and a half years, and carrying it for nearly as long. I\u0027d love to see this landed, especially if we can squash in the false-positive fix.","commit_id":"1607b25175586c6e0055610b85915e4ddeba40b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"f793a31960a89337760fbe74392bc1f9ed8a5de8","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":16,"id":"a17859b5_bfd0d5b9","updated":"2024-02-07 21:39:27.000000000","message":"Just had one small cleanup in tests.","commit_id":"8227f4539cbdf4b9b59def8a344084796622248c"}],"swift/container/replicator.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":36,"context_line":"def merge_remote_own_shard_range(local_osr, remote_osr):"},{"line_number":37,"context_line":"    if isinstance(remote_osr, dict):"},{"line_number":38,"context_line":"        remote_osr \u003d ShardRange(**remote_osr)"},{"line_number":39,"context_line":"    if local_osr and local_osr.epoch and remote_osr.epoch is None:"},{"line_number":40,"context_line":"        return False"},{"line_number":41,"context_line":"    return True"},{"line_number":42,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"ddd18a1b_d1d30397","line":39,"updated":"2021-09-24 21:16:41.000000000","message":"isn\u0027t \"remote_osr.epoch is None\" enough?  when do we have a local_osr with no epoch and even if we did what does it matter then if the remote does?","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"7fafbf6caafc86a52b2b252767ac03dbf89fae44","unresolved":false,"context_lines":[{"line_number":36,"context_line":"def merge_remote_own_shard_range(local_osr, remote_osr):"},{"line_number":37,"context_line":"    if isinstance(remote_osr, dict):"},{"line_number":38,"context_line":"        remote_osr \u003d ShardRange(**remote_osr)"},{"line_number":39,"context_line":"    if local_osr and local_osr.epoch and remote_osr.epoch is None:"},{"line_number":40,"context_line":"        return False"},{"line_number":41,"context_line":"    return True"},{"line_number":42,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"679d8b3d_d0cd8114","line":39,"in_reply_to":"705a1fc3_48a94486","updated":"2022-07-01 06:15:24.000000000","message":"Ack","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ac9893d3a7b7d392b1d882dd984f057ef5c2fed","unresolved":true,"context_lines":[{"line_number":36,"context_line":"def merge_remote_own_shard_range(local_osr, remote_osr):"},{"line_number":37,"context_line":"    if isinstance(remote_osr, dict):"},{"line_number":38,"context_line":"        remote_osr \u003d ShardRange(**remote_osr)"},{"line_number":39,"context_line":"    if local_osr and local_osr.epoch and remote_osr.epoch is None:"},{"line_number":40,"context_line":"        return False"},{"line_number":41,"context_line":"    return True"},{"line_number":42,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"705a1fc3_48a94486","line":39,"in_reply_to":"ddd18a1b_d1d30397","updated":"2021-09-27 06:51:33.000000000","message":"The idea here was to only block merging if the remote\u0027s epoch is None and the local isn\u0027t. Having a rempte epoch and None and a local is totally valid and what shards use. If we remove it shards will not merge each others OSRs which we may need to keep state up to date.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":47,"context_line":"    if remote_osr:"},{"line_number":48,"context_line":"        remote_index, remote_osr \u003d remote_osr[0]"},{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"}],"source_content_type":"text/x-python","patch_set":3,"id":"89e37c63_88f050e4","line":50,"updated":"2021-09-24 21:16:41.000000000","message":"I find the use of another fuction here makes this whole thing harder to follow:\n\nhttps://review.opendev.org/c/openstack/swift/+/810963","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":47,"context_line":"    if remote_osr:"},{"line_number":48,"context_line":"        remote_index, remote_osr \u003d remote_osr[0]"},{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"}],"source_content_type":"text/x-python","patch_set":3,"id":"873e05bb_c6faebaa","line":50,"in_reply_to":"5c29e9b4_c87ea635","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ac9893d3a7b7d392b1d882dd984f057ef5c2fed","unresolved":true,"context_lines":[{"line_number":47,"context_line":"    if remote_osr:"},{"line_number":48,"context_line":"        remote_index, remote_osr \u003d remote_osr[0]"},{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"}],"source_content_type":"text/x-python","patch_set":3,"id":"5c29e9b4_c87ea635","line":50,"in_reply_to":"89e37c63_88f050e4","updated":"2021-09-27 06:51:33.000000000","message":"I thought it might be easier then having the function inline here. That is at least currently as the helper function stands, if we can simplify it then fine 😊","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"72e3fc12d56ac7bcda54d6b1ed7813d506582e08","unresolved":true,"context_lines":[{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"},{"line_number":54,"context_line":"    return shards"},{"line_number":55,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"02200b38_f6dafd88","line":52,"updated":"2021-09-22 17:38:53.000000000","message":"I don\u0027t think DEBUG is the right log level here - that would make it harder to detect.  We run with log level INFO in production.  This is kind of an exceptional behavior, maybe a WARNING?","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ac9893d3a7b7d392b1d882dd984f057ef5c2fed","unresolved":true,"context_lines":[{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"},{"line_number":54,"context_line":"    return shards"},{"line_number":55,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"4bf2f8b0_6baa36f7","line":52,"in_reply_to":"02200b38_f6dafd88","updated":"2021-09-27 06:51:33.000000000","message":"True, seeing as we may run this patch to detect and norrow down the main issue at hand.. yes debug is definitely wrong here.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":49,"context_line":"        if not merge_remote_own_shard_range("},{"line_number":50,"context_line":"                broker.get_own_shard_range(), remote_osr):"},{"line_number":51,"context_line":"            logger.debug(\"Not merging remote own_shard_range \""},{"line_number":52,"context_line":"                         \"into %s\", broker.db_file)"},{"line_number":53,"context_line":"            shards.pop(remote_index)"},{"line_number":54,"context_line":"    return shards"},{"line_number":55,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"d2eb481c_40ed2332","line":52,"in_reply_to":"4bf2f8b0_6baa36f7","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":161,"context_line":"        with Timeout(self.node_timeout):"},{"line_number":162,"context_line":"            response \u003d http.replicate(\u0027get_shard_ranges\u0027)"},{"line_number":163,"context_line":"        if response and is_success(response.status):"},{"line_number":164,"context_line":"            shards \u003d json.loads(response.data.decode(\u0027ascii\u0027))"},{"line_number":165,"context_line":"            shards \u003d check_merge_own_shard_range(shards, broker, self.logger)"},{"line_number":166,"context_line":"            broker.merge_shard_ranges(shards)"},{"line_number":167,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"979b187a_882290f6","line":164,"range":{"start_line":164,"start_character":53,"end_line":164,"end_character":60},"updated":"2021-09-27 11:49:06.000000000","message":"is \u0027ascii\u0027 correct? not sure we enforce ascii when dumping shard data and names \u0026 bounds could have non-ascii","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"ba1e80f841f2e787efbe8e6f71b56cf8976d0fc9","unresolved":true,"context_lines":[{"line_number":161,"context_line":"        with Timeout(self.node_timeout):"},{"line_number":162,"context_line":"            response \u003d http.replicate(\u0027get_shard_ranges\u0027)"},{"line_number":163,"context_line":"        if response and is_success(response.status):"},{"line_number":164,"context_line":"            shards \u003d json.loads(response.data.decode(\u0027ascii\u0027))"},{"line_number":165,"context_line":"            shards \u003d check_merge_own_shard_range(shards, broker, self.logger)"},{"line_number":166,"context_line":"            broker.merge_shard_ranges(shards)"},{"line_number":167,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"c36d8e25_8800081b","line":164,"range":{"start_line":164,"start_character":53,"end_line":164,"end_character":60},"in_reply_to":"979b187a_882290f6","updated":"2021-09-27 19:51:27.000000000","message":"It\u0027s always been sent to us via json.dumps(), though -- which defaults to ensure_ascii\u003dTrue","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":163,"context_line":"        if response and is_success(response.status):"},{"line_number":164,"context_line":"            shards \u003d json.loads(response.data.decode(\u0027ascii\u0027))"},{"line_number":165,"context_line":"            shards \u003d check_merge_own_shard_range(shards, broker, self.logger)"},{"line_number":166,"context_line":"            broker.merge_shard_ranges(shards)"},{"line_number":167,"context_line":""},{"line_number":168,"context_line":"    def find_local_handoff_for_part(self, part):"},{"line_number":169,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":3,"id":"5130ecd2_61dc3009","line":166,"updated":"2021-09-24 21:16:41.000000000","message":"this is when we pull info from a remote","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"4ad6fbfa51ad5c86195853fb7b5164f98fbe5197","unresolved":true,"context_lines":[{"line_number":163,"context_line":"        if response and is_success(response.status):"},{"line_number":164,"context_line":"            shards \u003d json.loads(response.data.decode(\u0027ascii\u0027))"},{"line_number":165,"context_line":"            shards \u003d check_merge_own_shard_range(shards, broker, self.logger)"},{"line_number":166,"context_line":"            broker.merge_shard_ranges(shards)"},{"line_number":167,"context_line":""},{"line_number":168,"context_line":"    def find_local_handoff_for_part(self, part):"},{"line_number":169,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":3,"id":"f0514196_7b9ad6e4","line":166,"in_reply_to":"3ef4174b_94162952","updated":"2024-02-02 17:05:11.000000000","message":"Out of ~2.6k \"Ignoring remote osr w/o epoch\" in our logs, all are this case.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"dda95b5c2cf0ae0af7f509f546982b0b24528f6a","unresolved":true,"context_lines":[{"line_number":163,"context_line":"        if response and is_success(response.status):"},{"line_number":164,"context_line":"            shards \u003d json.loads(response.data.decode(\u0027ascii\u0027))"},{"line_number":165,"context_line":"            shards \u003d check_merge_own_shard_range(shards, broker, self.logger)"},{"line_number":166,"context_line":"            broker.merge_shard_ranges(shards)"},{"line_number":167,"context_line":""},{"line_number":168,"context_line":"    def find_local_handoff_for_part(self, part):"},{"line_number":169,"context_line":"        \"\"\""}],"source_content_type":"text/x-python","patch_set":3,"id":"3ef4174b_94162952","line":166,"in_reply_to":"5130ecd2_61dc3009","updated":"2022-02-24 17:38:29.000000000","message":"And when we see the warning trip, this is usually the caller (since it shows up with the host/path, never rsync and only rarely repl_req)","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":401,"context_line":"        # Note the following hook will need to change to using a pointer and"},{"line_number":402,"context_line":"        # limit in the future."},{"line_number":403,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":404,"context_line":"        shards \u003d check_merge_own_shard_range(shards, new_broker, self.logger)"},{"line_number":405,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"d012567a_bcc9ba09","line":404,"updated":"2021-09-27 11:49:06.000000000","message":"not covered by unit tests","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":401,"context_line":"        # Note the following hook will need to change to using a pointer and"},{"line_number":402,"context_line":"        # limit in the future."},{"line_number":403,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":404,"context_line":"        shards \u003d check_merge_own_shard_range(shards, new_broker, self.logger)"},{"line_number":405,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"17c5a8bf_f1c6fa7f","line":404,"in_reply_to":"d012567a_bcc9ba09","updated":"2022-02-28 06:33:28.000000000","message":"Done","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"09f7c35936524fc9f0f9082e082e496545f895d2","unresolved":true,"context_lines":[{"line_number":402,"context_line":"        # limit in the future."},{"line_number":403,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":404,"context_line":"        shards \u003d check_merge_own_shard_range(shards, new_broker, self.logger)"},{"line_number":405,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"}],"source_content_type":"text/x-python","patch_set":3,"id":"1ed51bc3_ea542a53","line":405,"updated":"2021-09-21 21:01:13.000000000","message":"This one feels a bit unlike the others... Probably good that we check it, but it *feels* different.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":402,"context_line":"        # limit in the future."},{"line_number":403,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":404,"context_line":"        shards \u003d check_merge_own_shard_range(shards, new_broker, self.logger)"},{"line_number":405,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"}],"source_content_type":"text/x-python","patch_set":3,"id":"2c37a142_d1cbedc7","line":405,"in_reply_to":"1ed51bc3_ea542a53","updated":"2021-09-24 21:16:41.000000000","message":"... and this is the rsync-then-merge","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ac9893d3a7b7d392b1d882dd984f057ef5c2fed","unresolved":true,"context_lines":[{"line_number":402,"context_line":"        # limit in the future."},{"line_number":403,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":404,"context_line":"        shards \u003d check_merge_own_shard_range(shards, new_broker, self.logger)"},{"line_number":405,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"}],"source_content_type":"text/x-python","patch_set":3,"id":"127995f9_55d2d5be","line":405,"in_reply_to":"2c37a142_d1cbedc7","updated":"2021-09-27 06:51:33.000000000","message":"yeah, thought being if we have a new PUT to a handoff or new primary and a new container database is created it\u0027s ROWID will start at 0 right? An old primary will have a ROWID much bigger, which may trigger an rsync-then-merge. We rsync it to tmp (the new_broker), then pull in the shard ranges from the existing broker (in this case the new one). And we\u0027d get the same problem.\n\nHowever, the old primary is SHARDED so maybe the rsync-then-merge wont be triggered. I\u0027ll have to double check this. Either way it was a safety to make sure I filtered in all _possible_ places outside of the merge_shards.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"},{"line_number":409,"context_line":"        broker.merge_shard_ranges(shards)"},{"line_number":410,"context_line":"        return HTTPAccepted()"},{"line_number":411,"context_line":""},{"line_number":412,"context_line":"    def get_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"3dbc1c8e_e56c5569","line":409,"updated":"2021-09-24 21:16:41.000000000","message":"so this is the container-server-rpc; when we\u0027re pushed new shard ranges","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"4ad6fbfa51ad5c86195853fb7b5164f98fbe5197","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"},{"line_number":409,"context_line":"        broker.merge_shard_ranges(shards)"},{"line_number":410,"context_line":"        return HTTPAccepted()"},{"line_number":411,"context_line":""},{"line_number":412,"context_line":"    def get_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"719bbddc_2d557e05","line":409,"in_reply_to":"17b03f4b_750ec5d7","updated":"2024-02-02 17:05:11.000000000","message":"Out of ~2.6k \"Ignoring remote osr w/o epoch\" in our logs, zero are this case.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"},{"line_number":409,"context_line":"        broker.merge_shard_ranges(shards)"},{"line_number":410,"context_line":"        return HTTPAccepted()"},{"line_number":411,"context_line":""},{"line_number":412,"context_line":"    def get_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"17b03f4b_750ec5d7","line":409,"in_reply_to":"319b9180_9011a726","updated":"2021-09-27 11:49:06.000000000","message":"not covered by unit tests","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3ac9893d3a7b7d392b1d882dd984f057ef5c2fed","unresolved":true,"context_lines":[{"line_number":406,"context_line":""},{"line_number":407,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":408,"context_line":"        shards \u003d check_merge_own_shard_range(args[0], broker, self.logger)"},{"line_number":409,"context_line":"        broker.merge_shard_ranges(shards)"},{"line_number":410,"context_line":"        return HTTPAccepted()"},{"line_number":411,"context_line":""},{"line_number":412,"context_line":"    def get_shard_ranges(self, broker, args):"}],"source_content_type":"text/x-python","patch_set":3,"id":"319b9180_9011a726","line":409,"in_reply_to":"3dbc1c8e_e56c5569","updated":"2021-09-27 06:51:33.000000000","message":"Yup, so we sync both ways on every replication.","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"80911d72ada372c286ede2b5ebf37c58435249dd","unresolved":true,"context_lines":[{"line_number":49,"context_line":"    for shard in shards:"},{"line_number":50,"context_line":"        if shard[\u0027name\u0027] \u003d\u003d broker.path and shard[\u0027epoch\u0027] is None:"},{"line_number":51,"context_line":"            logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,"},{"line_number":52,"context_line":"                           shard, source)"},{"line_number":53,"context_line":"            continue"},{"line_number":54,"context_line":"        to_merge.append(shard)"},{"line_number":55,"context_line":"    return to_merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"655ca55a_4fa7f93e","line":52,"updated":"2022-02-23 21:15:03.000000000","message":"might be worth adding broker.db_path to the warning?","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"cda6df1b97e0b2e4bb25de7e220f59517c769312","unresolved":true,"context_lines":[{"line_number":49,"context_line":"    for shard in shards:"},{"line_number":50,"context_line":"        if shard[\u0027name\u0027] \u003d\u003d broker.path and shard[\u0027epoch\u0027] is None:"},{"line_number":51,"context_line":"            logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,"},{"line_number":52,"context_line":"                           shard, source)"},{"line_number":53,"context_line":"            continue"},{"line_number":54,"context_line":"        to_merge.append(shard)"},{"line_number":55,"context_line":"    return to_merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"7f2267f4_7b3198fe","line":52,"in_reply_to":"46bb3a65_7a7b0085","updated":"2022-02-24 20:20:56.000000000","message":"This tends to catch a lot of false-positives, where we initiate sharding on one replica (usually a shard), then the other replicas trip this warning even though the state timestamps should keep us from actually dropping the epoch. Alistair had an idea this morning to reduce the noise: only emit the warning if the epoch we\u0027re overwriting is older than (now - an hour) or something.\n\nI\u0027m kind of tempted to come at it from merge_shards() instead, with something like https://paste.opendev.org/show/baqD3bolWr4Wu41UIZnA/ -- though that breaks a couple unit tests.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"dda95b5c2cf0ae0af7f509f546982b0b24528f6a","unresolved":true,"context_lines":[{"line_number":49,"context_line":"    for shard in shards:"},{"line_number":50,"context_line":"        if shard[\u0027name\u0027] \u003d\u003d broker.path and shard[\u0027epoch\u0027] is None:"},{"line_number":51,"context_line":"            logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,"},{"line_number":52,"context_line":"                           shard, source)"},{"line_number":53,"context_line":"            continue"},{"line_number":54,"context_line":"        to_merge.append(shard)"},{"line_number":55,"context_line":"    return to_merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"46bb3a65_7a7b0085","line":52,"in_reply_to":"655ca55a_4fa7f93e","updated":"2022-02-24 17:38:29.000000000","message":"Not a bad idea, but you *can* derive it together from what\u0027s logged: pass the name from the shard dict to swift-get-nodes to get the part/suffix/hashdir.\n\nEven more useful (I think): add broker.get_own_shard_range(), so we can see what we\u0027re trying to merge *into* and how merge_shards() is failing us. It seems like we must have\n\n shard[\u0027state_timestamp\u0027] \u003e broker.get_own_shard_range()[\u0027state_timestamp\u0027]\n\nbut I\u0027m not *at all* clear on how that could be the case...","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":true,"context_lines":[{"line_number":49,"context_line":"    for shard in shards:"},{"line_number":50,"context_line":"        if shard[\u0027name\u0027] \u003d\u003d broker.path and shard[\u0027epoch\u0027] is None:"},{"line_number":51,"context_line":"            logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,"},{"line_number":52,"context_line":"                           shard, source)"},{"line_number":53,"context_line":"            continue"},{"line_number":54,"context_line":"        to_merge.append(shard)"},{"line_number":55,"context_line":"    return to_merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"88e16dfa_d99cc9e8","line":52,"in_reply_to":"7f2267f4_7b3198fe","updated":"2022-02-28 06:33:28.000000000","message":"Thats an interesting idea. Yeah, the way we auto shard where we are controller driven caused alot of false positives. Time based might work. I\u0027ll have a play.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"7fafbf6caafc86a52b2b252767ac03dbf89fae44","unresolved":true,"context_lines":[{"line_number":49,"context_line":"    for shard in shards:"},{"line_number":50,"context_line":"        if shard[\u0027name\u0027] \u003d\u003d broker.path and shard[\u0027epoch\u0027] is None:"},{"line_number":51,"context_line":"            logger.warning(\u0027Ignoring remote osr w/o epoch: %r, from: %s\u0027,"},{"line_number":52,"context_line":"                           shard, source)"},{"line_number":53,"context_line":"            continue"},{"line_number":54,"context_line":"        to_merge.append(shard)"},{"line_number":55,"context_line":"    return to_merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"123fa6ac_ac97f9b5","line":52,"in_reply_to":"88e16dfa_d99cc9e8","updated":"2022-07-01 06:15:24.000000000","message":"Tim, I do like the merge_shards approach. I think I had a version like this early on. Let me take a closer look and see if that makes more sense.\n\nI do like that we can compare the timestamps, because in theory one of the these \"non epoch maybe default OSRs\" should have a later timestamp, which is why the stomp all over existing ones.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"5d2fe6ea35841eadb920530e2891ac01a71b8611","unresolved":true,"context_lines":[{"line_number":405,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":406,"context_line":"        shards \u003d check_merge_own_shard_range("},{"line_number":407,"context_line":"            shards, new_broker, self.logger, \u0027rsync\u0027)"},{"line_number":408,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":409,"context_line":""},{"line_number":410,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":411,"context_line":"        shards \u003d check_merge_own_shard_range("}],"source_content_type":"text/x-python","patch_set":6,"id":"0c75fddf_a7219b50","line":408,"updated":"2021-09-28 06:14:02.000000000","message":"I actually think this part of the change is completely unrequired. Looked back at the code, and rsync_then_merge will NOT happen if sharding or sharded locally. If an rsync and then merge happens then it\u0027ll be aborted if the remote is sharding. So this code isn\u0027t required. I might just revert this part.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e72a83cd44cafd49eab676956bd664c609218f81","unresolved":true,"context_lines":[{"line_number":405,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":406,"context_line":"        shards \u003d check_merge_own_shard_range("},{"line_number":407,"context_line":"            shards, new_broker, self.logger, \u0027rsync\u0027)"},{"line_number":408,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":409,"context_line":""},{"line_number":410,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":411,"context_line":"        shards \u003d check_merge_own_shard_range("}],"source_content_type":"text/x-python","patch_set":6,"id":"2209e7f4_209010b5","line":408,"in_reply_to":"0c75fddf_a7219b50","updated":"2021-09-28 15:28:05.000000000","message":"\u003e if sharding or sharded locally\n\nbut if osr is None; would it even *know* that it\u0027s sharding or sharded locally?  let\u0027s leave it for belts and braces and consistency","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"297aa009bacf55338f228afb51ce31272f06a0bc","unresolved":true,"context_lines":[{"line_number":405,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":406,"context_line":"        shards \u003d check_merge_own_shard_range("},{"line_number":407,"context_line":"            shards, new_broker, self.logger, \u0027rsync\u0027)"},{"line_number":408,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":409,"context_line":""},{"line_number":410,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":411,"context_line":"        shards \u003d check_merge_own_shard_range("}],"source_content_type":"text/x-python","patch_set":6,"id":"7abb1dc1_39a769c7","line":408,"in_reply_to":"2209e7f4_209010b5","updated":"2021-09-29 00:49:04.000000000","message":"If the local side (ie not opposite side of the RPC) has a broker.sharding_initiated() it\u0027ll sync shard ranges and leave it to the sharder to deal.\nLike you mention, which is the local is a None epoch, or moreso a default OSR, this will not be sharding_initated. But to be a candidate for rsync_then_merge the local needs to be have a higher object ROWID to qaulify (https://github.com/openstack/swift/blob/master/swift/common/db_replicator.py#L514). Which shouldn\u0027t be the case if it was a small handoff or what not. But let\u0027s be devil\u0027s avocate for a second and say it does, then it first pulls the remote shard ranges which should make the OSR in a sharding state, unless the remote OSR is \u003c Local OSR in which case it\u0027ll:\n\n  - It\u0027ll rsync the local DB over to tmp. \n  - On the RPC side of rsync_then_merge it\u0027ll do a self._abort_rsync_then_merge which does check to see if the RPC is broker.sharding_initiated() and if so abort.\n\nSo there is a server side check. It\u0027s kinda annoying that it\u0027ll rsync then check and fail. We do send back the db_state in the replication info, so we should do a check for that and abort _before_ sending a db, but this edgecase does seem pretty small.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"4ad6fbfa51ad5c86195853fb7b5164f98fbe5197","unresolved":true,"context_lines":[{"line_number":405,"context_line":"        shards \u003d existing_broker.get_all_shard_range_data()"},{"line_number":406,"context_line":"        shards \u003d check_merge_own_shard_range("},{"line_number":407,"context_line":"            shards, new_broker, self.logger, \u0027rsync\u0027)"},{"line_number":408,"context_line":"        new_broker.merge_shard_ranges(shards)"},{"line_number":409,"context_line":""},{"line_number":410,"context_line":"    def merge_shard_ranges(self, broker, args):"},{"line_number":411,"context_line":"        shards \u003d check_merge_own_shard_range("}],"source_content_type":"text/x-python","patch_set":6,"id":"b9b7cf5e_2eaeadb5","line":408,"in_reply_to":"7abb1dc1_39a769c7","updated":"2024-02-02 17:05:11.000000000","message":"Out of ~2.6k \"Ignoring remote osr w/o epoch\" in our logs, zero are this case.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"}],"test/probe/test_sharder.py":[{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":3383,"context_line":"        handoff_broker \u003d ContainerBroker(dir_content[\u0027normal_dbs\u0027][0])"},{"line_number":3384,"context_line":"        self.assertEqual(len(handoff_broker.get_shard_ranges()), 2)"},{"line_number":3385,"context_line":"        handoff_osr \u003d handoff_broker.get_own_shard_range(no_default\u003dTrue)"},{"line_number":3386,"context_line":"        self.assertIsNotNone(handoff_osr.epoch)"},{"line_number":3387,"context_line":""},{"line_number":3388,"context_line":"    def test_force_replication_of_a_reset_own_shard_range(self):"},{"line_number":3389,"context_line":"        obj_names \u003d self._make_object_names(100)"}],"source_content_type":"text/x-python","patch_set":3,"id":"df51be2c_e8b7a258","line":3386,"updated":"2021-09-24 21:16:41.000000000","message":"ok, this one passes on master still","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"bec3dc412d9ab53ebb33cf474855e3cfa27024e5","unresolved":true,"context_lines":[{"line_number":3464,"context_line":"        # epoched osr. But it doesn\u0027t do the other way. So it means the"},{"line_number":3465,"context_line":"        # primary with non-epoched OSR get\u0027s stuck with it, if it is newer then"},{"line_number":3466,"context_line":"        # the other epoched versions."},{"line_number":3467,"context_line":"        self.assertIsNotNone(old_osr.epoch)"},{"line_number":3468,"context_line":"        self.assertEqual(old_osr.state, ShardRange.SHARDED)"},{"line_number":3469,"context_line":""},{"line_number":3470,"context_line":"        self.assertIsNone(new_osr.epoch)"}],"source_content_type":"text/x-python","patch_set":3,"id":"9b1e1904_2ea41d66","line":3467,"updated":"2021-09-24 21:16:41.000000000","message":"this fails unless we prevent replication","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e72a83cd44cafd49eab676956bd664c609218f81","unresolved":true,"context_lines":[{"line_number":3463,"context_line":"        # This version stops replicating a remote non-epoch osr over a local"},{"line_number":3464,"context_line":"        # epoched osr. But it doesn\u0027t do the other way. So it means the"},{"line_number":3465,"context_line":"        # primary with non-epoched OSR get\u0027s stuck with it, if it is newer then"},{"line_number":3466,"context_line":"        # the other epoched versions."},{"line_number":3467,"context_line":"        self.assertIsNotNone(old_osr.epoch)"},{"line_number":3468,"context_line":"        self.assertEqual(old_osr.state, ShardRange.SHARDED)"},{"line_number":3469,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"4d60c6f2_bc9626da","line":3466,"updated":"2021-09-28 15:28:05.000000000","message":"*if* it\u0027s newer - that\u0027s the second time that came up in test comments - is the unintuative behavior desirable enough to preserve or simply an observation of how the system works currently?","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"297aa009bacf55338f228afb51ce31272f06a0bc","unresolved":true,"context_lines":[{"line_number":3463,"context_line":"        # This version stops replicating a remote non-epoch osr over a local"},{"line_number":3464,"context_line":"        # epoched osr. But it doesn\u0027t do the other way. So it means the"},{"line_number":3465,"context_line":"        # primary with non-epoched OSR get\u0027s stuck with it, if it is newer then"},{"line_number":3466,"context_line":"        # the other epoched versions."},{"line_number":3467,"context_line":"        self.assertIsNotNone(old_osr.epoch)"},{"line_number":3468,"context_line":"        self.assertEqual(old_osr.state, ShardRange.SHARDED)"},{"line_number":3469,"context_line":""}],"source_content_type":"text/x-python","patch_set":6,"id":"6951fc6f_5baa7601","line":3466,"in_reply_to":"4d60c6f2_bc9626da","updated":"2021-09-29 00:49:04.000000000","message":"Yeah, it\u0027s the current crux of the bug I think. We always merge newer (like the rest of swift). But a default OSR is brand new, so timestamp \u003d\u003d Timestamp.now() which is newer the the old OSR.\n\nIf it the default was older then it\u0027ll just merged away.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"58794d2d562c1a8c3fbf7b8003ce466898bac434","unresolved":true,"context_lines":[{"line_number":4267,"context_line":""},{"line_number":4268,"context_line":"        self.assertTrue(any("},{"line_number":4269,"context_line":"            [True for w in warnings"},{"line_number":4270,"context_line":"             if expected_false_positive_line_snippet in w]))"},{"line_number":4271,"context_line":""},{"line_number":4272,"context_line":"        # But it does send the new OSR with an epoch so the others should all"},{"line_number":4273,"context_line":"        # have it now."}],"source_content_type":"text/x-python","patch_set":14,"id":"f4d9eb4e_8ef6e1e7","line":4270,"updated":"2024-02-02 19:40:50.000000000","message":"Better as\n```\nself.assertTrue([w for w in warnings\n                 if expected_false_positive_line_snippet in w])\n```\n? It certainly seems better if/when we squash in https://review.opendev.org/c/openstack/swift/+/907622 and change it to an `assertFalse`.\n\nWe don\u0027t have any assertions that have _true_ positives, though, right? Is that mostly because it\u0027s still a little mysterious how we found ourselves in that epoch-got-reset situation?","commit_id":"1607b25175586c6e0055610b85915e4ddeba40b7"}],"test/unit/container/test_replicator.py":[{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"6bc2e721f2805c50637116a3f6cff54e2dfd992d","unresolved":true,"context_lines":[{"line_number":1463,"context_line":"                lines \u003d daemon.logger.get_lines_for_level(\u0027debug\u0027)"},{"line_number":1464,"context_line":"                line \u003d (\"Not merging remote own_shard_range into %s\""},{"line_number":1465,"context_line":"                        % broker.db_file)"},{"line_number":1466,"context_line":"                self.assertIn(line, lines)"},{"line_number":1467,"context_line":""},{"line_number":1468,"context_line":"            os.remove(broker.db_file)"},{"line_number":1469,"context_line":"            os.remove(remote_broker.db_file)"}],"source_content_type":"text/x-python","patch_set":3,"id":"043c87fd_56df99c0","line":1466,"updated":"2021-09-27 11:49:06.000000000","message":"we should assert the opposite case too - see https://review.opendev.org/c/openstack/swift/+/811112","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"4f54fca6bb5c3cce5b32c841934d16bb1fb8981d","unresolved":false,"context_lines":[{"line_number":1463,"context_line":"                lines \u003d daemon.logger.get_lines_for_level(\u0027debug\u0027)"},{"line_number":1464,"context_line":"                line \u003d (\"Not merging remote own_shard_range into %s\""},{"line_number":1465,"context_line":"                        % broker.db_file)"},{"line_number":1466,"context_line":"                self.assertIn(line, lines)"},{"line_number":1467,"context_line":""},{"line_number":1468,"context_line":"            os.remove(broker.db_file)"},{"line_number":1469,"context_line":"            os.remove(remote_broker.db_file)"}],"source_content_type":"text/x-python","patch_set":3,"id":"ffaef38e_7b868f6b","line":1466,"in_reply_to":"043c87fd_56df99c0","updated":"2022-02-28 06:33:28.000000000","message":"Ack","commit_id":"8f3f9fe45b5aa6b46c11d6e92b5aaffbcfb93b12"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e72a83cd44cafd49eab676956bd664c609218f81","unresolved":true,"context_lines":[{"line_number":1526,"context_line":"            # to local then sends updated shards to the remote. So if the"},{"line_number":1527,"context_line":"            # OSR on the remote is newer then the default the RPC side will"},{"line_number":1528,"context_line":"            # actually get a merged OSR, ie get the remote one back."},{"line_number":1529,"context_line":"            (default_osr, osr_with_epoch, True, False, False),"},{"line_number":1530,"context_line":"            # But if the local default is newer then the epoched remote side"},{"line_number":1531,"context_line":"            # we\u0027d get an error logged on the RPC side and the local is newer"},{"line_number":1532,"context_line":"            # so wil fail to merge"}],"source_content_type":"text/x-python","patch_set":6,"id":"53eed3ef_979e50ab","line":1529,"updated":"2021-09-28 15:28:05.000000000","message":"if exp_merge is True, we don\u0027t ever exp_*warning","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e72a83cd44cafd49eab676956bd664c609218f81","unresolved":true,"context_lines":[{"line_number":1530,"context_line":"            # But if the local default is newer then the epoched remote side"},{"line_number":1531,"context_line":"            # we\u0027d get an error logged on the RPC side and the local is newer"},{"line_number":1532,"context_line":"            # so wil fail to merge"},{"line_number":1533,"context_line":"            (default_osr_newer, osr_with_epoch, False, False, True),"},{"line_number":1534,"context_line":"        )"},{"line_number":1535,"context_line":"        for params in tests:"},{"line_number":1536,"context_line":"            with annotate_failure(params):"}],"source_content_type":"text/x-python","patch_set":6,"id":"45e02e73_db302015","line":1533,"updated":"2021-09-28 15:28:05.000000000","message":"so the remote osr_with_epoch will log a warning; that makes sense\n\nbut why doesn\u0027t the local merge in the goodness?  it tries and fails because it\u0027s \"newer\"?  Is this a reasonable test?  It\u0027s the ONLY one that says exp_rpc_warning\u003dTrue","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"297aa009bacf55338f228afb51ce31272f06a0bc","unresolved":true,"context_lines":[{"line_number":1530,"context_line":"            # But if the local default is newer then the epoched remote side"},{"line_number":1531,"context_line":"            # we\u0027d get an error logged on the RPC side and the local is newer"},{"line_number":1532,"context_line":"            # so wil fail to merge"},{"line_number":1533,"context_line":"            (default_osr_newer, osr_with_epoch, False, False, True),"},{"line_number":1534,"context_line":"        )"},{"line_number":1535,"context_line":"        for params in tests:"},{"line_number":1536,"context_line":"            with annotate_failure(params):"}],"source_content_type":"text/x-python","patch_set":6,"id":"5a1825da_7e92dd45","line":1533,"in_reply_to":"45e02e73_db302015","updated":"2021-09-29 00:49:04.000000000","message":"This is to test the other code path, but also to test the bad node when pushing it\u0027s badness nothing happens to the good nodes. If the default OSR is older then things are OK, the remote shard_range sync will override it. Again it\u0027s a crux of the problem.. somehow a broker gets out there without an OSR so a default one is created and written to the broker.. because it\u0027s newer it overwrites all the \"good\" ones.\n\nAnother fix would be to make the default OSR have a timestamp of Timestamp(0)... but then when do we update this to the correct time? And whens a safe place to do that? If we do it too soon then we haven\u0027t fixed anything. Which is way trying recreate the bug would be nice to have.. or just don\u0027t merge None over epoch.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"e72a83cd44cafd49eab676956bd664c609218f81","unresolved":true,"context_lines":[{"line_number":1533,"context_line":"            (default_osr_newer, osr_with_epoch, False, False, True),"},{"line_number":1534,"context_line":"        )"},{"line_number":1535,"context_line":"        for params in tests:"},{"line_number":1536,"context_line":"            with annotate_failure(params):"},{"line_number":1537,"context_line":"                do_test(*params)"},{"line_number":1538,"context_line":""},{"line_number":1539,"context_line":"    def test_sync_shard_ranges(self):"}],"source_content_type":"text/x-python","patch_set":6,"id":"2db5992e_d2810838","line":1536,"updated":"2021-09-28 15:28:05.000000000","message":"I love me me some annotate_failure - it\u0027s no DAMP testing, but it at least makes these parameterized Al tests somewhat bareable.","commit_id":"c48f7a6b3b2a930a16ff627d34694c1224071536"}]}
