)]}'
{"etc/object-server.conf-sample":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":409,"context_line":"# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never"},{"line_number":410,"context_line":"# quarantined when sufficient fragments exist to reconstruct the object."},{"line_number":411,"context_line":"# Fragments are not quarantined until they are older than the reclaim_age."},{"line_number":412,"context_line":"# quarantine_threshold \u003d 0"},{"line_number":413,"context_line":"#"},{"line_number":414,"context_line":"# Sets the maximum number of nodes to which requests will be made before"},{"line_number":415,"context_line":"# quarantining a fragment. You can use \u0027* replicas\u0027 at the end to have it use"}],"source_content_type":"application/octet-stream","patch_set":7,"id":"504a2a00_66c0641a","line":412,"updated":"2021-05-07 22:54:00.000000000","message":"Is the expectation that this is the sort of thing you might turn up for a little while, then bring back down to zero after some data\u0027s been quarantined, or that you leave at one indefinitely? I could see pros and cons to both.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":409,"context_line":"# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never"},{"line_number":410,"context_line":"# quarantined when sufficient fragments exist to reconstruct the object."},{"line_number":411,"context_line":"# Fragments are not quarantined until they are older than the reclaim_age."},{"line_number":412,"context_line":"# quarantine_threshold \u003d 0"},{"line_number":413,"context_line":"#"},{"line_number":414,"context_line":"# Sets the maximum number of nodes to which requests will be made before"},{"line_number":415,"context_line":"# quarantining a fragment. You can use \u0027* replicas\u0027 at the end to have it use"}],"source_content_type":"application/octet-stream","patch_set":7,"id":"002bd16c_ff610967","line":412,"in_reply_to":"504a2a00_66c0641a","updated":"2021-05-10 19:47:00.000000000","message":"I could imagine turning it up upon discovering the error logs and then turning down - in the ideal world lonely frags would not occur, so it may not be something we\u0027d recommend being always on.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":409,"context_line":"# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never"},{"line_number":410,"context_line":"# quarantined when sufficient fragments exist to reconstruct the object."},{"line_number":411,"context_line":"# Fragments are not quarantined until they are older than the reclaim_age."},{"line_number":412,"context_line":"# quarantine_threshold \u003d 0"},{"line_number":413,"context_line":"#"},{"line_number":414,"context_line":"# Sets the maximum number of nodes to which requests will be made before"},{"line_number":415,"context_line":"# quarantining a fragment. You can use \u0027* replicas\u0027 at the end to have it use"}],"source_content_type":"application/octet-stream","patch_set":8,"id":"e976744f_a545970c","line":412,"updated":"2021-05-10 18:10:26.000000000","message":"Still interested in some guidance for how we expect ops to use this -- do you envision it as being non-zero for a period of hours/days until a particular issue is resolved, then switched back to zero? Or let it run at one indefinitely?","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":409,"context_line":"# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never"},{"line_number":410,"context_line":"# quarantined when sufficient fragments exist to reconstruct the object."},{"line_number":411,"context_line":"# Fragments are not quarantined until they are older than the reclaim_age."},{"line_number":412,"context_line":"# quarantine_threshold \u003d 0"},{"line_number":413,"context_line":"#"},{"line_number":414,"context_line":"# Sets the maximum number of nodes to which requests will be made before"},{"line_number":415,"context_line":"# quarantining a fragment. You can use \u0027* replicas\u0027 at the end to have it use"}],"source_content_type":"application/octet-stream","patch_set":8,"id":"0fc00121_5441ac00","line":412,"in_reply_to":"e976744f_a545970c","updated":"2021-05-10 19:47:00.000000000","message":"Done","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"}],"swift/obj/reconstructor.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"0fc065e150f5ae2577a37cd6c2308c347bf24c0c","unresolved":true,"context_lines":[{"line_number":556,"context_line":"        primary_nodes \u003d [n for n in ring.get_part_nodes(partition)"},{"line_number":557,"context_line":"                         if n[\u0027id\u0027] !\u003d node[\u0027id\u0027]]"},{"line_number":558,"context_line":"        # TODO: we could tune down the initial batch of requests"},{"line_number":559,"context_line":"        min_requests \u003d ring.replicas - 1"},{"line_number":560,"context_line":"        # rebuild_node_count is the maximum nodes to consume in a normal"},{"line_number":561,"context_line":"        # rebuild attempt when there is no quarantine candidate"},{"line_number":562,"context_line":"        rebuild_node_count \u003d ring.replicas"}],"source_content_type":"text/x-python","patch_set":1,"id":"d7fa7930_5d8c15c2","line":559,"updated":"2021-04-30 07:03:03.000000000","message":"this could also be len(primary_nodes) right, unless the node[\u0027id\u0027] isn\u0027t in the primary_list somehow in which case it could be possible for len(primary_nodes) \u003d\u003d ring.replicas. Otherwise get_part_nodes should always return ring.replicas right?","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"0fc065e150f5ae2577a37cd6c2308c347bf24c0c","unresolved":true,"context_lines":[{"line_number":595,"context_line":"                self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":596,"context_line":"                                  % (fi_to_rebuild, frag_indexes))"},{"line_number":597,"context_line":"                return bucket"},{"line_number":598,"context_line":"            elif (nodes_consumed[0] \u003c rebuild_node_count or"},{"line_number":599,"context_line":"                  (response_count \u003e\u003d min_requests and"},{"line_number":600,"context_line":"                   self.is_quarantine_candidate("},{"line_number":601,"context_line":"                       policy, buckets, error_responses, df))):"}],"source_content_type":"text/x-python","patch_set":1,"id":"bf5d1138_0258e69e","line":598,"updated":"2021-04-30 07:03:03.000000000","message":"Ok so this is just incase we don\u0027t fire off (ring.replica -1) number of requests initally, then we can fire off another? Not sure how this could happen to be honest as we fire off min_request requests, unless there isn\u0027t enough nodes.. but then don\u0027t we have bigger issues.","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"75102e6df5ff9ac688839a2bbb11746c4c28f48e","unresolved":true,"context_lines":[{"line_number":595,"context_line":"                self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":596,"context_line":"                                  % (fi_to_rebuild, frag_indexes))"},{"line_number":597,"context_line":"                return bucket"},{"line_number":598,"context_line":"            elif (nodes_consumed[0] \u003c rebuild_node_count or"},{"line_number":599,"context_line":"                  (response_count \u003e\u003d min_requests and"},{"line_number":600,"context_line":"                   self.is_quarantine_candidate("},{"line_number":601,"context_line":"                       policy, buckets, error_responses, df))):"}],"source_content_type":"text/x-python","patch_set":1,"id":"f681b02d_e4aa0570","line":598,"in_reply_to":"bf5d1138_0258e69e","updated":"2021-04-30 17:19:14.000000000","message":"yes this cannot happen at the moment but I wrote it as is the initial batch size could be reduced ( see the TODO at line 558)","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"681f838e4b6ecbbd8e5a01c5f3b6edcbab54c87d","unresolved":true,"context_lines":[{"line_number":601,"context_line":"                       policy, buckets, error_responses, df))):"},{"line_number":602,"context_line":"                # only go beyond rebuild_node_count, and on to handoffs, once"},{"line_number":603,"context_line":"                # all rebuild nodes have responded *and* we have a quarantine"},{"line_number":604,"context_line":"                # candidate"},{"line_number":605,"context_line":"                for _node in node_iter:"},{"line_number":606,"context_line":"                    full_get_path \u003d _full_path(_node, partition, path, policy)"},{"line_number":607,"context_line":"                    pile.spawn(self._get_response, _node, partition,"}],"source_content_type":"text/x-python","patch_set":1,"id":"d059b75a_598b924a","line":604,"updated":"2021-04-29 22:22:39.000000000","message":"this is still not right 😭\n\n- we don\u0027t issue any extra requests until primaries have all come back and we have a lonely frag, because we only want to go to handoffs if it is a likely quarantine event\n- but then it is trickling requests to handoffs\n\nwe probably want another batch to handoffs to get some concurrency going again, then keep trickling","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"0fc065e150f5ae2577a37cd6c2308c347bf24c0c","unresolved":true,"context_lines":[{"line_number":601,"context_line":"                       policy, buckets, error_responses, df))):"},{"line_number":602,"context_line":"                # only go beyond rebuild_node_count, and on to handoffs, once"},{"line_number":603,"context_line":"                # all rebuild nodes have responded *and* we have a quarantine"},{"line_number":604,"context_line":"                # candidate"},{"line_number":605,"context_line":"                for _node in node_iter:"},{"line_number":606,"context_line":"                    full_get_path \u003d _full_path(_node, partition, path, policy)"},{"line_number":607,"context_line":"                    pile.spawn(self._get_response, _node, partition,"}],"source_content_type":"text/x-python","patch_set":1,"id":"e3bbcc16_4995623d","line":604,"in_reply_to":"d059b75a_598b924a","updated":"2021-04-30 07:03:03.000000000","message":"Might have to let it drain, then start a new spwan cycle + trickle in the quarantine_candidate case:\n\n  for resp in pile:\n     bucket \u003d self._handle_fragment_response(..)\n     if bucket and len(bucket.useful_response ...\n         return bucket\n     elif (nodes_consumed[0] \u003c rebuild_node_count:\n         \u003ctrickle more connections\u003e\n\n  if self.is_quarantine_candidate(...):\n     \u003cspawn a batch\u003e\n     for resp in pile:\n         bucket \u003d self._handle_fragment_response(..)\n         if bucket and len(bucket.useful_response ...\n             return bucket\n         else:\n             \u003ctrickle more connections until we run out of nodes\u003e\n\nOr do we want to fire off to handoff quicker before all the primaries come back? In which case we may need a seperate handoff pile to track extra concurrency. BUT we wont know we have a loney frag until after we hear from all the primaries right. So I guess we just let the pile drain.","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"75102e6df5ff9ac688839a2bbb11746c4c28f48e","unresolved":true,"context_lines":[{"line_number":601,"context_line":"                       policy, buckets, error_responses, df))):"},{"line_number":602,"context_line":"                # only go beyond rebuild_node_count, and on to handoffs, once"},{"line_number":603,"context_line":"                # all rebuild nodes have responded *and* we have a quarantine"},{"line_number":604,"context_line":"                # candidate"},{"line_number":605,"context_line":"                for _node in node_iter:"},{"line_number":606,"context_line":"                    full_get_path \u003d _full_path(_node, partition, path, policy)"},{"line_number":607,"context_line":"                    pile.spawn(self._get_response, _node, partition,"}],"source_content_type":"text/x-python","patch_set":1,"id":"645b989b_e0ee3b56","line":604,"in_reply_to":"e3bbcc16_4995623d","updated":"2021-04-30 17:19:14.000000000","message":"yes, I\u0027ve taken this kind of approach - thanks for the idea","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"0fc065e150f5ae2577a37cd6c2308c347bf24c0c","unresolved":true,"context_lines":[{"line_number":605,"context_line":"                for _node in node_iter:"},{"line_number":606,"context_line":"                    full_get_path \u003d _full_path(_node, partition, path, policy)"},{"line_number":607,"context_line":"                    pile.spawn(self._get_response, _node, partition,"},{"line_number":608,"context_line":"                               path, headers, full_get_path)"},{"line_number":609,"context_line":"                    break"},{"line_number":610,"context_line":"        return None"},{"line_number":611,"context_line":""}],"source_content_type":"text/x-python","patch_set":1,"id":"35f73793_371491f6","line":608,"updated":"2021-04-30 07:03:03.000000000","message":"Because we\u0027re using a pile we generated above with a pool size of (ring.replica -1), we\u0027ll never have ring.replica in flight at the same time, so not really doing in batches, but that\u0027s not really a problem just an observation.","commit_id":"c9e11fec12714e48fb612c2a214540a59e194fba"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":572,"context_line":"            itertools.islice("},{"line_number":573,"context_line":"                itertools.chain(primary_nodes, ring.get_more_nodes(partition)),"},{"line_number":574,"context_line":"                max_node_count - node_count),"},{"line_number":575,"context_line":"            node_count + 1)  # 1st value out of iter is 2nd node consumed"},{"line_number":576,"context_line":""},{"line_number":577,"context_line":"        pile \u003d GreenAsyncPile(concurrency)"},{"line_number":578,"context_line":"        for node_count, _node in itertools.islice(node_iter, concurrency):"}],"source_content_type":"text/x-python","patch_set":2,"id":"a7cd52e3_21133bb7","line":575,"updated":"2021-05-03 04:55:27.000000000","message":"Oh nice, so the first islice get\u0027s up an iter of the max node depth we\u0027d go.\n\nAlso, using enumerate in the iter to could the nodes.. love it.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":577,"context_line":"        pile \u003d GreenAsyncPile(concurrency)"},{"line_number":578,"context_line":"        for node_count, _node in itertools.islice(node_iter, concurrency):"},{"line_number":579,"context_line":"            pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":580,"context_line":"                       path, headers)"},{"line_number":581,"context_line":""},{"line_number":582,"context_line":"        useful_bucket \u003d None"},{"line_number":583,"context_line":"        for resp in pile:"}],"source_content_type":"text/x-python","patch_set":2,"id":"988cd03c_88a64681","line":580,"updated":"2021-05-03 04:55:27.000000000","message":"And this will give us the first concurrency nodes (or rather ring.replicas -1). Though I wonder if it should be len(primary_nodes) which should be the same thing, however makes sure that we are definitely hitting all primaries that we need to hit. Just in case there is some way the comprehension on line 568 doesn\u0027t find the node[\u0027id\u0027], so we still look for all the peices.\n\nI guess worst case though would be we possibly couldn\u0027t rebuild until next cycle, as a lonley quaratine check would finish the primary nodes and onto handoffs in this case.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":600,"context_line":"                    policy, buckets, error_responses, df)):"},{"line_number":601,"context_line":"            for node_count, _node in itertools.islice(node_iter, concurrency):"},{"line_number":602,"context_line":"                pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":603,"context_line":"                           path, headers)"},{"line_number":604,"context_line":"        for resp in pile:"},{"line_number":605,"context_line":"            bucket \u003d self._handle_fragment_response("},{"line_number":606,"context_line":"                node, policy, partition, fi_to_rebuild, path, buckets,"}],"source_content_type":"text/x-python","patch_set":2,"id":"0ffa55cf_d5f8b43d","line":603,"updated":"2021-05-03 04:55:27.000000000","message":"An now we can continue down into the handoff nodes for another ring.replicas -1.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":612,"context_line":"                    policy, buckets, error_responses, df):"},{"line_number":613,"context_line":"                for node_count, _node in node_iter:"},{"line_number":614,"context_line":"                    pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":615,"context_line":"                               path, headers)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":""},{"line_number":618,"context_line":"        return useful_bucket"}],"source_content_type":"text/x-python","patch_set":2,"id":"977e086e_dd2266a1","line":615,"updated":"2021-05-03 04:55:27.000000000","message":"And trickle off some more until we get to the end of our nodes or we find enough nodes to break the quarantine candidate check. Cool.\n\nI guess to play devil\u0027s advocate for a second. I wonder if a push of concurrency requests for the inital connections to primaries, and then suddenly a second push of concurrency requests for the first set of handoffs makes sense. Or should be fire a concurrency set of requests for primaries and then just tricle through the handoffs knowing in most cases if we\u0027ve only found 1 fragment on the primaries we\u0027re really looking to break the is_quarantine_candidate tie. So once we find enough extra frags we can stop making requests, and potentually save on unneeded connections? (I guess this is the way you had it before). Not saying it\u0027s what we should do, justing thinking out loud and trying to cover all bases.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"350e57b9c13ee8209bd13f1c37520fc2c99b3df6","unresolved":true,"context_lines":[{"line_number":612,"context_line":"                    policy, buckets, error_responses, df):"},{"line_number":613,"context_line":"                for node_count, _node in node_iter:"},{"line_number":614,"context_line":"                    pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":615,"context_line":"                               path, headers)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":""},{"line_number":618,"context_line":"        return useful_bucket"}],"source_content_type":"text/x-python","patch_set":2,"id":"929237ab_08d3c4de","line":615,"in_reply_to":"977e086e_dd2266a1","updated":"2021-05-06 12:12:56.000000000","message":"The \u0027batch then trickle\u0027 is meant to be a trade-off between making progress quickly vs firing off unnecessary requests.\n\nBut, yes, a single non-404 will prevent quarantine, and in that case there will be up to concurrency - 1 \u0027wasted\u0027 requests.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":614,"context_line":"                    pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":615,"context_line":"                               path, headers)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":""},{"line_number":618,"context_line":"        return useful_bucket"},{"line_number":619,"context_line":""},{"line_number":620,"context_line":"    def reconstruct_fa(self, job, node, df):"}],"source_content_type":"text/x-python","patch_set":2,"id":"cf1e1e67_dc89eefb","line":617,"updated":"2021-05-03 04:55:27.000000000","message":"I wonder if we should debug level log the final node_count we got. I wonder if that would be an interesting number to look at.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"350e57b9c13ee8209bd13f1c37520fc2c99b3df6","unresolved":false,"context_lines":[{"line_number":614,"context_line":"                    pile.spawn(self._get_response, _node, policy, partition,"},{"line_number":615,"context_line":"                               path, headers)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":""},{"line_number":618,"context_line":"        return useful_bucket"},{"line_number":619,"context_line":""},{"line_number":620,"context_line":"    def reconstruct_fa(self, job, node, df):"}],"source_content_type":"text/x-python","patch_set":2,"id":"7a4f10bd_5f28d8e3","line":617,"in_reply_to":"cf1e1e67_dc89eefb","updated":"2021-05-06 12:12:56.000000000","message":"Ack","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3a0d979e9dc09cd014f31526f3347cc35f702e7a","unresolved":true,"context_lines":[{"line_number":508,"context_line":"            # likely that a tombstone will be reverted to this node, so"},{"line_number":509,"context_line":"            # don\u0027t quarantine it yet: better that it is cleaned up"},{"line_number":510,"context_line":"            # \u0027normally\u0027."},{"line_number":511,"context_line":"            return False"},{"line_number":512,"context_line":""},{"line_number":513,"context_line":"        bucket \u003d buckets[local_timestamp]"},{"line_number":514,"context_line":"        return (bucket.num_responses \u003c\u003d self.quarantine_threshold and"}],"source_content_type":"text/x-python","patch_set":4,"id":"2d64e5a1_8015cd8f","line":511,"updated":"2021-05-07 02:54:09.000000000","message":"Cool, so this will block anything newish from quarantining right away. But for any object that is older then reclaim_age, which would be most of them right, so in most cased things would get cleaned up. If there is a fast put fragment over others then this makes alot of sense. Otherwise it does seem a little redundant.. but happy to have it, belt and braces and all that.\n\nOr maybe I\u0027m missing something.","commit_id":"f88d45028303f47356cadcb0e7537530632d6798"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"169ec860e759257fb2f1641738c6fea6f9b450e0","unresolved":true,"context_lines":[{"line_number":508,"context_line":"            # likely that a tombstone will be reverted to this node, so"},{"line_number":509,"context_line":"            # don\u0027t quarantine it yet: better that it is cleaned up"},{"line_number":510,"context_line":"            # \u0027normally\u0027."},{"line_number":511,"context_line":"            return False"},{"line_number":512,"context_line":""},{"line_number":513,"context_line":"        bucket \u003d buckets[local_timestamp]"},{"line_number":514,"context_line":"        return (bucket.num_responses \u003c\u003d self.quarantine_threshold and"}],"source_content_type":"text/x-python","patch_set":4,"id":"e68983aa_e8c4d813","line":511,"in_reply_to":"2d64e5a1_8015cd8f","updated":"2021-05-07 12:41:39.000000000","message":"I\u0027m trying to avoid quarantining a frag that might otherwise get cleaned up normally. If it is newer than reclaim age then we\u0027d definitely expect there to be tombstones that will eventually replicate over this frag. If there aren\u0027t tombstones then maybe we should keep logging an error and pay attention because we don\u0027t understand the cause of that scenario.\n\nHmmm, maybe I should check all the 404s for timestamp?","commit_id":"f88d45028303f47356cadcb0e7537530632d6798"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"3a0d979e9dc09cd014f31526f3347cc35f702e7a","unresolved":true,"context_lines":[{"line_number":617,"context_line":"                    for node_count, _node in node_iter:"},{"line_number":618,"context_line":"                        pile.spawn(self._get_response, _node, policy,"},{"line_number":619,"context_line":"                                   partition, path, headers)"},{"line_number":620,"context_line":"                        break"},{"line_number":621,"context_line":""},{"line_number":622,"context_line":"        return useful_bucket"},{"line_number":623,"context_line":""}],"source_content_type":"text/x-python","patch_set":4,"id":"db00f958_05916605","line":620,"updated":"2021-05-07 02:54:09.000000000","message":"oh we\u0027re starting to loop then breaking out of it stright away. Wouldn\u0027t it be better to:\n\n  elif self.is_quarantine_candidate(...\n      node_count, _node \u003d next(node_iter)\n      pile.spawn(self._get_response, _node, policy,\n                 partition, path, headers)","commit_id":"f88d45028303f47356cadcb0e7537530632d6798"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"169ec860e759257fb2f1641738c6fea6f9b450e0","unresolved":true,"context_lines":[{"line_number":617,"context_line":"                    for node_count, _node in node_iter:"},{"line_number":618,"context_line":"                        pile.spawn(self._get_response, _node, policy,"},{"line_number":619,"context_line":"                                   partition, path, headers)"},{"line_number":620,"context_line":"                        break"},{"line_number":621,"context_line":""},{"line_number":622,"context_line":"        return useful_bucket"},{"line_number":623,"context_line":""}],"source_content_type":"text/x-python","patch_set":4,"id":"2e13ca52_4c1d3e3a","line":620,"in_reply_to":"db00f958_05916605","updated":"2021-05-07 12:41:39.000000000","message":"I was using the for loop to achieve take-one-if-there-is-one-and-break as a shorthand for catching the StopIteration when the iter is exhausted, but I agree that this form could be confusing at first glance. I\u0027ll change it.","commit_id":"f88d45028303f47356cadcb0e7537530632d6798"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":534,"context_line":"            if bucket and len(bucket.useful_responses) \u003e\u003d policy.ec_ndata:"},{"line_number":535,"context_line":"                frag_indexes \u003d list(bucket.useful_responses.keys())"},{"line_number":536,"context_line":"                self.logger.debug(\u0027Reconstruct frag #%s with frag indexes %s\u0027"},{"line_number":537,"context_line":"                                  % (fi_to_rebuild, frag_indexes))"},{"line_number":538,"context_line":"                return bucket"},{"line_number":539,"context_line":"        return None"},{"line_number":540,"context_line":""}],"source_content_type":"text/x-python","patch_set":7,"id":"7aca7706_ed0cd2da","side":"PARENT","line":537,"updated":"2021-05-07 22:54:00.000000000","message":"Right, so this got pushed up into reconstruct_fa -- seems reasonable.","commit_id":"eeaac713fd9d593336e636af19228e3c17ac8b0e"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":466,"context_line":"                              etag, bucket.etag,"},{"line_number":467,"context_line":"                              _full_path(node, partition, path, policy),"},{"line_number":468,"context_line":"                              fi_to_rebuild)"},{"line_number":469,"context_line":"            return None"},{"line_number":470,"context_line":""},{"line_number":471,"context_line":"        if fi_to_rebuild \u003d\u003d resp_frag_index:"},{"line_number":472,"context_line":"            # TODO: With duplicated EC frags it\u0027s not unreasonable to find the"}],"source_content_type":"text/x-python","patch_set":7,"id":"324aab7b_9d4d24a8","line":469,"updated":"2021-05-07 22:54:00.000000000","message":"Off-topic: I wonder if we ought to put this resp in error_responses before returning here...","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":466,"context_line":"                              etag, bucket.etag,"},{"line_number":467,"context_line":"                              _full_path(node, partition, path, policy),"},{"line_number":468,"context_line":"                              fi_to_rebuild)"},{"line_number":469,"context_line":"            return None"},{"line_number":470,"context_line":""},{"line_number":471,"context_line":"        if fi_to_rebuild \u003d\u003d resp_frag_index:"},{"line_number":472,"context_line":"            # TODO: With duplicated EC frags it\u0027s not unreasonable to find the"}],"source_content_type":"text/x-python","patch_set":7,"id":"aedae600_05311d4a","line":469,"in_reply_to":"324aab7b_9d4d24a8","updated":"2021-05-10 19:47:00.000000000","message":"it is included in the total ok responses e.g. we might see \n\n  Unable to get enough responses (1/10 from 2 ok responses)\n\nand it is included against the quarantine threshold","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":472,"context_line":"            # TODO: With duplicated EC frags it\u0027s not unreasonable to find the"},{"line_number":473,"context_line":"            # very fragment we\u0027re trying to rebuild exists on another primary"},{"line_number":474,"context_line":"            # node.  In this case we should stream it directly from the remote"},{"line_number":475,"context_line":"            # node to our target instead of rebuild.  But instead we ignore it."},{"line_number":476,"context_line":"            self.logger.debug("},{"line_number":477,"context_line":"                \u0027Found existing frag #%s at %s while rebuilding to %s\u0027,"},{"line_number":478,"context_line":"                fi_to_rebuild, resp.full_path,"}],"source_content_type":"text/x-python","patch_set":7,"id":"4371c64e_42a6c228","line":475,"range":{"start_line":475,"start_character":54,"end_line":475,"end_character":79},"updated":"2021-05-07 22:54:00.000000000","message":"This still bugs me... especially since it looks like we could just stick it in the list of frags from which to reconstruct:\n\n \u003e\u003e\u003e import pyeclib.ec_iface\n \u003e\u003e\u003e ec\u003dpyeclib.ec_iface.ECDriver(ec_type\u003d\u0027liberasurecode_rs_vand\u0027, k\u003d4, m\u003d2)\n \u003e\u003e\u003e frags \u003d ec.encode(b\u0027asdf\u0027 * 40)\n \u003e\u003e\u003e ec.reconstruct(frags[1:5], [0]) \u003d\u003d [frags[0]]\n True\n \u003e\u003e\u003e ec.reconstruct(frags[:4], [0]) \u003d\u003d [frags[0]]\n True\n \u003e\u003e\u003e ec.reconstruct(frags[2:], [0]) \u003d\u003d [frags[0]]\n True\n \u003e\u003e\u003e ec.reconstruct(frags[:1] + frags[3:], [0]) \u003d\u003d [frags[0]]\n True\n\nI guess we log about it (https://github.com/openstack/liberasurecode/blob/master/src/erasurecode.c#L859-L884)... but that seems better than quarantining a frag that has more neighbors out there than we suspected.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"054d15b07064da5fbfe62072daafa28dc6dbfa2d","unresolved":true,"context_lines":[{"line_number":472,"context_line":"            # TODO: With duplicated EC frags it\u0027s not unreasonable to find the"},{"line_number":473,"context_line":"            # very fragment we\u0027re trying to rebuild exists on another primary"},{"line_number":474,"context_line":"            # node.  In this case we should stream it directly from the remote"},{"line_number":475,"context_line":"            # node to our target instead of rebuild.  But instead we ignore it."},{"line_number":476,"context_line":"            self.logger.debug("},{"line_number":477,"context_line":"                \u0027Found existing frag #%s at %s while rebuilding to %s\u0027,"},{"line_number":478,"context_line":"                fi_to_rebuild, resp.full_path,"}],"source_content_type":"text/x-python","patch_set":7,"id":"6636bf0a_aeedbf01","line":475,"range":{"start_line":475,"start_character":54,"end_line":475,"end_character":79},"in_reply_to":"4371c64e_42a6c228","updated":"2021-05-10 07:00:50.000000000","message":"oh interesting, if we do that does it become an NOOP in the library?.. though I guess we\u0027re still pulling data from all the other nodes to for the rebuild.\n\nif it\u0027s a NOOP, we save on CPU, but we\u0027re still using network io etc. It\u0027ll be nice to just reutrn the found fragment as an iter to ssync then we only need one network connection and save  CPU io too! Though if we have this set up, we might want to start walking handoffs a little... just in case as it too might save us a bunch of work, or finally start tracking last_primary and always check that to make rebalances better.\n\nSomething like https://review.opendev.org/c/openstack/swift/+/790374 but cleaned up with better belt and braces checks of course. (and something that has been tested, that hasn\u0027t even been run, just thought a patch would be easier to read then a paste diff).","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":472,"context_line":"            # TODO: With duplicated EC frags it\u0027s not unreasonable to find the"},{"line_number":473,"context_line":"            # very fragment we\u0027re trying to rebuild exists on another primary"},{"line_number":474,"context_line":"            # node.  In this case we should stream it directly from the remote"},{"line_number":475,"context_line":"            # node to our target instead of rebuild.  But instead we ignore it."},{"line_number":476,"context_line":"            self.logger.debug("},{"line_number":477,"context_line":"                \u0027Found existing frag #%s at %s while rebuilding to %s\u0027,"},{"line_number":478,"context_line":"                fi_to_rebuild, resp.full_path,"}],"source_content_type":"text/x-python","patch_set":7,"id":"bc93138a_b6c1a319","line":475,"range":{"start_line":475,"start_character":54,"end_line":475,"end_character":79},"in_reply_to":"6636bf0a_aeedbf01","updated":"2021-05-10 19:47:00.000000000","message":"This frag is counted against quarantine_threshold - see test_reconstruct_fa_no_quarantine_more_than_threshold_frags\n\nnum_responses is the condition, not number of useful_responses","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":483,"context_line":"        if durable_timestamp:"},{"line_number":484,"context_line":"            buckets[Timestamp(durable_timestamp)].durable \u003d True"},{"line_number":485,"context_line":""},{"line_number":486,"context_line":"        if resp_frag_index not in bucket.useful_responses:"},{"line_number":487,"context_line":"            bucket.useful_responses[resp_frag_index] \u003d resp"},{"line_number":488,"context_line":"            return bucket"},{"line_number":489,"context_line":"        return None"}],"source_content_type":"text/x-python","patch_set":7,"id":"7b382d1c_bae1b5c3","line":486,"updated":"2021-05-07 22:54:00.000000000","message":"Oh, interesting... so if we\u0027ve got a 2x(4+2) policy, quarantine threshold of 1, and we can only find the two copies of frag index 0 (say), we\u0027ll still quarantine...","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":483,"context_line":"        if durable_timestamp:"},{"line_number":484,"context_line":"            buckets[Timestamp(durable_timestamp)].durable \u003d True"},{"line_number":485,"context_line":""},{"line_number":486,"context_line":"        if resp_frag_index not in bucket.useful_responses:"},{"line_number":487,"context_line":"            bucket.useful_responses[resp_frag_index] \u003d resp"},{"line_number":488,"context_line":"            return bucket"},{"line_number":489,"context_line":"        return None"}],"source_content_type":"text/x-python","patch_set":7,"id":"cf1a26d4_e5ff51a6","line":486,"in_reply_to":"7b382d1c_bae1b5c3","updated":"2021-05-10 19:47:00.000000000","message":"No. Adding a unit test to cover this: the duplicate frag will prevent quarantine. In general my approach has been to only quarantine in the scenario we understand and \u0027expect\u0027. We can always relax the conditions in future.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":507,"context_line":""},{"line_number":508,"context_line":"        if time.time() - float(local_timestamp) \u003c\u003d df.manager.reclaim_age:"},{"line_number":509,"context_line":"            # If the fragment has not yet passed reclaim age then it is"},{"line_number":510,"context_line":"            # likely that a tombstone will be reverted to this node, so"},{"line_number":511,"context_line":"            # don\u0027t quarantine it yet: better that it is cleaned up"},{"line_number":512,"context_line":"            # \u0027normally\u0027."},{"line_number":513,"context_line":"            return False"}],"source_content_type":"text/x-python","patch_set":7,"id":"014289e2_0a43482b","line":510,"range":{"start_line":510,"start_character":26,"end_line":510,"end_character":67},"updated":"2021-05-07 22:54:00.000000000","message":"Or, neighbor frags will get reverted from handoffs to *other* nodes and we\u0027ll discover we *do* have enough to reconstruct.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":507,"context_line":""},{"line_number":508,"context_line":"        if time.time() - float(local_timestamp) \u003c\u003d df.manager.reclaim_age:"},{"line_number":509,"context_line":"            # If the fragment has not yet passed reclaim age then it is"},{"line_number":510,"context_line":"            # likely that a tombstone will be reverted to this node, so"},{"line_number":511,"context_line":"            # don\u0027t quarantine it yet: better that it is cleaned up"},{"line_number":512,"context_line":"            # \u0027normally\u0027."},{"line_number":513,"context_line":"            return False"}],"source_content_type":"text/x-python","patch_set":7,"id":"01ff0fc4_3bbd789d","line":510,"range":{"start_line":510,"start_character":26,"end_line":510,"end_character":67},"in_reply_to":"014289e2_0a43482b","updated":"2021-05-10 19:47:00.000000000","message":"Done","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":564,"context_line":"        filtered_primary_nodes \u003d [n for n in primary_nodes"},{"line_number":565,"context_line":"                                  if n[\u0027id\u0027] !\u003d node[\u0027id\u0027]]"},{"line_number":566,"context_line":"        # concurrency is the number of requests fired off in initial batch"},{"line_number":567,"context_line":"        concurrency \u003d len(filtered_primary_nodes)"},{"line_number":568,"context_line":"        # max_node_count is the maximum number of nodes to consume when"},{"line_number":569,"context_line":"        # verifying a quarantine candidate and is at least primary_node_count"},{"line_number":570,"context_line":"        max_node_count \u003d max(primary_node_count,"}],"source_content_type":"text/x-python","patch_set":7,"id":"bcea3c02_d8f81822","line":567,"updated":"2021-05-07 22:54:00.000000000","message":"Off-topic: Man, this is making me realize how terrible it is to have to reconstruct on a duplicated EC policy... the IO storm can be pretty terrible 😞","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":564,"context_line":"        filtered_primary_nodes \u003d [n for n in primary_nodes"},{"line_number":565,"context_line":"                                  if n[\u0027id\u0027] !\u003d node[\u0027id\u0027]]"},{"line_number":566,"context_line":"        # concurrency is the number of requests fired off in initial batch"},{"line_number":567,"context_line":"        concurrency \u003d len(filtered_primary_nodes)"},{"line_number":568,"context_line":"        # max_node_count is the maximum number of nodes to consume when"},{"line_number":569,"context_line":"        # verifying a quarantine candidate and is at least primary_node_count"},{"line_number":570,"context_line":"        max_node_count \u003d max(primary_node_count,"}],"source_content_type":"text/x-python","patch_set":7,"id":"db12cbd7_80b3755d","line":567,"in_reply_to":"bcea3c02_d8f81822","updated":"2021-05-10 19:47:00.000000000","message":"At first I wrote in the ability to tune down concurrency to ec_ndata, then pulled that out, which partly explains some of the additional complexity below...","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":576,"context_line":"                itertools.chain(filtered_primary_nodes,"},{"line_number":577,"context_line":"                                ring.get_more_nodes(partition)),"},{"line_number":578,"context_line":"                max_node_count - node_count),"},{"line_number":579,"context_line":"            node_count + 1)  # 1st value out of iter is 2nd node consumed"},{"line_number":580,"context_line":""},{"line_number":581,"context_line":"        pile \u003d GreenAsyncPile(concurrency)"},{"line_number":582,"context_line":"        for node_count, _node in itertools.islice(node_iter, concurrency):"}],"source_content_type":"text/x-python","patch_set":7,"id":"3829763f_611f6fa7","line":579,"updated":"2021-05-07 22:54:00.000000000","message":"There\u0027s a lot to unpack here, and most of the details only really matter once we\u0027ve exhausted the primaries. Something like http://paste.openstack.org/show/805052/ might be better?","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":576,"context_line":"                itertools.chain(filtered_primary_nodes,"},{"line_number":577,"context_line":"                                ring.get_more_nodes(partition)),"},{"line_number":578,"context_line":"                max_node_count - node_count),"},{"line_number":579,"context_line":"            node_count + 1)  # 1st value out of iter is 2nd node consumed"},{"line_number":580,"context_line":""},{"line_number":581,"context_line":"        pile \u003d GreenAsyncPile(concurrency)"},{"line_number":582,"context_line":"        for node_count, _node in itertools.islice(node_iter, concurrency):"}],"source_content_type":"text/x-python","patch_set":7,"id":"48399c60_8feaee46","line":579,"in_reply_to":"3829763f_611f6fa7","updated":"2021-05-10 19:47:00.000000000","message":"Thanks. This made more sense when I had the option to reduce concurrency, but agree that it is overly complex for the simpler quarantine case in this patch.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":594,"context_line":""},{"line_number":595,"context_line":"        # Once all rebuild nodes have responded, if we have a quarantine"},{"line_number":596,"context_line":"        # candidate, go beyond primary_node_count and on to handoffs. The"},{"line_number":597,"context_line":"        # first non-404 response will prevent quarantine, but the expected"},{"line_number":598,"context_line":"        # common case is all 404 responses so we use some concurrency to get an"},{"line_number":599,"context_line":"        # outcome faster at the risk of some unnecessary requests in the"},{"line_number":600,"context_line":"        # uncommon case."}],"source_content_type":"text/x-python","patch_set":7,"id":"ff39d643_5b970907","line":597,"range":{"start_line":597,"start_character":33,"end_line":597,"end_character":37},"updated":"2021-05-07 22:54:00.000000000","message":"nit: may","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":594,"context_line":""},{"line_number":595,"context_line":"        # Once all rebuild nodes have responded, if we have a quarantine"},{"line_number":596,"context_line":"        # candidate, go beyond primary_node_count and on to handoffs. The"},{"line_number":597,"context_line":"        # first non-404 response will prevent quarantine, but the expected"},{"line_number":598,"context_line":"        # common case is all 404 responses so we use some concurrency to get an"},{"line_number":599,"context_line":"        # outcome faster at the risk of some unnecessary requests in the"},{"line_number":600,"context_line":"        # uncommon case."}],"source_content_type":"text/x-python","patch_set":7,"id":"0b669835_f4b05ca7","line":597,"range":{"start_line":597,"start_character":33,"end_line":597,"end_character":37},"in_reply_to":"ff39d643_5b970907","updated":"2021-05-10 19:47:00.000000000","message":"I *think* we catch any non-404? is there a case I am not seeing?","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":612,"context_line":"                    useful_bucket \u003d bucket"},{"line_number":613,"context_line":"                    self.logger.debug("},{"line_number":614,"context_line":"                        \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":615,"context_line":"                        % node_count)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":"                elif self._is_quarantine_candidate("},{"line_number":618,"context_line":"                        policy, buckets, error_responses, df):"}],"source_content_type":"text/x-python","patch_set":7,"id":"1f4e0050_ea9ce52e","line":615,"range":{"start_line":615,"start_character":26,"end_line":615,"end_character":36},"updated":"2021-05-07 22:54:00.000000000","message":"This is the only reason we\u0027re tracking node_count, yeah?","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":612,"context_line":"                    useful_bucket \u003d bucket"},{"line_number":613,"context_line":"                    self.logger.debug("},{"line_number":614,"context_line":"                        \u0027Reconstructing frag from handoffs, node_count\u003d%d\u0027"},{"line_number":615,"context_line":"                        % node_count)"},{"line_number":616,"context_line":"                    break"},{"line_number":617,"context_line":"                elif self._is_quarantine_candidate("},{"line_number":618,"context_line":"                        policy, buckets, error_responses, df):"}],"source_content_type":"text/x-python","patch_set":7,"id":"d939ba6d_990e554e","line":615,"range":{"start_line":615,"start_character":26,"end_line":615,"end_character":36},"in_reply_to":"1f4e0050_ea9ce52e","updated":"2021-05-10 19:47:00.000000000","message":"yes, it is, in this version of the patch. IIRC somewhere along the way there was a comment about logging this unexpected event.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":454,"context_line":"            self.logger.warning("},{"line_number":455,"context_line":"                \u0027Invalid resp from %s, frag index %s (missing Etag)\u0027,"},{"line_number":456,"context_line":"                resp.full_path, resp_frag_index)"},{"line_number":457,"context_line":"            error_responses[resp.status].append(resp)"},{"line_number":458,"context_line":"            return None"},{"line_number":459,"context_line":""},{"line_number":460,"context_line":"        bucket \u003d buckets[timestamp]"}],"source_content_type":"text/x-python","patch_set":8,"id":"d95c0b09_d114423c","line":457,"updated":"2021-05-10 18:10:26.000000000","message":"IDK -- I was OK with these getting grouped in with refused connections and timeouts... we may have gotten a 200, but it seems we weren\u0027t talking to the service we thought we were...","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":454,"context_line":"            self.logger.warning("},{"line_number":455,"context_line":"                \u0027Invalid resp from %s, frag index %s (missing Etag)\u0027,"},{"line_number":456,"context_line":"                resp.full_path, resp_frag_index)"},{"line_number":457,"context_line":"            error_responses[resp.status].append(resp)"},{"line_number":458,"context_line":"            return None"},{"line_number":459,"context_line":""},{"line_number":460,"context_line":"        bucket \u003d buckets[timestamp]"}],"source_content_type":"text/x-python","patch_set":8,"id":"1c2922a6_46b48b7a","line":457,"in_reply_to":"d95c0b09_d114423c","updated":"2021-05-10 19:47:00.000000000","message":"I liked the idea of getting a little more info in the logs, but in practice it makes the logs potentially confusing. \n\n  Unable to get enough responses (1 x 200, 9 x 404 error responses)\n\n#willrevert","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":458,"context_line":"            return None"},{"line_number":459,"context_line":""},{"line_number":460,"context_line":"        bucket \u003d buckets[timestamp]"},{"line_number":461,"context_line":"        bucket.num_responses +\u003d 1"},{"line_number":462,"context_line":"        if bucket.etag is None:"},{"line_number":463,"context_line":"            bucket.etag \u003d etag"},{"line_number":464,"context_line":"        elif bucket.etag !\u003d etag:"}],"source_content_type":"text/x-python","patch_set":8,"id":"fc7cb4b0_92a3dbb6","line":461,"updated":"2021-05-10 18:10:26.000000000","message":"OK, OK -- I see it now -- duplicated frags and frags we\u0027re trying to rebuild increment here, but prevent us from returning a \"useful\" bucket. Got it.","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":485,"context_line":"            bucket.useful_responses[resp_frag_index] \u003d resp"},{"line_number":486,"context_line":"        # else: duplicate frag_index isn\u0027t useful for rebuilding"},{"line_number":487,"context_line":""},{"line_number":488,"context_line":"        return bucket"},{"line_number":489,"context_line":""},{"line_number":490,"context_line":"    def _is_quarantine_candidate(self, policy, buckets, error_responses, df):"},{"line_number":491,"context_line":"        # This condition is deliberately strict because it determines if"}],"source_content_type":"text/x-python","patch_set":8,"id":"ae55b5da_048fd259","line":488,"range":{"start_line":488,"start_character":15,"end_line":488,"end_character":21},"updated":"2021-05-10 18:10:26.000000000","message":"Yeah, I think I prefer returning the bucket here, even if we\u0027re not moving the needle on useful_responses. Matches the documented return better, and both of the callers have an explicit check on number of useful responses after they get a bucket in hand.","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":514,"context_line":""},{"line_number":515,"context_line":"        bucket \u003d buckets[local_timestamp]"},{"line_number":516,"context_line":"        return (bucket.num_responses \u003c\u003d self.quarantine_threshold and"},{"line_number":517,"context_line":"                bucket.num_responses \u003c policy.ec_ndata and"},{"line_number":518,"context_line":"                df._frag_index in bucket.useful_responses)"},{"line_number":519,"context_line":""},{"line_number":520,"context_line":"    def _make_fragment_requests(self, job, node, df, buckets, error_responses):"}],"source_content_type":"text/x-python","patch_set":8,"id":"0522f59e_0aac0543","line":517,"updated":"2021-05-10 18:10:26.000000000","message":"Right, so we can have bucket.num_responses \u003e policy.ec_ndata but still be unable to reconstruct -- and we\u0027ll be unwilling to quarantine. Cool!","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"}],"test/probe/test_reconstructor_rebuild.py":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":400,"context_line":""},{"line_number":401,"context_line":"        # revive the failed device and check it has a fragment"},{"line_number":402,"context_line":"        self.revive_drive(device_path)"},{"line_number":403,"context_line":"        self.assert_direct_get_succeeds(failed_node, self.opart)"},{"line_number":404,"context_line":""},{"line_number":405,"context_line":"        # run the reconstructor without quarantine_threshold set"},{"line_number":406,"context_line":"        error_lines \u003d []"}],"source_content_type":"text/x-python","patch_set":7,"id":"01de762c_d201ad97","line":403,"updated":"2021-05-07 22:54:00.000000000","message":"Might want to send some requests through the proxy, too -- show how a HEAD 200s but then a GET 503s.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":400,"context_line":""},{"line_number":401,"context_line":"        # revive the failed device and check it has a fragment"},{"line_number":402,"context_line":"        self.revive_drive(device_path)"},{"line_number":403,"context_line":"        self.assert_direct_get_succeeds(failed_node, self.opart)"},{"line_number":404,"context_line":""},{"line_number":405,"context_line":"        # run the reconstructor without quarantine_threshold set"},{"line_number":406,"context_line":"        error_lines \u003d []"}],"source_content_type":"text/x-python","patch_set":7,"id":"d3be210e_2b2df174","line":403,"in_reply_to":"01de762c_d201ad97","updated":"2021-05-10 19:47:00.000000000","message":"node error limiting seems to prevent these assertions being made immediately :( I see 404 from the GET due to all 404s from backend, and the HEAD also 404s.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":420,"context_line":"                continue"},{"line_number":421,"context_line":"            self.assertFalse(found_lines, error_lines)"},{"line_number":422,"context_line":"            found_lines \u003d True"},{"line_number":423,"context_line":"            for line in itertools.islice(lines, 0, 2, 6):"},{"line_number":424,"context_line":"                self.assertIn("},{"line_number":425,"context_line":"                    \u0027Unable to get enough responses (1/4 from 1 ok \u0027"},{"line_number":426,"context_line":"                    \u0027responses)\u0027, line, lines)"}],"source_content_type":"text/x-python","patch_set":7,"id":"ed7263d3_8e48cf5f","line":423,"range":{"start_line":423,"start_character":49,"end_line":423,"end_character":55},"updated":"2021-05-07 22:54:00.000000000","message":"Isn\u0027t this just going to be checking lines[0]?\n\n \u003e\u003e\u003e list(itertools.islice(range(30), 0, 2, 6))\n [0]","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":420,"context_line":"                continue"},{"line_number":421,"context_line":"            self.assertFalse(found_lines, error_lines)"},{"line_number":422,"context_line":"            found_lines \u003d True"},{"line_number":423,"context_line":"            for line in itertools.islice(lines, 0, 2, 6):"},{"line_number":424,"context_line":"                self.assertIn("},{"line_number":425,"context_line":"                    \u0027Unable to get enough responses (1/4 from 1 ok \u0027"},{"line_number":426,"context_line":"                    \u0027responses)\u0027, line, lines)"}],"source_content_type":"text/x-python","patch_set":7,"id":"19b1fc6f_1ac46c60","line":423,"range":{"start_line":423,"start_character":49,"end_line":423,"end_character":55},"in_reply_to":"ed7263d3_8e48cf5f","updated":"2021-05-10 19:47:00.000000000","message":"oops, got my steps and stops muddled! good catch","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":424,"context_line":"                self.assertIn("},{"line_number":425,"context_line":"                    \u0027Unable to get enough responses (1/4 from 1 ok \u0027"},{"line_number":426,"context_line":"                    \u0027responses)\u0027, line, lines)"},{"line_number":427,"context_line":"            for line in itertools.islice(lines, 1, 2, 7):"},{"line_number":428,"context_line":"                self.assertIn("},{"line_number":429,"context_line":"                    \u0027Unable to get enough responses (4 x 404 error \u0027"},{"line_number":430,"context_line":"                    \u0027responses)\u0027, line, lines)"}],"source_content_type":"text/x-python","patch_set":7,"id":"91309b19_6906e7af","line":427,"range":{"start_line":427,"start_character":51,"end_line":427,"end_character":55},"updated":"2021-05-07 22:54:00.000000000","message":"Similar issue.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":424,"context_line":"                self.assertIn("},{"line_number":425,"context_line":"                    \u0027Unable to get enough responses (1/4 from 1 ok \u0027"},{"line_number":426,"context_line":"                    \u0027responses)\u0027, line, lines)"},{"line_number":427,"context_line":"            for line in itertools.islice(lines, 1, 2, 7):"},{"line_number":428,"context_line":"                self.assertIn("},{"line_number":429,"context_line":"                    \u0027Unable to get enough responses (4 x 404 error \u0027"},{"line_number":430,"context_line":"                    \u0027responses)\u0027, line, lines)"}],"source_content_type":"text/x-python","patch_set":7,"id":"59c84588_f4a661fa","line":427,"range":{"start_line":427,"start_character":51,"end_line":427,"end_character":55},"in_reply_to":"91309b19_6906e7af","updated":"2021-05-10 19:47:00.000000000","message":"Done","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":481,"context_line":"        # check we have nothing"},{"line_number":482,"context_line":"        for node in self.onodes:"},{"line_number":483,"context_line":"            err \u003d self.assert_direct_get_fails(node, self.opart, 404)"},{"line_number":484,"context_line":"            self.assertNotIn(\u0027X-Backend-Timestamp\u0027, err.http_headers)"},{"line_number":485,"context_line":""},{"line_number":486,"context_line":"        # run the reconstructor once more - should see no errors in logs!"},{"line_number":487,"context_line":"        error_lines \u003d []"}],"source_content_type":"text/x-python","patch_set":7,"id":"9c56cd89_b9ccde42","line":484,"updated":"2021-05-07 22:54:00.000000000","message":"And now HEADs and GETs through the proxy both 404.","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":false,"context_lines":[{"line_number":481,"context_line":"        # check we have nothing"},{"line_number":482,"context_line":"        for node in self.onodes:"},{"line_number":483,"context_line":"            err \u003d self.assert_direct_get_fails(node, self.opart, 404)"},{"line_number":484,"context_line":"            self.assertNotIn(\u0027X-Backend-Timestamp\u0027, err.http_headers)"},{"line_number":485,"context_line":""},{"line_number":486,"context_line":"        # run the reconstructor once more - should see no errors in logs!"},{"line_number":487,"context_line":"        error_lines \u003d []"}],"source_content_type":"text/x-python","patch_set":7,"id":"8fe9983b_c2e23ceb","line":484,"in_reply_to":"9c56cd89_b9ccde42","updated":"2021-05-10 19:47:00.000000000","message":"Done","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"},{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"56dce4281b44381896325e6b439662add7b46f88","unresolved":true,"context_lines":[{"line_number":402,"context_line":"        self.revive_drive(device_path)"},{"line_number":403,"context_line":"        self.assert_direct_get_succeeds(failed_node, self.opart)"},{"line_number":404,"context_line":""},{"line_number":405,"context_line":"        # client GET will fail with 503, or 404 if the failed node is"},{"line_number":406,"context_line":"        # error-limited"},{"line_number":407,"context_line":"        with self.assertRaises(ClientException) as cm:"},{"line_number":408,"context_line":"            client.get_object(self.url, self.token, self.container_name,"},{"line_number":409,"context_line":"                              self.object_name)"}],"source_content_type":"text/x-python","patch_set":8,"id":"1c3fb721_8622bc69","line":406,"range":{"start_line":405,"start_character":44,"end_line":406,"end_character":23},"updated":"2021-05-10 18:10:26.000000000","message":"*That\u0027s* an interesting point... :-/\n\nI don\u0027t like the ambiguity in the assertion below, but OTOH, I see how the 404 should be a \"reasonable\" decision here.\n\nI wonder -- should we restart the proxy to ensure error-limiting is cleared?","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"15bd0be3f4689b9d994fd439da9c95ae83a0a149","unresolved":true,"context_lines":[{"line_number":402,"context_line":"        self.revive_drive(device_path)"},{"line_number":403,"context_line":"        self.assert_direct_get_succeeds(failed_node, self.opart)"},{"line_number":404,"context_line":""},{"line_number":405,"context_line":"        # client GET will fail with 503, or 404 if the failed node is"},{"line_number":406,"context_line":"        # error-limited"},{"line_number":407,"context_line":"        with self.assertRaises(ClientException) as cm:"},{"line_number":408,"context_line":"            client.get_object(self.url, self.token, self.container_name,"},{"line_number":409,"context_line":"                              self.object_name)"}],"source_content_type":"text/x-python","patch_set":8,"id":"1a57aed0_5b17fa39","line":406,"range":{"start_line":405,"start_character":44,"end_line":406,"end_character":23},"in_reply_to":"1c3fb721_8622bc69","updated":"2021-05-10 19:47:00.000000000","message":"good idea!","commit_id":"370853e543cb6341043308ed43c01d006686a6b7"}],"test/probe/test_sharder.py":[{"author":{"_account_id":15343,"name":"Tim Burke","email":"tburke@nvidia.com","username":"tburke"},"change_message_id":"9c357cba4b43c008dab00d8556ea7c6e39c2c24f","unresolved":true,"context_lines":[{"line_number":414,"context_line":"                               additional_args\u003d\u0027--partitions\u003d%s\u0027 % part)"},{"line_number":415,"context_line":""},{"line_number":416,"context_line":"    def run_custom_sharder(self, conf_index, custom_conf, **kwargs):"},{"line_number":417,"context_line":"        return self.run_custom_daemon(ContainerSharder, \u0027container-sharder\u0027,"},{"line_number":418,"context_line":"                                      conf_index, custom_conf, **kwargs)"},{"line_number":419,"context_line":""},{"line_number":420,"context_line":""}],"source_content_type":"text/x-python","patch_set":7,"id":"223f59ef_dc2a30b5","line":417,"range":{"start_line":417,"start_character":38,"end_line":417,"end_character":75},"updated":"2021-05-07 22:54:00.000000000","message":"I wonder if we could eliminate this redundancy...","commit_id":"bf6e8de8648b53cd2e46d8bc33c53f3aa2cb7926"}],"test/unit/common/test_utils.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":3168,"context_line":"        do_test(\u00271 * replicas\u0027, 11, 11)"},{"line_number":3169,"context_line":"        do_test(\u00272 * replicas\u0027, 3, 6)"},{"line_number":3170,"context_line":"        do_test(\u00272 * replicas\u0027, 11, 22)"},{"line_number":3171,"context_line":"        do_test(\u002711\u0027, 11, 11)"},{"line_number":3172,"context_line":""},{"line_number":3173,"context_line":"        for bad in (\u00271.1\u0027, 1.1, \u0027auto\u0027, \u0027bad\u0027,"},{"line_number":3174,"context_line":"                    \u00272.5 * replicas\u0027, \u0027two * replicas\u0027):"}],"source_content_type":"text/x-python","patch_set":2,"id":"a90bb677_fc3abe40","line":3171,"updated":"2021-05-03 04:55:27.000000000","message":"maybe for completeness we should add something like:\n\n   do_test(\u002710\u0027, 11, 10)\n   do_test(\u002712\u0027, 11, 12)\n\nTo make sure we\u0027ve covered the bounds.","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"}],"test/unit/proxy/test_server.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"d70ea329a4148f9b6128e54e7580a8fb666a57e5","unresolved":true,"context_lines":[{"line_number":4713,"context_line":""},{"line_number":4714,"context_line":"        do_test(\u00273\u0027, 4, 3)"},{"line_number":4715,"context_line":"        do_test(\u00271 * replicas\u0027, 4, 4)"},{"line_number":4716,"context_line":"        do_test(\u00272 * replicas\u0027, 4, 8)"},{"line_number":4717,"context_line":""},{"line_number":4718,"context_line":"        for bad in (\u00271.1\u0027, 1.1, \u0027auto\u0027, \u0027bad\u0027,"},{"line_number":4719,"context_line":"                    \u00272.5 * replicas\u0027, \u0027two * replicas\u0027):"}],"source_content_type":"text/x-python","patch_set":2,"id":"ac307b34_a70ee942","line":4716,"updated":"2021-05-03 04:55:27.000000000","message":"Probaby could do with some bounds testing here (number \u003d\u003d replicas and number \u003e replicas).","commit_id":"42b4856da1e87c9a51d3f9d125fe39c724a3b362"}]}
