)]}'
{"/PATCHSET_LEVEL":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"c86a5b910efd357c0b1208e4b0acee676a75fd5c","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"e1ac658e_f1ea2645","updated":"2025-07-04 04:21:03.000000000","message":"Cool, changing to `` to 0 and I get:\n```\nvagrant@saio7:~$ swift-init object-replicator once\nWARNING: Unable to modify max process limit.  Running as non-root?\nRunning object-replicator once...(/etc/swift/object-server/1.conf.d)\nRunning object-replicator once...(/etc/swift/object-server/2.conf.d)\nRunning object-replicator once...(/etc/swift/object-server/3.conf.d)\nRunning object-replicator once...(/etc/swift/object-server/4.conf.d)\nJul 04 04:07:05 saio7 object-server-6040[7754]: STDERR: (7754) accepted (\u0027127.0.0.1\u0027, 40776)\nJul 04 04:07:05 saio7 object-server-6040[7754]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb4/443\" 200 56 \"-\" \"-\" \"object-replicator 7767\" -0.6197 \"-\" 7754 0\nJul 04 04:07:05 saio7 object-server-6040[7754]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb4/443 HTTP/1.1\" 200 172 -0.618788\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: (7749) accepted (\u0027127.0.0.1\u0027, 58030)\nJul 04 04:07:05 saio7 object-server-6030[7748]: STDERR: (7748) accepted (\u0027127.0.0.1\u0027, 41642)\nJul 04 04:07:05 saio7 object-server-6010[7749]: Slow REPLICATE (0.131s) for /sdb1/443, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.131s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6010[7749]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb1/443\" 200 56 \"-\" \"-\" \"object-replicator 7767\" 0.1321 \"-\" 7749 0\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb1/443 HTTP/1.1\" 200 172 0.132803\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: (7749) accepted (\u0027127.0.0.1\u0027, 58034)\nJul 04 04:07:05 saio7 object-server-6010[7749]: Slow REPLICATE (0.001s) for /sdb5/882, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.001s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6010[7749]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb5/882\" 200 56 \"-\" \"-\" \"object-replicator 7767\" 0.0007 \"-\" 7749 0\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb5/882 HTTP/1.1\" 200 172 0.001043\nJul 04 04:07:05 saio7 object-server-6020[7756]: STDERR: (7756) accepted (\u0027127.0.0.1\u0027, 51598)\nJul 04 04:07:05 saio7 object-server-6040[7754]: STDERR: (7754) accepted (\u0027127.0.0.1\u0027, 40788)\nJul 04 04:07:05 saio7 object-server-6040[7754]: Slow REPLICATE (0.001s) for /sdb4/267, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.000s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6030[7748]: Slow REPLICATE (0.014s) for /sdb7/882, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.014s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6040[7754]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb4/267\" 200 56 \"-\" \"-\" \"object-replicator 7765\" 0.0008 \"-\" 7754 0\nJul 04 04:07:05 saio7 object-server-6040[7754]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb4/267 HTTP/1.1\" 200 172 0.001132\nJul 04 04:07:05 saio7 object-server-6030[7748]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb7/882\" 200 56 \"-\" \"-\" \"object-replicator 7766\" 0.0144 \"-\" 7748 0\nJul 04 04:07:05 saio7 object-server-6030[7748]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb7/882 HTTP/1.1\" 200 172 0.015730\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: (7749) accepted (\u0027127.0.0.1\u0027, 58038)\nJul 04 04:07:05 saio7 object-server-6010[7749]: Slow REPLICATE (0.000s) for /sdb5/882, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.000s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6010[7749]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb5/882\" 200 56 \"-\" \"-\" \"object-replicator 7766\" 0.0006 \"-\" 7749 0\nJul 04 04:07:05 saio7 object-server-6010[7749]: STDERR: 127.0.0.1 - - [04/Jul/2025 04:07:05] \"REPLICATE /sdb5/882 HTTP/1.1\" 200 172 0.000946\nJul 04 04:07:05 saio7 object-server-6040[7754]: STDERR: (7754) accepted (\u0027127.0.0.1\u0027, 40798)\nJul 04 04:07:05 saio7 object-server-6040[7754]: Slow REPLICATE (0.001s) for /sdb8/710, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.001s, pickle_serialization\u003d0.000s\nJul 04 04:07:05 saio7 object-server-6040[7754]: 127.0.0.1 - - [04/Jul/2025:04:07:05 +0000] \"REPLICATE /sdb8/710\" 200 56 \"-\" \"-\" \"object-replicator 7766\" 0.0009 \"-\" 7754 0\n```\n\nWhich is pretty cool. It\u0027s understand the relationship between the timing on a line. And there seems to be some unaccounted for. Well I guess it isn\u0027t because if it isn\u0027t covered by the others it\u0027s whatever is missing.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"9ee4ecf6f30830574fa5dc1ffd33a1229b389e70","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":3,"id":"400ecf89_b00265aa","updated":"2025-07-07 13:39:35.000000000","message":"like Matt I noticed some time in the total that is unaccounted for. I added invalidate_hash but its not thath. I wondered if I could instrument the call to tpool.execute(get_hashes) but that would require passing timing_start as well as timing_breakdown...which is convincing me more that timing_breakdown should be a small class that encapsulates the absolute time.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"357411920f0ae194b6ca34afb06c2bb2b5a44a9b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":4,"id":"6ef012be_c8ae4283","updated":"2025-07-10 09:19:14.000000000","message":"despite my comments on the parent patch about the implementation of the timing breakdown mechanism, I think this would be ok to carry as an experiment to gather data.","commit_id":"2f4eeb178f004dba10403b1c22ad7124a274f1d6"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"1f35b3bf21fe2a2c7d0df1963e4fb9db1c3745c1","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":6,"id":"a7967f84_72e985ae","updated":"2025-07-14 04:08:15.000000000","message":"@clay.gerrard@gmail.com \u0027s patch https://review.opendev.org/c/openstack/swift/+/954379 inspired me to add more log lines which we can dig all the way into the bottom functions if we are going to enable DEBUG logging on one node to get more complete information of get_hashes().","commit_id":"c8b1daaf3956bfd0dd5e636f9e977abbf2474ea0"},{"author":{"_account_id":1179,"name":"Clay Gerrard","email":"clay.gerrard@gmail.com","username":"clay-gerrard"},"change_message_id":"8448a2e44f542d6086dd4a1d0a86821288fb0d0a","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":7,"id":"f738599c_e3a72807","updated":"2025-07-14 17:28:08.000000000","message":"IME you can\u0027t add a bunch of debug logging into a threadpool without dead locking\n\nNormally I see messages like:\n\n```\ngreenlet.error: Cannot switch to a different thread\nAssertionError: Cannot switch to MAINLOOP from MAINLOOP\n```\n\n... it seems to have something to do with the logger instance in thread pool having a patched connection to journald\n\nIn this patch when I created over a thousand parts/objects:\n\n```\nvagrant@saio:~$ ls /srv/node1/sdb*/objects-1 | wc -l\n1534\n```\n\n... and removed any previous hashes:\n\n```\nfind /srv/node*/sdb*/object* -name \\*.pkl -delete\n```\n\n... and then ran a foreground reconstructor:\n\n```\nswift-object-reconstructor /etc/swift/object-server/1.conf.d/ once -v\n```\n\nit did deadlock for me:\n\n```\nobject-reconstructor-6010: __get_hashes: Completed consolidate_hashes with device\u003dsdb5, partition\u003d470, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed hash calculation for device sdb5 partition 470 , hashed 0 suffixes, modified\u003dFalse\nobject-reconstructor-6010: __get_hashes: Scheduled to run with device\u003dsdb5, partition\u003d470, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed consolidate_hashes with device\u003dsdb5, partition\u003d470, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed hash calculation for device sdb5 partition 470 , hashed 0 suffixes, modified\u003dFalse\nobject-reconstructor-6010: __get_hashes: Scheduled to run with device\u003dsdb1, partition\u003d287, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed consolidate_hashes with device\u003dsdb1, partition\u003d287, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed listdir on /srv/node1/sdb1/objects-1/287\nobject-reconstructor-6010: __get_hashes: Completed hash calculation for device sdb1 partition 287 , hashed 0 suffixes, modified\u003dTrue\nobject-reconstructor-6010: __get_hashes: Scheduled to run with device\u003dsdb5, partition\u003d881, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed consolidate_hashes with device\u003dsdb5, partition\u003d881, policy\u003dECStoragePolicy(1, [\u0027ec\u0027], is_default\u003dFalse, is_deprecated\u003dFalse, policy_type\u003d\u0027erasure_coding\u0027), EC config(ec_type\u003dliberasurecode_rs_vand, ec_segment_size\u003d1048576, ec_ndata\u003d4, ec_nparity\u003d2)\nobject-reconstructor-6010: __get_hashes: Completed hash calculation for device sdb5 partition 881 , hashed 0 suffixes, modified\u003dFalse\n\n\n\n\n\n^C^C^C^C^C^CError in sys.excepthook:\n\n^C^C\nOriginal exception was:\n^C\n\n\n^C^C^C\n\n\n^\\Quit (core dumped)\n\n```\n\nalthough w/o any of the normal \"can not switch\" messages, so that might be interesting...\n\nah, it did NOT lockup when I turned off DEBUG logging (in object-server \u0026 object-reconstructor), but when I turned DEBUG back on for *just* the reconstructor I see:\n\n```\nobject-reconstructor-6010: Exception in top-level reconstruction loop: \nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 476, in fire_timers\n    timer()\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/timer.py\", line 59, in __call__\n    cb(*args, **kw)\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/semaphore.py\", line 147, in _do_acquire\n    waiter.switch()\ngreenlet.error: Cannot switch to a different thread\n    Current:  \u003cgreenlet.greenlet object at 0x7fad0d354040 (otid\u003d0x7fad0d59dc20) suspended active started main\u003e\n  Expected: \u003cgreenlet.greenlet object at 0x7fad0d3561c0 (otid\u003d0x7fad0d59d770) suspended active started main\u003e\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n  File \"/vagrant/swift/swift/obj/reconstructor.py\", line 1461, in reconstruct\n    jobs \u003d self.build_reconstruction_jobs(part_info)\n  File \"/vagrant/swift/swift/obj/reconstructor.py\", line 1411, in build_reconstruction_jobs\n    jobs \u003d self._get_part_jobs(**part_info)\n  File \"/vagrant/swift/swift/obj/reconstructor.py\", line 1154, in _get_part_jobs\n    hashes \u003d self._get_hashes(local_dev[\u0027device\u0027], partition, policy,\n  File \"/vagrant/swift/swift/obj/reconstructor.py\", line 834, in _get_hashes\n    hashed, suffix_hashes \u003d tpool.execute(\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/tpool.py\", line 132, in execute\n    six.reraise(c, e, tb)\n  File \"/usr/lib/python3/dist-packages/six.py\", line 719, in reraise\n    raise value\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/tpool.py\", line 86, in tworker\n    rv \u003d meth(*args, **kwargs)\n  File \"/vagrant/swift/swift/obj/diskfile.py\", line 1326, in _get_hashes\n    hashed, hashes \u003d self.__get_hashes(*args, **kwargs)\n  File \"/vagrant/swift/swift/obj/diskfile.py\", line 1419, in __get_hashes\n    hashes[suffix] \u003d self._hash_suffix(\n  File \"/vagrant/swift/swift/obj/diskfile.py\", line 3982, in _hash_suffix\n    hash_per_fi \u003d self._hash_suffix_dir(path, policy)\n  File \"/vagrant/swift/swift/obj/diskfile.py\", line 1299, in _hash_suffix_dir\n    self.logger.debug(\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1835, in debug\n    self.log(DEBUG, msg, *args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1879, in log\n    self.logger.log(level, msg, *args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1547, in log\n    self._log(level, msg, args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1624, in _log\n    self.handle(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1634, in handle\n    self.callHandlers(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1696, in callHandlers\n    hdlr.handle(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 966, in handle\n    self.acquire()\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 917, in acquire\n    self.lock.acquire()\n  File \"/usr/lib/python3.10/threading.py\", line 168, in acquire\n    rc \u003d self._block.acquire(blocking, timeout)\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/semaphore.py\", line 115, in acquire\n    hubs.get_hub().switch()\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 313, in switch\n    return self.greenlet.switch()\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 355, in run\n    self.fire_timers(self.clock())\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 480, in fire_timers\n    self.squelch_timer_exception(timer, sys.exc_info())\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 404, in squelch_timer_exception\n    traceback.print_exception(*exc_info)\n  File \"/usr/lib/python3.10/traceback.py\", line 121, in print_exception\n    print(line, file\u003dfile, end\u003d\"\")\n  File \"/vagrant/swift/swift/common/utils/logs.py\", line 439, in write\n    self.logger.error(\u0027%(type)s: %(value)s\u0027,\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1858, in error\n    self.log(ERROR, msg, *args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1879, in log\n    self.logger.log(level, msg, *args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1547, in log\n    self._log(level, msg, args, **kwargs)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1624, in _log\n    self.handle(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1634, in handle\n    self.callHandlers(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 1696, in callHandlers\n    hdlr.handle(record)\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 966, in handle\n    self.acquire()\n  File \"/usr/lib/python3.10/logging/__init__.py\", line 917, in acquire\n    self.lock.acquire()\n  File \"/usr/lib/python3.10/threading.py\", line 168, in acquire\n    rc \u003d self._block.acquire(blocking, timeout)\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/semaphore.py\", line 115, in acquire\n    hubs.get_hub().switch()\n  File \"/usr/local/lib/python3.10/dist-packages/eventlet/hubs/hub.py\", line 299, in switch\n    assert cur is not self.greenlet, \u0027Cannot switch to MAINLOOP from MAINLOOP\u0027\nAssertionError: Cannot switch to MAINLOOP from MAINLOOP\n\n```\n\n... which is *exactly* what I expect.  You can NOT call debug from w/i a threadpool - it\u0027s a foot gun, and I think there\u0027s some pre-existing debug messages in the threadpool already... very dangerous.","commit_id":"43d2f1a320b07524e78874ecb98776cd385f1576"}],"swift/obj/diskfile.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"c86a5b910efd357c0b1208e4b0acee676a75fd5c","unresolved":true,"context_lines":[{"line_number":1333,"context_line":"            self.logger.warning(\u0027Unable to read %r\u0027, hashes_file,"},{"line_number":1334,"context_line":"                                exc_info\u003dTrue)"},{"line_number":1335,"context_line":"        timing_start \u003d record_slow_timing("},{"line_number":1336,"context_line":"            timing_breakdown, \u0027consolidate_hashes\u0027, timing_start)"},{"line_number":1337,"context_line":""},{"line_number":1338,"context_line":"        if not orig_hashes[\u0027valid\u0027]:"},{"line_number":1339,"context_line":"            # This is the only path to a valid hashes from invalid read (e.g."}],"source_content_type":"text/x-python","patch_set":3,"id":"a60a173d_e92116a8","line":1336,"updated":"2025-07-04 04:21:03.000000000","message":"I do wonder if this could be used as a context manager.. though I guess we need to be able to get the next time out, so that\u0027s probably why you didn\u0027t.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"9ee4ecf6f30830574fa5dc1ffd33a1229b389e70","unresolved":true,"context_lines":[{"line_number":1667,"context_line":""},{"line_number":1668,"context_line":"        if skip_rehash:"},{"line_number":1669,"context_line":"            for suffix in suffixes:"},{"line_number":1670,"context_line":"                self.invalidate_hash(os.path.join(partition_path, suffix))"},{"line_number":1671,"context_line":"            hashes \u003d None"},{"line_number":1672,"context_line":"        elif not os.path.exists(partition_path):"},{"line_number":1673,"context_line":"            hashes \u003d {}"}],"source_content_type":"text/x-python","patch_set":3,"id":"4893ec97_85f912b3","line":1670,"updated":"2025-07-07 13:39:35.000000000","message":"this seems like it should be instrumented, especially since it takes a lock","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"e8f979f2fa2c038f567ed86a49ec10a7412f575f","unresolved":false,"context_lines":[{"line_number":1667,"context_line":""},{"line_number":1668,"context_line":"        if skip_rehash:"},{"line_number":1669,"context_line":"            for suffix in suffixes:"},{"line_number":1670,"context_line":"                self.invalidate_hash(os.path.join(partition_path, suffix))"},{"line_number":1671,"context_line":"            hashes \u003d None"},{"line_number":1672,"context_line":"        elif not os.path.exists(partition_path):"},{"line_number":1673,"context_line":"            hashes \u003d {}"}],"source_content_type":"text/x-python","patch_set":3,"id":"ed62d138_0d9b625d","line":1670,"in_reply_to":"4893ec97_85f912b3","updated":"2025-07-08 22:41:28.000000000","message":"Done","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"}],"swift/obj/server.py":[{"author":{"_account_id":7233,"name":"Matthew Oliver","email":"matt@oliver.net.au","username":"mattoliverau"},"change_message_id":"c86a5b910efd357c0b1208e4b0acee676a75fd5c","unresolved":true,"context_lines":[{"line_number":1414,"context_line":"                skip_rehash\u003dskip_rehash, timing_breakdown\u003dtiming_breakdown)"},{"line_number":1415,"context_line":"        except DiskFileDeviceUnavailable:"},{"line_number":1416,"context_line":"            record_slow_timing("},{"line_number":1417,"context_line":"                timing_breakdown, \u0027get_hashes_total\u0027, timing_start)"},{"line_number":1418,"context_line":"            resp \u003d HTTPInsufficientStorage(drive\u003ddevice, request\u003drequest)"},{"line_number":1419,"context_line":"        else:"},{"line_number":1420,"context_line":"            timing_start \u003d record_slow_timing("}],"source_content_type":"text/x-python","patch_set":3,"id":"fc58c626_ce815032","line":1417,"updated":"2025-07-04 04:21:03.000000000","message":"So this get_hashes_total, is the total time, so when we look at a log line:\n\n```\nSlow REPLICATE (0.014s) for /sdb7/882, status 200, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.014s,\n```\n\nThe `get_hashes_total` took 0.014s, yet the timings inside seem to all be 0? so they don\u0027t add up.\n\nDoes that mean we\u0027re loosing time that isn\u0027t being tracked in `get_hashes` and/or the beginning of `_get_hashes`? Maybe that needs to be instrumented too.\n\nI loke the log line, but you need to know where they are in the code for the values to make sense.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":34930,"name":"Jianjian Huo","email":"jhuo@nvidia.com","username":"jhuo"},"change_message_id":"bd2bfc54405e834c81fde1618aed75194e0d9b61","unresolved":true,"context_lines":[{"line_number":1414,"context_line":"                skip_rehash\u003dskip_rehash, timing_breakdown\u003dtiming_breakdown)"},{"line_number":1415,"context_line":"        except DiskFileDeviceUnavailable:"},{"line_number":1416,"context_line":"            record_slow_timing("},{"line_number":1417,"context_line":"                timing_breakdown, \u0027get_hashes_total\u0027, timing_start)"},{"line_number":1418,"context_line":"            resp \u003d HTTPInsufficientStorage(drive\u003ddevice, request\u003drequest)"},{"line_number":1419,"context_line":"        else:"},{"line_number":1420,"context_line":"            timing_start \u003d record_slow_timing("}],"source_content_type":"text/x-python","patch_set":3,"id":"95b07d10_17fcbb1f","line":1417,"in_reply_to":"437e95e1_bcedbb34","updated":"2025-07-08 21:26:37.000000000","message":"\u003e\u003e The get_hashes_total took 0.014s, yet the timings inside seem to all be 0? so they don\u0027t add up.\n\u003e\u003e\n\u003e\u003e Does that mean we\u0027re loosing time that isn\u0027t being tracked in get_hashes and/or the beginning of _get_hashes? Maybe that needs to be instrumented too.\n\nyes, I knew I skipped one place which is the ``tpool.execute`` scheduling delay, let\u0027s verify if the missing time was caused by it.\n\nFor debugging purpose, I added three new timings into the dict: ``invalidate_hash``, ``_tpool_submit_time`` and ``tpool_scheduling``, and ``_tpool_submit_time`` will be deleted after the function call. This test patch is at: https://review.opendev.org/c/openstack/swift/+/954375\n\nThen I reproduced the issue which Matt saw, and it was caused by the ``tpool_scheduling\u003d0.008s``.\n```\nJul  8 15:48:21 vagrant object-server-6040: Slow REPLICATE (0.010s) for /sdb4/14, status 200, tpool_scheduling\u003d0.008s, consolidate_hashes\u003d0.000s, listdir\u003d0.000s, hash_calculation\u003d0.000s, get_hashes_total\u003d0.009s, pickle_serialization\u003d0.000s\n```\n\nIn order to merge all patches upstream, while addressing Al\u0027s comments, I am going to refactor the base patch and include ``tpool_scheduling``, a simple way would be storing the ``_timing_start`` to the ``timing_breakdown`` dict itself, reuse it for each ``record_slow_timing`` call in order to save calls of ``time.time()``, and remove ``_timing_start`` after the whole function call finishes.\n\nBut my short term goal is to make this patch ready for carrying, which I think we only need to include the new timing ``invalidate_hash``, because we can get ``tpool_scheduling`` \u003d ``get_hashes_total`` - all other timings within ``get_hashes()``.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"9ee4ecf6f30830574fa5dc1ffd33a1229b389e70","unresolved":true,"context_lines":[{"line_number":1414,"context_line":"                skip_rehash\u003dskip_rehash, timing_breakdown\u003dtiming_breakdown)"},{"line_number":1415,"context_line":"        except DiskFileDeviceUnavailable:"},{"line_number":1416,"context_line":"            record_slow_timing("},{"line_number":1417,"context_line":"                timing_breakdown, \u0027get_hashes_total\u0027, timing_start)"},{"line_number":1418,"context_line":"            resp \u003d HTTPInsufficientStorage(drive\u003ddevice, request\u003drequest)"},{"line_number":1419,"context_line":"        else:"},{"line_number":1420,"context_line":"            timing_start \u003d record_slow_timing("}],"source_content_type":"text/x-python","patch_set":3,"id":"437e95e1_bcedbb34","line":1417,"in_reply_to":"fc58c626_ce815032","updated":"2025-07-07 13:39:35.000000000","message":"it would be cool if the timing_breakdown passed into get_hashes somehow had a get_hashes prefix to add to all the timings that are added in get_hashes, which would hep make sense of then getting a total and what the total covered.\n\ne.g.\n```\nget_hashes.consolidate_hashes\u003d0.01, get_hashes.something\u003d0.02, get_hashes\u003d0.03\n```\n\nwhich relates to my comment on the parent patch that timing_breakdown could be a class with a simpler interface for its users.","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"},{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"9ee4ecf6f30830574fa5dc1ffd33a1229b389e70","unresolved":true,"context_lines":[{"line_number":1418,"context_line":"            resp \u003d HTTPInsufficientStorage(drive\u003ddevice, request\u003drequest)"},{"line_number":1419,"context_line":"        else:"},{"line_number":1420,"context_line":"            timing_start \u003d record_slow_timing("},{"line_number":1421,"context_line":"                timing_breakdown, \u0027get_hashes_total\u0027, timing_start)"},{"line_number":1422,"context_line":"            # force pickle protocol for compatibility with py2 nodes"},{"line_number":1423,"context_line":"            resp \u003d Response(body\u003dpickle.dumps(hashes, protocol\u003d2))"},{"line_number":1424,"context_line":"            record_slow_timing("}],"source_content_type":"text/x-python","patch_set":3,"id":"154b878c_ea4d5738","line":1421,"updated":"2025-07-07 13:39:35.000000000","message":"you can have a single record of \u0027get_hashes_total\u0027 in a ``finally`` clause","commit_id":"d9393ccbace98a5689d6fd11a292ab67c0c978b2"}],"test/unit/obj/test_server.py":[{"author":{"_account_id":7847,"name":"Alistair Coles","email":"alistairncoles@gmail.com","username":"acoles"},"change_message_id":"357411920f0ae194b6ca34afb06c2bb2b5a44a9b","unresolved":true,"context_lines":[{"line_number":9273,"context_line":"        # Get the suffix directories that were created"},{"line_number":9274,"context_line":"        suffixes \u003d []"},{"line_number":9275,"context_line":"        partition_path \u003d os.path.join(self.testdir, \u0027sda1\u0027, \u0027objects\u0027, \u0027p\u0027)"},{"line_number":9276,"context_line":"        if os.path.exists(partition_path):"},{"line_number":9277,"context_line":"            suffixes \u003d [d for d in os.listdir(partition_path)"},{"line_number":9278,"context_line":"                        if os.path.isdir(os.path.join(partition_path, d))]"},{"line_number":9279,"context_line":""}],"source_content_type":"text/x-python","patch_set":4,"id":"7a86b946_9f720e9c","line":9276,"range":{"start_line":9276,"start_character":8,"end_line":9276,"end_character":41},"updated":"2025-07-10 09:19:14.000000000","message":"is it valid for the partition dir to not exist ? should this be a \"sanity check\" assertion\n\n```\nself.assertTrue(os.path.exists(partition_path))  # sanity check\n```","commit_id":"2f4eeb178f004dba10403b1c22ad7124a274f1d6"}]}
