)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9a98a1e55218b40d25323c1b1b4c134be32d551e","unresolved":true,"context_lines":[{"line_number":20,"context_line":"other tasks to work even though the resource check task takes long time."},{"line_number":21,"context_line":"The force context switch can prevent lack of any heartbeat operation."},{"line_number":22,"context_line":""},{"line_number":23,"context_line":"Related-Bug: #2085709"},{"line_number":24,"context_line":"Change-Id: Ieefc50d91672948faa2cb6e9b676f11c6f88988b"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":3,"id":"74b2ff8f_dc810a1b","line":23,"updated":"2025-01-20 14:40:04.000000000","message":"as noted the real fix is likely https://review.opendev.org/c/openstack/nova/+/939317\n\nbut i dont think this really hurts so im ok with takeign  multiple approchs to resolve this stalling problem.","commit_id":"a1e5df5d2d8458c752f128e0fda3708058619ff2"}],"/PATCHSET_LEVEL":[{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"666553a784c56762db5af22d83493fdd65bcbac0","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":1,"id":"d303e9a3_e8b8fc55","updated":"2024-12-30 08:34:34.000000000","message":"I\u0027ll test this on our test env","commit_id":"8a456edd547fd7226caffbd00f4cbdc4ff6dc696"},{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"8f0326b4b6318155e3e756dc38cb293c24692eb5","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"2f077fde_5a39ccf6","updated":"2025-01-11 20:12:47.000000000","message":"@masahito.muroi@linecorp.com I deployed the patch on Jan 8th, since then it looks fine. In our case, the issue started to manifest after about a week, so I\u0027m still monitoring.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"f22f4d01ac373759cf945cf711ca2413f11ebef2","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"9b5223bd_8c8bbe8a","updated":"2024-12-30 12:09:07.000000000","message":"@masahito.muroi@linecorp.com I pushed a slight rephrase of the comment. I\u0027ll get back here with the feedback once I test it on our env if you\u0027d like :)","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"ee28309c8a81a24e118faf41a4b192983dd7471b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"18bca0cd_e6f67229","updated":"2025-01-09 06:27:09.000000000","message":"Hi, Jakub.  Thank you for the rephrasing the comment.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":6476,"name":"Thomas Goirand","email":"thomas@goirand.fr","username":"thomas-goirand"},"change_message_id":"f3557320430ec6c6bf3de16dacff8d81bf5328db","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"ef2cf869_396d6456","updated":"2025-01-03 08:46:11.000000000","message":"Hi. We\u0027ve been experiencing the issue in production, with some compute nodes repeatedly appearing down for a short period. So we\u0027ve put this patch in production (under Caracal) on 6 of our compute nodes out of 18 (one AZ). Since then, only the unpatched compute continue to have the problem. This has been running for 2 days so far, so we\u0027ll continue to monitor the problem, but so far so good. So +1 so far.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"6e9316d888b76832d5cd634eed14c47a34a47ccb","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"894cb744_4c39df9a","updated":"2025-01-20 14:11:21.000000000","message":"Replaced the `greenthread` with `time`.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":36998,"name":"Ilia Baikov","display_name":"Ilia Baikov","email":"ilia.baikov@ib.systems","username":"frct1"},"change_message_id":"315aa78fad8096536c1bc99d0c5eceabc84254dd","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"6e0240db_a637ce46","updated":"2025-01-11 13:03:26.000000000","message":"Testing this patch around related bug (https://bugs.launchpad.net/nova/+bug/2092297). \nIntro: we have 2 sites running the same version 2024.2, patch really helps but not for a long time for unknown reason. \nFirst site after patch: we have 3 nodes with around 180 instances each. Patch really helps a lot for about 30 hours (patched at 4am, first nova-compute flap occurs at 10am next day) then 2 out of 3 nova-compute are flapping.\nhttps://imgur.com/a/iqrXjI6\nwarn logs: https://paste.opendev.org/show/bFVTCMTE2RDw5kQ5Hepc/\nAlso there is heartbeat error logs and ovs-related about broken pipe:\nhttps://paste.opendev.org/show/b68AMBCD9FlCVfupk5Gx/\n\nSecond site: 3 nodes running about 400 instances each, CPU load is about 50%. Flap occured for a single nova-compute instance yet after about 33 hours since patch.\nhttps://imgur.com/a/site-2-WXFCoff","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"22accfd160ddc071a6fa3401cbc60a0d4bb7502b","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"35167ab0_05b205ec","updated":"2025-01-17 03:39:11.000000000","message":"This change is more straitforward change to solve the problem.  Once every one checks the change works well, I\u0027m thinking close my commit.\n\nhttps://review.opendev.org/c/openstack/nova/+/939317","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":6476,"name":"Thomas Goirand","email":"thomas@goirand.fr","username":"thomas-goirand"},"change_message_id":"8ac81d0eff97671461527792f98e97e08a002562","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"e9f93fd7_e68d604e","updated":"2025-01-03 08:43:30.000000000","message":"recheck","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"f774d5ad4eb19537ebf901f951f61b81671a3a79","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"eca14ddc_a30500ea","updated":"2025-01-18 01:53:29.000000000","message":"this might help but the actully isseu is likely fixed by https://review.opendev.org/c/openstack/nova/+/939317\n\nthe code you have writtn is not incorrect but i woudl prefer if we used time.sleep(0) instead.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":36998,"name":"Ilia Baikov","display_name":"Ilia Baikov","email":"ilia.baikov@ib.systems","username":"frct1"},"change_message_id":"c202464fcf5f902f6c1a5326e0d4fa2e077f3279","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"293c489e_a891619e","in_reply_to":"32fb3fce_09fcb0c6","updated":"2025-01-13 18:36:20.000000000","message":"Yep, the same thing occurs after almost 2 days since nova-compute has been restarted meanwhile it only affects single node yet.\nnova-compute statuses across both sites: https://imgur.com/a/7fisXpK","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":36998,"name":"Ilia Baikov","display_name":"Ilia Baikov","email":"ilia.baikov@ib.systems","username":"frct1"},"change_message_id":"4db15879b6ba91d91fe2b9faf9fae3df78fd80f9","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"d54a0f10_53c8204f","in_reply_to":"35167ab0_05b205ec","updated":"2025-01-17 09:33:46.000000000","message":"Probably, anyway just deployed at the one more loaded site for monitoring. I will report back after 2 or 3 days","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":36998,"name":"Ilia Baikov","display_name":"Ilia Baikov","email":"ilia.baikov@ib.systems","username":"frct1"},"change_message_id":"2ac38db7800535508fac145fee67d88db26adcd0","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"b130cac7_a18a99c6","in_reply_to":"6e0240db_a637ce46","updated":"2025-01-11 13:05:08.000000000","message":"related bug: https://bugs.launchpad.net/nova/+bug/2092297 (fixed link)","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"a8e042f308d26fee90150e9272382e69446ff615","unresolved":true,"context_lines":[],"source_content_type":"","patch_set":2,"id":"32fb3fce_09fcb0c6","in_reply_to":"b130cac7_a18a99c6","updated":"2025-01-13 10:14:06.000000000","message":"I\u0027d assume that this patch, while optimizing the operations, does not address the real root cause - but gives us a bit more time before the service starts having issues. I\u0027ll report back here when I\u0027ll have some more results - testing since Jan 8th.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"b88cf325d2caed7210f4c29aa627f1231fe66ae6","unresolved":false,"context_lines":[],"source_content_type":"","patch_set":2,"id":"08ed2ef4_191ea51f","in_reply_to":"eca14ddc_a30500ea","updated":"2025-01-20 02:36:53.000000000","message":"Yes. You\u0027re right.\n\n\u003e This change is more straitforward change to solve the problem. Once every one checks the change works well, I\u0027m thinking close my commit.\n\nSorry for the confusion, the \"This change\" points out your patch[1], not this patch [2].  I completely agree adding wrap proxy is more straitfoward way.\n\n1. https://review.opendev.org/c/openstack/nova/+/939317\n2. https://review.opendev.org/c/openstack/nova/+/938215","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"}],"nova/virt/libvirt/driver.py":[{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"e70e376ed74b8ae3fba9b1d754fff0a64dbb95f3","unresolved":true,"context_lines":[{"line_number":8692,"context_line":"            except libvirt.libvirtError:"},{"line_number":8693,"context_line":"                return []"},{"line_number":8694,"context_line":""},{"line_number":8695,"context_line":"        net_devs \u003d []"},{"line_number":8696,"context_line":"        vdpa_devs \u003d []"},{"line_number":8697,"context_line":"        pci_devs \u003d {}"},{"line_number":8698,"context_line":"        for name, dev in devices.items():"}],"source_content_type":"text/x-python","patch_set":1,"id":"47ef5b65_bd8bbb4d","line":8695,"updated":"2024-12-30 08:08:33.000000000","message":"Lines 8695 to 8705 are cosmetic change, not related to the bug fix?","commit_id":"8a456edd547fd7226caffbd00f4cbdc4ff6dc696"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"ee28309c8a81a24e118faf41a4b192983dd7471b","unresolved":true,"context_lines":[{"line_number":8692,"context_line":"            except libvirt.libvirtError:"},{"line_number":8693,"context_line":"                return []"},{"line_number":8694,"context_line":""},{"line_number":8695,"context_line":"        net_devs \u003d []"},{"line_number":8696,"context_line":"        vdpa_devs \u003d []"},{"line_number":8697,"context_line":"        pci_devs \u003d {}"},{"line_number":8698,"context_line":"        for name, dev in devices.items():"}],"source_content_type":"text/x-python","patch_set":1,"id":"dc9ff7cf_ba3bd0b7","line":8695,"in_reply_to":"1ba45c78_2cde5f68","updated":"2025-01-09 06:27:09.000000000","message":"Yes. As Jens mentioned, the for-loop style change reduces number of libvirt API calls. If the devices has 10 items, the old style makes 10 * 3 \u003d 30 libvirt API ,`listCaps()`, calls but the newer one call only 10 API calls.\n\nMy main motivation is to give more context switch chance with reasonable code compared to the following code.\n\n```\n        net_devs \u003d [\n            dev for dev in devices.values() if \"net\" in _safe_list_caps(dev)\n        ]\n        greenthread.sleep(0)         \n        vdpa_devs \u003d [\n            dev for dev in devices.values() if \"vdpa\" in _safe_list_caps(dev)\n        ]\n        greenthread.sleep(0)\n        pci_devs \u003d {\n            name: dev for name, dev in devices.items()\n                    if \"pci\" in _safe_list_caps(dev)}\n        greenthread.sleep(0)\n```","commit_id":"8a456edd547fd7226caffbd00f4cbdc4ff6dc696"},{"author":{"_account_id":13252,"name":"Dr. Jens Harbott","display_name":"Jens Harbott (frickler)","email":"frickler@offenerstapel.de","username":"jrosenboom"},"change_message_id":"1c1c4cb6cfe05e5f5f9a67935ba89aafdd9de9a9","unresolved":true,"context_lines":[{"line_number":8692,"context_line":"            except libvirt.libvirtError:"},{"line_number":8693,"context_line":"                return []"},{"line_number":8694,"context_line":""},{"line_number":8695,"context_line":"        net_devs \u003d []"},{"line_number":8696,"context_line":"        vdpa_devs \u003d []"},{"line_number":8697,"context_line":"        pci_devs \u003d {}"},{"line_number":8698,"context_line":"        for name, dev in devices.items():"}],"source_content_type":"text/x-python","patch_set":1,"id":"1ba45c78_2cde5f68","line":8695,"in_reply_to":"47ef5b65_bd8bbb4d","updated":"2025-01-03 13:53:51.000000000","message":"well the change makes _safe_list_caps() get called only once per device instead of three times, assuming that that call is really taking a lot of time, this may be part of the positive effect this patch has. but indeed it would seem worthwhile to mention this in the commit message","commit_id":"8a456edd547fd7226caffbd00f4cbdc4ff6dc696"},{"author":{"_account_id":34076,"name":"Jakub Darmach","email":"jakub@stackhpc.com","username":"darmach"},"change_message_id":"a8e042f308d26fee90150e9272382e69446ff615","unresolved":false,"context_lines":[{"line_number":8692,"context_line":"            except libvirt.libvirtError:"},{"line_number":8693,"context_line":"                return []"},{"line_number":8694,"context_line":""},{"line_number":8695,"context_line":"        net_devs \u003d []"},{"line_number":8696,"context_line":"        vdpa_devs \u003d []"},{"line_number":8697,"context_line":"        pci_devs \u003d {}"},{"line_number":8698,"context_line":"        for name, dev in devices.items():"}],"source_content_type":"text/x-python","patch_set":1,"id":"803cc64e_b2dedebf","line":8695,"in_reply_to":"dc9ff7cf_ba3bd0b7","updated":"2025-01-13 10:14:06.000000000","message":"Ah, of course I somehow misread that. Thanks for clarifying.","commit_id":"8a456edd547fd7226caffbd00f4cbdc4ff6dc696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"f774d5ad4eb19537ebf901f951f61b81671a3a79","unresolved":true,"context_lines":[{"line_number":8705,"context_line":"                pci_devs[name] \u003d dev"},{"line_number":8706,"context_line":""},{"line_number":8707,"context_line":"            # Let other tasks work during libvirt connection task"},{"line_number":8708,"context_line":"            greenthread.sleep(0)"},{"line_number":8709,"context_line":""},{"line_number":8710,"context_line":"        pci_info \u003d ["},{"line_number":8711,"context_line":"            self._host._get_pcidev_info("}],"source_content_type":"text/x-python","patch_set":2,"id":"440affa8_a71c2dab","line":8708,"range":{"start_line":8708,"start_character":12,"end_line":8708,"end_character":32},"updated":"2025-01-18 01:53:29.000000000","message":"we are currently in the process of removign eventelt \n\nfor nwo we shoudl avoid any explitc addtions of greenthread.sleep(0)\n\nand prefer time.sleep(0)\n\ni have a patch somnewhere that add a copperative_yeild function but that has not merged so for now time.sleep(0) is prefer.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"700c1f2faf28451769e8edb9f13ddd1d3ed418cd","unresolved":true,"context_lines":[{"line_number":8705,"context_line":"                pci_devs[name] \u003d dev"},{"line_number":8706,"context_line":""},{"line_number":8707,"context_line":"            # Let other tasks work during libvirt connection task"},{"line_number":8708,"context_line":"            greenthread.sleep(0)"},{"line_number":8709,"context_line":""},{"line_number":8710,"context_line":"        pci_info \u003d ["},{"line_number":8711,"context_line":"            self._host._get_pcidev_info("}],"source_content_type":"text/x-python","patch_set":2,"id":"689a3a44_0b873cbd","line":8708,"range":{"start_line":8708,"start_character":12,"end_line":8708,"end_character":32},"in_reply_to":"440affa8_a71c2dab","updated":"2025-01-18 01:54:38.000000000","message":"in theory, we should also have testing to assert that it yields before we merge this.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"b88cf325d2caed7210f4c29aa627f1231fe66ae6","unresolved":true,"context_lines":[{"line_number":8705,"context_line":"                pci_devs[name] \u003d dev"},{"line_number":8706,"context_line":""},{"line_number":8707,"context_line":"            # Let other tasks work during libvirt connection task"},{"line_number":8708,"context_line":"            greenthread.sleep(0)"},{"line_number":8709,"context_line":""},{"line_number":8710,"context_line":"        pci_info \u003d ["},{"line_number":8711,"context_line":"            self._host._get_pcidev_info("}],"source_content_type":"text/x-python","patch_set":2,"id":"f74d2f96_1c14597f","line":8708,"range":{"start_line":8708,"start_character":12,"end_line":8708,"end_character":32},"in_reply_to":"689a3a44_0b873cbd","updated":"2025-01-20 02:36:53.000000000","message":"Oops, I missed the eventlet removal activity.  Let me update this patch to use `time` instead of `greenthread`. Your patch could solve the problem and this patch doesn\u0027t need for the long running task problem.\n\nI could update this patch just for reducing loop-ing count if the adding context switch chance is extra operations in this change.","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":8878,"name":"Masahito Muroi","email":"masahito.muroi@linecorp.com","username":"masa"},"change_message_id":"6e9316d888b76832d5cd634eed14c47a34a47ccb","unresolved":false,"context_lines":[{"line_number":8705,"context_line":"                pci_devs[name] \u003d dev"},{"line_number":8706,"context_line":""},{"line_number":8707,"context_line":"            # Let other tasks work during libvirt connection task"},{"line_number":8708,"context_line":"            greenthread.sleep(0)"},{"line_number":8709,"context_line":""},{"line_number":8710,"context_line":"        pci_info \u003d ["},{"line_number":8711,"context_line":"            self._host._get_pcidev_info("}],"source_content_type":"text/x-python","patch_set":2,"id":"abd6d619_abb8704e","line":8708,"range":{"start_line":8708,"start_character":12,"end_line":8708,"end_character":32},"in_reply_to":"f74d2f96_1c14597f","updated":"2025-01-20 14:11:21.000000000","message":"Done","commit_id":"99da0eca058ca617f65cb012a1004d6e68475dc7"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"9a98a1e55218b40d25323c1b1b4c134be32d551e","unresolved":true,"context_lines":[{"line_number":8705,"context_line":"                pci_devs[name] \u003d dev"},{"line_number":8706,"context_line":""},{"line_number":8707,"context_line":"            # Give other tasks to work during libvirt connection task"},{"line_number":8708,"context_line":"            time.sleep(0)"},{"line_number":8709,"context_line":""},{"line_number":8710,"context_line":"        pci_info \u003d ["},{"line_number":8711,"context_line":"            self._host._get_pcidev_info("}],"source_content_type":"text/x-python","patch_set":3,"id":"e4433bed_d8fa1bba","line":8708,"updated":"2025-01-20 14:40:04.000000000","message":"+1 for now to get input form others.\n\ni think we likely do want at elast one unit test that mocks time.sleep(0) and asserts its called.\n\nwith that said if others are ok without that then we can proceed.","commit_id":"a1e5df5d2d8458c752f128e0fda3708058619ff2"}]}
