)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":6,"context_line":""},{"line_number":7,"context_line":"NVMe monitoring and healing agent for NVMe connector."},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Change-Id: I9b76fc4b1f13ddf07769136ec975148e1e109ca8"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":4,"id":"5549703c_e39c9b5b","line":9,"updated":"2020-12-15 17:33:59.000000000","message":"-1: Missing footers:\n\nDepends-On: https://review.opendev.org/c/openstack/cinder-specs/+/766730\nImplements: blueprint nvmeof-client-raid-healing-agent","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"}],"specs/wallaby/nvme-agent.rst":[{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"9aca290f5e784bd3ad886d53305322844f59c174","unresolved":true,"context_lines":[{"line_number":4,"context_line":""},{"line_number":5,"context_line":" http://creativecommons.org/licenses/by/3.0/legalcode"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":8,"context_line":"NVMe Connector Healing Agent"},{"line_number":9,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":10,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"7666d704_3e9bfa62","line":7,"range":{"start_line":7,"start_character":28,"end_line":7,"end_character":42},"updated":"2020-12-15 03:42:34.000000000","message":"nit: should be same length as the heading (same thing for line 9)","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"f7f1eb3c943abc93fc6ffce950f58916fd12bb47","unresolved":false,"context_lines":[{"line_number":4,"context_line":""},{"line_number":5,"context_line":" http://creativecommons.org/licenses/by/3.0/legalcode"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":8,"context_line":"NVMe Connector Healing Agent"},{"line_number":9,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":10,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"7efb1548_9e7cb980","line":7,"range":{"start_line":7,"start_character":28,"end_line":7,"end_character":42},"in_reply_to":"7666d704_3e9bfa62","updated":"2020-12-15 14:11:00.000000000","message":"Done","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"9aca290f5e784bd3ad886d53305322844f59c174","unresolved":true,"context_lines":[{"line_number":51,"context_line":"Alternatives"},{"line_number":52,"context_line":"------------"},{"line_number":53,"context_line":""},{"line_number":54,"context_line":"None?"},{"line_number":55,"context_line":""},{"line_number":56,"context_line":"Data model impact"},{"line_number":57,"context_line":"-----------------"}],"source_content_type":"text/x-rst","patch_set":3,"id":"e5e5f1d8_362c2729","line":54,"updated":"2020-12-15 03:42:34.000000000","message":"I think this is right.  If we want the replication to be transparent to the entity holding the attachment, then it\u0027s got to be monitored by some kind of external agent.","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"f7f1eb3c943abc93fc6ffce950f58916fd12bb47","unresolved":false,"context_lines":[{"line_number":51,"context_line":"Alternatives"},{"line_number":52,"context_line":"------------"},{"line_number":53,"context_line":""},{"line_number":54,"context_line":"None?"},{"line_number":55,"context_line":""},{"line_number":56,"context_line":"Data model impact"},{"line_number":57,"context_line":"-----------------"}],"source_content_type":"text/x-rst","patch_set":3,"id":"9914d594_6688f74e","line":54,"in_reply_to":"e5e5f1d8_362c2729","updated":"2020-12-15 14:11:00.000000000","message":"Done","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"9aca290f5e784bd3ad886d53305322844f59c174","unresolved":true,"context_lines":[{"line_number":89,"context_line":"None"},{"line_number":90,"context_line":""},{"line_number":91,"context_line":"Other deployer impact"},{"line_number":92,"context_line":"---------------------"},{"line_number":93,"context_line":""},{"line_number":94,"context_line":"None"},{"line_number":95,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"98d9a043_783b054b","line":92,"updated":"2020-12-15 03:42:34.000000000","message":"I think you need to answer the question: If this change is a new binary, how would it be deployed?","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"f7f1eb3c943abc93fc6ffce950f58916fd12bb47","unresolved":true,"context_lines":[{"line_number":89,"context_line":"None"},{"line_number":90,"context_line":""},{"line_number":91,"context_line":"Other deployer impact"},{"line_number":92,"context_line":"---------------------"},{"line_number":93,"context_line":""},{"line_number":94,"context_line":"None"},{"line_number":95,"context_line":""}],"source_content_type":"text/x-rst","patch_set":3,"id":"54ec7331_706c028b","line":92,"in_reply_to":"98d9a043_783b054b","updated":"2020-12-15 14:11:00.000000000","message":"This change is a new class that will get initialized by the NVMe connector on a first connection, and will schedule itself to run periodically.","commit_id":"a4d0ba584d9b955d440013d0a7ee902c9b7fabcb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":18,"context_line":"Problem description"},{"line_number":19,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"When the NVMe connector connects a replicated volume, OpenStack will see it"},{"line_number":22,"context_line":"as one volume, and has no way of monitoring managing and healing the replicas"},{"line_number":23,"context_line":"in these MDRAID arrays. This agent will take care of that."},{"line_number":24,"context_line":""},{"line_number":25,"context_line":"It will monitor the state of the MDRAID arrays and reconcile their physical"},{"line_number":26,"context_line":"state on the host with expected state from the volume provisioner, replacing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"b07e6ecf_8a82096e","line":23,"range":{"start_line":21,"start_character":0,"end_line":23,"end_character":58},"updated":"2020-12-15 17:33:59.000000000","message":"nit: I think we should try to be a little more verbose when explaining the issue.  I believe the key points are:\n\n- For backend volume replicas, it\u0027s the storage array that takes care of monitoring and replacing unhealthy replicas.\n- NVMe MDRAID moves the data replication responsibility from the backend to the consumer.\n- There\u0027s no mechanism to monitor and heal these replicated volumes.\n- We cannot do it on the Cinder side, because even if the Cinder driver detected the issue and created a replacing volume, we have no mechanism to report the connection information of the replacing volume to the consumer.\n- So the monitoring and healing needs to be on the volume consumer side.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":18,"context_line":"Problem description"},{"line_number":19,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"When the NVMe connector connects a replicated volume, OpenStack will see it"},{"line_number":22,"context_line":"as one volume, and has no way of monitoring managing and healing the replicas"},{"line_number":23,"context_line":"in these MDRAID arrays. This agent will take care of that."},{"line_number":24,"context_line":""},{"line_number":25,"context_line":"It will monitor the state of the MDRAID arrays and reconcile their physical"},{"line_number":26,"context_line":"state on the host with expected state from the volume provisioner, replacing"}],"source_content_type":"text/x-rst","patch_set":4,"id":"556c6e83_47172719","line":23,"range":{"start_line":21,"start_character":0,"end_line":23,"end_character":58},"in_reply_to":"b07e6ecf_8a82096e","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":30,"context_line":"Use Cases"},{"line_number":31,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":32,"context_line":""},{"line_number":33,"context_line":"When working with replicated NVMe volumes, one of the replicas may go faulty,"},{"line_number":34,"context_line":"this agent will detect it and attempt to replace it (self heal the MDRAID)"},{"line_number":35,"context_line":""},{"line_number":36,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"0d37b43c_09434a83","line":33,"range":{"start_line":33,"start_character":34,"end_line":33,"end_character":41},"updated":"2020-12-15 17:33:59.000000000","message":"nit: volumes that are attached to an instance for a long time,","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":30,"context_line":"Use Cases"},{"line_number":31,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":32,"context_line":""},{"line_number":33,"context_line":"When working with replicated NVMe volumes, one of the replicas may go faulty,"},{"line_number":34,"context_line":"this agent will detect it and attempt to replace it (self heal the MDRAID)"},{"line_number":35,"context_line":""},{"line_number":36,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"ea817cb8_2e34d79c","line":33,"range":{"start_line":33,"start_character":34,"end_line":33,"end_character":41},"in_reply_to":"0d37b43c_09434a83","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":31,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":32,"context_line":""},{"line_number":33,"context_line":"When working with replicated NVMe volumes, one of the replicas may go faulty,"},{"line_number":34,"context_line":"this agent will detect it and attempt to replace it (self heal the MDRAID)"},{"line_number":35,"context_line":""},{"line_number":36,"context_line":""},{"line_number":37,"context_line":"Proposed change"}],"source_content_type":"text/x-rst","patch_set":4,"id":"216784a9_eacfbbdc","line":34,"range":{"start_line":34,"start_character":73,"end_line":34,"end_character":74},"updated":"2020-12-15 17:33:59.000000000","message":"nit: ) without the need to detach and re-attach the volume.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":31,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":32,"context_line":""},{"line_number":33,"context_line":"When working with replicated NVMe volumes, one of the replicas may go faulty,"},{"line_number":34,"context_line":"this agent will detect it and attempt to replace it (self heal the MDRAID)"},{"line_number":35,"context_line":""},{"line_number":36,"context_line":""},{"line_number":37,"context_line":"Proposed change"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e0dbac32_9ce982c8","line":34,"range":{"start_line":34,"start_character":73,"end_line":34,"end_character":74},"in_reply_to":"216784a9_eacfbbdc","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":37,"context_line":"Proposed change"},{"line_number":38,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":39,"context_line":""},{"line_number":40,"context_line":"Add a monitoring task that will be launched on a host by the NVMe connector"},{"line_number":41,"context_line":"when connecting to a volume."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"bcfb65fe_bcf561c5","line":41,"range":{"start_line":40,"start_character":0,"end_line":41,"end_character":27},"updated":"2020-12-15 17:33:59.000000000","message":"?: Please be explicit on when and how it will be launched, because we don\u0027t want to launch it for non MDRAID volumes, or for MDRAID volumes that don\u0027t support this healing mechanism.\n\nWill we have a single monitoring \"task\" for all the consumer services?  Or will we have one for each service?  For example if we have cinder-volume and cinder-backup running on the same host, or nova-compute and cinder-volume on the same host.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":37,"context_line":"Proposed change"},{"line_number":38,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":39,"context_line":""},{"line_number":40,"context_line":"Add a monitoring task that will be launched on a host by the NVMe connector"},{"line_number":41,"context_line":"when connecting to a volume."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"ebce219f_1bef9d31","line":41,"range":{"start_line":40,"start_character":0,"end_line":41,"end_character":27},"in_reply_to":"bcfb65fe_bcf561c5","updated":"2020-12-16 12:01:53.000000000","message":"added clarification.\n\nin the case of multiple services on same host, then this task may be ran per service, this may make sense if different services are consuming different backends.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":40,"context_line":"Add a monitoring task that will be launched on a host by the NVMe connector"},{"line_number":41,"context_line":"when connecting to a volume."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"aefed9e4_ddebc3f7","line":43,"range":{"start_line":43,"start_character":0,"end_line":43,"end_character":69},"updated":"2020-12-15 17:33:59.000000000","message":"?: Which NVMe devices will monitor?  All? Or only those connected via os-brick?\n\nHow will it know which devices it needs to monitor if the \"task\" is restarted?","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":40,"context_line":"Add a monitoring task that will be launched on a host by the NVMe connector"},{"line_number":41,"context_line":"when connecting to a volume."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"c1ca5701_5a69310e","line":43,"range":{"start_line":43,"start_character":0,"end_line":43,"end_character":69},"in_reply_to":"aefed9e4_ddebc3f7","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""},{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."}],"source_content_type":"text/x-rst","patch_set":4,"id":"b475b0cf_9615c691","line":45,"range":{"start_line":45,"start_character":15,"end_line":45,"end_character":34},"updated":"2020-12-15 17:33:59.000000000","message":"?: By volume provisioner do you mean Cinder? If I remember correctly the idea was that this monitoring service would connect directly to the \"backend\", and the credentials would need to be stored on the consumer beforehand.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":42,"context_line":""},{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""},{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."}],"source_content_type":"text/x-rst","patch_set":4,"id":"958456ba_357d7b80","line":45,"range":{"start_line":45,"start_character":15,"end_line":45,"end_character":34},"in_reply_to":"b475b0cf_9615c691","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""},{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."},{"line_number":49,"context_line":""},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":4,"id":"8ea69fda_59c3e32c","line":48,"range":{"start_line":46,"start_character":0,"end_line":48,"end_character":14},"updated":"2020-12-15 17:33:59.000000000","message":"?: I believe it\u0027s necessary an explanation on how it will support the case where all the replicas have been replaced and we now reuse this connection information on the same node (I think this happens on one of the Nova operations).","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":43,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":44,"context_line":""},{"line_number":45,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":46,"context_line":""},{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."},{"line_number":49,"context_line":""},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":4,"id":"114f24e4_ebdf026e","line":48,"range":{"start_line":46,"start_character":0,"end_line":48,"end_character":14},"in_reply_to":"8ea69fda_59c3e32c","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."},{"line_number":49,"context_line":""},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"Alternatives"},{"line_number":52,"context_line":"------------"},{"line_number":53,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"4f61be43_2be6c79c","line":50,"updated":"2020-12-15 17:33:59.000000000","message":"There is no mention on how this would support multiple vendor implementations.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":47,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":48,"context_line":"in the MDRAID."},{"line_number":49,"context_line":""},{"line_number":50,"context_line":""},{"line_number":51,"context_line":"Alternatives"},{"line_number":52,"context_line":"------------"},{"line_number":53,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"f0e6f033_1f2f09a2","line":50,"in_reply_to":"4f61be43_2be6c79c","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":67,"context_line":"---------------"},{"line_number":68,"context_line":""},{"line_number":69,"context_line":"Will call NVMe connector methods that do sudo executions of `nvme` and `mdadm`"},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"Active/Active HA impact"},{"line_number":72,"context_line":"-----------------------"},{"line_number":73,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"030dc5c5_131ebd03","line":70,"updated":"2020-12-15 17:33:59.000000000","message":"We should highlight that those calls will be made from a new \"daemon/agent\" that will be spawned from os-brick.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":67,"context_line":"---------------"},{"line_number":68,"context_line":""},{"line_number":69,"context_line":"Will call NVMe connector methods that do sudo executions of `nvme` and `mdadm`"},{"line_number":70,"context_line":""},{"line_number":71,"context_line":"Active/Active HA impact"},{"line_number":72,"context_line":"-----------------------"},{"line_number":73,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"9412579c_6d97b08f","line":70,"in_reply_to":"030dc5c5_131ebd03","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":111,"context_line":"Work Items"},{"line_number":112,"context_line":"----------"},{"line_number":113,"context_line":""},{"line_number":114,"context_line":"NVMe connector launch monitoring task on connect_volume."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":117,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"01d34b7f_2f50227d","line":114,"range":{"start_line":114,"start_character":15,"end_line":114,"end_character":38},"updated":"2020-12-15 17:33:59.000000000","message":"nit: if it isn\u0027t already running (iirc this wasn\u0027t a task but an independent process)","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":111,"context_line":"Work Items"},{"line_number":112,"context_line":"----------"},{"line_number":113,"context_line":""},{"line_number":114,"context_line":"NVMe connector launch monitoring task on connect_volume."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":117,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"49b14a47_a64285be","line":114,"range":{"start_line":114,"start_character":15,"end_line":114,"end_character":38},"in_reply_to":"01d34b7f_2f50227d","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":112,"context_line":"----------"},{"line_number":113,"context_line":""},{"line_number":114,"context_line":"NVMe connector launch monitoring task on connect_volume."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":117,"context_line":""},{"line_number":118,"context_line":"When a replica goes faulty (as well as other events such as disconnects)"},{"line_number":119,"context_line":"call interface method for notifying volume provisioner."}],"source_content_type":"text/x-rst","patch_set":4,"id":"6438cacb_0f5a7e8f","line":116,"range":{"start_line":115,"start_character":0,"end_line":116,"end_character":70},"updated":"2020-12-15 17:33:59.000000000","message":"?: iirc this agent was in charge of updating the connection_information before we start doing the actual connection on the host.\n\nThat way if the whole connection_info is outdated (we had replaced all the replicas since the connection info was generated by the Cinder driver) we would still be able to attach the volume with the updated one we get from the agent.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":112,"context_line":"----------"},{"line_number":113,"context_line":""},{"line_number":114,"context_line":"NVMe connector launch monitoring task on connect_volume."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":117,"context_line":""},{"line_number":118,"context_line":"When a replica goes faulty (as well as other events such as disconnects)"},{"line_number":119,"context_line":"call interface method for notifying volume provisioner."}],"source_content_type":"text/x-rst","patch_set":4,"id":"018d01b0_764e5de3","line":116,"range":{"start_line":115,"start_character":0,"end_line":116,"end_character":70},"in_reply_to":"6438cacb_0f5a7e8f","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"662dc8f39157570361f6f0df17403d329ceb23d8","unresolved":true,"context_lines":[{"line_number":131,"context_line":"Testing"},{"line_number":132,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":133,"context_line":""},{"line_number":134,"context_line":"In order to properly test this in tempest, programmatic access will be needed"},{"line_number":135,"context_line":"to the storage backend. For example, to fail one of the drives of a replicated"},{"line_number":136,"context_line":"volume."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"Since OpenStack is not aware of volume replicas of the new NVMe connector"},{"line_number":139,"context_line":"replicated volumes, would need to verify physical state of NVMe devices and"},{"line_number":140,"context_line":"MDRAID arrays on the host."},{"line_number":141,"context_line":""},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"Documentation Impact"}],"source_content_type":"text/x-rst","patch_set":4,"id":"e4aaeb1a_d9b398dc","line":140,"range":{"start_line":134,"start_character":0,"end_line":140,"end_character":26},"updated":"2020-12-15 17:33:59.000000000","message":"I think it would be too much to ask to have proper testing for this in tempest, as it would be a lot of work.\n\nWe should be able to accept it just with unit-tests.\n\nThough functional tests could be added, and are probably more appropriate.","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"b973a37aab3740ab8eb243dee1fd1aa76d6d0df0","unresolved":false,"context_lines":[{"line_number":131,"context_line":"Testing"},{"line_number":132,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":133,"context_line":""},{"line_number":134,"context_line":"In order to properly test this in tempest, programmatic access will be needed"},{"line_number":135,"context_line":"to the storage backend. For example, to fail one of the drives of a replicated"},{"line_number":136,"context_line":"volume."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"Since OpenStack is not aware of volume replicas of the new NVMe connector"},{"line_number":139,"context_line":"replicated volumes, would need to verify physical state of NVMe devices and"},{"line_number":140,"context_line":"MDRAID arrays on the host."},{"line_number":141,"context_line":""},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"Documentation Impact"}],"source_content_type":"text/x-rst","patch_set":4,"id":"9a165f80_a6354224","line":140,"range":{"start_line":134,"start_character":0,"end_line":140,"end_character":26},"in_reply_to":"e4aaeb1a_d9b398dc","updated":"2020-12-16 12:01:53.000000000","message":"Done","commit_id":"d148afcc8a01cff6436ad09b5709ace606b4bf05"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":40,"context_line":""},{"line_number":41,"context_line":"So the monitoring and healing needs to be on the volume consumer side."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":""},{"line_number":44,"context_line":"Use Cases"},{"line_number":45,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":46,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"6e5a5162_6d4fa3e4","line":43,"updated":"2020-12-16 18:41:21.000000000","message":"I believe with the healing agent we will be able to reduce considerably the possibility of the two scenarios I wanted to highlight on the MDRAID spec, because now the agent will be able to mark/delete/replace the volumes that it doesn\u0027t have access to and they won\u0027t be available for attaching on the other host.\n\nI think it\u0027s worth mentioning this great benefit in this spec.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":40,"context_line":""},{"line_number":41,"context_line":"So the monitoring and healing needs to be on the volume consumer side."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":""},{"line_number":44,"context_line":"Use Cases"},{"line_number":45,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":46,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"c058aa20_acee7008","line":43,"in_reply_to":"6e5a5162_6d4fa3e4","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":47,"context_line":"When working with replicated NVMe volumes that are attached to an instance"},{"line_number":48,"context_line":"for a long time, one of the replicas may go faulty."},{"line_number":49,"context_line":"This agent will detect it and attempt to replace it (self heal the MDRAID,"},{"line_number":50,"context_line":"without the need to detach and re-attach the volume.)"},{"line_number":51,"context_line":""},{"line_number":52,"context_line":""},{"line_number":53,"context_line":"Proposed change"}],"source_content_type":"text/x-rst","patch_set":7,"id":"9dbea01e_ae87ff1a","line":50,"range":{"start_line":50,"start_character":51,"end_line":50,"end_character":53},"updated":"2020-12-16 18:41:21.000000000","message":"nit: The period should go after the closing parenthesis.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":47,"context_line":"When working with replicated NVMe volumes that are attached to an instance"},{"line_number":48,"context_line":"for a long time, one of the replicas may go faulty."},{"line_number":49,"context_line":"This agent will detect it and attempt to replace it (self heal the MDRAID,"},{"line_number":50,"context_line":"without the need to detach and re-attach the volume.)"},{"line_number":51,"context_line":""},{"line_number":52,"context_line":""},{"line_number":53,"context_line":"Proposed change"}],"source_content_type":"text/x-rst","patch_set":7,"id":"374e0096_1d0202ab","line":50,"range":{"start_line":50,"start_character":51,"end_line":50,"end_character":53},"in_reply_to":"9dbea01e_ae87ff1a","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":56,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":57,"context_line":"during volume connection on a host."},{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."},{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"39074442_c15da154","line":59,"range":{"start_line":59,"start_character":24,"end_line":59,"end_character":55},"updated":"2020-12-16 18:41:21.000000000","message":"-1: This sounds like a thread within the same process, and I don\u0027t like that approach because it has some drawbacks:\n\n- We are using greenthreads, so any blocking on that task will affect the parent process (Nova, Cinder, Glance)\n- If Nova is stopped for an upgrade (instances keep running on the hypervisor) we would lose the monitoring and healing.\n- Nova is unlikely to accept os-brick running a task within its process (a different story is a different process like we do in privsep).\n\nIf I misinterpreted and you were not referring to a thread within the process, please make it more explicit so others won\u0027t make my same mistake.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":true,"context_lines":[{"line_number":56,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":57,"context_line":"during volume connection on a host."},{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."},{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"cc135e8c_f89797b3","line":59,"range":{"start_line":59,"start_character":24,"end_line":59,"end_character":55},"in_reply_to":"39074442_c15da154","updated":"2020-12-16 20:41:44.000000000","message":"We were proposing a thread within a process scheduled by using python `sched`\n\nWe are definitely open to other suggestions here, looking forward for feedback!","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":true,"context_lines":[{"line_number":56,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":57,"context_line":"during volume connection on a host."},{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."},{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"df63c407_64afb463","line":59,"range":{"start_line":59,"start_character":24,"end_line":59,"end_character":55},"in_reply_to":"481fc660_ac4c1d75","updated":"2020-12-19 05:46:51.000000000","message":"Noted, thank you.\n\nWill mention that first proposal is to use native thread if possible, or otherwise spawn an independent process.\n\nQuestions:\nSo this means not using python `sched.scheduler` in the main process?\nWhat would be an example of native thread usage?","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":56,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":57,"context_line":"during volume connection on a host."},{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."},{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"481fc660_ac4c1d75","line":59,"range":{"start_line":59,"start_character":24,"end_line":59,"end_character":55},"in_reply_to":"cc135e8c_f89797b3","updated":"2020-12-18 16:35:53.000000000","message":"A thread will *not* be acceptable from the OpenStack perspective.  A bug in that part of the code that blocked something would be problematic to Nova, for example.\n\nIn my opinion it should be an independent process, like we have with privsep.\n\nMaybe it would be acceptable if it was a native thread, but I\u0027m not sure if you would run into other problems when interacting with privsep.\n\nIt is OK to not have all the answers now, but we should explicitly mention these things, even if just to say that a native thread will be used if possible and will only go into using the independent process alternative if necessary.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""},{"line_number":63,"context_line":"It will know which NVMe devices and MDRAID arrays to monitor based on metadata"},{"line_number":64,"context_line":"from the volume provisioner (backend) - which it will have a custom interface"},{"line_number":65,"context_line":"to."},{"line_number":66,"context_line":""},{"line_number":67,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":68,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"1cc84d5d_5eaa8fc0","line":65,"range":{"start_line":63,"start_character":0,"end_line":65,"end_character":3},"updated":"2020-12-16 18:41:21.000000000","message":"I believe this contradicts your comment on patchset #4 where you said there would be a per service agent, because the volume provisioner won\u0027t be able to tell the difference between volumes used by Nova and Cinder when they are sharing the same host.\n\nI think we either have a single agent, or for the connect_volume method to pass this information to the agent.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":true,"context_lines":[{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""},{"line_number":63,"context_line":"It will know which NVMe devices and MDRAID arrays to monitor based on metadata"},{"line_number":64,"context_line":"from the volume provisioner (backend) - which it will have a custom interface"},{"line_number":65,"context_line":"to."},{"line_number":66,"context_line":""},{"line_number":67,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":68,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"90a07e76_68819e72","line":65,"range":{"start_line":63,"start_character":0,"end_line":65,"end_character":3},"in_reply_to":"1cc84d5d_5eaa8fc0","updated":"2020-12-16 20:41:44.000000000","message":"Good point! The reason I mentioned per service agent was because of spinning out a thread from each process invoking os-brick connector connect_volume. If there is a way to run 1 agent per host that would be best!\n\nconnect_volume could pass this info to the agent but we would need a way to communicate to the process if we are going inter-process I think.\n\nEither way, we are also proposing the agent to keep heartbeating the backend and receive metadata about volumes belonging to its host. This will keep it in sync about volumes that belong to it even if the original connection properties go stale.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":60,"context_line":""},{"line_number":61,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":62,"context_line":""},{"line_number":63,"context_line":"It will know which NVMe devices and MDRAID arrays to monitor based on metadata"},{"line_number":64,"context_line":"from the volume provisioner (backend) - which it will have a custom interface"},{"line_number":65,"context_line":"to."},{"line_number":66,"context_line":""},{"line_number":67,"context_line":"It will notify volume provisioner if necessary of failed devices."},{"line_number":68,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"a77ff70f_46600656","line":65,"range":{"start_line":63,"start_character":0,"end_line":65,"end_character":3},"in_reply_to":"90a07e76_68819e72","updated":"2020-12-18 16:35:53.000000000","message":"I recommend you look at the privsep service code, where you can see that they spawn the process and then communicate with it using sockets (iirc).","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":69,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":70,"context_line":"in the MDRAID."},{"line_number":71,"context_line":""},{"line_number":72,"context_line":"If connection to any of volume replicas fails, pull replicas metadata from"},{"line_number":73,"context_line":"volume provisioner and use that to connect."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"To allow multiple vendor implementations, the specific methods for:"},{"line_number":76,"context_line":"- host probe volume provisioner"}],"source_content_type":"text/x-rst","patch_set":7,"id":"7ed9ad3a_1da63672","line":73,"range":{"start_line":72,"start_character":0,"end_line":73,"end_character":43},"updated":"2020-12-16 18:41:21.000000000","message":"nit: mention what problem does this solve.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":69,"context_line":"It will attempt to connect to new NVMe devices / replicas, replacing them"},{"line_number":70,"context_line":"in the MDRAID."},{"line_number":71,"context_line":""},{"line_number":72,"context_line":"If connection to any of volume replicas fails, pull replicas metadata from"},{"line_number":73,"context_line":"volume provisioner and use that to connect."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"To allow multiple vendor implementations, the specific methods for:"},{"line_number":76,"context_line":"- host probe volume provisioner"}],"source_content_type":"text/x-rst","patch_set":7,"id":"dfc52828_23302e04","line":73,"range":{"start_line":72,"start_character":0,"end_line":73,"end_character":43},"in_reply_to":"7ed9ad3a_1da63672","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":73,"context_line":"volume provisioner and use that to connect."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"To allow multiple vendor implementations, the specific methods for:"},{"line_number":76,"context_line":"- host probe volume provisioner"},{"line_number":77,"context_line":"- get replicas metadata from volume provisioner"},{"line_number":78,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":79,"context_line":""},{"line_number":80,"context_line":""},{"line_number":81,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":7,"id":"78108b46_7ee8c93a","line":78,"range":{"start_line":76,"start_character":0,"end_line":78,"end_character":50},"updated":"2020-12-16 18:41:21.000000000","message":"?: Could you clarify what these 2 methods do? It doesn\u0027t look like they do the healing part, and other vendors would need that as well.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":73,"context_line":"volume provisioner and use that to connect."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"To allow multiple vendor implementations, the specific methods for:"},{"line_number":76,"context_line":"- host probe volume provisioner"},{"line_number":77,"context_line":"- get replicas metadata from volume provisioner"},{"line_number":78,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":79,"context_line":""},{"line_number":80,"context_line":""},{"line_number":81,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":7,"id":"9455f856_ce4d402f","line":78,"range":{"start_line":76,"start_character":0,"end_line":78,"end_character":50},"in_reply_to":"78108b46_7ee8c93a","updated":"2020-12-16 20:41:44.000000000","message":"Yes, host probe is a heartbeat probe, it can be optional, but recommended.\n\nBesides letting the volume provisioner / storage backend this host is up and active, it can also pull (connection) information about the volumes belonging to the host (and importantly their replicas.)\n\nThis is part of the mechanism for self healing, since the information about a newly replaced replica has to be updated from the backend side, after it has been notified of a failure and a certain grace period is passed, so then it will return this information to the agent which will then carry out the replica replacement.\n\nmoved the above to \"developer impact\" section and clarified","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":152,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"}],"source_content_type":"text/x-rst","patch_set":7,"id":"a16b34c3_960eff6c","line":155,"range":{"start_line":155,"start_character":57,"end_line":155,"end_character":73},"updated":"2020-12-16 18:41:21.000000000","message":"-1: passed to the `connect_volume` method","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":152,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"}],"source_content_type":"text/x-rst","patch_set":7,"id":"c37f3762_c92e8f52","line":155,"range":{"start_line":155,"start_character":24,"end_line":155,"end_character":34},"updated":"2020-12-16 18:41:21.000000000","message":"nit: to any volume","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":152,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"}],"source_content_type":"text/x-rst","patch_set":7,"id":"b8eaca3f_b426e604","line":155,"range":{"start_line":155,"start_character":57,"end_line":155,"end_character":73},"in_reply_to":"a16b34c3_960eff6c","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":152,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"}],"source_content_type":"text/x-rst","patch_set":7,"id":"b1d6fc41_b569d49e","line":155,"range":{"start_line":155,"start_character":24,"end_line":155,"end_character":34},"in_reply_to":"c37f3762_c92e8f52","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"db262eba_e2288475","line":156,"range":{"start_line":156,"start_character":0,"end_line":156,"end_character":20},"updated":"2020-12-16 18:41:21.000000000","message":"?: Sorry, I\u0027m not following you here. If we have 4 replicas and only 1 fails, then the connection as a whole didn\u0027t fail (we connected to 3 volumes and can build the MDRAID).","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":true,"context_lines":[{"line_number":153,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"0dbfce0e_b7e07e9b","line":156,"range":{"start_line":156,"start_character":0,"end_line":156,"end_character":20},"in_reply_to":"db262eba_e2288475","updated":"2020-12-16 20:41:44.000000000","message":"yes, but one of the NVMe connects failed, so we can now handle this failure (and for example notify the provisioner backend.) Also, this applies to the connector, which is where I should have put this.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"With that being said, we should be able to accept it with just unit tests."},{"line_number":161,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"5e60ab5a_5f50eecc","line":158,"range":{"start_line":158,"start_character":63,"end_line":158,"end_character":66},"updated":"2020-12-16 18:41:21.000000000","message":"nit: has not been","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":155,"context_line":"When failing to connect to volumes using connection info stored by Cinder,"},{"line_number":156,"context_line":"and connection fails, pull replicas metadata from volume provisioner and"},{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"With that being said, we should be able to accept it with just unit tests."},{"line_number":161,"context_line":""}],"source_content_type":"text/x-rst","patch_set":7,"id":"d9556542_0d98a727","line":158,"range":{"start_line":158,"start_character":63,"end_line":158,"end_character":66},"in_reply_to":"5e60ab5a_5f50eecc","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"With that being said, we should be able to accept it with just unit tests."},{"line_number":161,"context_line":""},{"line_number":162,"context_line":""},{"line_number":163,"context_line":"Dependencies"}],"source_content_type":"text/x-rst","patch_set":7,"id":"d6093279_1ce2c6f9","line":160,"range":{"start_line":160,"start_character":0,"end_line":160,"end_character":74},"updated":"2020-12-16 18:41:21.000000000","message":"-1: Was this meant to go on L179?","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":157,"context_line":"use that to connect (this is for the case of long attached volumes with"},{"line_number":158,"context_line":"replicas that were already replaced but Cinder connection info not refereshed)"},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"With that being said, we should be able to accept it with just unit tests."},{"line_number":161,"context_line":""},{"line_number":162,"context_line":""},{"line_number":163,"context_line":"Dependencies"}],"source_content_type":"text/x-rst","patch_set":7,"id":"563a67c2_4bdad483","line":160,"range":{"start_line":160,"start_character":0,"end_line":160,"end_character":74},"in_reply_to":"d6093279_1ce2c6f9","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"ed9417a32aef3c075038a5b036ad450481865f13","unresolved":true,"context_lines":[{"line_number":169,"context_line":"Testing"},{"line_number":170,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":171,"context_line":""},{"line_number":172,"context_line":"In order to properly test this in tempest, programmatic access will be needed"},{"line_number":173,"context_line":"to the storage backend. For example, to fail one of the drives of a replicated"},{"line_number":174,"context_line":"volume."},{"line_number":175,"context_line":""},{"line_number":176,"context_line":"Since OpenStack is not aware of volume replicas of the new NVMe connector"},{"line_number":177,"context_line":"replicated volumes, would need to verify physical state of NVMe devices and"},{"line_number":178,"context_line":"MDRAID arrays on the host."},{"line_number":179,"context_line":""},{"line_number":180,"context_line":""},{"line_number":181,"context_line":"Documentation Impact"}],"source_content_type":"text/x-rst","patch_set":7,"id":"deb60107_38c29584","line":178,"range":{"start_line":172,"start_character":0,"end_line":178,"end_character":26},"updated":"2020-12-16 18:41:21.000000000","message":"If you leave this here people may ask you to actually implement it in tempest, and that\u0027s probably going to take longer than the actual implementation.","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"83ecd023a7fe0f63ee12c4e1fb46ec792c4733a9","unresolved":false,"context_lines":[{"line_number":169,"context_line":"Testing"},{"line_number":170,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":171,"context_line":""},{"line_number":172,"context_line":"In order to properly test this in tempest, programmatic access will be needed"},{"line_number":173,"context_line":"to the storage backend. For example, to fail one of the drives of a replicated"},{"line_number":174,"context_line":"volume."},{"line_number":175,"context_line":""},{"line_number":176,"context_line":"Since OpenStack is not aware of volume replicas of the new NVMe connector"},{"line_number":177,"context_line":"replicated volumes, would need to verify physical state of NVMe devices and"},{"line_number":178,"context_line":"MDRAID arrays on the host."},{"line_number":179,"context_line":""},{"line_number":180,"context_line":""},{"line_number":181,"context_line":"Documentation Impact"}],"source_content_type":"text/x-rst","patch_set":7,"id":"251a43cd_513d7024","line":178,"range":{"start_line":172,"start_character":0,"end_line":178,"end_character":26},"in_reply_to":"deb60107_38c29584","updated":"2020-12-16 20:41:44.000000000","message":"Done","commit_id":"98ce1ba99926ccdf6246f0afc6e4aea32541cb60"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":40,"context_line":""},{"line_number":41,"context_line":"So the monitoring and healing needs to be on the volume consumer side."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"}],"source_content_type":"text/x-rst","patch_set":8,"id":"72a6ae18_da2717cf","line":43,"range":{"start_line":43,"start_character":10,"end_line":43,"end_character":21},"updated":"2020-12-18 16:35:53.000000000","message":"nit: verb is missing: \"will also be\"","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":40,"context_line":""},{"line_number":41,"context_line":"So the monitoring and healing needs to be on the volume consumer side."},{"line_number":42,"context_line":""},{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"}],"source_content_type":"text/x-rst","patch_set":8,"id":"f6a2c444_2aaa64d2","line":43,"range":{"start_line":43,"start_character":10,"end_line":43,"end_character":21},"in_reply_to":"72a6ae18_da2717cf","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":42,"context_line":""},{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"},{"line_number":47,"context_line":"them entirely."},{"line_number":48,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"66a0d262_1b5ff73d","line":45,"range":{"start_line":45,"start_character":11,"end_line":45,"end_character":30},"updated":"2020-12-18 16:35:53.000000000","message":"nit: It might be easier to read without this","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":42,"context_line":""},{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"},{"line_number":47,"context_line":"them entirely."},{"line_number":48,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"2f5209cf_503876e5","line":45,"range":{"start_line":45,"start_character":11,"end_line":45,"end_character":30},"in_reply_to":"66a0d262_1b5ff73d","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"},{"line_number":47,"context_line":"them entirely."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"28b71b8d_8290f910","line":46,"range":{"start_line":46,"start_character":16,"end_line":46,"end_character":33},"updated":"2020-12-18 16:35:53.000000000","message":"nit: Replace with \"using\"","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":43,"context_line":"This agent will also greatly beneficial for scenarios where certain replicas"},{"line_number":44,"context_line":"of an attached replicated volume go faulty, by notifying the volume"},{"line_number":45,"context_line":"provisioner / storage backend of the faulty devices, they can be marked as"},{"line_number":46,"context_line":"faulty to avoid data corruption / old data on re-attachments and to replace"},{"line_number":47,"context_line":"them entirely."},{"line_number":48,"context_line":""},{"line_number":49,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"e10dac14_dcbf1f37","line":46,"range":{"start_line":46,"start_character":16,"end_line":46,"end_character":33},"in_reply_to":"28b71b8d_8290f910","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2648d0c480d17cfe2d8469bdfc280bc8acf7745b","unresolved":true,"context_lines":[{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Proposed change"},{"line_number":60,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":63,"context_line":"during volume connection on a host."},{"line_number":64,"context_line":""},{"line_number":65,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."}],"source_content_type":"text/x-rst","patch_set":8,"id":"62fcf104_3f6066c2","line":62,"range":{"start_line":61,"start_character":0,"end_line":62,"end_character":26},"updated":"2020-12-17 03:19:38.000000000","message":"I think this approach (making the agent a class in os-brick) makes sense and gets around the problem of where this thing would live and how it would get deployed.","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Proposed change"},{"line_number":60,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":63,"context_line":"during volume connection on a host."},{"line_number":64,"context_line":""},{"line_number":65,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."}],"source_content_type":"text/x-rst","patch_set":8,"id":"f740be25_3259c12c","line":62,"range":{"start_line":61,"start_character":0,"end_line":62,"end_character":26},"in_reply_to":"57d7a1ab_49d98121","updated":"2020-12-19 05:46:51.000000000","message":"Ack","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":58,"context_line":""},{"line_number":59,"context_line":"Proposed change"},{"line_number":60,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":61,"context_line":""},{"line_number":62,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":63,"context_line":"during volume connection on a host."},{"line_number":64,"context_line":""},{"line_number":65,"context_line":"Initializing this agent will schedule a monitoring task to run periodically."}],"source_content_type":"text/x-rst","patch_set":8,"id":"57d7a1ab_49d98121","line":62,"range":{"start_line":61,"start_character":0,"end_line":62,"end_character":26},"in_reply_to":"62fcf104_3f6066c2","updated":"2020-12-18 16:35:53.000000000","message":"The code should live in os-brick, but the executions should be in a different process, and maybe it should not be a child of the current one (or have a mechanism to stop and refresh when new code is running).\n\nThat way we can\u0027t block the service that called us (cinder, nova, glance), and we can continue the monitoring and healing even when the service is brought down for upgrades.","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":78,"context_line":"If connection to any of volume replicas fails, pull replicas metadata from"},{"line_number":79,"context_line":"volume provisioner and use that to connect, to resolve stale connection info"},{"line_number":80,"context_line":"issue."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":""},{"line_number":83,"context_line":""},{"line_number":84,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":8,"id":"e136a481_e47e5152","line":81,"updated":"2020-12-18 16:35:53.000000000","message":"-1: We need to spell out how the agent is going to:\n\n- get the configuration to access the volume provisioner.\n- how the agent is going to be run (native thread or as an independent process tied or not to the parent) and how the communication with this process is going to happen.  Or at the very least mention the alternatives and how one will be chosen.","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":78,"context_line":"If connection to any of volume replicas fails, pull replicas metadata from"},{"line_number":79,"context_line":"volume provisioner and use that to connect, to resolve stale connection info"},{"line_number":80,"context_line":"issue."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":""},{"line_number":83,"context_line":""},{"line_number":84,"context_line":"Alternatives"}],"source_content_type":"text/x-rst","patch_set":8,"id":"8500bac6_63a8a3c9","line":81,"in_reply_to":"e136a481_e47e5152","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2648d0c480d17cfe2d8469bdfc280bc8acf7745b","unresolved":true,"context_lines":[{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"560398c8_8d3f9aac","line":139,"range":{"start_line":139,"start_character":57,"end_line":139,"end_character":61},"updated":"2020-12-17 03:19:38.000000000","message":"know this","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"162e42e7_f820f8c9","line":139,"range":{"start_line":139,"start_character":57,"end_line":139,"end_character":61},"in_reply_to":"560398c8_8d3f9aac","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2648d0c480d17cfe2d8469bdfc280bc8acf7745b","unresolved":true,"context_lines":[{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"}],"source_content_type":"text/x-rst","patch_set":8,"id":"c2d6363a_fafd8205","line":141,"range":{"start_line":141,"start_character":43,"end_line":141,"end_character":45},"updated":"2020-12-17 03:19:38.000000000","message":"nit: period outside the parenthesis","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"}],"source_content_type":"text/x-rst","patch_set":8,"id":"12ac3f43_135677d6","line":141,"range":{"start_line":141,"start_character":43,"end_line":141,"end_character":45},"in_reply_to":"c2d6363a_fafd8205","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":9535,"name":"Gorka Eguileor","email":"geguileo@redhat.com","username":"Gorka"},"change_message_id":"73db90c23325e0eafc5826a39d330d59b255fc53","unresolved":true,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"Developer impact"},{"line_number":131,"context_line":"----------------"},{"line_number":132,"context_line":""},{"line_number":133,"context_line":"To allow multiple vendor implementations, the specific methods / logic for:"},{"line_number":134,"context_line":"- probing the volume provisioner / backend"},{"line_number":135,"context_line":"- parsing replica metadata from provisioner / backend"},{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"},{"line_number":145,"context_line":"been notified of a failure and a certain grace period is passed, so then it"},{"line_number":146,"context_line":"will return this information to the agent which will then carry out the"},{"line_number":147,"context_line":"replica replacement."},{"line_number":148,"context_line":""},{"line_number":149,"context_line":""},{"line_number":150,"context_line":"Implementation"},{"line_number":151,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":8,"id":"cc446020_4beb3313","line":148,"range":{"start_line":132,"start_character":0,"end_line":148,"end_character":0},"updated":"2020-12-18 16:35:53.000000000","message":"I\u0027m having trouble understanding this whole part. Any chance you could rephrase it some other way?\n\nAlso, I think I expressed myself poorly in some of my earlier comments.  Aren\u0027t we worried about the effects of a broken path to a volume (the volume on the array is healthy)?\n\nHow is the agent going to handle those cases?","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"Developer impact"},{"line_number":131,"context_line":"----------------"},{"line_number":132,"context_line":""},{"line_number":133,"context_line":"To allow multiple vendor implementations, the specific methods / logic for:"},{"line_number":134,"context_line":"- probing the volume provisioner / backend"},{"line_number":135,"context_line":"- parsing replica metadata from provisioner / backend"},{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"},{"line_number":145,"context_line":"been notified of a failure and a certain grace period is passed, so then it"},{"line_number":146,"context_line":"will return this information to the agent which will then carry out the"},{"line_number":147,"context_line":"replica replacement."},{"line_number":148,"context_line":""},{"line_number":149,"context_line":""},{"line_number":150,"context_line":"Implementation"},{"line_number":151,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":8,"id":"f8f6753c_32fcbc50","line":148,"range":{"start_line":132,"start_character":0,"end_line":148,"end_character":0},"in_reply_to":"077c2528_41d40301","updated":"2020-12-22 04:54:49.000000000","message":"The current proposed changes section mentions that the agent detects and attempts to heal both faults in the \"logical\" MDRAID array, and in specific NVMe connections.\n\nI still don\u0027t see the point of \"the connection from the logical volume to the replicas is broken\" and how is that different from the scenarios already described.\n\nCan you please clarify again (I apologize for my difficulty of understaing.)\n\nBesides detecting and handling broken NVMe devices (whether they are a replica of a logical volume or not) - what other scenario are you refering to?","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":true,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"Developer impact"},{"line_number":131,"context_line":"----------------"},{"line_number":132,"context_line":""},{"line_number":133,"context_line":"To allow multiple vendor implementations, the specific methods / logic for:"},{"line_number":134,"context_line":"- probing the volume provisioner / backend"},{"line_number":135,"context_line":"- parsing replica metadata from provisioner / backend"},{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"},{"line_number":145,"context_line":"been notified of a failure and a certain grace period is passed, so then it"},{"line_number":146,"context_line":"will return this information to the agent which will then carry out the"},{"line_number":147,"context_line":"replica replacement."},{"line_number":148,"context_line":""},{"line_number":149,"context_line":""},{"line_number":150,"context_line":"Implementation"},{"line_number":151,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":8,"id":"f06aa0a2_7c1147cd","line":148,"range":{"start_line":132,"start_character":0,"end_line":148,"end_character":0},"in_reply_to":"cc446020_4beb3313","updated":"2020-12-19 05:46:51.000000000","message":"I turned this into a \"typical self healing flow\" section which I put in proposed changes, this essentially describes why pulling and parsing volume information from the provisioner is important for self healing. (the pulling and parsing is vendor-specific implementation, that is why these items were under developer impact.)\n\nAs to your question, what do you mean by \"broken path to a volume\" ?\nIf you mean an NVMeoF volume has become unreachable, then agent will report that to the provisioner, and will attempt to re-connect to it based on metadata pulled from provisioner (which can be updated due to certain events such as replacements).\n\nOr am I misunderstanding your question? If so please clarify.","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"ebe3f5a5d0c4efc94e96bfda1b7c7cfdb181c31a","unresolved":true,"context_lines":[{"line_number":129,"context_line":""},{"line_number":130,"context_line":"Developer impact"},{"line_number":131,"context_line":"----------------"},{"line_number":132,"context_line":""},{"line_number":133,"context_line":"To allow multiple vendor implementations, the specific methods / logic for:"},{"line_number":134,"context_line":"- probing the volume provisioner / backend"},{"line_number":135,"context_line":"- parsing replica metadata from provisioner / backend"},{"line_number":136,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":137,"context_line":""},{"line_number":138,"context_line":"host probe is a heartbeat probe, it can be optional, but recommended."},{"line_number":139,"context_line":"Besides letting the volume provisioner / storage backend this host is up and"},{"line_number":140,"context_line":"active, it can also pull (connection) information about the volumes belonging"},{"line_number":141,"context_line":"to the host (and importantly their replicas.)"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"This is part of the mechanism for self healing, since the information about a"},{"line_number":144,"context_line":"newly replaced replica has to be updated from the backend side, after it has"},{"line_number":145,"context_line":"been notified of a failure and a certain grace period is passed, so then it"},{"line_number":146,"context_line":"will return this information to the agent which will then carry out the"},{"line_number":147,"context_line":"replica replacement."},{"line_number":148,"context_line":""},{"line_number":149,"context_line":""},{"line_number":150,"context_line":"Implementation"},{"line_number":151,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":8,"id":"077c2528_41d40301","line":148,"range":{"start_line":132,"start_character":0,"end_line":148,"end_character":0},"in_reply_to":"f06aa0a2_7c1147cd","updated":"2020-12-21 13:32:38.000000000","message":"If you read lines 62-80, it sounds like all the agent does is monitor the health of the backend volumes.  I think Gorka\u0027s point is that it\u0027s also possible that the backend volume health is fine, but the connection from the logical volume to the replicas (or to one of them) is broken.  Is the healing agent also going to detect this, or is that handled in a different way?","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2648d0c480d17cfe2d8469bdfc280bc8acf7745b","unresolved":true,"context_lines":[{"line_number":173,"context_line":"`connect_volume` method, pull replicas metadata from volume provisioner backend"},{"line_number":174,"context_line":"and use that to connect (this is for the case of long attached volumes with"},{"line_number":175,"context_line":"replicas that were already replaced but Cinder connection info has not been"},{"line_number":176,"context_line":"refreshed.)"},{"line_number":177,"context_line":""},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"Dependencies"}],"source_content_type":"text/x-rst","patch_set":8,"id":"a55a3302_20082a1e","line":176,"range":{"start_line":176,"start_character":9,"end_line":176,"end_character":11},"updated":"2020-12-17 03:19:38.000000000","message":"same as line 141","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"91314886a6760d36186daa2e71f083138dc504f7","unresolved":false,"context_lines":[{"line_number":173,"context_line":"`connect_volume` method, pull replicas metadata from volume provisioner backend"},{"line_number":174,"context_line":"and use that to connect (this is for the case of long attached volumes with"},{"line_number":175,"context_line":"replicas that were already replaced but Cinder connection info has not been"},{"line_number":176,"context_line":"refreshed.)"},{"line_number":177,"context_line":""},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"Dependencies"}],"source_content_type":"text/x-rst","patch_set":8,"id":"9383ef02_45020c40","line":176,"range":{"start_line":176,"start_character":9,"end_line":176,"end_character":11},"in_reply_to":"a55a3302_20082a1e","updated":"2020-12-19 05:46:51.000000000","message":"Done","commit_id":"42bdc8480d1f756f6331cb1dc792f2005fee26cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"ebe3f5a5d0c4efc94e96bfda1b7c7cfdb181c31a","unresolved":true,"context_lines":[{"line_number":65,"context_line":"periodically. We are proposing this to be a native thread if possible,"},{"line_number":66,"context_line":"but if necessary it can be an independent process."},{"line_number":67,"context_line":""},{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"},{"line_number":69,"context_line":"alternatives, such as spawning a separate process communicated to via socket,"},{"line_number":70,"context_line":"may be chosen instead."},{"line_number":71,"context_line":""},{"line_number":72,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":73,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":74,"context_line":"format, the content of which should be provided there by the systems operator."}],"source_content_type":"text/x-rst","patch_set":9,"id":"6eee64eb_0ee25889","line":71,"range":{"start_line":68,"start_character":0,"end_line":71,"end_character":0},"updated":"2020-12-21 13:32:38.000000000","message":"I think it would be good to mention a key problem that will drive this selection, namely, during an upgrade, it\u0027s possible for the compute to go down while the VMs continue operating, and we don\u0027t want to lose the monitoring agent while volumes are in use.","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":65,"context_line":"periodically. We are proposing this to be a native thread if possible,"},{"line_number":66,"context_line":"but if necessary it can be an independent process."},{"line_number":67,"context_line":""},{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"},{"line_number":69,"context_line":"alternatives, such as spawning a separate process communicated to via socket,"},{"line_number":70,"context_line":"may be chosen instead."},{"line_number":71,"context_line":""},{"line_number":72,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":73,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":74,"context_line":"format, the content of which should be provided there by the systems operator."}],"source_content_type":"text/x-rst","patch_set":9,"id":"8ff890f9_e86ec9c0","line":71,"range":{"start_line":68,"start_character":0,"end_line":71,"end_character":0},"in_reply_to":"6eee64eb_0ee25889","updated":"2020-12-22 04:54:49.000000000","message":"That\u0027s a great scenario to consider. Mentioning it here.","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":1736,"name":"Ivan Kolodyazhny","email":"e0ne@e0ne.info","username":"e0ne"},"change_message_id":"d02165cb0bf6fd1f0ceefb5df3ad53e1bcae3a40","unresolved":true,"context_lines":[{"line_number":72,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":73,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":74,"context_line":"format, the content of which should be provided there by the systems operator."},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"It will know which NVMe devices and MDRAID arrays to monitor based on metadata"}],"source_content_type":"text/x-rst","patch_set":9,"id":"452d53bb_3b657386","line":75,"updated":"2020-12-21 15:03:30.000000000","message":"It\u0027s not clear to me how this agent will work: will it be a some service or a script which should be scheduled as a cron task?","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":72,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":73,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":74,"context_line":"format, the content of which should be provided there by the systems operator."},{"line_number":75,"context_line":""},{"line_number":76,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"It will know which NVMe devices and MDRAID arrays to monitor based on metadata"}],"source_content_type":"text/x-rst","patch_set":9,"id":"edfc63c8_e7a48819","line":75,"in_reply_to":"452d53bb_3b657386","updated":"2020-12-22 04:54:49.000000000","message":"This is mentioned in lines 61-70\n\"Add an \"NVMe agent\" class that will be initialized by the NVMe connector\nduring volume connection on a host.\n\nInitializing this agent will spawn a monitoring task which will repeat\nperiodically. We are proposing this to be a native thread if possible,\nbut if necessary it can be an independent process.\n\nFirst proposal was to use python Event Scheduler `sched.scheduler`, but other\nalternatives, such as spawning a separate process communicated to via socket,\nmay be chosen instead.\"","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":1736,"name":"Ivan Kolodyazhny","email":"e0ne@e0ne.info","username":"e0ne"},"change_message_id":"d02165cb0bf6fd1f0ceefb5df3ad53e1bcae3a40","unresolved":true,"context_lines":[{"line_number":94,"context_line":"   from provisioner, so it sends explicit request to replace replica"},{"line_number":95,"context_line":"6. provisioner replaces replica and updates volume information"},{"line_number":96,"context_line":"7. agent pulls volume replica information, notices a replica has changed"},{"line_number":97,"context_line":"8. agent carries out replica replacement"},{"line_number":98,"context_line":""},{"line_number":99,"context_line":"Alternatives"},{"line_number":100,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"329d8fcd_ae7e3eb1","line":97,"updated":"2020-12-21 15:03:30.000000000","message":"Please, describe how it will communicate with cinder","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":94,"context_line":"   from provisioner, so it sends explicit request to replace replica"},{"line_number":95,"context_line":"6. provisioner replaces replica and updates volume information"},{"line_number":96,"context_line":"7. agent pulls volume replica information, notices a replica has changed"},{"line_number":97,"context_line":"8. agent carries out replica replacement"},{"line_number":98,"context_line":""},{"line_number":99,"context_line":"Alternatives"},{"line_number":100,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"2a98e27f_82f0c35b","line":97,"in_reply_to":"329d8fcd_ae7e3eb1","updated":"2020-12-22 04:54:49.000000000","message":"It does not communicate with cinder.\n\nIt does communicate with the backend volume provisioner, via an interface that is vendor-specific.\n\nMention of this interface is under developer impact.","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":1736,"name":"Ivan Kolodyazhny","email":"e0ne@e0ne.info","username":"e0ne"},"change_message_id":"d02165cb0bf6fd1f0ceefb5df3ad53e1bcae3a40","unresolved":true,"context_lines":[{"line_number":99,"context_line":"Alternatives"},{"line_number":100,"context_line":"------------"},{"line_number":101,"context_line":""},{"line_number":102,"context_line":"None"},{"line_number":103,"context_line":""},{"line_number":104,"context_line":"Data model impact"},{"line_number":105,"context_line":"-----------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"5e87bc78_84889d26","line":102,"range":{"start_line":102,"start_character":0,"end_line":102,"end_character":4},"updated":"2020-12-21 15:03:30.000000000","message":"As alternative, operator could use some own script to monitor connections and fix them manually","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":99,"context_line":"Alternatives"},{"line_number":100,"context_line":"------------"},{"line_number":101,"context_line":""},{"line_number":102,"context_line":"None"},{"line_number":103,"context_line":""},{"line_number":104,"context_line":"Data model impact"},{"line_number":105,"context_line":"-----------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"3951e4c0_13fbe3ef","line":102,"range":{"start_line":102,"start_character":0,"end_line":102,"end_character":4},"in_reply_to":"5e87bc78_84889d26","updated":"2020-12-22 04:54:49.000000000","message":"Done","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":1736,"name":"Ivan Kolodyazhny","email":"e0ne@e0ne.info","username":"e0ne"},"change_message_id":"d02165cb0bf6fd1f0ceefb5df3ad53e1bcae3a40","unresolved":true,"context_lines":[{"line_number":140,"context_line":"Other deployer impact"},{"line_number":141,"context_line":"---------------------"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"None"},{"line_number":144,"context_line":""},{"line_number":145,"context_line":"Developer impact"},{"line_number":146,"context_line":"----------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"d7b7993c_1af96597","line":143,"range":{"start_line":143,"start_character":0,"end_line":143,"end_character":4},"updated":"2020-12-21 15:03:30.000000000","message":"Please, add a note that agent should be deployed on the compute nodes","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":140,"context_line":"Other deployer impact"},{"line_number":141,"context_line":"---------------------"},{"line_number":142,"context_line":""},{"line_number":143,"context_line":"None"},{"line_number":144,"context_line":""},{"line_number":145,"context_line":"Developer impact"},{"line_number":146,"context_line":"----------------"}],"source_content_type":"text/x-rst","patch_set":9,"id":"a0bea0c6_01a30a4b","line":143,"range":{"start_line":143,"start_character":0,"end_line":143,"end_character":4},"in_reply_to":"d7b7993c_1af96597","updated":"2020-12-22 04:54:49.000000000","message":"We are proposing to keep the agent in os-brick repository and activate it during NVMeoF connector connections, so no need for any other / external deployment.","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"ebe3f5a5d0c4efc94e96bfda1b7c7cfdb181c31a","unresolved":true,"context_lines":[{"line_number":152,"context_line":"- reporting volume state changes to provisioner"},{"line_number":153,"context_line":"- requesting provisioner to replace replica"},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":156,"context_line":""},{"line_number":157,"context_line":""},{"line_number":158,"context_line":"Implementation"}],"source_content_type":"text/x-rst","patch_set":9,"id":"d9107420_39fa12c0","line":155,"updated":"2020-12-21 13:32:38.000000000","message":"Might be worth stating here that your architecture will be a general class that will provide the interface and that the kioxia healing agent will provide an example implementation.","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a8fa9183d1bb60a6d161c3c014959f10f7b3187c","unresolved":false,"context_lines":[{"line_number":152,"context_line":"- reporting volume state changes to provisioner"},{"line_number":153,"context_line":"- requesting provisioner to replace replica"},{"line_number":154,"context_line":""},{"line_number":155,"context_line":"will need to be implemented on a per vendor basis."},{"line_number":156,"context_line":""},{"line_number":157,"context_line":""},{"line_number":158,"context_line":"Implementation"}],"source_content_type":"text/x-rst","patch_set":9,"id":"44fc3d0a_7b49af3c","line":155,"in_reply_to":"d9107420_39fa12c0","updated":"2020-12-22 04:54:49.000000000","message":"Done","commit_id":"681e0b1030e659a7624aa4c684b6b486a019a6c8"},{"author":{"_account_id":27615,"name":"Rajat Dhasmana","email":"rajatdhasmana@gmail.com","username":"whoami-rajat"},"change_message_id":"1653022d2c926c999dc67b166b3391b78eb402db","unresolved":true,"context_lines":[{"line_number":157,"context_line":""},{"line_number":158,"context_line":"Will need to be implemented on a per vendor basis."},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"The architecture is such that the agent will be a general class that will"},{"line_number":161,"context_line":"provide the interface, and the kioxia implementation will be the first"},{"line_number":162,"context_line":"example of vendor-specific implementation."},{"line_number":163,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"7ed72fea_3f13e297","line":160,"range":{"start_line":160,"start_character":50,"end_line":160,"end_character":57},"updated":"2020-12-22 15:44:02.000000000","message":"generic","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"1b00b8316aafdf1736cb463d1084947eb8b60518","unresolved":false,"context_lines":[{"line_number":157,"context_line":""},{"line_number":158,"context_line":"Will need to be implemented on a per vendor basis."},{"line_number":159,"context_line":""},{"line_number":160,"context_line":"The architecture is such that the agent will be a general class that will"},{"line_number":161,"context_line":"provide the interface, and the kioxia implementation will be the first"},{"line_number":162,"context_line":"example of vendor-specific implementation."},{"line_number":163,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"6f038285_914300a9","line":160,"range":{"start_line":160,"start_character":50,"end_line":160,"end_character":57},"in_reply_to":"7ed72fea_3f13e297","updated":"2020-12-23 04:43:09.000000000","message":"Done","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":27615,"name":"Rajat Dhasmana","email":"rajatdhasmana@gmail.com","username":"whoami-rajat"},"change_message_id":"1653022d2c926c999dc67b166b3391b78eb402db","unresolved":true,"context_lines":[{"line_number":174,"context_line":"Work Items"},{"line_number":175,"context_line":"----------"},{"line_number":176,"context_line":""},{"line_number":177,"context_line":"NVMe connector launch monitoring task on connect_volume if not running."},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":180,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"1a685027_d07240fe","line":177,"range":{"start_line":177,"start_character":0,"end_line":177,"end_character":15},"updated":"2020-12-22 15:44:02.000000000","message":"NVMe connector will?","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"1b00b8316aafdf1736cb463d1084947eb8b60518","unresolved":false,"context_lines":[{"line_number":174,"context_line":"Work Items"},{"line_number":175,"context_line":"----------"},{"line_number":176,"context_line":""},{"line_number":177,"context_line":"NVMe connector launch monitoring task on connect_volume if not running."},{"line_number":178,"context_line":""},{"line_number":179,"context_line":"Task monitors NVMe devices and MDRAID arrays created by the connector."},{"line_number":180,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"cba2e422_6a8e2a44","line":177,"range":{"start_line":177,"start_character":0,"end_line":177,"end_character":15},"in_reply_to":"1a685027_d07240fe","updated":"2020-12-23 04:43:09.000000000","message":"Done","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":27615,"name":"Rajat Dhasmana","email":"rajatdhasmana@gmail.com","username":"whoami-rajat"},"change_message_id":"1653022d2c926c999dc67b166b3391b78eb402db","unresolved":true,"context_lines":[{"line_number":181,"context_line":"When a replica goes faulty (as well as other events such as disconnects)"},{"line_number":182,"context_line":"call interface method for notifying volume provisioner."},{"line_number":183,"context_line":""},{"line_number":184,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":185,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":186,"context_line":""},{"line_number":187,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"3b33d9b4_80101f0f","line":184,"range":{"start_line":184,"start_character":35,"end_line":184,"end_character":42},"updated":"2020-12-22 15:44:02.000000000","message":"what do you mean by changed here? does it mean replaced when the any replica goes faulty?","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"1b00b8316aafdf1736cb463d1084947eb8b60518","unresolved":false,"context_lines":[{"line_number":181,"context_line":"When a replica goes faulty (as well as other events such as disconnects)"},{"line_number":182,"context_line":"call interface method for notifying volume provisioner."},{"line_number":183,"context_line":""},{"line_number":184,"context_line":"When replicated volume devices are changed by the volume provisioner,"},{"line_number":185,"context_line":"reconcile the physical state of NVMe devices and MDRAID arrays on the host."},{"line_number":186,"context_line":""},{"line_number":187,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"8d3ed69f_22860866","line":184,"range":{"start_line":184,"start_character":35,"end_line":184,"end_character":42},"in_reply_to":"3b33d9b4_80101f0f","updated":"2020-12-23 04:43:09.000000000","message":"yes, the backend volume provisioner can replace a replica due to certain events (the replacement is in the storage backend) - so once one of the replicas on the backend changes, the agent will become aware of it, and reconcile the state on the host initiator.","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":27615,"name":"Rajat Dhasmana","email":"rajatdhasmana@gmail.com","username":"whoami-rajat"},"change_message_id":"1653022d2c926c999dc67b166b3391b78eb402db","unresolved":true,"context_lines":[{"line_number":200,"context_line":"Documentation Impact"},{"line_number":201,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":202,"context_line":""},{"line_number":203,"context_line":"Document that using NVMe connector with replicated volumes will optionally"},{"line_number":204,"context_line":"launch this agent."},{"line_number":205,"context_line":""},{"line_number":206,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"b0811bce_3376d92a","line":203,"range":{"start_line":203,"start_character":64,"end_line":203,"end_character":74},"updated":"2020-12-22 15:44:02.000000000","message":"what do we mean by optionally here? My understanding is during connect_volume call, the monitoring agent will be launched always or is it backend dependent?","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"1b00b8316aafdf1736cb463d1084947eb8b60518","unresolved":false,"context_lines":[{"line_number":200,"context_line":"Documentation Impact"},{"line_number":201,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":202,"context_line":""},{"line_number":203,"context_line":"Document that using NVMe connector with replicated volumes will optionally"},{"line_number":204,"context_line":"launch this agent."},{"line_number":205,"context_line":""},{"line_number":206,"context_line":""}],"source_content_type":"text/x-rst","patch_set":11,"id":"725e3ff6_04a4dcf9","line":203,"range":{"start_line":203,"start_character":64,"end_line":203,"end_character":74},"in_reply_to":"b0811bce_3376d92a","updated":"2020-12-23 04:43:09.000000000","message":"This is a good point and good question. Ideally we would like to make this configurable in some way, such that an operator that is using the NVMeoF connector but does not want the agent running will have an option to do so. Implementation-wise we have not decided how to achieve this configurability yet but giving that choice to the operator is the vision.","commit_id":"eefa6f8100e5028ba14623a8a86e5f8c85b44c0e"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ab38193bffa9c720b307a4c4254e6615969e818e","unresolved":true,"context_lines":[{"line_number":15,"context_line":"and replaces faulted replicas with new ones."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":""},{"line_number":18,"context_line":"Problem description"},{"line_number":19,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"When the NVMe connector connects a replicated volume, OpenStack will see it"}],"source_content_type":"text/x-rst","patch_set":12,"id":"ed048b3e_fe8db617","line":18,"range":{"start_line":18,"start_character":0,"end_line":18,"end_character":19},"updated":"2021-01-05 15:46:01.000000000","message":"reading the problem statement below this feels like something that is not in the scope\nof os-brck to provide and shoudl be in a differt repo/delverable.","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a0666e41cec1805bc46ff724158ee29d90fc4cd7","unresolved":true,"context_lines":[{"line_number":15,"context_line":"and replaces faulted replicas with new ones."},{"line_number":16,"context_line":""},{"line_number":17,"context_line":""},{"line_number":18,"context_line":"Problem description"},{"line_number":19,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":20,"context_line":""},{"line_number":21,"context_line":"When the NVMe connector connects a replicated volume, OpenStack will see it"}],"source_content_type":"text/x-rst","patch_set":12,"id":"a9416ef6_977cac66","line":18,"range":{"start_line":18,"start_character":0,"end_line":18,"end_character":19},"in_reply_to":"ed048b3e_fe8db617","updated":"2021-01-20 06:57:11.000000000","message":"I can see your point here, and this is likely true.\nCurrently, we found os-brick to be the best place to keep it for lack of a better place, and due to the nvmeof connector being the triggering hook and counterpart for this agent.","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ab38193bffa9c720b307a4c4254e6615969e818e","unresolved":true,"context_lines":[{"line_number":61,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":62,"context_line":"during volume connection on a host."},{"line_number":63,"context_line":""},{"line_number":64,"context_line":"Initializing this agent will spawn a monitoring task which will repeat"},{"line_number":65,"context_line":"periodically. We are proposing this to be a native thread if possible,"},{"line_number":66,"context_line":"but if necessary it can be an independent process."},{"line_number":67,"context_line":""},{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"}],"source_content_type":"text/x-rst","patch_set":12,"id":"8bb39a42_8b963b44","line":65,"range":{"start_line":64,"start_character":0,"end_line":65,"end_character":13},"updated":"2021-01-05 15:46:01.000000000","message":"could this just be a perodic task provide by os-brick instead of an agent?\n\nhttps://github.com/openstack/oslo.service/blob/1.38.0/oslo_service/periodic_task.py#L177","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a0666e41cec1805bc46ff724158ee29d90fc4cd7","unresolved":true,"context_lines":[{"line_number":61,"context_line":"Add an \"NVMe agent\" class that will be initialized by the NVMe connector"},{"line_number":62,"context_line":"during volume connection on a host."},{"line_number":63,"context_line":""},{"line_number":64,"context_line":"Initializing this agent will spawn a monitoring task which will repeat"},{"line_number":65,"context_line":"periodically. We are proposing this to be a native thread if possible,"},{"line_number":66,"context_line":"but if necessary it can be an independent process."},{"line_number":67,"context_line":""},{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"}],"source_content_type":"text/x-rst","patch_set":12,"id":"bd0860f0_60b9d267","line":65,"range":{"start_line":64,"start_character":0,"end_line":65,"end_character":13},"in_reply_to":"8bb39a42_8b963b44","updated":"2021-01-20 06:57:11.000000000","message":"Yes this is definitely a great option, however, will this allow for an independent process / isolation as is desired per other comments?","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ab38193bffa9c720b307a4c4254e6615969e818e","unresolved":true,"context_lines":[{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"},{"line_number":69,"context_line":"alternatives, such as spawning a separate process communicated to via socket,"},{"line_number":70,"context_line":"may be chosen instead."},{"line_number":71,"context_line":"One key problem that would need to be addressed by this selection is a scenario"},{"line_number":72,"context_line":"where compute service goes down, while the VMs continue operating (and their"},{"line_number":73,"context_line":"volumes remain attached) - we don\u0027t want to lose this agent in this case."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":76,"context_line":"provisioner from a pre-determined config file location, with vendor specific"}],"source_content_type":"text/x-rst","patch_set":12,"id":"3f5fbdfb_c8403508","line":73,"range":{"start_line":71,"start_character":0,"end_line":73,"end_character":73},"updated":"2021-01-05 15:46:01.000000000","message":"this is the main motivatior for a speart process.\n\nthis requirement rules out the possiblity of using a thread or anwy other lighter weight\nconcurancy mechanium as all thread, fibers, greantread or corotiens will be terminated when the parten process exits.\n\nthe only way to have a fully indepented life cycle if this was spwaned form the nova compute agnet would be a process. the alternitive here is to make this agent a seperately installable binary that is run via systemd or as a sperate container. personally i think that would be a better approch.\n\nif this requirement is removed then we could take a privsep like approch where we spawn a seperate process and comunicated with it via a unix socket and have that process exeit when either all client disconnect and all nvme connections on the host are disconnected.\n\ni.e. it would remain running if there is  a cinnet connected (nova-compute agent) or there is a managed nvme connection establisted.","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a0666e41cec1805bc46ff724158ee29d90fc4cd7","unresolved":true,"context_lines":[{"line_number":68,"context_line":"First proposal was to use python Event Scheduler `sched.scheduler`, but other"},{"line_number":69,"context_line":"alternatives, such as spawning a separate process communicated to via socket,"},{"line_number":70,"context_line":"may be chosen instead."},{"line_number":71,"context_line":"One key problem that would need to be addressed by this selection is a scenario"},{"line_number":72,"context_line":"where compute service goes down, while the VMs continue operating (and their"},{"line_number":73,"context_line":"volumes remain attached) - we don\u0027t want to lose this agent in this case."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":76,"context_line":"provisioner from a pre-determined config file location, with vendor specific"}],"source_content_type":"text/x-rst","patch_set":12,"id":"1daa5620_b9c779a5","line":73,"range":{"start_line":71,"start_character":0,"end_line":73,"end_character":73},"in_reply_to":"3f5fbdfb_c8403508","updated":"2021-01-20 06:57:11.000000000","message":"agreed.\n\nabout a separately installable binary, this is an option but it does not play well in cases where operators are reluctant to install any custom bits on their systems, choosing to use only the bits that come with upstream OpenStack\u0027s \"seal of approval\"\n\nwould spawning from nova compute agent mean not doing it in the connector / os-brick?\n\nwe are still exploring the best options here, please lets keep this discussion going and try to reach a decision for this development cycle (this of course can always be more deeply redesigned / refactor with more proper architecture implementation in future cycles)","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ab38193bffa9c720b307a4c4254e6615969e818e","unresolved":true,"context_lines":[{"line_number":72,"context_line":"where compute service goes down, while the VMs continue operating (and their"},{"line_number":73,"context_line":"volumes remain attached) - we don\u0027t want to lose this agent in this case."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":76,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":77,"context_line":"format, the content of which should be provided there by the systems operator."},{"line_number":78,"context_line":""},{"line_number":79,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":80,"context_line":""}],"source_content_type":"text/x-rst","patch_set":12,"id":"bff6b409_a16c6a2b","line":77,"range":{"start_line":75,"start_character":0,"end_line":77,"end_character":78},"updated":"2021-01-05 15:46:01.000000000","message":"im not sure this makes sesne form a security point of view.\n\nthis agent is crossing two trust domains. the storage backend domain to directly\nmanimaly raid volumes and the openstack cloud adminstrators trust domain.\n\nit really feels like this agent should be invoking the cinder rest or rpc api to have the cinder volume servcie/driver perfrome the backend operations and the new agent should mealy have the service authetication info for cinder.","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a0666e41cec1805bc46ff724158ee29d90fc4cd7","unresolved":true,"context_lines":[{"line_number":72,"context_line":"where compute service goes down, while the VMs continue operating (and their"},{"line_number":73,"context_line":"volumes remain attached) - we don\u0027t want to lose this agent in this case."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"When initialized, the agent will read access information to the volume"},{"line_number":76,"context_line":"provisioner from a pre-determined config file location, with vendor specific"},{"line_number":77,"context_line":"format, the content of which should be provided there by the systems operator."},{"line_number":78,"context_line":""},{"line_number":79,"context_line":"The task will monitor NVMe devices and MDRAID arrays built over them."},{"line_number":80,"context_line":""}],"source_content_type":"text/x-rst","patch_set":12,"id":"bbf46b68_63522ba7","line":77,"range":{"start_line":75,"start_character":0,"end_line":77,"end_character":78},"in_reply_to":"bff6b409_a16c6a2b","updated":"2021-01-20 06:57:11.000000000","message":"this is a great idea.\n\nhowever, this will require additions to Cinder API and likely some major changes in general.\n\nFor now we chose to keep the agent self contained since this is a brand new feature, we want to see how well it is received by the OpenStack community before we delve deeper into building a whole framework that would be needed to support this properly (with the design implementation you pointed out included)","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"ab38193bffa9c720b307a4c4254e6615969e818e","unresolved":true,"context_lines":[{"line_number":117,"context_line":"Security impact"},{"line_number":118,"context_line":"---------------"},{"line_number":119,"context_line":""},{"line_number":120,"context_line":"Will call NVMe connector methods that do sudo executions of `nvme` and `mdadm`"},{"line_number":121,"context_line":"This will happen in the new agent task that will be spawned from os-brick."},{"line_number":122,"context_line":""},{"line_number":123,"context_line":"Active/Active HA impact"},{"line_number":124,"context_line":"-----------------------"}],"source_content_type":"text/x-rst","patch_set":12,"id":"6339ed9b_4e24498c","line":121,"range":{"start_line":120,"start_character":0,"end_line":121,"end_character":73},"updated":"2021-01-05 15:46:01.000000000","message":"this sound a lot like adding more instaces of using rootwrap.\n\nwhich goes aganst one of the goals of removall all use of rootwap form nova\npresently os-brick is the only use of rootwrap so i would be -1 on this approch.\n\nthese privaldaged operations shoudl use privsep so no directly invokation of sudo.\n\nnot all systems have sudo by the way so this is also an interuperablity issue.\n\nopenstack does not strictly require sudo to work we just need a privalge escalation mechniums\nso we shoudl never assuem the sudo executable exists.\n\nprivsep for example default to sudo but does not assume it \n\nhttps://github.com/openstack/oslo.privsep/blob/83870bd2655f3250bb5d5aed7c9865ba0b5e4770/oslo_privsep/priv_context.py#L60-L71\n\nit also suports a entirely different execution model where you start with privaltes and dthen drop them\n\nhttps://github.com/openstack/oslo.privsep/blob/83870bd2655f3250bb5d5aed7c9865ba0b5e4770/oslo_privsep/daemon.py#L15-L44\n\nbut that is less commonaly used.\n\nin anycase we shoudl avoid new uses of rootwrap and shoudl not add any direct use of sudo via process_utils.exec or similar to run arbitary commands.","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"},{"author":{"_account_id":16721,"name":"Zohar Mamedov","email":"zohar.cloud@gmail.com","username":"zohar"},"change_message_id":"a0666e41cec1805bc46ff724158ee29d90fc4cd7","unresolved":true,"context_lines":[{"line_number":117,"context_line":"Security impact"},{"line_number":118,"context_line":"---------------"},{"line_number":119,"context_line":""},{"line_number":120,"context_line":"Will call NVMe connector methods that do sudo executions of `nvme` and `mdadm`"},{"line_number":121,"context_line":"This will happen in the new agent task that will be spawned from os-brick."},{"line_number":122,"context_line":""},{"line_number":123,"context_line":"Active/Active HA impact"},{"line_number":124,"context_line":"-----------------------"}],"source_content_type":"text/x-rst","patch_set":12,"id":"6364bd8a_5c41a5ae","line":121,"range":{"start_line":120,"start_character":0,"end_line":121,"end_character":73},"in_reply_to":"6339ed9b_4e24498c","updated":"2021-01-20 06:57:11.000000000","message":"I am not sure if we are doing this properly as per your suggestion. I think for this it would be best to refer to the code itself to see if there are any occurences of rootwrap vs the newly proper way of doing it. (i could use some pointers on how to distinguish it)\n\nhttps://review.opendev.org/c/openstack/os-brick/+/768576","commit_id":"ce4aa864538d4590d010582c0df3f24472d6fc87"}]}
