)]}'
{"/COMMIT_MSG":[{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":7,"context_line":"Glance rolling upgrades"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"This spec proposes a strategy to implement rolling upgrades for"},{"line_number":10,"context_line":"Glance."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Change-Id: I40c2b85e9d9a806d528d5be5d97ca8a730631463"},{"line_number":13,"context_line":"Co-authored-by: Hemanth Makkapati \u003chemanth.makkapati@rackspace.com\u003e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":8,"id":"5a74a57a_49fe9139","line":10,"range":{"start_line":10,"start_character":6,"end_line":10,"end_character":7},"updated":"2016-11-29 17:16:16.000000000","message":"nit: This reads like we\u0027re proposing a standard \u0027we\u0027re gonna implement some code\u0027 spec.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"7842f08ec200a4d7da1e422f10efac12bf7cc017","unresolved":false,"context_lines":[{"line_number":7,"context_line":"Glance rolling upgrades"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"This spec proposes a strategy to implement rolling upgrades for"},{"line_number":10,"context_line":"Glance."},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Change-Id: I40c2b85e9d9a806d528d5be5d97ca8a730631463"},{"line_number":13,"context_line":"Co-authored-by: Hemanth Makkapati \u003chemanth.makkapati@rackspace.com\u003e"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":8,"id":"3a71b18c_2e2cd449","line":10,"range":{"start_line":10,"start_character":6,"end_line":10,"end_character":7},"in_reply_to":"5a74a57a_49fe9139","updated":"2016-12-01 22:37:59.000000000","message":"Done","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"}],"specs/newton/approved/glance/rolling-upgrades.rst":[{"author":{"_account_id":16157,"name":"Ken Johnston","email":"kenny@kencjohnston.com","username":"kencjohnston"},"change_message_id":"0e62d6b3ea76896f816da170201bee37025a9ad6","unresolved":false,"context_lines":[{"line_number":107,"context_line":""},{"line_number":108,"context_line":"      * turn off consistency mode on the release N nodes"},{"line_number":109,"context_line":""},{"line_number":110,"context_line":"#. Testing"},{"line_number":111,"context_line":""},{"line_number":112,"context_line":"   Full stack integration testing with services arranged in a mid-upgrade"},{"line_number":113,"context_line":"   manner is performed on every proposed commit to validate that mixed-version"},{"line_number":114,"context_line":"   services work together properly."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"   * This testing must be performed on configurations that the project"},{"line_number":117,"context_line":"     considers to be its reference implementations."},{"line_number":118,"context_line":""},{"line_number":119,"context_line":"   * The arrangement(s) tested will depend on the project (i.e. should be"},{"line_number":120,"context_line":"     representative of a meaningful-to-operators rolling upgrade scenario) and"},{"line_number":121,"context_line":"     available testing resources."},{"line_number":122,"context_line":""},{"line_number":123,"context_line":"   * At least one representative arrangement must be tested full-stack in the"},{"line_number":124,"context_line":"     gate."},{"line_number":125,"context_line":""},{"line_number":126,"context_line":"Alternatives"},{"line_number":127,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"5a9d85d2_afeb51af","line":124,"range":{"start_line":110,"start_character":0,"end_line":124,"end_character":10},"updated":"2016-06-21 14:35:12.000000000","message":"Is this in reference to Grenade?","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"5b07c199c65d7e1a81b1f1ff060be81bb7b01771","unresolved":false,"context_lines":[{"line_number":107,"context_line":""},{"line_number":108,"context_line":"      * turn off consistency mode on the release N nodes"},{"line_number":109,"context_line":""},{"line_number":110,"context_line":"#. Testing"},{"line_number":111,"context_line":""},{"line_number":112,"context_line":"   Full stack integration testing with services arranged in a mid-upgrade"},{"line_number":113,"context_line":"   manner is performed on every proposed commit to validate that mixed-version"},{"line_number":114,"context_line":"   services work together properly."},{"line_number":115,"context_line":""},{"line_number":116,"context_line":"   * This testing must be performed on configurations that the project"},{"line_number":117,"context_line":"     considers to be its reference implementations."},{"line_number":118,"context_line":""},{"line_number":119,"context_line":"   * The arrangement(s) tested will depend on the project (i.e. should be"},{"line_number":120,"context_line":"     representative of a meaningful-to-operators rolling upgrade scenario) and"},{"line_number":121,"context_line":"     available testing resources."},{"line_number":122,"context_line":""},{"line_number":123,"context_line":"   * At least one representative arrangement must be tested full-stack in the"},{"line_number":124,"context_line":"     gate."},{"line_number":125,"context_line":""},{"line_number":126,"context_line":"Alternatives"},{"line_number":127,"context_line":"------------"}],"source_content_type":"text/x-rst","patch_set":4,"id":"1aa78d24_0447ccbc","line":124,"range":{"start_line":110,"start_character":0,"end_line":124,"end_character":10},"in_reply_to":"5a9d85d2_afeb51af","updated":"2016-07-11 19:14:57.000000000","message":"I guess there\u0027s no reason to be coy about it, I\u0027m pretty sure we\u0027ll use Grenade, so I might as well say so!","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":16157,"name":"Ken Johnston","email":"kenny@kencjohnston.com","username":"kencjohnston"},"change_message_id":"0e62d6b3ea76896f816da170201bee37025a9ad6","unresolved":false,"context_lines":[{"line_number":197,"context_line":""},{"line_number":198,"context_line":"* Write documentation for rolling upgrade (operator docs)."},{"line_number":199,"context_line":""},{"line_number":200,"context_line":""},{"line_number":201,"context_line":"Dependencies"},{"line_number":202,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":203,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"5a9d85d2_4ffe2d38","line":200,"updated":"2016-06-21 14:35:12.000000000","message":"You could add as a work item the submission to governance for the tag.","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"5b07c199c65d7e1a81b1f1ff060be81bb7b01771","unresolved":false,"context_lines":[{"line_number":197,"context_line":""},{"line_number":198,"context_line":"* Write documentation for rolling upgrade (operator docs)."},{"line_number":199,"context_line":""},{"line_number":200,"context_line":""},{"line_number":201,"context_line":"Dependencies"},{"line_number":202,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":203,"context_line":""}],"source_content_type":"text/x-rst","patch_set":4,"id":"1aa78d24_6fb2e9e6","line":200,"in_reply_to":"5a9d85d2_4ffe2d38","updated":"2016-07-11 19:14:57.000000000","message":"Done","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":16157,"name":"Ken Johnston","email":"kenny@kencjohnston.com","username":"kencjohnston"},"change_message_id":"0e62d6b3ea76896f816da170201bee37025a9ad6","unresolved":false,"context_lines":[{"line_number":201,"context_line":"Dependencies"},{"line_number":202,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":203,"context_line":""},{"line_number":204,"context_line":"None"},{"line_number":205,"context_line":""},{"line_number":206,"context_line":"Testing"},{"line_number":207,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"5a9d85d2_afd511ba","line":204,"range":{"start_line":204,"start_character":0,"end_line":204,"end_character":4},"updated":"2016-06-21 14:35:12.000000000","message":"Is there a dependency on the \"Database Strategy for Rolling Upgrades\" spec?","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"5b07c199c65d7e1a81b1f1ff060be81bb7b01771","unresolved":false,"context_lines":[{"line_number":201,"context_line":"Dependencies"},{"line_number":202,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"},{"line_number":203,"context_line":""},{"line_number":204,"context_line":"None"},{"line_number":205,"context_line":""},{"line_number":206,"context_line":"Testing"},{"line_number":207,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":4,"id":"1aa78d24_0fd9a522","line":204,"range":{"start_line":204,"start_character":0,"end_line":204,"end_character":4},"in_reply_to":"5a9d85d2_afd511ba","updated":"2016-07-11 19:14:57.000000000","message":"I think you\u0027re right.","commit_id":"95c586503fbef54fe5bbc794b6d5a1068bff6dff"},{"author":{"_account_id":17491,"name":"Lujin Luo","email":"luo.lujin@jp.fujitsu.com","username":"Lujin"},"change_message_id":"04fe0c0809b5559d5b86064a0b5c12085cd84191","unresolved":false,"context_lines":[{"line_number":90,"context_line":"        deploy\")"},{"line_number":91,"context_line":"      * initial database schema migration"},{"line_number":92,"context_line":""},{"line_number":93,"context_line":"   #. **Multi-version interoperabilty**"},{"line_number":94,"context_line":""},{"line_number":95,"context_line":"      * start the release N nodes in compatability mode"},{"line_number":96,"context_line":"      * take the release N-1 nodes out of rotation, allowing them to drain"}],"source_content_type":"text/x-rst","patch_set":5,"id":"7a8ec9b2_da07747f","line":93,"updated":"2016-09-15 04:16:36.000000000","message":"sorry, a newbie question. so Glance does not have distributed components across multiple nodes communicating with each other, right? and multiple nodes are running independent Glance services, which only share db. thus this spec is not discussing things like RPC versioning.","commit_id":"84e314bab217b62226ae514af654d06f3336fb89"}],"specs/ocata/approved/glance/rolling-upgrades.rst":[{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"32a0153e50414123fb7f20c02ebfe8a61d5a669b","unresolved":false,"context_lines":[{"line_number":72,"context_line":"    intended to be exposed to end users or other OpenStack services; it is"},{"line_number":73,"context_line":"    expressly designed for internal Glance use only."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_38cb3c20","line":75,"range":{"start_line":75,"start_character":19,"end_line":75,"end_character":76},"updated":"2016-11-30 11:13:35.000000000","message":"One thing I should answer is \u0027why can\u0027t zero downtime reload\u0027 be used for the rolling upgrade case\u0027?\n\nie why is it not possible to do this:\n\n1) update software (and configuration)\n2) trigger zero downtime reload\n\nThe reason is:\n\nThe glance api would be running with an indeterminate mix of the old and new software.\n\nFrom memory this is how the reload works:\n\n1. The glance service starts. A parent process \u0027P\u0027 forks some child processes \u0027C1\u0027, \u0027C2\u0027.\n\nWhat actually happens here is the parent process binds to the relevant socket (IP address/port) and then forks. The children inherit the open socket.\n\n(This kind of thing is a fairly standard way of working, though with the introduction of SO_REUSEPORT in Linux 3.9 https://lwn.net/Articles/542629 it\u0027s arguably not needed anymore.)\n\n2. A SIGHUP is received by the parent\n\n3. The parent sends the old children a SIGHUP\n\n4. The old children stop accepting new requests, complete any existing requests and then exit.\n\n5. (at the same time as \u00274\u0027) The parent P spawns new children \u0027c1\u0027, \u0027c2\u0027 -- these will be using the new config settings\n\n\nIn order for any new requests to be handled at any time the socket needs to remain open at all times. (This will be true except in the unlikely corner case where the listening port config has been changed where the socket must be closed and opened on the new port.)\n\nIn the standard case where the listening port doesn\u0027t change, the parent process P will continue to exist after the reload, having kept the socket open all the time. If the code had been updated before the reload, this parent process will not pick up the new code.\n\nAlso, since the children will have been spawned from a process that was not running the new code it\u0027s not clear what state the children would be in. Would they inherit some old code from the parent and some new code by loading new libraries as they run? I guess you\u0027d describe them as being in a (scary) indeterminate state ... possibly a nasty kind of \u0027it seems to work\u0027 until it doesn\u0027t situation.\n\nIn terms of configuration changes, if the new code has added configuration changes that the parent process can\u0027t parse that would also cause problems if we tried to use reload.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":72,"context_line":"    intended to be exposed to end users or other OpenStack services; it is"},{"line_number":73,"context_line":"    expressly designed for internal Glance use only."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_2995d587","line":75,"range":{"start_line":75,"start_character":19,"end_line":75,"end_character":53},"updated":"2016-11-29 17:16:16.000000000","message":"Something similar -- but not identical -- to the zero downtime configuration reload may be required.\n\nImagine someone uploading a largish image over a slow internet connection. Let\u0027s say that API call takes ~ 20 minutes. In an ideal world of true zero downtime upgrade that API call would complete. (Yes, many folks may not care, but go with me for now....)\n\nThe zero downtime *reload* works in this case because that twenty minute upload will complete before its parent process is reaped.\n\nFor true zero downtime *upgrade* I think you need to be able to allow your API servers to \u0027graceful-stop\u0027. ie complete existing requests then exit. I\u0027m pretty sure that having something very close to the \u0027reload\u0027 behaviour, that just differs in not spawning up new processes should be really trivial to implement. Basically, rather than accepting the SIGHUP signal accept, eg the SIGTERM or SIGSTOP signal, complete requests like HUP, but don\u0027t spawn new processes.\n\n So if you had two API servers it would look like this:\n\n1) server A and server B are handling requests\n2) remove server A from the load balancer (no new requests hit server A)\n3) send server A a \u0027graceful-stop\u0027 signal, it completes existing requests then (eventually) exits\n4) once it has exited (healthcheck should have stopped), upgrade server A\n5) put A back in the load balancer\n6) take B out of the load balancer\n7) send B a \u0027graceful-stop\u0027 signal\n8) upgrade B\n9) add B to load balancer\n\n(I\u0027m assuming the api service isn\u0027t running in a container.)","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9ff6d7fab6d4aaf0381b76877d690f3b129e38bd","unresolved":false,"context_lines":[{"line_number":72,"context_line":"    intended to be exposed to end users or other OpenStack services; it is"},{"line_number":73,"context_line":"    expressly designed for internal Glance use only."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"}],"source_content_type":"text/x-rst","patch_set":8,"id":"3a71b18c_aa7f8964","line":75,"range":{"start_line":75,"start_character":19,"end_line":75,"end_character":76},"in_reply_to":"3a71b18c_64442b74","updated":"2016-12-05 12:04:03.000000000","message":"Ah, I\u0027d forgotten about the \u0027disabled by file\u0027 part of the middlware!\n\nI guess the difference between a \u0027graceful shutdown\u0027 and using the middleware is that one is event driven where the other one requires you to pick a \u0027long time\u0027 and assume that after that there are no requests being handled.\n\nWhich would work ... but the code to implement the graceful shutdown could be pretty easy (potentially just a few lines of code), so might be worth thinking about.\n\nIt\u0027s probably just a matter of changing these functions in wsgi.py:\n\n\n    def wait_on_children(self):\n        while self.running:\n            try:\n                pid, status \u003d os.wait()\n                if os.WIFEXITED(status) or os.WIFSIGNALED(status):\n                    self._remove_children(pid)\n                    self._verify_and_respawn_children(pid, status)\n            except OSError as err:\n                if err.errno not in (errno.EINTR, errno.ECHILD):\n                    raise\n            except KeyboardInterrupt:\n                LOG.info(_LI(\u0027Caught keyboard interrupt. Exiting.\u0027))\n                break\n            except exception.SIGHUPInterrupt:\n                self.reload()\n                continue\n        eventlet.greenio.shutdown_safe(self.sock)\n        self.sock.close()\n        LOG.debug(\u0027Exited\u0027)\n\n\n    def run_child(self):\n        def child_hup(*args):\n            \"\"\"Shuts down child processes, existing requests are handled.\"\"\"\n            signal.signal(signal.SIGHUP, signal.SIG_IGN)\n            eventlet.wsgi.is_accepting \u003d False\n            self.sock.close()\n\n        pid \u003d os.fork()\n        if pid \u003d\u003d 0:\n            signal.signal(signal.SIGHUP, child_hup)\n            signal.signal(signal.SIGTERM, signal.SIG_DFL)\n\nto look like this:\n\n    def wait_on_children(self):\n        while self.running:\n            try:\n                pid, status \u003d os.wait()\n                if os.WIFEXITED(status) or os.WIFSIGNALED(status):\n                    self._remove_children(pid)\n                    self._verify_and_respawn_children(pid, status)\n            except OSError as err:\n                if err.errno not in (errno.EINTR, errno.ECHILD):\n                    raise\n            except KeyboardInterrupt:\n                LOG.info(_LI(\u0027Caught keyboard interrupt. Exiting.\u0027))\n                break\n            except exception.SIGHUPInterrupt:\n                self.reload()\n                continue\n            except exception.SIGSTOPInterrupt: \u003c\u003c\u003c\n                self.graceful_shutdown()       \u003c\u003c\u003c\n                continue                       \u003c\u003c\u003c\n        eventlet.greenio.shutdown_safe(self.sock)\n        self.sock.close()\n        LOG.debug(\u0027Exited\u0027)\n\n\n    def run_child(self):\n        def child_hup(*args):\n            \"\"\"Shuts down child processes, existing requests are handled.\"\"\"\n            signal.signal(signal.SIGHUP, signal.SIG_IGN)\n            eventlet.wsgi.is_accepting \u003d False\n            self.sock.close()\n\n        def child_stop(*args):          \u003c\u003c\u003c\n            \"\"\"Shuts down child processes, existing requests are handled.\"\"\"  \u003c\u003c\u003c\n            signal.signal(signal.SIGSTOP, signal.SIG_IGN) \u003c\u003c\u003c \n            eventlet.wsgi.is_accepting \u003d False \u003c\u003c\u003c \n            self.sock.close() \u003c\u003c\u003c\n\n        pid \u003d os.fork()\n        if pid \u003d\u003d 0:\n            signal.signal(signal.SIGHUP, child_hup)\n            signal.signal(signal.SIGSTOP, child_stop) \u003c\u003c\u003c\n            signal.signal(signal.SIGTERM, signal.SIG_DFL)\n\n\nAnd add this one line function:\n\n    def graceful_shutdown(self):\n        \"\"\"\n        Send signal to child processes to handle current requests then exit\n        \"\"\"\n        os.killpg(self.pgid, signal.SIGSTOP)\n\n\nI haven\u0027t tested the above, I think it might be sufficient because the existing code will do the accounting of the number of children and exit when self.children hits zero, but it\u0027d need testing.\n\nYou\u0027d do \"kill -s STOP \u003cglance-parent-process-pid\u003e\"","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"7842f08ec200a4d7da1e422f10efac12bf7cc017","unresolved":false,"context_lines":[{"line_number":72,"context_line":"    intended to be exposed to end users or other OpenStack services; it is"},{"line_number":73,"context_line":"    expressly designed for internal Glance use only."},{"line_number":74,"context_line":""},{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"}],"source_content_type":"text/x-rst","patch_set":8,"id":"3a71b18c_64442b74","line":75,"range":{"start_line":75,"start_character":19,"end_line":75,"end_character":76},"in_reply_to":"5a74a57a_38cb3c20","updated":"2016-12-01 22:37:59.000000000","message":"Thanks for the detailed explanation, Stuart.  I think I\u0027ve sent you down a bad rabbit hole here due to not removing this point when revising the spec.\n\nIn one of the earlier versions of the rolling upgrade database strategy patch [0], we were going to have the \"new\" code be capable of running in various \"modes\" that would be triggered by config reloads.  That\u0027s not the case anymore.\n\nSo the scenario we have now is this:\n\n0. Newton code is running\n\n1. Database expansion and data migration (see [0] for details)\n\n2. Deploy Ocata code in small batches: take some Newton nodes out of rotation, wait for them to drain, upgrade to Ocata, and return to rotation.\n\n3. When all nodes are running Ocata code, perform the database contraction (see [0] for details).\n\nAs far as handling step 2, the oslo healthcheck middleware allows you to create a file in a configurable location; if that file exists, the healthcheck endpoint returns 503 Disabled By File, and that indicates to the load balancer to stop sending requests to the node.  As you point out, Glance nodes can take a long time to drain (I\u0027m aware of a public cloud in which image uploads can take 12-18 hours).  It\u0027s up to the operator whether to wait for all connections to be closed, or to just go ahead and kill the service and waste any in-flight images.\n\nSince the node\u0027s out of rotation, it won\u0027t be receiving any new requests.  So the \"disabled-by-file\" functionality of the healthcheck middleware eliminates the requirement for \"graceful-stop\" signal handling.  (I\u0027m not saying it\u0027s not a good idea, just that I don\u0027t think it\u0027s necessary to perform rolling upgrades.)\n\nAdditionally, on this scenario, a Glance node will only be running one version of Glance at a time, so we don\u0027t have to worry about mixed versions of Glance running on a single node.  (There will be two versions running in the deployment, but [0] explains how we can handle that.)\n\n\n[0] https://review.openstack.org/331740","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"},{"line_number":79,"context_line":"    balancer that an API node is out of service since the Liberty release"},{"line_number":80,"context_line":"    [GLA2]_."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* Full stack integration testing with services arranged in a mid-upgrade manner"}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_3bcd1f5e","line":79,"range":{"start_line":78,"start_character":59,"end_line":79,"end_character":47},"updated":"2016-11-29 17:16:16.000000000","message":"Strictly speaking the healthcheck middleware signals that an API node is alive. The absense of a signal can be used by the load balancer to infer that there may be a problem.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"7842f08ec200a4d7da1e422f10efac12bf7cc017","unresolved":false,"context_lines":[{"line_number":75,"context_line":"    Glance has had zero downtime configuration reload since the Kilo release"},{"line_number":76,"context_line":"    [GLA1]_."},{"line_number":77,"context_line":""},{"line_number":78,"context_line":"    Glance has had healthcheck middleware that can be used to signal to a load"},{"line_number":79,"context_line":"    balancer that an API node is out of service since the Liberty release"},{"line_number":80,"context_line":"    [GLA2]_."},{"line_number":81,"context_line":""},{"line_number":82,"context_line":"* Full stack integration testing with services arranged in a mid-upgrade manner"}],"source_content_type":"text/x-rst","patch_set":8,"id":"3a71b18c_e4583b59","line":79,"range":{"start_line":78,"start_character":59,"end_line":79,"end_character":47},"in_reply_to":"5a74a57a_3bcd1f5e","updated":"2016-12-01 22:37:59.000000000","message":"The olso healthcheck middleware we use has the disabled-by-file functionality described above; the 503 response is a signal to take the node out of rotation.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":193,"context_line":""},{"line_number":194,"context_line":"   #. **Upgrade Orchestration - Deploy**"},{"line_number":195,"context_line":""},{"line_number":196,"context_line":"      * stage the code for release N to new Glance nodes"},{"line_number":197,"context_line":""},{"line_number":198,"context_line":"   #. **Online Schema Migration** - Part 1"},{"line_number":199,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_fb472775","line":196,"range":{"start_line":196,"start_character":40,"end_line":196,"end_character":56},"updated":"2016-11-29 17:16:16.000000000","message":"If your registry and api services run on the same node using standard (eg rpm) packages then it may not be possible to upgrade the registry and api software independently.\n\nI guess I\u0027m just saying we need to handle the case where the registry is being used and what assumptions we\u0027re making about whether you have separate registry/api nodes or not. (It may be the case that using the registry buys us something in terms of rolling upgrades and database -- ie sql connection -- timeouts.)","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"7842f08ec200a4d7da1e422f10efac12bf7cc017","unresolved":false,"context_lines":[{"line_number":193,"context_line":""},{"line_number":194,"context_line":"   #. **Upgrade Orchestration - Deploy**"},{"line_number":195,"context_line":""},{"line_number":196,"context_line":"      * stage the code for release N to new Glance nodes"},{"line_number":197,"context_line":""},{"line_number":198,"context_line":"   #. **Online Schema Migration** - Part 1"},{"line_number":199,"context_line":""}],"source_content_type":"text/x-rst","patch_set":8,"id":"3a71b18c_ca57c900","line":196,"range":{"start_line":196,"start_character":40,"end_line":196,"end_character":56},"in_reply_to":"5a74a57a_fb472775","updated":"2016-12-01 22:37:59.000000000","message":"That\u0027s a good point.  The OpenStack guidelines only require that there be a deployment configuration under which rolling upgrade is possible, so we\u0027ll have to make it clear that we\u0027re assuming that each Glance service is running on a dedicated node.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":194,"context_line":"   #. **Upgrade Orchestration - Deploy**"},{"line_number":195,"context_line":""},{"line_number":196,"context_line":"      * stage the code for release N to new Glance nodes"},{"line_number":197,"context_line":""},{"line_number":198,"context_line":"   #. **Online Schema Migration** - Part 1"},{"line_number":199,"context_line":""},{"line_number":200,"context_line":"      * initial database schema migration (the \"expand\" phase as described"}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_6461b4d3","line":197,"updated":"2016-11-29 17:16:16.000000000","message":"It\u0027s not clear how much detail needs to be in the spec, versus how much can be worked through in the \u0027code\u0027 (ie documentation in this case.)","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"7842f08ec200a4d7da1e422f10efac12bf7cc017","unresolved":false,"context_lines":[{"line_number":194,"context_line":"   #. **Upgrade Orchestration - Deploy**"},{"line_number":195,"context_line":""},{"line_number":196,"context_line":"      * stage the code for release N to new Glance nodes"},{"line_number":197,"context_line":""},{"line_number":198,"context_line":"   #. **Online Schema Migration** - Part 1"},{"line_number":199,"context_line":""},{"line_number":200,"context_line":"      * initial database schema migration (the \"expand\" phase as described"}],"source_content_type":"text/x-rst","patch_set":8,"id":"3a71b18c_8a5eb1e7","line":197,"in_reply_to":"5a74a57a_6461b4d3","updated":"2016-12-01 22:37:59.000000000","message":"I think that as long as it\u0027s fairly plausible that what\u0027s been outlined here will work, we can approve the spec and then I can get to work on the \"code\".  As you can tell from my responses, the serious code work is being done by Alex and Hemanth in the \"database strategy for rockin\u0027 and rollin\u0027 upgrades\" spec.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":455,"name":"Stuart McLaren","email":"stuart.mclaren@hpe.com","username":"stuart-mclaren"},"change_message_id":"9f49556a02f1defa830d493fc647463a770b6d05","unresolved":false,"context_lines":[{"line_number":311,"context_line":"* Grenade tests."},{"line_number":312,"context_line":""},{"line_number":313,"context_line":"* Assert the tag and notify the OpenStack Technical Committee."},{"line_number":314,"context_line":""},{"line_number":315,"context_line":""},{"line_number":316,"context_line":"Dependencies"},{"line_number":317,"context_line":"\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d\u003d"}],"source_content_type":"text/x-rst","patch_set":8,"id":"5a74a57a_c4ec482c","line":314,"updated":"2016-11-29 17:16:16.000000000","message":"Need to decide if there is a \u0027graceful-stop\u0027 code change required.","commit_id":"ff5bbc8c67f1a9cefe64703bc0c697229315c269"},{"author":{"_account_id":22448,"name":"Alexander Bashmakov","email":"abashmak@yahoo.com","username":"abashmak"},"change_message_id":"dced40436880c8485c0be58ae6523ed96c4202e2","unresolved":false,"context_lines":[{"line_number":83,"context_line":"    of rotation while nodes running \"new\" code are brought in."},{"line_number":84,"context_line":""},{"line_number":85,"context_line":"    Note that while this proposal will allow a mixed deployment of API versions"},{"line_number":86,"context_line":"    to run simultaneously, it does not envision that this will included"},{"line_number":87,"context_line":"    multiple versions of the API running *on the same node* simultaneously.  In"},{"line_number":88,"context_line":"    other words, we do not intend to support a scenario in which \"new\" API code"},{"line_number":89,"context_line":"    is deployed to a node while \"old\" Glance processes are running on that"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_e288b673","line":86,"range":{"start_line":86,"start_character":63,"end_line":86,"end_character":71},"updated":"2016-12-05 18:04:47.000000000","message":"nit: s/included/include/","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2e193fd3ff211e16606703598b2e978ef0d4f566","unresolved":false,"context_lines":[{"line_number":83,"context_line":"    of rotation while nodes running \"new\" code are brought in."},{"line_number":84,"context_line":""},{"line_number":85,"context_line":"    Note that while this proposal will allow a mixed deployment of API versions"},{"line_number":86,"context_line":"    to run simultaneously, it does not envision that this will included"},{"line_number":87,"context_line":"    multiple versions of the API running *on the same node* simultaneously.  In"},{"line_number":88,"context_line":"    other words, we do not intend to support a scenario in which \"new\" API code"},{"line_number":89,"context_line":"    is deployed to a node while \"old\" Glance processes are running on that"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_42e96ae5","line":86,"range":{"start_line":86,"start_character":63,"end_line":86,"end_character":71},"in_reply_to":"3a71b18c_e288b673","updated":"2016-12-05 18:07:43.000000000","message":"Done","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"},{"author":{"_account_id":22448,"name":"Alexander Bashmakov","email":"abashmak@yahoo.com","username":"abashmak"},"change_message_id":"dced40436880c8485c0be58ae6523ed96c4202e2","unresolved":false,"context_lines":[{"line_number":88,"context_line":"    other words, we do not intend to support a scenario in which \"new\" API code"},{"line_number":89,"context_line":"    is deployed to a node while \"old\" Glance processes are running on that"},{"line_number":90,"context_line":"    node.  Instead, we expect operators to allow the \"old\" nodes to drain"},{"line_number":91,"context_line":"    completely and all processes running the \"old\" code to stopped before the"},{"line_number":92,"context_line":"    \"new\" code is deployed to that node.  (If the Glance nodes are VMs, a"},{"line_number":93,"context_line":"    completely drained node could simply be deleted and be replaced by a fresh"},{"line_number":94,"context_line":"    VM containing the \"new\" code.)"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_62958685","line":91,"range":{"start_line":91,"start_character":56,"end_line":91,"end_character":66},"updated":"2016-12-05 18:04:47.000000000","message":"nit: s/to stopped/to be stopped/","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2e193fd3ff211e16606703598b2e978ef0d4f566","unresolved":false,"context_lines":[{"line_number":88,"context_line":"    other words, we do not intend to support a scenario in which \"new\" API code"},{"line_number":89,"context_line":"    is deployed to a node while \"old\" Glance processes are running on that"},{"line_number":90,"context_line":"    node.  Instead, we expect operators to allow the \"old\" nodes to drain"},{"line_number":91,"context_line":"    completely and all processes running the \"old\" code to stopped before the"},{"line_number":92,"context_line":"    \"new\" code is deployed to that node.  (If the Glance nodes are VMs, a"},{"line_number":93,"context_line":"    completely drained node could simply be deleted and be replaced by a fresh"},{"line_number":94,"context_line":"    VM containing the \"new\" code.)"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_62e66616","line":91,"range":{"start_line":91,"start_character":56,"end_line":91,"end_character":66},"in_reply_to":"3a71b18c_62958685","updated":"2016-12-05 18:07:43.000000000","message":"Done","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"},{"author":{"_account_id":22448,"name":"Alexander Bashmakov","email":"abashmak@yahoo.com","username":"abashmak"},"change_message_id":"dced40436880c8485c0be58ae6523ed96c4202e2","unresolved":false,"context_lines":[{"line_number":262,"context_line":"   of rotation and allow them to drain.  Stuart McLaren has suggested an"},{"line_number":263,"context_line":"   alternative, namely to piggyback on the zero downtime configuration reload"},{"line_number":264,"context_line":"   feature of Glance (available since the Kilo release [GLA1]_) and create a"},{"line_number":265,"context_line":"   \"graceful stop\" function that would accept a signal shut down child"},{"line_number":266,"context_line":"   processes as they complete.  (See [GSP2]_ for details.)"},{"line_number":267,"context_line":""},{"line_number":268,"context_line":"   Since we\u0027ve got the \"disable by file\" functionality available, this"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_22cd4eb7","line":265,"range":{"start_line":265,"start_character":55,"end_line":265,"end_character":59},"updated":"2016-12-05 18:04:47.000000000","message":"nit: s/signal shut down/signal to shut down/","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"},{"author":{"_account_id":5314,"name":"Brian Rosmaita","email":"rosmaita.fossdev@gmail.com","username":"brian-rosmaita"},"change_message_id":"2e193fd3ff211e16606703598b2e978ef0d4f566","unresolved":false,"context_lines":[{"line_number":262,"context_line":"   of rotation and allow them to drain.  Stuart McLaren has suggested an"},{"line_number":263,"context_line":"   alternative, namely to piggyback on the zero downtime configuration reload"},{"line_number":264,"context_line":"   feature of Glance (available since the Kilo release [GLA1]_) and create a"},{"line_number":265,"context_line":"   \"graceful stop\" function that would accept a signal shut down child"},{"line_number":266,"context_line":"   processes as they complete.  (See [GSP2]_ for details.)"},{"line_number":267,"context_line":""},{"line_number":268,"context_line":"   Since we\u0027ve got the \"disable by file\" functionality available, this"}],"source_content_type":"text/x-rst","patch_set":10,"id":"3a71b18c_82ef02e8","line":265,"range":{"start_line":265,"start_character":55,"end_line":265,"end_character":59},"in_reply_to":"3a71b18c_22cd4eb7","updated":"2016-12-05 18:07:43.000000000","message":"Done","commit_id":"df3da3099df7244e4ab44c84c41b0afc23e879cb"}]}
