)]}' {"/COMMIT_MSG":[{"author":{"_account_id":8898,"name":"Chris Wedgwood","email":"cw@f00f.org","username":"anticw"},"change_message_id":"436f77c99f1b30d12f8951a6aa5996a582bb877c","unresolved":false,"context_lines":[{"line_number":7,"context_line":"allocate_data_node"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"- allocate_data_node() should execute after drain_data_node()"},{"line_number":10,"context_line":"- enable alloation moved outside the wait_to_join check loop"},{"line_number":11,"context_line":""},{"line_number":12,"context_line":"Change-Id: Ie42af89551bd8804b87fe936c676e85130564187"}],"source_content_type":"text/x-gerrit-commit-message","patch_set":2,"id":"bf51134e_75a382b2","line":10,"range":{"start_line":10,"start_character":9,"end_line":10,"end_character":18},"updated":"2020-07-05 17:36:45.000000000","message":"allocation","commit_id":"5087b16d240fc2bd69dde11e8472b6f5155b2b91"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"cd5f93a6de1ea2024737494e8616817a947c0a78","unresolved":false,"context_lines":[{"line_number":4,"context_line":"Commit: willxz \u003cxz8905@att.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2020-07-07 12:07:03 -0400"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"allocate_data_node"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"remove \"if\" condition of allocate_data_node"},{"line_number":10,"context_line":""}],"source_content_type":"text/x-gerrit-commit-message","patch_set":5,"id":"bf51134e_79d07cf8","line":7,"range":{"start_line":7,"start_character":0,"end_line":7,"end_character":18},"updated":"2020-07-07 16:16:12.000000000","message":"I would write a more descriptive title. Something that mentions re-allocation bug or something like that","commit_id":"995c5323d76058bad42204ad67ee45d6965b228e"}],"elasticsearch/templates/bin/_elasticsearch.sh.tpl":[{"author":{"_account_id":17591,"name":"Steve Wilkerson","email":"wilkers.steve@gmail.com","username":"srwilkers"},"change_message_id":"3660f7e6da0f72c36a98f033ee5ed88b1c12ea11","unresolved":false,"context_lines":[{"line_number":120,"context_line":" kill -TERM 1"},{"line_number":121,"context_line":" }"},{"line_number":122,"context_line":" trap drain_data_node TERM EXIT HUP INT"},{"line_number":123,"context_line":" allocate_data_node \u0026"},{"line_number":124,"context_line":" wait"},{"line_number":125,"context_line":"}"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-smarty","patch_set":1,"id":"bf51134e_5ae95f7e","line":123,"range":{"start_line":123,"start_character":2,"end_line":123,"end_character":22},"updated":"2020-07-02 19:26:59.000000000","message":"The drain_data_node function is just a trap on the listed signals - it doesn\u0027t get executed before allocate in this case","commit_id":"73222528a338c066a7de89beed6fa8078edb0cd1"},{"author":{"_account_id":17591,"name":"Steve Wilkerson","email":"wilkers.steve@gmail.com","username":"srwilkers"},"change_message_id":"f3ec83e052b8b4a2b7af058fcdc3cda44847ac21","unresolved":false,"context_lines":[{"line_number":120,"context_line":" kill -TERM 1"},{"line_number":121,"context_line":" }"},{"line_number":122,"context_line":" trap drain_data_node TERM EXIT HUP INT"},{"line_number":123,"context_line":" allocate_data_node \u0026"},{"line_number":124,"context_line":" wait"},{"line_number":125,"context_line":"}"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-smarty","patch_set":1,"id":"bf51134e_15b7b8da","line":123,"range":{"start_line":123,"start_character":2,"end_line":123,"end_character":22},"in_reply_to":"bf51134e_5a131f67","updated":"2020-07-02 20:24:38.000000000","message":"I\u0027d also suggest reconsidering the existence of a file being the determining factor of whether you allocate the node or do nothing. If it takes longer to drain the node than the graceful termination period allows, it\u0027s likely that the kubelet is issuing SIGKILL because your nodes are taking too long to drain, which means that the file you\u0027re expecting will not exist when the pods are recreated, explaining the scenario where nothing happens","commit_id":"73222528a338c066a7de89beed6fa8078edb0cd1"},{"author":{"_account_id":17591,"name":"Steve Wilkerson","email":"wilkers.steve@gmail.com","username":"srwilkers"},"change_message_id":"8a1802ceff09fbce290b12c71f53bb9ca18d03d3","unresolved":false,"context_lines":[{"line_number":120,"context_line":" kill -TERM 1"},{"line_number":121,"context_line":" }"},{"line_number":122,"context_line":" trap drain_data_node TERM EXIT HUP INT"},{"line_number":123,"context_line":" allocate_data_node \u0026"},{"line_number":124,"context_line":" wait"},{"line_number":125,"context_line":"}"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-smarty","patch_set":1,"id":"bf51134e_b59a4c6b","line":123,"range":{"start_line":123,"start_character":2,"end_line":123,"end_character":22},"in_reply_to":"bf51134e_5a131f67","updated":"2020-07-02 20:21:26.000000000","message":"You absolutely do want to trap the signal and drain the data node, specifically in upgrade scenarios - failure to do so will result in unallocated shards, which will lead to significantly longer recovery times when the pods are recreated and come back online.","commit_id":"73222528a338c066a7de89beed6fa8078edb0cd1"},{"author":{"_account_id":31713,"name":"William Zhang","email":"xz8905@att.com","username":"willxz"},"change_message_id":"6b1509645c950e83da7ddb8b3db96d1ace432a05","unresolved":false,"context_lines":[{"line_number":120,"context_line":" kill -TERM 1"},{"line_number":121,"context_line":" }"},{"line_number":122,"context_line":" trap drain_data_node TERM EXIT HUP INT"},{"line_number":123,"context_line":" allocate_data_node \u0026"},{"line_number":124,"context_line":" wait"},{"line_number":125,"context_line":"}"},{"line_number":126,"context_line":""}],"source_content_type":"text/x-smarty","patch_set":1,"id":"bf51134e_5a131f67","line":123,"range":{"start_line":123,"start_character":2,"end_line":123,"end_character":22},"in_reply_to":"bf51134e_5ae95f7e","updated":"2020-07-02 19:42:31.000000000","message":"Currently there is an issue that some labs after upgrade the allocation setting remains persistent. The only function call this setting is in \"drain_data_node function. the only function to reset the setting is in allocate_data_node. allocate_data_node() checks file /data/restarting and if no such file, it will do nothing. I think there is logical bug that if allocate_data_node() starts too soon, it will finish before drain_data_node create \"/data/restarting\". So we need set the order. I don\u0027t think we need consider the exit signal for drain_data_node, just need to be sure allocation_data_node() is after drain_data_node","commit_id":"73222528a338c066a7de89beed6fa8078edb0cd1"},{"author":{"_account_id":8898,"name":"Chris Wedgwood","email":"cw@f00f.org","username":"anticw"},"change_message_id":"436f77c99f1b30d12f8951a6aa5996a582bb877c","unresolved":false,"context_lines":[{"line_number":50,"context_line":" wait_to_join"},{"line_number":51,"context_line":" fi"},{"line_number":52,"context_line":" echo \"Re-enabling Replica Shard Allocation\""},{"line_number":53,"context_line":" curl -s -K- \u003c\u003c\u003c \"--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}\" -XPUT -H \u0027Content-Type: application/json\u0027 \\"},{"line_number":54,"context_line":" \"${ELASTICSEARCH_ENDPOINT}/_cluster/settings\" -d \"{"},{"line_number":55,"context_line":" \\\"persistent\\\": {"},{"line_number":56,"context_line":" \\\"cluster.routing.allocation.enable\\\": null"}],"source_content_type":"text/x-smarty","patch_set":2,"id":"bf51134e_b5ad1a9d","line":53,"range":{"start_line":53,"start_character":10,"end_line":53,"end_character":78},"updated":"2020-07-05 17:36:45.000000000","message":"does\n\nscheme://user:password@host/...\n\nwork?","commit_id":"5087b16d240fc2bd69dde11e8472b6f5155b2b91"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"5e0e7125a8a4b80b432eaf6f62ccb5f97902933f","unresolved":false,"context_lines":[{"line_number":97,"context_line":" # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/restart-cluster.html#restart-cluster-rolling"},{"line_number":98,"context_line":""},{"line_number":99,"context_line":" echo \"Disabling Replica Shard Allocation\""},{"line_number":100,"context_line":" curl -s -K- \u003c\u003c\u003c \"-XPUT -H \u0027Content-Type: application/json\u0027 \\"},{"line_number":101,"context_line":" \"${ELASTICSEARCH_ENDPOINT}/_cluster/settings\" -d \"{"},{"line_number":102,"context_line":" \\\"persistent\\\": {"},{"line_number":103,"context_line":" \\\"cluster.routing.allocation.enable\\\": \\\"primaries\\\""}],"source_content_type":"text/x-smarty","patch_set":3,"id":"bf51134e_3c1c4469","line":100,"range":{"start_line":100,"start_character":20,"end_line":100,"end_character":21},"updated":"2020-07-06 21:17:33.000000000","message":"Extra \"","commit_id":"63786eab6b8e78a0de690acbb970ec7d5eb75896"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"cd5f93a6de1ea2024737494e8616817a947c0a78","unresolved":false,"context_lines":[{"line_number":51,"context_line":"}"},{"line_number":52,"context_line":""},{"line_number":53,"context_line":"function allocate_data_node () {"},{"line_number":54,"context_line":" #if [ -f /data/restarting ]; then"},{"line_number":55,"context_line":" # rm /data/restarting"},{"line_number":56,"context_line":" # echo \"Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster.\""},{"line_number":57,"context_line":" # wait_to_join"},{"line_number":58,"context_line":" #fi"},{"line_number":59,"context_line":" echo \"Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster.\""},{"line_number":60,"context_line":" wait_to_join"},{"line_number":61,"context_line":" echo \"Re-enabling Replica Shard Allocation\""}],"source_content_type":"text/x-smarty","patch_set":5,"id":"bf51134e_199ce05b","line":58,"range":{"start_line":54,"start_character":0,"end_line":58,"end_character":5},"updated":"2020-07-07 16:16:12.000000000","message":"Let\u0027s remove this","commit_id":"995c5323d76058bad42204ad67ee45d6965b228e"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"cd5f93a6de1ea2024737494e8616817a947c0a78","unresolved":false,"context_lines":[{"line_number":56,"context_line":" # echo \"Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster.\""},{"line_number":57,"context_line":" # wait_to_join"},{"line_number":58,"context_line":" #fi"},{"line_number":59,"context_line":" echo \"Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster.\""},{"line_number":60,"context_line":" wait_to_join"},{"line_number":61,"context_line":" echo \"Re-enabling Replica Shard Allocation\""},{"line_number":62,"context_line":" curl -s -K- \u003c\u003c\u003c \"--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}\" -XPUT -H \u0027Content-Type: application/json\u0027 \\"}],"source_content_type":"text/x-smarty","patch_set":5,"id":"bf51134e_79cb5c52","line":59,"range":{"start_line":59,"start_character":30,"end_line":59,"end_character":39},"updated":"2020-07-07 16:16:12.000000000","message":"Since we\u0027re not checking for a restart condition anymore, we should probably change to this to \u0027started\u0027 for clarity","commit_id":"995c5323d76058bad42204ad67ee45d6965b228e"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"cd5f93a6de1ea2024737494e8616817a947c0a78","unresolved":false,"context_lines":[{"line_number":117,"context_line":" action\u003d\"_flush\""},{"line_number":118,"context_line":" fi"},{"line_number":119,"context_line":""},{"line_number":120,"context_line":" #curl -s -K- \u003c\u003c\u003c \"-XPOST \"${ELASTICSEARCH_ENDPOINT}/$action\""},{"line_number":121,"context_line":" curl -s -K- \u003c\u003c\u003c \"--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}\" -XPOST \"${ELASTICSEARCH_ENDPOINT}/$action\""},{"line_number":122,"context_line":""},{"line_number":123,"context_line":" # TODO: Check the response of synced flush operations to make sure there are no failures."}],"source_content_type":"text/x-smarty","patch_set":5,"id":"bf51134e_599db855","line":120,"range":{"start_line":120,"start_character":4,"end_line":120,"end_character":64},"updated":"2020-07-07 16:16:12.000000000","message":"ditto","commit_id":"995c5323d76058bad42204ad67ee45d6965b228e"},{"author":{"_account_id":30777,"name":"Steven Fitzpatrick","email":"steven@fitzpatrick.wtf","username":"sf280x"},"change_message_id":"cd5f93a6de1ea2024737494e8616817a947c0a78","unresolved":false,"context_lines":[{"line_number":125,"context_line":" # although the request itself still returns a 200 OK status. If there are failures, reissue the request."},{"line_number":126,"context_line":" # (The only side effect of not doing so is slower start up times. See flush documentation linked above)"},{"line_number":127,"context_line":""},{"line_number":128,"context_line":" #touch /data/restarting"},{"line_number":129,"context_line":" echo \"Node ${NODE_NAME} is ready to shutdown\""},{"line_number":130,"context_line":" kill -TERM 1"},{"line_number":131,"context_line":" }"}],"source_content_type":"text/x-smarty","patch_set":5,"id":"bf51134e_99a750a8","line":128,"range":{"start_line":128,"start_character":3,"end_line":128,"end_character":27},"updated":"2020-07-07 16:16:12.000000000","message":"ditto","commit_id":"995c5323d76058bad42204ad67ee45d6965b228e"}]}