)]}' {"/COMMIT_MSG":[{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"17da01107646393eba9f45c6fc62772e3771842b","unresolved":true,"context_lines":[{"line_number":4,"context_line":"Commit: Rodrigo Barbieri \u003crodrigo.barbieri2010@gmail.com\u003e"},{"line_number":5,"context_line":"CommitDate: 2021-04-09 17:14:30 -0300"},{"line_number":6,"context_line":""},{"line_number":7,"context_line":"Error anti-affinity violation on live migrations"},{"line_number":8,"context_line":""},{"line_number":9,"context_line":"Error-out the migrations (cold and live) whenever the"},{"line_number":10,"context_line":"anti-affinity policy is violated."}],"source_content_type":"text/x-gerrit-commit-message","patch_set":2,"id":"b6ade5af_86d198be","line":7,"range":{"start_line":7,"start_character":33,"end_line":7,"end_character":37},"updated":"2021-04-09 21:17:04.000000000","message":"actually migrations in general. \"live\" should be removed","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"}],"nova/compute/manager.py":[{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"e7081143df334457748c7a533c927f295f1b85c1","unresolved":true,"context_lines":[{"line_number":5240,"context_line":" # to anti-affinity. At this point the migration is already"},{"line_number":5241,"context_line":" # in-progress, so this is the definitive moment to abort due to"},{"line_number":5242,"context_line":" # the policy violation. Also, exploding here is covered by the"},{"line_number":5243,"context_line":" # cleanup methods in except block."},{"line_number":5244,"context_line":" self._validate_instance_group_policy(context, instance)"},{"line_number":5245,"context_line":""},{"line_number":5246,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":1,"id":"4b2625e9_3f6418e5","line":5243,"range":{"start_line":5243,"start_character":18,"end_line":5243,"end_character":50},"updated":"2021-04-01 16:08:08.000000000","message":"interestingly, the instance ended up in error state.","commit_id":"0616293b63ed37ef6e06b2339f9b0c44459a9d3e"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"951d328a5d6f26a7d72e992414808b558daf27ff","unresolved":true,"context_lines":[{"line_number":1613,"context_line":" scheduler_hints\u003dNone):"},{"line_number":1614,"context_line":""},{"line_number":1615,"context_line":" if CONF.workarounds.disable_group_policy_check_upcall:"},{"line_number":1616,"context_line":" # Why would we do anything below if it is disabled?"},{"line_number":1617,"context_line":" return"},{"line_number":1618,"context_line":""},{"line_number":1619,"context_line":" # NOTE(russellb) Instance group policy is enforced by the scheduler."}],"source_content_type":"text/x-python","patch_set":2,"id":"f89bd8a2_adf37e0e","line":1616,"updated":"2021-04-09 21:30:33.000000000","message":"We pretty much don\u0027t ... the only things we do outside the check are scheduler_hints.get and isinstance(hint, list). Not harmful but also not useful. The check makes more sense to be first, like you have here.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"c0467f732dbcaceed8eefa158baaf4ed65dcdd17","unresolved":true,"context_lines":[{"line_number":1613,"context_line":" scheduler_hints\u003dNone):"},{"line_number":1614,"context_line":""},{"line_number":1615,"context_line":" if CONF.workarounds.disable_group_policy_check_upcall:"},{"line_number":1616,"context_line":" # Why would we do anything below if it is disabled?"},{"line_number":1617,"context_line":" return"},{"line_number":1618,"context_line":""},{"line_number":1619,"context_line":" # NOTE(russellb) Instance group policy is enforced by the scheduler."}],"source_content_type":"text/x-python","patch_set":2,"id":"4ea18b6e_5ca120e6","line":1616,"in_reply_to":"f89bd8a2_adf37e0e","updated":"2021-04-12 18:28:49.000000000","message":"Thanks for the clarification, I removed the note.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"951d328a5d6f26a7d72e992414808b558daf27ff","unresolved":true,"context_lines":[{"line_number":1691,"context_line":" # reasons and it can easily be violated:"},{"line_number":1692,"context_line":" # 1) because the lock happens in different compute hosts, so the"},{"line_number":1693,"context_line":" # lock is pointless here. Actually, due to reason #2 below, a lock"},{"line_number":1694,"context_line":" # doesn\u0027t help at all."},{"line_number":1695,"context_line":" # 2) the group.get_hosts is racy with _set_instance_host_and_node,"},{"line_number":1696,"context_line":" # as in my testing, for the first instance sometimes it would reach"},{"line_number":1697,"context_line":" # this part of the code with group_hosts as an empty list. Perhaps"}],"source_content_type":"text/x-python","patch_set":2,"id":"d280eb2c_84b778d1","line":1694,"updated":"2021-04-09 21:30:33.000000000","message":"True, the lock only helps with anti-affinity.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"951d328a5d6f26a7d72e992414808b558daf27ff","unresolved":true,"context_lines":[{"line_number":1697,"context_line":" # this part of the code with group_hosts as an empty list. Perhaps"},{"line_number":1698,"context_line":" # a possible improvement would be to sleep and wait while it is"},{"line_number":1699,"context_line":" # empty, however it would cause reason #3 below to be more likely"},{"line_number":1700,"context_line":" # to be hit."},{"line_number":1701,"context_line":" # 3) the condition \"self.host not it in group_hosts\" means that"},{"line_number":1702,"context_line":" # this check only works in a short period of time while the \"host\""},{"line_number":1703,"context_line":" # field is set only on one of the parallel requests (the racy"}],"source_content_type":"text/x-python","patch_set":2,"id":"edab8d68_dbb2d49e","line":1700,"updated":"2021-04-09 21:30:33.000000000","message":"Note to self: IIUC what this is describing is a scenario where instance A has its host set and gets to this check while instance B is in the middle of building into the same server group on a different host, but instance B does not have its host set yet. So a call to group.get_hosts returns only one host (and that host is excluded because of exclude\u003d[instance.uuid] on L1715). So group_hosts is an empty list. So this will not raise an affinity violation.\n\nThat makes sense I think, that this instance (instance A) should \"win\" and get to continue building since it set its host first. And then once instance B has its host set and does this check, it raises the affinity violation because of \"self.host not in group_hosts\".","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"c0467f732dbcaceed8eefa158baaf4ed65dcdd17","unresolved":true,"context_lines":[{"line_number":1697,"context_line":" # this part of the code with group_hosts as an empty list. Perhaps"},{"line_number":1698,"context_line":" # a possible improvement would be to sleep and wait while it is"},{"line_number":1699,"context_line":" # empty, however it would cause reason #3 below to be more likely"},{"line_number":1700,"context_line":" # to be hit."},{"line_number":1701,"context_line":" # 3) the condition \"self.host not it in group_hosts\" means that"},{"line_number":1702,"context_line":" # this check only works in a short period of time while the \"host\""},{"line_number":1703,"context_line":" # field is set only on one of the parallel requests (the racy"}],"source_content_type":"text/x-python","patch_set":2,"id":"619bbc50_d4c8c50f","line":1700,"in_reply_to":"edab8d68_dbb2d49e","updated":"2021-04-12 18:28:49.000000000","message":"ah, yes. Sorry about all the confusion. Despite being right in my face I missed the \"exclude\u003d...\" kwargs in get_hosts below and interpreted as it was not there. Please ignore items (2) and (3). In the end it is just the fact that the lock does not work because the requests are being serviced by different computes and instances A and B can both query the DB at the same time and receive the same response, therefore none or both end up passing the affinity check.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":4690,"name":"melanie witt","display_name":"melwitt","email":"melwittt@gmail.com","username":"melwitt"},"change_message_id":"951d328a5d6f26a7d72e992414808b558daf27ff","unresolved":true,"context_lines":[{"line_number":1708,"context_line":" # the validation must be done after the host field is set (this is"},{"line_number":1709,"context_line":" # true, otherwise all instances would be rejected according to the"},{"line_number":1710,"context_line":" # condition below), but it is paradoxical by also causing the"},{"line_number":1711,"context_line":" # problem."},{"line_number":1712,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1713,"context_line":" # setting the host field to an instance."},{"line_number":1714,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"}],"source_content_type":"text/x-python","patch_set":2,"id":"a1efb7f1_b942daad","line":1711,"updated":"2021-04-09 21:30:33.000000000","message":"I think I don\u0027t understand this part. If two instances are racing on different compute hosts and both have their host set, get_hosts for instance A would return host B and get_hosts for instance B would return host A (because of the \u0027exclude\u0027 kwarg below). Then self.host not in group_hosts would be true in both cases and both instances would be rejected.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"c0467f732dbcaceed8eefa158baaf4ed65dcdd17","unresolved":true,"context_lines":[{"line_number":1708,"context_line":" # the validation must be done after the host field is set (this is"},{"line_number":1709,"context_line":" # true, otherwise all instances would be rejected according to the"},{"line_number":1710,"context_line":" # condition below), but it is paradoxical by also causing the"},{"line_number":1711,"context_line":" # problem."},{"line_number":1712,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1713,"context_line":" # setting the host field to an instance."},{"line_number":1714,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"}],"source_content_type":"text/x-python","patch_set":2,"id":"e1f4b7ec_f353124a","line":1711,"in_reply_to":"a1efb7f1_b942daad","updated":"2021-04-12 18:28:49.000000000","message":"see response to comment above. In the end, this cannot be fully solved without a DB lock. I am not familiar with the usage of DB locks in nova (I know cinder uses some, but they also use etcd).\n\nWhat could be done perhaps is to be more strict in the condition and fail more. Something like:\n\nif group_hosts and (self.host not in group_hosts or len(group_hosts) \u003e 1):\n\nCurrently what happens is that, when both instances A and B passed the check due to the race condition, for instances C, D, E and so on they will see 2 hosts in the list. So if you are scheduling 10 instances, you will end up with around 5 instances in nodes A and B. By being more strict, we would have 2 successfull builds and 8 failures in this case. It is slightly easier for the admin to spot the violation and fix.","commit_id":"d7998e8a8465f9d0ba1fed9fe38240391f40fe52"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":1641,"context_line":" else:"},{"line_number":1642,"context_line":" # TODO(ganso): a call to DB can be saved by adding request_spec"},{"line_number":1643,"context_line":" # to rpcapi payload of live_migration, pre_live_migration and"},{"line_number":1644,"context_line":" # check_can_live_migrate_destination"},{"line_number":1645,"context_line":" try:"},{"line_number":1646,"context_line":" group \u003d objects.InstanceGroup.get_by_instance_uuid("},{"line_number":1647,"context_line":" context, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":3,"id":"bb5198b3_f0e8fddc","line":1644,"updated":"2021-04-13 21:59:09.000000000","message":"im not sure how that would work.\nif there was a concurrent operation the data would be stale by\nthe time we got to the compute node.\n\nwe could get the server group from the request spec sure but the group.members could be out of date\nif an instance has been spawned concurrently that was added to the group.\n\nyou moved the setting of group outside to the lock her and above so that you could use the group uuid as a lock instead but that in its self add a race condition","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":1641,"context_line":" else:"},{"line_number":1642,"context_line":" # TODO(ganso): a call to DB can be saved by adding request_spec"},{"line_number":1643,"context_line":" # to rpcapi payload of live_migration, pre_live_migration and"},{"line_number":1644,"context_line":" # check_can_live_migrate_destination"},{"line_number":1645,"context_line":" try:"},{"line_number":1646,"context_line":" group \u003d objects.InstanceGroup.get_by_instance_uuid("},{"line_number":1647,"context_line":" context, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":3,"id":"af319938_40aa8ef4","line":1644,"in_reply_to":"bb5198b3_f0e8fddc","updated":"2021-04-14 19:23:27.000000000","message":"we would just need the group id for locking, that\u0027s all. All the code in this else clause is just for the case where we don\u0027t have the request_spec (live_migration, pre_live_migration, check_can_live_migration_destination). By having the group id we can lock the following method and then refresh the group (for updated members and rules). So, it is just to save 1 DB call.\n\nSorry, what race condition does changing the lock to the group uuid adds? We are not doing any set/update or decision making outside of the locked code. (the locked code however is doing decision making with the stale data, but it will be addressed by refreshing the group within the lock code, as I mentioned in the other comments below).","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" except exception.InstanceGroupNotFound:"},{"line_number":1649,"context_line":" return"},{"line_number":1650,"context_line":""},{"line_number":1651,"context_line":" @utils.synchronized(group[\u0027uuid\u0027])"},{"line_number":1652,"context_line":" def _do_validation(context, instance, group):"},{"line_number":1653,"context_line":" if group.policy and \u0027anti-affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1654,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"fa0f3401_b9a3eb0d","line":1651,"range":{"start_line":1651,"start_character":7,"end_line":1651,"end_character":1},"updated":"2021-04-13 21:59:09.000000000","message":"this is quite inefficent by the way \nwe are redfinign the _do_validation fucntion and re decorating it on every invocation\nof _validate_instance_group_policy\n\nit work but this feels somewhat wasteful.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"7d301d09b550379f4fa8a958767cd838bc23f4f2","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" except exception.InstanceGroupNotFound:"},{"line_number":1649,"context_line":" return"},{"line_number":1650,"context_line":""},{"line_number":1651,"context_line":" @utils.synchronized(group[\u0027uuid\u0027])"},{"line_number":1652,"context_line":" def _do_validation(context, instance, group):"},{"line_number":1653,"context_line":" if group.policy and \u0027anti-affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1654,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"3384f65d_ffd61105","line":1651,"range":{"start_line":1651,"start_character":7,"end_line":1651,"end_character":1},"in_reply_to":"312d1ce4_c64fc495","updated":"2021-04-22 10:39:55.000000000","message":"it was ineffienct before yes and yep the pattern is fairly common in nova\nand every time i see it i think the samething. we dont need to chagne this here.\n\nit just distracts me every time i see it.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" except exception.InstanceGroupNotFound:"},{"line_number":1649,"context_line":" return"},{"line_number":1650,"context_line":""},{"line_number":1651,"context_line":" @utils.synchronized(group[\u0027uuid\u0027])"},{"line_number":1652,"context_line":" def _do_validation(context, instance, group):"},{"line_number":1653,"context_line":" if group.policy and \u0027anti-affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1654,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"312d1ce4_c64fc495","line":1651,"range":{"start_line":1651,"start_character":7,"end_line":1651,"end_character":1},"in_reply_to":"ee5dfbb4_c67302c9","updated":"2021-04-16 15:01:20.000000000","message":"Yeah, if it is inefficient then it was inefficient before and this patch did not make it worse. Sure it can be moved to a top level if we fear about the cost of defining a function at each call. Btw this is a common pattern in nova to define the locked function as a nested one.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" except exception.InstanceGroupNotFound:"},{"line_number":1649,"context_line":" return"},{"line_number":1650,"context_line":""},{"line_number":1651,"context_line":" @utils.synchronized(group[\u0027uuid\u0027])"},{"line_number":1652,"context_line":" def _do_validation(context, instance, group):"},{"line_number":1653,"context_line":" if group.policy and \u0027anti-affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1654,"context_line":""}],"source_content_type":"text/x-python","patch_set":3,"id":"ee5dfbb4_c67302c9","line":1651,"range":{"start_line":1651,"start_character":7,"end_line":1651,"end_character":1},"in_reply_to":"fa0f3401_b9a3eb0d","updated":"2021-04-14 19:23:27.000000000","message":"I don\u0027t understand how it is worse than it was before. Also, the the group ID is the common resource we want locked. Alternatively we could lock based on group_id + hostname, but since the lock is already only local, there is no point. I would agree it is inefficient if it was a distributed lock (but then again, we may want a distributed lock to solve affinity, as I highlighted below, in that case).","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":1669,"context_line":" for mig in migrations])"},{"line_number":1670,"context_line":""},{"line_number":1671,"context_line":" total_instances \u003d migration_vm_uuids | ins_on_host"},{"line_number":1672,"context_line":" members \u003d set(group.members)"},{"line_number":1673,"context_line":" # Determine the set of instance group members on this host"},{"line_number":1674,"context_line":" # which are not the instance in question. This is used to"},{"line_number":1675,"context_line":" # determine how many other members from the same anti-affinity"}],"source_content_type":"text/x-python","patch_set":3,"id":"eed8cabf_5d3829bc","line":1672,"updated":"2021-04-13 21:59:09.000000000","message":"so because you did not have the lock when you looked up the data you need to actully lookup the group members again here","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":1669,"context_line":" for mig in migrations])"},{"line_number":1670,"context_line":""},{"line_number":1671,"context_line":" total_instances \u003d migration_vm_uuids | ins_on_host"},{"line_number":1672,"context_line":" members \u003d set(group.members)"},{"line_number":1673,"context_line":" # Determine the set of instance group members on this host"},{"line_number":1674,"context_line":" # which are not the instance in question. This is used to"},{"line_number":1675,"context_line":" # determine how many other members from the same anti-affinity"}],"source_content_type":"text/x-python","patch_set":3,"id":"fd96e837_31e7a780","line":1672,"in_reply_to":"eed8cabf_5d3829bc","updated":"2021-04-14 19:23:27.000000000","message":"good point. I will add that to the locked code as a refresh.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":1676,"context_line":" # group can be on this host."},{"line_number":1677,"context_line":" members_on_host \u003d (total_instances \u0026 members -"},{"line_number":1678,"context_line":" set([instance.uuid]))"},{"line_number":1679,"context_line":" rules \u003d group.rules"},{"line_number":1680,"context_line":" if rules and \u0027max_server_per_host\u0027 in rules:"},{"line_number":1681,"context_line":" max_server \u003d rules[\u0027max_server_per_host\u0027]"},{"line_number":1682,"context_line":" else:"}],"source_content_type":"text/x-python","patch_set":3,"id":"91d7c70b_0d916b8a","line":1679,"updated":"2021-04-13 21:59:09.000000000","message":"and you technially should look up the rules too\nso on line 1653 the first thing you should do is refersh the group object by looking it up in the db by uuid.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":1676,"context_line":" # group can be on this host."},{"line_number":1677,"context_line":" members_on_host \u003d (total_instances \u0026 members -"},{"line_number":1678,"context_line":" set([instance.uuid]))"},{"line_number":1679,"context_line":" rules \u003d group.rules"},{"line_number":1680,"context_line":" if rules and \u0027max_server_per_host\u0027 in rules:"},{"line_number":1681,"context_line":" max_server \u003d rules[\u0027max_server_per_host\u0027]"},{"line_number":1682,"context_line":" else:"}],"source_content_type":"text/x-python","patch_set":3,"id":"0cf2850d_f5cbf911","line":1679,"in_reply_to":"91d7c70b_0d916b8a","updated":"2021-04-14 19:23:27.000000000","message":"it is unlikely the rule would have changed through the API, as that would be caused by a human intervention. But yes, for completeness it makes sense. Will include that in the group refresh as stated earlier.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":1692,"context_line":" # can easily be violated because the lock happens in different"},{"line_number":1693,"context_line":" # compute hosts."},{"line_number":1694,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1695,"context_line":" # setting the host field to an instance."},{"line_number":1696,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1697,"context_line":" group_hosts \u003d group.get_hosts(exclude\u003d[instance.uuid])"},{"line_number":1698,"context_line":" if group_hosts and self.host not in group_hosts:"}],"source_content_type":"text/x-python","patch_set":3,"id":"56f575bb_941accdf","line":1695,"updated":"2021-04-13 21:59:09.000000000","message":"a db lock really is not what we want if we are locking in the db we could have just claimed things in the db instead.there are other ways to do this then a db lock\n\nfor example we could use placement aggreates to model affinity and anti affinity.\nwe could also use a custom_trait.\n\nthe real cause of the curret affinity issue is that we dont create the instance claims in the scheduler /condutor they are done on the compute node. if we atomicaly created the instance claim earlier we would not need the late affinity check as only one request would be able to complete the claim.\n\nusing placement to do this with aggregates or traits would be an even better solution then doing the claim in the conductor/scheduler.\n\nthe other way to resolve this is to do an actul distibuted lock like cinder does but im not sure that is what we want.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":1692,"context_line":" # can easily be violated because the lock happens in different"},{"line_number":1693,"context_line":" # compute hosts."},{"line_number":1694,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1695,"context_line":" # setting the host field to an instance."},{"line_number":1696,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1697,"context_line":" group_hosts \u003d group.get_hosts(exclude\u003d[instance.uuid])"},{"line_number":1698,"context_line":" if group_hosts and self.host not in group_hosts:"}],"source_content_type":"text/x-python","patch_set":3,"id":"3a872626_dfaac9dd","line":1695,"in_reply_to":"3a296d4a_e58fd3f8","updated":"2021-04-16 15:01:20.000000000","message":"Nice analysis Rodrigo! Thanks for documenting your findings of the PoC it provides a valuable insight of how complex affinity-in-placement is. \n\n@Sean: I do agree that we should try to do the right implementation. But I also do agree with Rodrigo that if that implementation is so extremely costly then we need to look into alternatives. This patch is a bugfix that solves some real operator problems. It can be used as a stop gap until we 1) find a solution that seems feasible to implement in placement 2) find the developer who has time to do that implementation.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"7d301d09b550379f4fa8a958767cd838bc23f4f2","unresolved":true,"context_lines":[{"line_number":1692,"context_line":" # can easily be violated because the lock happens in different"},{"line_number":1693,"context_line":" # compute hosts."},{"line_number":1694,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1695,"context_line":" # setting the host field to an instance."},{"line_number":1696,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1697,"context_line":" group_hosts \u003d group.get_hosts(exclude\u003d[instance.uuid])"},{"line_number":1698,"context_line":" if group_hosts and self.host not in group_hosts:"}],"source_content_type":"text/x-python","patch_set":3,"id":"3a9e3a9b_7fd810dd","line":1695,"in_reply_to":"3a872626_dfaac9dd","updated":"2021-04-22 10:39:55.000000000","message":"ok we can fix this without placment by the way by doing the instance claim in the conductor.\nwe discussed this as a way to not need to track numa in placment too and said we should try to do that for that usecase.\n\nso yes let proceed with the interim fix and then revisit this.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":1692,"context_line":" # can easily be violated because the lock happens in different"},{"line_number":1693,"context_line":" # compute hosts."},{"line_number":1694,"context_line":" # The only fix seems to be a DB lock to perform the check whenever"},{"line_number":1695,"context_line":" # setting the host field to an instance."},{"line_number":1696,"context_line":" elif group.policy and \u0027affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1697,"context_line":" group_hosts \u003d group.get_hosts(exclude\u003d[instance.uuid])"},{"line_number":1698,"context_line":" if group_hosts and self.host not in group_hosts:"}],"source_content_type":"text/x-python","patch_set":3,"id":"3a296d4a_e58fd3f8","line":1695,"in_reply_to":"56f575bb_941accdf","updated":"2021-04-14 19:23:27.000000000","message":"indeed the ideal solution to race conditions that would allow for migrations to succeed without violating the policy, is through the complete redesign of the (anti-)affinity feature on placement. We discussed this a meeting and I signed up for the redesigned work. Unfortunately, things turned out to be much more complex than I anticipated.\n\nI was almost finished with the redesigned and PoC implementation using aggregates, but the fact that there are customizable rules \"max_server_per_host\" per individual group makes it much more complex. I wouldn\u0027t propose a redesign with the loss of that functionality (even if it is not widely used).\n\nThe redesign I came up with consisted of using the aggregate UUID as the group UUID, as the instances and resource providers need a resource in common to check against and the generation ID from the resource providers ensured no 2 requests would perform the same action in parallel in different resource providers, while having the aggregate in common (I added a new placement API to create/update the aggregate based on aggregate UUID instead of resource provider UUID). Traits also would require new placement API to interact on them instead of on resource providers, so they are almost equivalent but the semantics using traits look too \"hacky\" while using aggregates it makes much more logical sense. A lot of stitching was required to prevent the requests themselves from being racy because it all relied on a cascade of successes:\n1) it would need to set the aggregate first (a parallel request would fail and then re-evaluate the candidates)\n2) it would then need to claim the allocations, which could then fail if the host didn\u0027t have enough resources during claim (they could have been claimed in parallel), and then the aggregate update needs to be reverted\n3) finally the instance would need to build successfully on the compute, otherwise the claim needs to be reverted, and then aggregate also reverted.\n\nSteps #2 and #3 can lead to exponencial cascading failures, where the parallel request that previously failed step #1 would then re-evaluate and agree to land on the same resource provider (if policy\u003daffinity) as picked by the first request that created the aggregate (however, that had not yet successfully built the instance). Then, if the first request failed step #2 and #3, not only would need to revert that, but the parallel requests would also need to fail and be reverted. Affinity ended up being much harder than anti-affinity.\n\nFinally, introducing the max_server_per_host to the mix. The closest semantic was a resource semantic, like MEMORY_MB, VCPU, etc that gets subtracted. So, this is a variable that could be changed on the fly by the operator, and then needs to be updated on placement on all the affected resource providers. While you add it to the mix, you have a 4th item in the cascade above that needs to succeed. While also having extra requests for adding the resource to the resource_provider when it makes sense and subsequently updating it. The more requests, the racier. I could solve part of those issues with some extra APIs to bundle some requests together. However, the final nail in the complexity coffin was that there was no generation_ID to prevent the max_server_per_host from being racy. Therefore the placement semantics would need to be updated to address that.\n\nAnyway, it turned out to be a tremendous amount of work, so I decided to try something simpler (this patch). For my customer\u0027s use case (which is only for anti-affinity), this patch solves the race condition problem.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":5221,"context_line":" scheduler_hints \u003d self._get_scheduler_hints(filter_properties,"},{"line_number":5222,"context_line":" request_spec)"},{"line_number":5223,"context_line":" # Error out if this host cannot accept the new instance due"},{"line_number":5224,"context_line":" # to anti-affinity. At this point the migration is already"},{"line_number":5225,"context_line":" # in-progress, so this is the definitive moment to abort due to"},{"line_number":5226,"context_line":" # the policy violation. Also, exploding here is covered by the"},{"line_number":5227,"context_line":" # cleanup methods in except block."}],"source_content_type":"text/x-python","patch_set":3,"id":"04a5efa8_17072db6","line":5224,"range":{"start_line":5224,"start_character":54,"end_line":5224,"end_character":74},"updated":"2021-04-13 21:59:09.000000000","message":"it would be in-progress in the db but in reality the vm is still running on the\nsource host at this point right. we have not started to cold migration yet so if we fail at this\npoint and revert it wont impact the running guest.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":5221,"context_line":" scheduler_hints \u003d self._get_scheduler_hints(filter_properties,"},{"line_number":5222,"context_line":" request_spec)"},{"line_number":5223,"context_line":" # Error out if this host cannot accept the new instance due"},{"line_number":5224,"context_line":" # to anti-affinity. At this point the migration is already"},{"line_number":5225,"context_line":" # in-progress, so this is the definitive moment to abort due to"},{"line_number":5226,"context_line":" # the policy violation. Also, exploding here is covered by the"},{"line_number":5227,"context_line":" # cleanup methods in except block."}],"source_content_type":"text/x-python","patch_set":3,"id":"69fbee80_cec50582","line":5224,"range":{"start_line":5224,"start_character":54,"end_line":5224,"end_character":74},"in_reply_to":"04a5efa8_17072db6","updated":"2021-04-14 19:23:27.000000000","message":"yes, we are actually accounting for it twice on purpose to be more strict. If the VM migration is in-progress from host A to host B, when the check is being performed, it considers as if the instance is in both hosts at the same time. Therefore we don\u0027t want any new VMs or anything that would violate the policy to be allowed on host A or host B.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"7d301d09b550379f4fa8a958767cd838bc23f4f2","unresolved":true,"context_lines":[{"line_number":5221,"context_line":" scheduler_hints \u003d self._get_scheduler_hints(filter_properties,"},{"line_number":5222,"context_line":" request_spec)"},{"line_number":5223,"context_line":" # Error out if this host cannot accept the new instance due"},{"line_number":5224,"context_line":" # to anti-affinity. At this point the migration is already"},{"line_number":5225,"context_line":" # in-progress, so this is the definitive moment to abort due to"},{"line_number":5226,"context_line":" # the policy violation. Also, exploding here is covered by the"},{"line_number":5227,"context_line":" # cleanup methods in except block."}],"source_content_type":"text/x-python","patch_set":3,"id":"88a2f2a6_ca48663c","line":5224,"range":{"start_line":5224,"start_character":54,"end_line":5224,"end_character":74},"in_reply_to":"69fbee80_cec50582","updated":"2021-04-22 10:39:55.000000000","message":"right i was just pointing out that its in progress form the nova db point of view but form a libvirt/hypervior point of view\nthe cold migration has not started yet.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":7968,"context_line":" # multiple requests may be happening concurrently and miss the lock,"},{"line_number":7969,"context_line":" # but when it works it provides a better user experience by failing"},{"line_number":7970,"context_line":" # earlier. Also, it should be safe to explode here, error becomes"},{"line_number":7971,"context_line":" # NoValidHost and instance status remains ACTIVE."},{"line_number":7972,"context_line":" try:"},{"line_number":7973,"context_line":" self._validate_instance_group_policy(ctxt, instance)"},{"line_number":7974,"context_line":" except exception.RescheduledException as e:"}],"source_content_type":"text/x-python","patch_set":3,"id":"fe4f43ba_cc77dc86","line":7971,"updated":"2021-04-13 21:59:09.000000000","message":"ya this racy im conflicted on if this is an improvemnt as its basically addimg a buch of upcalls to get the group info.\nwhen this destination host was selected it was a valid host but at this point it could be invalid.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":7968,"context_line":" # multiple requests may be happening concurrently and miss the lock,"},{"line_number":7969,"context_line":" # but when it works it provides a better user experience by failing"},{"line_number":7970,"context_line":" # earlier. Also, it should be safe to explode here, error becomes"},{"line_number":7971,"context_line":" # NoValidHost and instance status remains ACTIVE."},{"line_number":7972,"context_line":" try:"},{"line_number":7973,"context_line":" self._validate_instance_group_policy(ctxt, instance)"},{"line_number":7974,"context_line":" except exception.RescheduledException as e:"}],"source_content_type":"text/x-python","patch_set":3,"id":"2a9e3bb8_0ebf969d","line":7971,"in_reply_to":"fe4f43ba_cc77dc86","updated":"2021-04-14 19:23:27.000000000","message":"So, for live migration there are 3 checks in total.\n\n1) scheduler\n2) check_can_live_migration_destination\n3) pre_live_migration\n\nwe don\u0027t need check #2, as it is not very accurate, but when it works, it provides a better user experience. Without it, the migration request will always be accepted and fails afterwards if it violates the policy.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"669f4b2691cbbf597450b9948437a35bc267c587","unresolved":true,"context_lines":[{"line_number":8016,"context_line":" dest_check_data)"},{"line_number":8017,"context_line":" return migrate_data"},{"line_number":8018,"context_line":""},{"line_number":8019,"context_line":" def _live_migration_claim(self, ctxt, instance, migrate_data,"},{"line_number":8020,"context_line":" migration, limits, allocs):"},{"line_number":8021,"context_line":" \"\"\"Runs on the destination and does a resources claim, if necessary."},{"line_number":8022,"context_line":" Currently, only NUMA live migrations require it."}],"source_content_type":"text/x-python","patch_set":3,"id":"d3e8dbb0_2d3380dc","line":8019,"range":{"start_line":8019,"start_character":8,"end_line":8019,"end_character":29},"updated":"2021-04-13 21:59:09.000000000","message":"it might be better to chack the affinti here as part fo the claim process.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"7d301d09b550379f4fa8a958767cd838bc23f4f2","unresolved":true,"context_lines":[{"line_number":8016,"context_line":" dest_check_data)"},{"line_number":8017,"context_line":" return migrate_data"},{"line_number":8018,"context_line":""},{"line_number":8019,"context_line":" def _live_migration_claim(self, ctxt, instance, migrate_data,"},{"line_number":8020,"context_line":" migration, limits, allocs):"},{"line_number":8021,"context_line":" \"\"\"Runs on the destination and does a resources claim, if necessary."},{"line_number":8022,"context_line":" Currently, only NUMA live migrations require it."}],"source_content_type":"text/x-python","patch_set":3,"id":"8d4550aa_8125059a","line":8019,"range":{"start_line":8019,"start_character":8,"end_line":8019,"end_character":29},"in_reply_to":"11e3f4b1_f7f22ab6","updated":"2021-04-22 10:39:55.000000000","message":"actully i was refering to resouce tracker claims not placement allocaions they are two diffenet but related things.\n\nplacment would be optimal as but doing claims in the conductor would fix this also without needing to use placement.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"d60893dfd9cb6f5d41d093a66eeb9a2dedf6f08a","unresolved":true,"context_lines":[{"line_number":8016,"context_line":" dest_check_data)"},{"line_number":8017,"context_line":" return migrate_data"},{"line_number":8018,"context_line":""},{"line_number":8019,"context_line":" def _live_migration_claim(self, ctxt, instance, migrate_data,"},{"line_number":8020,"context_line":" migration, limits, allocs):"},{"line_number":8021,"context_line":" \"\"\"Runs on the destination and does a resources claim, if necessary."},{"line_number":8022,"context_line":" Currently, only NUMA live migrations require it."}],"source_content_type":"text/x-python","patch_set":3,"id":"11e3f4b1_f7f22ab6","line":8019,"range":{"start_line":8019,"start_character":8,"end_line":8019,"end_character":29},"in_reply_to":"d3e8dbb0_2d3380dc","updated":"2021-04-14 19:23:27.000000000","message":"this would work if it (anti-)affinity was redesigned to use placement, see my other comment for context.","commit_id":"58b2f6126c5dd23f01203c191bbb0618bac36696"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":1641,"context_line":" else:"},{"line_number":1642,"context_line":" # TODO(ganso): a call to DB can be saved by adding request_spec"},{"line_number":1643,"context_line":" # to rpcapi payload of live_migration, pre_live_migration and"},{"line_number":1644,"context_line":" # check_can_live_migrate_destination"},{"line_number":1645,"context_line":" try:"},{"line_number":1646,"context_line":" group \u003d objects.InstanceGroup.get_by_instance_uuid("},{"line_number":1647,"context_line":" context, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":4,"id":"46ceb4fa_4cf2f7ff","line":1644,"updated":"2021-04-16 15:01:20.000000000","message":"Hm I don\u0027t see how we can save any db call. we need to get the InstanceGroup object from the DB either by hint or by instance_uuid. \n\n// later\n\nOh you mean that based on the hint we know if the instance does not belong to any group so we don\u0027t have to get the group from the DB. Yeah that is probably true.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"0c380d7a6faeeaebd7fbb43ade6ee5e6eadc1ae6","unresolved":true,"context_lines":[{"line_number":1641,"context_line":" else:"},{"line_number":1642,"context_line":" # TODO(ganso): a call to DB can be saved by adding request_spec"},{"line_number":1643,"context_line":" # to rpcapi payload of live_migration, pre_live_migration and"},{"line_number":1644,"context_line":" # check_can_live_migrate_destination"},{"line_number":1645,"context_line":" try:"},{"line_number":1646,"context_line":" group \u003d objects.InstanceGroup.get_by_instance_uuid("},{"line_number":1647,"context_line":" context, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":4,"id":"d518eb45_786ddfb1","line":1644,"in_reply_to":"46ceb4fa_4cf2f7ff","updated":"2021-04-16 17:39:48.000000000","message":"yes, this \"else\" clause wouldn\u0027t need to exist, as this method would always be invoked with the hint","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6dc6ce43ca7751dd854f553330ff4be705549096","unresolved":false,"context_lines":[{"line_number":1641,"context_line":" else:"},{"line_number":1642,"context_line":" # TODO(ganso): a call to DB can be saved by adding request_spec"},{"line_number":1643,"context_line":" # to rpcapi payload of live_migration, pre_live_migration and"},{"line_number":1644,"context_line":" # check_can_live_migrate_destination"},{"line_number":1645,"context_line":" try:"},{"line_number":1646,"context_line":" group \u003d objects.InstanceGroup.get_by_instance_uuid("},{"line_number":1647,"context_line":" context, instance.uuid)"}],"source_content_type":"text/x-python","patch_set":4,"id":"8cc51a7b_9ee9ffcc","line":1644,"in_reply_to":"d518eb45_786ddfb1","updated":"2021-04-22 09:02:42.000000000","message":"Ack","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":1648,"context_line":" except exception.InstanceGroupNotFound:"},{"line_number":1649,"context_line":" return"},{"line_number":1650,"context_line":""},{"line_number":1651,"context_line":" @utils.synchronized(group[\u0027uuid\u0027])"},{"line_number":1652,"context_line":" def _do_validation(context, instance, group):"},{"line_number":1653,"context_line":" if group.policy and \u0027anti-affinity\u0027 \u003d\u003d group.policy:"},{"line_number":1654,"context_line":""}],"source_content_type":"text/x-python","patch_set":4,"id":"a291a45c_e2f4a280","line":1651,"updated":"2021-04-16 15:01:20.000000000","message":"yepp this is better when some hints refer to the same group by name while other by uuid","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":1659,"context_line":""},{"line_number":1660,"context_line":" # instance param is just for logging, the nodename obtained is"},{"line_number":1661,"context_line":" # not actually related to the instance at all"},{"line_number":1662,"context_line":" nodename \u003d self._get_nodename(instance)"},{"line_number":1663,"context_line":""},{"line_number":1664,"context_line":" # instances being migrated to host"},{"line_number":1665,"context_line":" migrations \u003d ("}],"source_content_type":"text/x-python","patch_set":4,"id":"758cbb49_d6698618","line":1662,"updated":"2021-04-16 15:01:20.000000000","message":"this _get_nodename call needs a serious refactor. What is logs does not make any sense to me. (Can be done separately)","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":5232,"context_line":" try:"},{"line_number":5233,"context_line":" self._validate_instance_group_policy(context, instance,"},{"line_number":5234,"context_line":" scheduler_hints)"},{"line_number":5235,"context_line":" except exception.RescheduledException as e:"},{"line_number":5236,"context_line":" raise exception.InstanceFaultRollback(inner_exception\u003de)"},{"line_number":5237,"context_line":""},{"line_number":5238,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":4,"id":"8a2c6dd1_2adf67c5","line":5235,"updated":"2021-04-16 15:01:20.000000000","message":"From the func documentation of pre_resize:\n\n\"If the claim fails then a reschedule to another host may be attempted which involves calling back to conductor to start the process over again.\"\n\nSo in theory we could initiate reschedule, see L5256 for the re-schedule triggering.\n\n// later\n\nlooking into _reschedule_resize_or_reraise below I think this exception does trigger re-schedule. But then we don\u0027t need to translate the exception here.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"32f714e790e30daee09e5321e15c6e9ec30f5576","unresolved":true,"context_lines":[{"line_number":5232,"context_line":" try:"},{"line_number":5233,"context_line":" self._validate_instance_group_policy(context, instance,"},{"line_number":5234,"context_line":" scheduler_hints)"},{"line_number":5235,"context_line":" except exception.RescheduledException as e:"},{"line_number":5236,"context_line":" raise exception.InstanceFaultRollback(inner_exception\u003de)"},{"line_number":5237,"context_line":""},{"line_number":5238,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":4,"id":"ecd5549e_729c0cd2","line":5235,"in_reply_to":"028d4ccc_0983c04a","updated":"2021-05-28 20:45:05.000000000","message":"The requirement for InstanceFaultRollback (as I understand) is explained in:\n\n[0] https://github.com/openstack/nova/blob/d5ed968826895d362f4f2aa21decfdebb9b1fd84/nova/compute/manager.py#L3423\n[1] https://github.com/openstack/nova/blob/d5ed968826895d362f4f2aa21decfdebb9b1fd84/nova/compute/manager.py#L5061\n[2] https://github.com/openstack/nova/blob/d5ed968826895d362f4f2aa21decfdebb9b1fd84/nova/compute/manager.py#L10067\n\nIf I understand it correctly, in [0] and [1] basically it says that if \"_error_out_instance_on_exception\" is used, which is the case here just above in line 5217, it needs to be encapsulated in a InstanceFaultRollback.\nIn [2], it explains that it will perform the state changes and re-raise the encapsulated exception. In the end, it is the RescheduledException that bubbles up (I assume this is what you were expecting).\n\nFollowing your suggestion, I removed the translation as you described (as per my understanding, basically removed the \"try:\" and the except block, letting the RescheduledException bubble up), but the end result was that the instance ended in ERROR state.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"6dc6ce43ca7751dd854f553330ff4be705549096","unresolved":true,"context_lines":[{"line_number":5232,"context_line":" try:"},{"line_number":5233,"context_line":" self._validate_instance_group_policy(context, instance,"},{"line_number":5234,"context_line":" scheduler_hints)"},{"line_number":5235,"context_line":" except exception.RescheduledException as e:"},{"line_number":5236,"context_line":" raise exception.InstanceFaultRollback(inner_exception\u003de)"},{"line_number":5237,"context_line":""},{"line_number":5238,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":4,"id":"028d4ccc_0983c04a","line":5235,"in_reply_to":"574df7ea_10cbb0ab","updated":"2021-04-22 09:02:42.000000000","message":"I don\u0027t see why we need to wrap the RescheduledException into a InstanceFaultRollback as far as I see _reschedule_resize_or_reraise() does not care about the type of the exception.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"0c380d7a6faeeaebd7fbb43ade6ee5e6eadc1ae6","unresolved":true,"context_lines":[{"line_number":5232,"context_line":" try:"},{"line_number":5233,"context_line":" self._validate_instance_group_policy(context, instance,"},{"line_number":5234,"context_line":" scheduler_hints)"},{"line_number":5235,"context_line":" except exception.RescheduledException as e:"},{"line_number":5236,"context_line":" raise exception.InstanceFaultRollback(inner_exception\u003de)"},{"line_number":5237,"context_line":""},{"line_number":5238,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":4,"id":"574df7ea_10cbb0ab","line":5235,"in_reply_to":"8a2c6dd1_2adf67c5","updated":"2021-04-16 17:39:48.000000000","message":"yes it triggers re-schedule but it needs to be a InstanceFaultRollback with the RescheduledException as an inner exception. Sorry I didn\u0027t understand the part \"But then we don\u0027t need to translate the exception here\". What do you mean by translate?","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"fa40990eedc3b81567d04bdb2db03734e6dd6d94","unresolved":false,"context_lines":[{"line_number":5232,"context_line":" try:"},{"line_number":5233,"context_line":" self._validate_instance_group_policy(context, instance,"},{"line_number":5234,"context_line":" scheduler_hints)"},{"line_number":5235,"context_line":" except exception.RescheduledException as e:"},{"line_number":5236,"context_line":" raise exception.InstanceFaultRollback(inner_exception\u003de)"},{"line_number":5237,"context_line":""},{"line_number":5238,"context_line":" self._prep_resize(context, image, instance,"}],"source_content_type":"text/x-python","patch_set":4,"id":"0648bf6a_66d38c8c","line":5235,"in_reply_to":"ecd5549e_729c0cd2","updated":"2021-05-31 16:52:25.000000000","message":"Thanks. You are correct. I was misled by the fact that we use RescheduledException but the InstanceFaultRollback is not here to drive the reschedule but to properly clean up if the reschedule fails.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"32f714e790e30daee09e5321e15c6e9ec30f5576","unresolved":true,"context_lines":[{"line_number":7975,"context_line":" # NoValidHost and instance status remains ACTIVE."},{"line_number":7976,"context_line":" try:"},{"line_number":7977,"context_line":" self._validate_instance_group_policy(ctxt, instance)"},{"line_number":7978,"context_line":" except exception.RescheduledException as e:"},{"line_number":7979,"context_line":" msg \u003d (\"Failed to validate instance group policy \""},{"line_number":7980,"context_line":" \"due to: {}\".format(e))"},{"line_number":7981,"context_line":" raise exception.MigrationPreCheckError(reason\u003dmsg)"}],"source_content_type":"text/x-python","patch_set":4,"id":"c0675ed9_550b2901","line":7978,"range":{"start_line":7978,"start_character":24,"end_line":7978,"end_character":44},"updated":"2021-05-28 20:45:05.000000000","message":"this is the one that gets rescheduled, only when it is detected early while the checks are being performed for each host returned by the scheduler.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":11604,"name":"sean mooney","email":"smooney@redhat.com","username":"sean-k-mooney"},"change_message_id":"2019dcf78165bdd0c838c63c339dd40601a31316","unresolved":true,"context_lines":[{"line_number":7975,"context_line":" # NoValidHost and instance status remains ACTIVE."},{"line_number":7976,"context_line":" try:"},{"line_number":7977,"context_line":" self._validate_instance_group_policy(ctxt, instance)"},{"line_number":7978,"context_line":" except exception.RescheduledException as e:"},{"line_number":7979,"context_line":" msg \u003d (\"Failed to validate instance group policy \""},{"line_number":7980,"context_line":" \"due to: {}\".format(e))"},{"line_number":7981,"context_line":" raise exception.MigrationPreCheckError(reason\u003dmsg)"}],"source_content_type":"text/x-python","patch_set":4,"id":"9dc2e6df_53f4f324","line":7978,"range":{"start_line":7978,"start_character":24,"end_line":7978,"end_character":44},"in_reply_to":"c0675ed9_550b2901","updated":"2021-06-01 13:49:30.000000000","message":"Ack","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"1ea8cf4b963df41cb7b29d955003af2da899e4d3","unresolved":true,"context_lines":[{"line_number":8123,"context_line":" # so this is the definitive moment to abort due to the policy"},{"line_number":8124,"context_line":" # violation. Also, it should be safe to explode here. The instance"},{"line_number":8125,"context_line":" # status remains ACTIVE, migration status failed."},{"line_number":8126,"context_line":" self._validate_instance_group_policy(context, instance)"},{"line_number":8127,"context_line":""},{"line_number":8128,"context_line":" migrate_data.old_vol_attachment_ids \u003d {}"},{"line_number":8129,"context_line":" bdms \u003d objects.BlockDeviceMappingList.get_by_instance_uuid("}],"source_content_type":"text/x-python","patch_set":4,"id":"7c953fcb_c87cb7f4","line":8126,"updated":"2021-04-16 15:01:20.000000000","message":"OK, the caller of this RPC has an except Exception block to handle the RescheduledException cleanly.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":14567,"name":"Rodrigo Barbieri","email":"rodrigo.barbieri2010@gmail.com","username":"ganso"},"change_message_id":"32f714e790e30daee09e5321e15c6e9ec30f5576","unresolved":true,"context_lines":[{"line_number":8123,"context_line":" # so this is the definitive moment to abort due to the policy"},{"line_number":8124,"context_line":" # violation. Also, it should be safe to explode here. The instance"},{"line_number":8125,"context_line":" # status remains ACTIVE, migration status failed."},{"line_number":8126,"context_line":" self._validate_instance_group_policy(context, instance)"},{"line_number":8127,"context_line":""},{"line_number":8128,"context_line":" migrate_data.old_vol_attachment_ids \u003d {}"},{"line_number":8129,"context_line":" bdms \u003d objects.BlockDeviceMappingList.get_by_instance_uuid("}],"source_content_type":"text/x-python","patch_set":4,"id":"d43edc8c_497c6a44","line":8126,"in_reply_to":"7c953fcb_c87cb7f4","updated":"2021-05-28 20:45:05.000000000","message":"actually no, I think you misinterpreted this. There is no RescheduledException handled in pre_live_migration. Once it reaches this point, it is considered as \"started\", therefore any failure will trigger cleanup, status reverts back to ACTIVE, and migration is considered failed. It is not re-scheduled.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"},{"author":{"_account_id":9708,"name":"Balazs Gibizer","display_name":"gibi","email":"gibizer@gmail.com","username":"gibi"},"change_message_id":"fa40990eedc3b81567d04bdb2db03734e6dd6d94","unresolved":false,"context_lines":[{"line_number":8123,"context_line":" # so this is the definitive moment to abort due to the policy"},{"line_number":8124,"context_line":" # violation. Also, it should be safe to explode here. The instance"},{"line_number":8125,"context_line":" # status remains ACTIVE, migration status failed."},{"line_number":8126,"context_line":" self._validate_instance_group_policy(context, instance)"},{"line_number":8127,"context_line":""},{"line_number":8128,"context_line":" migrate_data.old_vol_attachment_ids \u003d {}"},{"line_number":8129,"context_line":" bdms \u003d objects.BlockDeviceMappingList.get_by_instance_uuid("}],"source_content_type":"text/x-python","patch_set":4,"id":"6386a405_f6c9c631","line":8126,"in_reply_to":"d43edc8c_497c6a44","updated":"2021-05-31 16:52:25.000000000","message":"We are on the same page with this. I meant that there is an Exception handling block that cleans up if we are failing here. I did not mean that there will be re-scheduling.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"}],"releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml":[{"author":{"_account_id":7166,"name":"Sylvain Bauza","email":"sbauza@redhat.com","username":"sbauza"},"change_message_id":"6b0d612284dd474e1a5a03593d90bee0f3165706","unresolved":true,"context_lines":[{"line_number":3,"context_line":" - |"},{"line_number":4,"context_line":" Improved detection of anti-affinity policy violation when performing live"},{"line_number":5,"context_line":" and cold migrations. Most of the violations caused by race conditions due"},{"line_number":6,"context_line":" to performing concurrent live or cold migrations should now be addressed."}],"source_content_type":"text/x-yaml","patch_set":4,"id":"077034f9_53774645","line":6,"updated":"2021-06-01 14:00:08.000000000","message":"nit : As gibi and you discussed, you should be explaining why sometimes we reschedule in case of a violation and sometimes not. This could be a follow-up.","commit_id":"d060e30dd1af16e45ecd5994831965f5fdd0c5cc"}]}