These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / drivers / hv / channel_mgmt.c
diff --git a/kernel/drivers/hv/channel_mgmt.c b/kernel/drivers/hv/channel_mgmt.c

index 0eeb1b3..652afd1 100644 (file)
--- a/kernel/drivers/hv/channel_mgmt.c
+++ b/kernel/drivers/hv/channel_mgmt.c
@@ -32,6 +32,9 @@
  
  #include "hyperv_vmbus.h"
  
+static void init_vp_index(struct vmbus_channel *channel,
+                         const uuid_le *type_guid);
+
  /**
   * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
   * @icmsghdrp: Pointer to msg header structure
@@ -201,22 +204,38 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
                 spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
                 list_del(&channel->listentry);
                 spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+
+               primary_channel = channel;
         } else {
                 primary_channel = channel->primary_channel;
                 spin_lock_irqsave(&primary_channel->lock, flags);
                 list_del(&channel->sc_list);
+               primary_channel->num_sc--;
                 spin_unlock_irqrestore(&primary_channel->lock, flags);
         }
+
+       /*
+        * We need to free the bit for init_vp_index() to work in the case
+        * of sub-channel, when we reload drivers like hv_netvsc.
+        */
+       cpumask_clear_cpu(channel->target_cpu,
+                         &primary_channel->alloced_cpus_in_node);
+
         free_channel(channel);
  }
  
  void vmbus_free_channels(void)
  {
-       struct vmbus_channel *channel;
+       struct vmbus_channel *channel, *tmp;
+
+       list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
+               listentry) {
+               /* if we don't set rescind to true, vmbus_close_internal()
+                * won't invoke hv_process_channel_removal().
+                */
+               channel->rescind = true;
  
-       list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
                 vmbus_device_unregister(channel->device_obj);
-               free_channel(channel);
         }
  }
  
@@ -228,7 +247,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
  {
         struct vmbus_channel *channel;
         bool fnew = true;
-       bool enq = false;
         unsigned long flags;
  
         /* Make sure this is a new offer */
@@ -244,25 +262,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
                 }
         }
  
-       if (fnew) {
+       if (fnew)
                 list_add_tail(&newchannel->listentry,
                               &vmbus_connection.chn_list);
-               enq = true;
-       }
  
         spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
  
-       if (enq) {
-               if (newchannel->target_cpu != get_cpu()) {
-                       put_cpu();
-                       smp_call_function_single(newchannel->target_cpu,
-                                                percpu_channel_enq,
-                                                newchannel, true);
-               } else {
-                       percpu_channel_enq(newchannel);
-                       put_cpu();
-               }
-       }
         if (!fnew) {
                 /*
                  * Check to see if this is a sub-channel.
@@ -274,27 +279,22 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
                         newchannel->primary_channel = channel;
                         spin_lock_irqsave(&channel->lock, flags);
                         list_add_tail(&newchannel->sc_list, &channel->sc_list);
-                       spin_unlock_irqrestore(&channel->lock, flags);
-
-                       if (newchannel->target_cpu != get_cpu()) {
-                               put_cpu();
-                               smp_call_function_single(newchannel->target_cpu,
-                                                        percpu_channel_enq,
-                                                        newchannel, true);
-                       } else {
-                               percpu_channel_enq(newchannel);
-                               put_cpu();
-                       }
-
-                       newchannel->state = CHANNEL_OPEN_STATE;
                         channel->num_sc++;
-                       if (channel->sc_creation_callback != NULL)
-                               channel->sc_creation_callback(newchannel);
+                       spin_unlock_irqrestore(&channel->lock, flags);
+               } else
+                       goto err_free_chan;
+       }
  
-                       return;
-               }
+       init_vp_index(newchannel, &newchannel->offermsg.offer.if_type);
  
-               goto err_free_chan;
+       if (newchannel->target_cpu != get_cpu()) {
+               put_cpu();
+               smp_call_function_single(newchannel->target_cpu,
+                                        percpu_channel_enq,
+                                        newchannel, true);
+       } else {
+               percpu_channel_enq(newchannel);
+               put_cpu();
         }
  
         /*
@@ -304,6 +304,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
          */
         newchannel->state = CHANNEL_OPEN_STATE;
  
+       if (!fnew) {
+               if (channel->sc_creation_callback != NULL)
+                       channel->sc_creation_callback(newchannel);
+               return;
+       }
+
         /*
          * Start the process of binding this offer to the driver
          * We need to set the DeviceObject field before calling
@@ -351,6 +357,7 @@ enum {
         IDE = 0,
         SCSI,
         NIC,
+       ND_NIC,
         MAX_PERF_CHN,
  };
  
@@ -374,23 +381,28 @@ static const struct hv_vmbus_device_id hp_devs[] = {
  /*
   * We use this state to statically distribute the channel interrupt load.
   */
-static u32  next_vp;
+static int next_numa_node_id;
  
  /*
   * Starting with Win8, we can statically distribute the incoming
- * channel interrupt load by binding a channel to VCPU. We
- * implement here a simple round robin scheme for distributing
- * the interrupt load.
- * We will bind channels that are not performance critical to cpu 0 and
- * performance critical channels (IDE, SCSI and Network) will be uniformly
- * distributed across all available CPUs.
+ * channel interrupt load by binding a channel to VCPU.
+ * We do this in a hierarchical fashion:
+ * First distribute the primary channels across available NUMA nodes
+ * and then distribute the subchannels amongst the CPUs in the NUMA
+ * node assigned to the primary channel.
+ *
+ * For pre-win8 hosts or non-performance critical channels we assign the
+ * first CPU in the first NUMA node.
   */
  static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
  {
         u32 cur_cpu;
         int i;
         bool perf_chn = false;
-       u32 max_cpus = num_online_cpus();
+       struct vmbus_channel *primary = channel->primary_channel;
+       int next_node;
+       struct cpumask available_mask;
+       struct cpumask *alloced_mask;
  
         for (i = IDE; i < MAX_PERF_CHN; i++) {
                 if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -407,15 +419,103 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
                  * Also if the channel is not a performance critical
                  * channel, bind it to cpu 0.
                  */
+               channel->numa_node = 0;
                 channel->target_cpu = 0;
-               channel->target_vp = 0;
+               channel->target_vp = hv_context.vp_index[0];
                 return;
         }
-       cur_cpu = (++next_vp % max_cpus);
+
+       /*
+        * We distribute primary channels evenly across all the available
+        * NUMA nodes and within the assigned NUMA node we will assign the
+        * first available CPU to the primary channel.
+        * The sub-channels will be assigned to the CPUs available in the
+        * NUMA node evenly.
+        */
+       if (!primary) {
+               while (true) {
+                       next_node = next_numa_node_id++;
+                       if (next_node == nr_node_ids)
+                               next_node = next_numa_node_id = 0;
+                       if (cpumask_empty(cpumask_of_node(next_node)))
+                               continue;
+                       break;
+               }
+               channel->numa_node = next_node;
+               primary = channel;
+       }
+       alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
+
+       if (cpumask_weight(alloced_mask) ==
+           cpumask_weight(cpumask_of_node(primary->numa_node))) {
+               /*
+                * We have cycled through all the CPUs in the node;
+                * reset the alloced map.
+                */
+               cpumask_clear(alloced_mask);
+       }
+
+       cpumask_xor(&available_mask, alloced_mask,
+                   cpumask_of_node(primary->numa_node));
+
+       cur_cpu = -1;
+       while (true) {
+               cur_cpu = cpumask_next(cur_cpu, &available_mask);
+               if (cur_cpu >= nr_cpu_ids) {
+                       cur_cpu = -1;
+                       cpumask_copy(&available_mask,
+                                    cpumask_of_node(primary->numa_node));
+                       continue;
+               }
+
+               /*
+                * NOTE: in the case of sub-channel, we clear the sub-channel
+                * related bit(s) in primary->alloced_cpus_in_node in
+                * hv_process_channel_removal(), so when we reload drivers
+                * like hv_netvsc in SMP guest, here we're able to re-allocate
+                * bit from primary->alloced_cpus_in_node.
+                */
+               if (!cpumask_test_cpu(cur_cpu,
+                               &primary->alloced_cpus_in_node)) {
+                       cpumask_set_cpu(cur_cpu,
+                                       &primary->alloced_cpus_in_node);
+                       cpumask_set_cpu(cur_cpu, alloced_mask);
+                       break;
+               }
+       }
+
         channel->target_cpu = cur_cpu;
         channel->target_vp = hv_context.vp_index[cur_cpu];
  }
  
+/*
+ * vmbus_unload_response - Handler for the unload response.
+ */
+static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
+{
+       /*
+        * This is a global event; just wakeup the waiting thread.
+        * Once we successfully unload, we can cleanup the monitor state.
+        */
+       complete(&vmbus_connection.unload_event);
+}
+
+void vmbus_initiate_unload(void)
+{
+       struct vmbus_channel_message_header hdr;
+
+       /* Pre-Win2012R2 hosts don't support reconnect */
+       if (vmbus_proto_version < VERSION_WIN8_1)
+               return;
+
+       init_completion(&vmbus_connection.unload_event);
+       memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
+       hdr.msgtype = CHANNELMSG_UNLOAD;
+       vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header));
+
+       wait_for_completion(&vmbus_connection.unload_event);
+}
+
  /*
   * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
   *
@@ -461,8 +561,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
                                 offer->connection_id;
         }
  
-       init_vp_index(newchannel, &offer->offer.if_type);
-
         memcpy(&newchannel->offermsg, offer,
                sizeof(struct vmbus_channel_offer_channel));
         newchannel->monitor_grp = (u8)offer->monitorid / 32;
@@ -712,6 +810,7 @@ struct vmbus_channel_message_table_entry
         {CHANNELMSG_INITIATE_CONTACT,           0, NULL},
         {CHANNELMSG_VERSION_RESPONSE,           1, vmbus_onversion_response},
         {CHANNELMSG_UNLOAD,                     0, NULL},
+       {CHANNELMSG_UNLOAD_RESPONSE,            1, vmbus_unload_response},
  };
  
  /*