WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 17/17] Add support for automatically creating and des

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 17/17] Add support for automatically creating and destroying bypass rings in response to observed traffic.
From: <steven.smith@xxxxxxxxxx>
Date: Sun, 4 Oct 2009 16:04:33 +0100
Cc: Steven Smith <steven.smith@xxxxxxxxxx>, keir.frasier@xxxxxxxxxx, jeremy@xxxxxxxx, joserenato.santos@xxxxxx
Delivery-date: Sun, 04 Oct 2009 08:40:00 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <cover.1254667618.git.ssmith@xxxxxxxxxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <cover.1254667618.git.ssmith@xxxxxxxxxxxxxxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
This is designed to minimise the overhead of the autobypass machine,
and in particular to minimise the overhead in dom0, potentially at the
cost of not always detecting that a bypass would be useful.  In
particular, it isn't triggered by transmit_policy_small packets, and
so if you have a lot of very small packets then no bypass will be
created.

Signed-off-by: Steven Smith <steven.smith@xxxxxxxxxx>
---
 drivers/net/Kconfig                            |    8 +
 drivers/net/xen-netchannel2/Makefile           |    4 +
 drivers/net/xen-netchannel2/autobypass.c       |  316 ++++++++++++++++++++++++
 drivers/net/xen-netchannel2/bypass.c           |   40 +++-
 drivers/net/xen-netchannel2/bypassee.c         |   65 +++++
 drivers/net/xen-netchannel2/chan.c             |    6 +
 drivers/net/xen-netchannel2/netchannel2_core.h |  108 ++++++++-
 drivers/net/xen-netchannel2/recv_packet.c      |   13 +
 drivers/net/xen-netchannel2/rscb.c             |   19 ++
 include/xen/interface/io/netchannel2.h         |   13 +
 10 files changed, 585 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/xen-netchannel2/autobypass.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9ac12a8..eb02e57 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2818,6 +2818,14 @@ config XEN_NETDEV2_BYPASS_ENDPOINT
          Bypasses allow faster inter-domain communication, provided
          every VM supports them.
 
+config XEN_NETDEV2_AUTOMATIC_BYPASS
+       bool "Automatically manage netchannel2 bypasses"
+       depends on XEN_NETDEV2_BYPASS_ENDPOINT
+       default y
+       help
+         Try to detect when bypasses would be useful, and manage
+         them automatically.
+
 config ISERIES_VETH
        tristate "iSeries Virtual Ethernet driver support"
        depends on PPC_ISERIES
diff --git a/drivers/net/xen-netchannel2/Makefile 
b/drivers/net/xen-netchannel2/Makefile
index 5aa3410..9c4f97a 100644
--- a/drivers/net/xen-netchannel2/Makefile
+++ b/drivers/net/xen-netchannel2/Makefile
@@ -19,3 +19,7 @@ endif
 ifeq ($(CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT),y)
 netchannel2-objs += bypass.o
 endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS),y)
+netchannel2-objs += autobypass.o
+endif
diff --git a/drivers/net/xen-netchannel2/autobypass.c 
b/drivers/net/xen-netchannel2/autobypass.c
new file mode 100644
index 0000000..c83dac6
--- /dev/null
+++ b/drivers/net/xen-netchannel2/autobypass.c
@@ -0,0 +1,316 @@
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include "netchannel2_core.h"
+
+/* The state machine works like this:
+
+   -- We start in state NORMAL.         In this state, we count how many
+      bypass and non-bypass packets we receive, and don't do anything
+      else.
+
+   -- After receiving AUTOBYPASS_PERIOD packets, we look at the
+      bypass-candidate to non-bypass-candidate ratio.  If the number
+      of non-bypass packets exceeds the number of bypass packets by
+      more than a factor of AUTOBYPASS_RATIO, reset the counters and
+      go back to state NORMAL. Otherwise, go to state CONSIDERING.
+      We also reset and go back to normal if it took more than
+      AUTOBYPASS_MAX_PERIOD_JIFFIES jiffies to get here.
+
+   -- In state CONSIDERING, continue to count up the bypass and
+      non-bypass packets.  In addition, whenever we get a bypass
+      packet, pull the source MAC address out of the header and
+      compare it to the hot list.  If it's in the hot list, increment
+      that entry's count.
+
+   -- After another AUTOBYPASS_PERIOD, check the packet counts again.
+      Provided the total bypass ratio is good enough (see the NORMAL
+      exit criteria), walk the hot list, and if any entry accounts for
+      more than AUTOBYPASS_RATIO2 of the total traffic, suggest to
+      dom0 that it create a new bypass for us. The go to DEBOUNCE.
+
+   -- In DEBOUNCE, wait until we've received at least
+      AUTOBYPASS_DEBOUNCE_PERIOD bypass packets, then go to NORMAL.
+
+   So, we establish a bypass if total traffic > PERIOD/MAX_PERIOD
+   packets per second, of which at least PERIOD/(MAX_PERIOD*(RATIO+1))
+   are bypass candidates and PERIOD/(MAX_PERIOD*(RATIO2+1)) are for
+   one specific bypass.         This needs to be sustained for at least
+   PERIOD*2 before we actually establish a bypass.
+*/
+
+/* If you increase this past 65536, consider changing the type of
+   auto_bypass.hot_macs[...].count, to avoid overflow. */
+#define AUTOBYPASS_PERIOD 1024
+#define AUTOBYPASS_RATIO 2
+#define AUTOBYPASS_RATIO2 4
+#define AUTOBYPASS_DEBOUNCE_PERIOD 1024
+#define AUTOBYPASS_MAX_PERIOD_JIFFIES (HZ/2)
+
+
+#define TEARDOWN_PERIOD_JIFFIES (HZ*5)
+#define TEARDOWN_MIN_PACKETS (256*TEARDOWN_PERIOD_JIFFIES)
+
+static void autoteardown_timer_fn(unsigned long ignore);
+
+static DEFINE_SPINLOCK(autoteardown_lock);
+static LIST_HEAD(autoteardown_list);
+static DEFINE_TIMER(autoteardown_timer, autoteardown_timer_fn, 0, 0);
+
+static void autoteardown_timer_fn(unsigned long ignore)
+{
+       struct nc2_alternate_ring *nar;
+
+       spin_lock(&autoteardown_lock);
+       list_for_each_entry(nar, &autoteardown_list,
+                           autoteardown.autoteardown_list) {
+               if (nar->autoteardown.seen_count < 2) {
+                       /* Give it at least two periods to get started,
+                          to avoid flapping. */
+                       /* One period isn't enough, because we reset
+                          the seen_count without holding the teardown
+                          lock from
+                          nc2_aux_ring_start_disable_sequence, and
+                          there's a risk that we'll see it non-zero
+                          when it should be zero.  However, the
+                          chances of that happening twice in a row
+                          are so small that we can ignore them.  Even
+                          if it does go wrong twice, the worst case
+                          is that we drop a few packets by forcing a
+                          detach when the remote is behaving
+                          correctly. */
+                       nar->autoteardown.seen_count++;
+                       continue;
+               }
+               switch (nar->state) {
+               case nc2_alt_ring_frontend_sent_ready:
+                       /* Interesting.  We're ready to go, but the
+                          backend isn't.  Furthermore, this isn't the
+                          first time we've seen this interface, so
+                          we've been trying to establish it for at
+                          least TEARDOWN_PERIOD_JIFFIES.  Conclude
+                          that the backend is misbehaving and start a
+                          disable sequence. */
+                       nc2_aux_ring_start_disable_sequence(nar);
+                       break;
+               case nc2_alt_ring_ready:
+                       if (nar->autoteardown.nr_packets <
+                           TEARDOWN_MIN_PACKETS) {
+                               /* This interface isn't busy enough ->
+                                  needs to be torn down. */
+                               nc2_aux_ring_start_disable_sequence(nar);
+                       }
+                       nar->autoteardown.nr_packets = 0;
+                       break;
+               case nc2_alt_ring_disabling:
+                       /* We seem to have gotten stuck trying to
+                          disable the ring, probably because the
+                          remote isn't sending FINISH messages fast
+                          enough.  Be a bit more aggressive. */
+                       nc2_aux_ring_start_detach_sequence(nar);
+                       break;
+               default:
+                       /* Other states are waiting either for the
+                          local operating system to complete work
+                          items, or for the upstream interface to
+                          process messages.  Upstream is always
+                          trusted, so just assume that this'll fix
+                          itself sooner or later. */
+                       break;
+               }
+       }
+       if (!list_empty(&autoteardown_list)) {
+               mod_timer(&autoteardown_timer,
+                         jiffies + TEARDOWN_PERIOD_JIFFIES);
+       }
+       spin_unlock(&autoteardown_lock);
+}
+
+void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+       spin_lock_bh(&autoteardown_lock);
+       if (list_empty(&autoteardown_list))
+               mod_timer(&autoteardown_timer,
+                         jiffies + TEARDOWN_PERIOD_JIFFIES);
+       list_move(&nar->autoteardown.autoteardown_list, &autoteardown_list);
+       spin_unlock_bh(&autoteardown_lock);
+}
+
+void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+       spin_lock_bh(&autoteardown_lock);
+       list_del_init(&nar->autoteardown.autoteardown_list);
+       if (list_empty(&autoteardown_list))
+               del_timer(&autoteardown_timer);
+       spin_unlock_bh(&autoteardown_lock);
+}
+
+static int busy_enough_for_bypass(struct netchannel2 *nc)
+{
+       uint64_t nr_non_bypass;
+       unsigned long start_jiffies;
+
+       nr_non_bypass = nc->auto_bypass.nr_non_bypass_packets;
+       start_jiffies = nc->auto_bypass.start_jiffies;
+       nc->auto_bypass.nr_non_bypass_packets = 0;
+       nc->auto_bypass.nr_bypass_packets = 0;
+       if (nr_non_bypass > AUTOBYPASS_PERIOD * AUTOBYPASS_RATIO ||
+           jiffies - start_jiffies > AUTOBYPASS_MAX_PERIOD_JIFFIES) {
+               /* Either took too long to collect the bypass
+                  packets, or too many non-bypass relative to
+                  number of bypasses.  Either way, not a good
+                  time to consider doing bypasses. */
+               nc->auto_bypass.start_jiffies = jiffies;
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
+static void record_source_mac(struct netchannel2 *nc, struct sk_buff *skb)
+{
+       struct ethhdr *eh;
+       unsigned x;
+
+       if (skb_headlen(skb) < sizeof(struct ethhdr))
+               return;
+       eh = (struct ethhdr *)skb->data;
+       for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) {
+               if (!memcmp(eh->h_source, nc->auto_bypass.hot_macs[x].mac,
+                           sizeof(eh->h_source))) {
+                       nc->auto_bypass.hot_macs[x].count++;
+                       return;
+               }
+       }
+       if (x == AUTOBYPASS_MAX_HOT_MACS) {
+               /* Communicating with too many bypass candidates ->
+                  can't keep track of them all -> drop a couple at
+                  random. */
+               return;
+       }
+       nc->auto_bypass.hot_macs[x].count = 1;
+       memcpy(nc->auto_bypass.hot_macs[x].mac,
+              eh->h_source,
+              sizeof(eh->h_source));
+       nc->auto_bypass.nr_hot_macs++;
+}
+
+static void queue_suggested_bypass(struct netchannel2 *nc,
+                                  const char *mac)
+{
+       int ind;
+
+       ind = nc->auto_bypass.suggestion_head % AUTOBYPASS_SUGG_QUEUE_SIZE;
+       if (nc->auto_bypass.suggestion_head ==
+           nc->auto_bypass.suggestion_tail + AUTOBYPASS_SUGG_QUEUE_SIZE) {
+               /* We've overflowed the suggestion queue.  That means
+                  that, even though we're receiving a massive number
+                  of packets, we've never had enough free ring space
+                  to actually send a suggestion message.  I'm not
+                  convinced that's actually possible, but it's
+                  trivial to handle, so we might as well. */
+               /* Drop the oldest pending suggestion, since it's the
+                  most likely to be out of date and therefore
+                  useless. */
+               nc->auto_bypass.suggestion_tail++;
+       }
+       nc->auto_bypass.suggestion_head++;
+       memcpy(&nc->auto_bypass.suggestions[ind],
+              mac,
+              ETH_ALEN);
+}
+
+static void suggest_bypasses(struct netchannel2 *nc)
+{
+       unsigned x;
+       unsigned threshold;
+
+       BUG_ON(nc->auto_bypass.nr_hot_macs == 0);
+       threshold =
+               (nc->auto_bypass.nr_non_bypass_packets +
+                nc->auto_bypass.nr_bypass_packets) / AUTOBYPASS_RATIO2;
+       for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) {
+               if (nc->auto_bypass.hot_macs[x].count > threshold) {
+                       queue_suggested_bypass(
+                               nc,
+                               nc->auto_bypass.hot_macs[x].mac);
+               }
+       }
+}
+
+/* Called under the master ring lock whenever we receive a packet with
+   NC2_PACKET_FLAG_bypass_candidate set. */
+void nc2_received_bypass_candidate_packet(struct netchannel2 *nc,
+                                         struct sk_buff *skb)
+{
+       nc->auto_bypass.nr_bypass_packets++;
+       switch (nc->auto_bypass.state) {
+       case autobypass_state_normal:
+               if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD)
+                       return;
+               if (!busy_enough_for_bypass(nc))
+                       return;
+               nc->auto_bypass.nr_hot_macs = 0;
+               nc->auto_bypass.state = autobypass_state_considering;
+               break;
+       case autobypass_state_considering:
+               record_source_mac(nc, skb);
+               if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD)
+                       return;
+               if (busy_enough_for_bypass(nc))
+                       suggest_bypasses(nc);
+               nc->auto_bypass.state = autobypass_state_debounce;
+               break;
+       case autobypass_state_debounce:
+               if (nc->auto_bypass.nr_bypass_packets == AUTOBYPASS_PERIOD) {
+                       nc->auto_bypass.state = autobypass_state_normal;
+                       nc->auto_bypass.nr_non_bypass_packets = 0;
+                       nc->auto_bypass.nr_bypass_packets = 0;
+                       nc->auto_bypass.start_jiffies = jiffies;
+               }
+               break;
+       }
+}
+
+static int send_suggestion(struct netchannel2_ring_pair *ncrp,
+                          const char *mac)
+{
+       struct netchannel2_msg_suggest_bypass msg;
+
+       if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg)))
+               return 0;
+
+       memset(&msg, 0, sizeof(msg));
+       memcpy(msg.mac, mac, ETH_ALEN);
+       nc2_send_message(&ncrp->prod_ring,
+                        NETCHANNEL2_MSG_SUGGEST_BYPASS,
+                        0,
+                        &msg,
+                        sizeof(msg));
+       ncrp->pending_time_sensitive_messages = 1;
+       return 1;
+}
+
+void _nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+       struct nc2_auto_bypass *nab = &nc->auto_bypass;
+       struct netchannel2_ring_pair *ncrp = &nc->rings;
+       unsigned ind;
+
+       while (nab->suggestion_tail != nab->suggestion_head) {
+               BUG_ON(nab->suggestion_head - nab->suggestion_tail >
+                      AUTOBYPASS_SUGG_QUEUE_SIZE);
+               ind = nab->suggestion_tail % AUTOBYPASS_SUGG_QUEUE_SIZE;
+               if (!send_suggestion(ncrp, nab->suggestions[ind].mac))
+                       break;
+               nab->suggestion_tail++;
+       }
+}
+
+void nc2_shutdown_autoteardown(void)
+{
+       /* There shouldn't be any interfaces at all, so there
+          certainly won't be any bypasses, and we don't have to worry
+          about the timer getting requeued.  Make sure it's finished
+          and then get out. */
+       del_timer_sync(&autoteardown_timer);
+}
diff --git a/drivers/net/xen-netchannel2/bypass.c 
b/drivers/net/xen-netchannel2/bypass.c
index 05cb4d5..477f78d 100644
--- a/drivers/net/xen-netchannel2/bypass.c
+++ b/drivers/net/xen-netchannel2/bypass.c
@@ -65,6 +65,10 @@ int bypass_xmit_packet(struct netchannel2 *nc,
                return 1;
        }
 
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       ncr->autoteardown.nr_packets++;
+#endif
+
        queue_packet_to_interface(skb, rings);
 
        spin_unlock(&rings->lock);
@@ -76,6 +80,12 @@ void nc2_aux_ring_start_disable_sequence(struct 
nc2_alternate_ring *nar)
 {
        spin_lock(&nar->rings.lock);
        if (nar->state < nc2_alt_ring_disabling) {
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+               /* We should really hold the autoteardown lock for
+                  this, but see the big comment in
+                  autoteardown_timer_fn() */
+               nar->autoteardown.seen_count = 0;
+#endif
                nar->state = nc2_alt_ring_disabling;
                nc2_kick(&nar->rings);
        }
@@ -105,6 +115,8 @@ static void start_detach_worker(struct work_struct *ws)
                unbind_from_irqhandler(ncr->rings.irq, &ncr->rings);
        ncr->rings.irq = -1;
 
+       nc2_unregister_bypass_for_autoteardown(ncr);
+
        spin_lock_bh(&ncr->rings.lock);
        ncr->state = nc2_alt_ring_detached_pending;
        ncr->rings.interface->need_aux_ring_state_machine = 1;
@@ -328,7 +340,7 @@ static void send_ready_message(struct nc2_alternate_ring 
*ncr)
                /* This shouldn't happen, because the producer ring
                   should be essentially empty at this stage.  If it
                   does, it probably means the other end is playing
-                  silly buggers with the ring indexes.  Drop the
+                  silly buggers with the ring indexes.          Drop the
                   message. */
                printk(KERN_WARNING "Failed to send bypass ring ready 
message.\n");
        }
@@ -347,8 +359,12 @@ void nc2_handle_bypass_ready(struct netchannel2 *nc,
        ncr = container_of(ncrp, struct nc2_alternate_ring, rings);
        /* We're now allowed to start sending packets over this
         * ring. */
-       if (ncr->state == nc2_alt_ring_frontend_sent_ready)
+       if (ncr->state == nc2_alt_ring_frontend_sent_ready) {
                ncr->state = nc2_alt_ring_ready;
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+               ncr->autoteardown.seen_count = 0;
+#endif
+       }
 }
 
 /* Called holding the aux ring lock. */
@@ -396,6 +412,8 @@ static void initialise_bypass_frontend_work_item(struct 
work_struct *ws)
        nc2_kick(&interface->rings);
        spin_unlock_bh(&interface->rings.lock);
 
+       nc2_register_bypass_for_autoteardown(ncr);
+
        return;
 
 err:
@@ -440,12 +458,18 @@ static void initialise_bypass_backend_work_item(struct 
work_struct *ws)
 
        send_ready_message(ncr);
 
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       ncr->autoteardown.seen_count = 0;
+#endif
+
        spin_lock_bh(&ncr->rings.lock);
        ncr->state = nc2_alt_ring_ready;
        spin_unlock_bh(&ncr->rings.lock);
 
        nc2_kick(&ncr->rings);
 
+       nc2_register_bypass_for_autoteardown(ncr);
+
        return;
 
 err:
@@ -525,11 +549,14 @@ err:
                                   work->frontend_setup_msg.common.ring_pages);
 
        work->state = nc2_alt_ring_frontend_preparing;
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       INIT_LIST_HEAD(&work->autoteardown.autoteardown_list);
+#endif
        init_waitqueue_head(&work->eventq);
        work->handle = work->frontend_setup_msg.common.handle;
        INIT_WORK(&work->work_item, initialise_bypass_frontend_work_item);
        if (init_ring_pair(&work->rings, nc) < 0)
-               goto err;
+           goto err;
        work->rings.filter_mac = 1;
 
        list_add(&work->rings_by_interface, &nc->alternate_rings);
@@ -590,12 +617,15 @@ err:
                                   sizeof(uint32_t) *
                                   work->backend_setup_msg.common.ring_pages);
 
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       INIT_LIST_HEAD(&work->autoteardown.autoteardown_list);
+#endif
        work->state = nc2_alt_ring_backend_preparing;
        init_waitqueue_head(&work->eventq);
        work->handle = work->backend_setup_msg.common.handle;
        INIT_WORK(&work->work_item, initialise_bypass_backend_work_item);
        if (init_ring_pair(&work->rings, nc) < 0)
-               goto err;
+           goto err;
        work->rings.filter_mac = 1;
 
        list_add(&work->rings_by_interface, &nc->alternate_rings);
@@ -724,7 +754,7 @@ void detach_all_bypasses(struct netchannel2 *nc)
                           machine, which will eventually destroy the
                           bypass. */
                        /* nc2_alt_ring_frontend_sent_ready is a bit
-                          odd.  We are frontend-like, and we've told
+                          odd.  We are frontend-like, and we've told
                           the backend who we are, but we haven't yet
                           received a READY from the backend.  We
                           don't necessarily trust the backend, so we
diff --git a/drivers/net/xen-netchannel2/bypassee.c 
b/drivers/net/xen-netchannel2/bypassee.c
index f0cda24..e8166ef 100644
--- a/drivers/net/xen-netchannel2/bypassee.c
+++ b/drivers/net/xen-netchannel2/bypassee.c
@@ -277,6 +277,63 @@ void nc2_handle_bypass_detached(struct netchannel2 *nc,
                 msg.handle);
 }
 
+static void process_suggestion_queue_workitem(struct work_struct *ws)
+{
+       struct netchannel2 *nc =
+               container_of(ws, struct netchannel2,
+                            incoming_bypass_suggestions.workitem);
+       struct nc2_incoming_bypass_suggestions *sugg =
+               &nc->incoming_bypass_suggestions;
+       unsigned ind;
+       unsigned char mac[ETH_ALEN];
+
+       spin_lock_bh(&sugg->lock);
+       while (sugg->tail != sugg->head) {
+               ind = sugg->tail % NC2_BYPASS_SUGG_QUEUE_SIZE;
+               memcpy(mac, sugg->queue[ind].mac, ETH_ALEN);
+               sugg->tail++;
+               spin_unlock_bh(&sugg->lock);
+
+               nb2_handle_suggested_bypass(nc, mac);
+
+               spin_lock_bh(&sugg->lock);
+       }
+       spin_unlock_bh(&sugg->lock);
+}
+
+void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                              struct netchannel2_ring_pair *ncrp,
+                              struct netchannel2_msg_hdr *hdr)
+{
+       struct nc2_incoming_bypass_suggestions *sugg =
+               &nc->incoming_bypass_suggestions;
+       struct netchannel2_msg_suggest_bypass msg;
+       unsigned ind;
+
+       if (hdr->size != sizeof(msg)) {
+               pr_debug("strange size suggest bypass message; %d != %zd\n",
+                        hdr->size, sizeof(msg));
+               return;
+       }
+       if (ncrp != &nc->rings) {
+               pr_debug("suggest bypass on bypass ring?\n");
+               return;
+       }
+       nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+
+       spin_lock(&sugg->lock);
+       ind = sugg->head % NC2_BYPASS_SUGG_QUEUE_SIZE;
+       /* Drop if we've overflowed the queue */
+       if (sugg->head == sugg->tail + NC2_BYPASS_SUGG_QUEUE_SIZE)
+               sugg->tail++;
+       memcpy(&sugg->queue[ind].mac, msg.mac, ETH_ALEN);
+       if (sugg->head == sugg->tail)
+               schedule_work(&sugg->workitem);
+       sugg->head++;
+       spin_unlock(&sugg->lock);
+}
+
+
 static int send_disable_bypass_msg(struct netchannel2 *nc,
                                   struct nc2_bypass *bypass)
 {
@@ -736,3 +793,11 @@ void release_bypasses(struct netchannel2 *nc)
 
        flush_scheduled_work();
 }
+
+void nc2_init_incoming_bypass_suggestions(
+       struct netchannel2 *nc2,
+       struct nc2_incoming_bypass_suggestions *nibs)
+{
+       spin_lock_init(&nibs->lock);
+       INIT_WORK(&nibs->workitem, process_suggestion_queue_workitem);
+}
diff --git a/drivers/net/xen-netchannel2/chan.c 
b/drivers/net/xen-netchannel2/chan.c
index 5ceacc3..a26a917 100644
--- a/drivers/net/xen-netchannel2/chan.c
+++ b/drivers/net/xen-netchannel2/chan.c
@@ -114,6 +114,9 @@ retry:
                case NETCHANNEL2_MSG_BYPASS_READY:
                        nc2_handle_bypass_ready(nc, ncrp, &hdr);
                        break;
+               case NETCHANNEL2_MSG_SUGGEST_BYPASS:
+                       nc2_handle_suggest_bypass(nc, ncrp, &hdr);
+                       break;
                case NETCHANNEL2_MSG_PAD:
                        break;
                default:
@@ -174,6 +177,7 @@ static void flush_rings(struct netchannel2_ring_pair *ncrp)
                        advertise_offloads(nc);
                nc2_advertise_bypasses(nc);
                nc2_crank_aux_ring_state_machine(nc);
+               nc2_autobypass_make_suggestions(nc);
        } else {
                nc2_alternate_ring_disable_finish(ncrp);
        }
@@ -446,6 +450,8 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd)
 #ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
        INIT_LIST_HEAD(&nc->bypasses_a);
        INIT_LIST_HEAD(&nc->bypasses_b);
+       nc2_init_incoming_bypass_suggestions(nc,
+                                            &nc->incoming_bypass_suggestions);
        nc->max_bypasses = max_bypasses;
 #endif
 #ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
diff --git a/drivers/net/xen-netchannel2/netchannel2_core.h 
b/drivers/net/xen-netchannel2/netchannel2_core.h
index a116e5c..b2c3f8c 100644
--- a/drivers/net/xen-netchannel2/netchannel2_core.h
+++ b/drivers/net/xen-netchannel2/netchannel2_core.h
@@ -108,6 +108,79 @@ static inline struct skb_cb_overlay 
*get_skb_overlay(struct sk_buff *skb)
        return (struct skb_cb_overlay *)skb->cb;
 }
 
+struct nc2_alternate_ring;
+struct netchannel2;
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+#define AUTOBYPASS_MAX_HOT_MACS 8
+#define AUTOBYPASS_SUGG_QUEUE_SIZE 8
+struct nc2_auto_bypass {
+       enum {
+               autobypass_state_normal,
+               autobypass_state_considering,
+               autobypass_state_debounce
+       } state;
+       uint32_t nr_bypass_packets;
+       uint64_t nr_non_bypass_packets;
+       unsigned long start_jiffies;
+       unsigned nr_hot_macs;
+       struct {
+               unsigned char mac[ETH_ALEN];
+               /* This won't overflow because the autobypass period
+                  is less than 65536. */
+               uint16_t count;
+       } hot_macs[AUTOBYPASS_MAX_HOT_MACS];
+       unsigned suggestion_head;
+       unsigned suggestion_tail;
+       struct {
+               unsigned char mac[ETH_ALEN];
+       } suggestions[AUTOBYPASS_SUGG_QUEUE_SIZE];
+};
+void nc2_received_bypass_candidate_packet(struct netchannel2 *nc,
+                                         struct sk_buff *skb);
+
+struct nc2_bypass_autoteardown {
+       struct list_head autoteardown_list;
+       uint64_t nr_packets;
+       unsigned seen_count;
+};
+
+void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar);
+void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar);
+void nc2_shutdown_autoteardown(void);
+#else
+static inline void nc2_shutdown_autoteardown(void)
+{
+}
+static inline void nc2_register_bypass_for_autoteardown(struct 
nc2_alternate_ring *nar)
+{
+}
+static inline void nc2_unregister_bypass_for_autoteardown(struct 
nc2_alternate_ring *nar)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+#define NC2_BYPASS_SUGG_QUEUE_SIZE 8
+struct nc2_incoming_bypass_suggestions {
+       spinlock_t lock;
+
+       unsigned head;
+       unsigned tail;
+
+       struct work_struct workitem;
+
+       struct {
+               unsigned char mac[ETH_ALEN];
+       } queue[NC2_BYPASS_SUGG_QUEUE_SIZE];
+};
+
+void nc2_init_incoming_bypass_suggestions(
+       struct netchannel2 *nc,
+       struct nc2_incoming_bypass_suggestions *nibs);
+#endif
+
+
 /* Packets for which we need to send FINISH_PACKET messages for as
    soon as possible. */
 struct pending_finish_packets {
@@ -152,8 +225,8 @@ struct netchannel2_ring_pair {
        grant_ref_t gref_pool;
 
        /* The IRQ corresponding to the event channel which is
-          connected to the other end.  This only changes from the
-          xenbus state change handler.          It is notified from lots of
+          connected to the other end.  This only changes from the
+          xenbus state change handler.  It is notified from lots of
           other places.  Fortunately, it's safe to notify on an irq
           after it's been released, so the lack of synchronisation
           doesn't matter. */
@@ -294,12 +367,17 @@ struct netchannel2 {
        struct list_head bypasses_b;
 
        struct nc2_bypass *current_bypass_frontend;
+       struct nc2_incoming_bypass_suggestions incoming_bypass_suggestions;
 #endif
 
        /* Updates are protected by the lock.  This can be read at any
         * time without holding any locks, and the rest of Linux is
         * expected to cope. */
        struct net_device_stats stats;
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       struct nc2_auto_bypass auto_bypass;
+#endif
 };
 
 #ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
@@ -367,6 +445,9 @@ void nc2_handle_bypass_disabled(struct netchannel2 *nc,
 void nc2_handle_bypass_detached(struct netchannel2 *nc,
                                struct netchannel2_ring_pair *ncrp,
                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                              struct netchannel2_ring_pair *ncrp,
+                              struct netchannel2_msg_hdr *hdr);
 void release_bypasses(struct netchannel2 *nc);
 void nb2_handle_suggested_bypass(struct netchannel2 *a_chan,
                                 const char *mac_b);
@@ -392,6 +473,11 @@ static inline void nc2_handle_bypass_detached(struct 
netchannel2 *nc,
                                              struct netchannel2_msg_hdr *hdr)
 {
 }
+static inline void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                                            struct netchannel2_ring_pair *ncrp,
+                                            struct netchannel2_msg_hdr *hdr)
+{
+}
 #endif
 
 #ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
@@ -455,6 +541,10 @@ struct nc2_alternate_ring {
        struct netchannel2_msg_bypass_backend backend_setup_msg;
        uint32_t cons_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE];
        uint32_t prod_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE];
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+       struct nc2_bypass_autoteardown autoteardown;
+#endif
 };
 
 void nc2_handle_bypass_ready(struct netchannel2 *nc,
@@ -537,6 +627,18 @@ static inline void nc2_handle_bypass_ready(struct 
netchannel2 *nc,
 }
 #endif
 
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+void _nc2_autobypass_make_suggestions(struct netchannel2 *nc);
+static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+       if (nc->auto_bypass.suggestion_tail != nc->auto_bypass.suggestion_head)
+               _nc2_autobypass_make_suggestions(nc);
+}
+#else
+static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+}
+#endif
 
 static inline void flush_prepared_grant_copies(struct hypercall_batcher *hb,
                                               void (*on_fail)(void *ctxt,
@@ -653,6 +755,8 @@ void deinit_receive_map_mode(void);
 void suspend_receive_map_mode(void);
 void resume_receive_map_mode(void);
 
+struct netchannel2 *nc2_get_interface_for_page(struct page *p);
+
 int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp,
                          struct sk_buff *skb);
diff --git a/drivers/net/xen-netchannel2/recv_packet.c 
b/drivers/net/xen-netchannel2/recv_packet.c
index 749c70e..94aa127 100644
--- a/drivers/net/xen-netchannel2/recv_packet.c
+++ b/drivers/net/xen-netchannel2/recv_packet.c
@@ -200,6 +200,18 @@ void nc2_handle_packet_msg(struct netchannel2 *nc,
                        break;
                }
 
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+               if (ncrp == &nc->rings) {
+                       if (msg.flags & NC2_PACKET_FLAG_bypass_candidate)
+                               nc2_received_bypass_candidate_packet(nc, skb);
+                       else
+                               nc->auto_bypass.nr_non_bypass_packets++;
+               } else {
+                       container_of(ncrp, struct nc2_alternate_ring, rings)->
+                               autoteardown.nr_packets++;
+               }
+#endif
+
                switch (msg.segmentation_type) {
                case NC2_PACKET_SEGMENTATION_TYPE_none:
                        break;
@@ -316,5 +328,6 @@ int __init nc2_init(void)
 
 void __exit nc2_exit(void)
 {
+       nc2_shutdown_autoteardown();
        deinit_receive_map_mode();
 }
diff --git a/drivers/net/xen-netchannel2/rscb.c 
b/drivers/net/xen-netchannel2/rscb.c
index c929c73..945acf5 100644
--- a/drivers/net/xen-netchannel2/rscb.c
+++ b/drivers/net/xen-netchannel2/rscb.c
@@ -212,6 +212,9 @@ struct grant_packet_plan {
        grant_ref_t gref_pool;
        int use_subpage_grants;
        unsigned prefix_avail;
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+       int could_have_used_bypass;
+#endif
 };
 
 static inline int nfrags_skb(struct sk_buff *skb, int prefix_size)
@@ -312,6 +315,9 @@ static void prepare_subpage_grant(struct 
netchannel2_ring_pair *ncrp,
        domid_t trans_domid;
        grant_ref_t trans_gref;
        grant_ref_t gref;
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+       struct netchannel2 *orig_iface;
+#endif
 
        if (size <= plan->prefix_avail) {
                /* This fragment is going to be inline -> nothing to
@@ -329,6 +335,12 @@ static void prepare_subpage_grant(struct 
netchannel2_ring_pair *ncrp,
        gref = gnttab_claim_grant_reference(&plan->gref_pool);
        frag->receiver_copy.gref = gref;
        if (page_is_tracked(page)) {
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+               orig_iface = nc2_get_interface_for_page(page);
+               if (orig_iface &&
+                   orig_iface->extant_bypasses < orig_iface->max_bypasses)
+                       plan->could_have_used_bypass = 1;
+#endif
                lookup_tracker_page(page, &trans_domid, &trans_gref);
                gnttab_grant_foreign_access_ref_trans(gref,
                                                      ncrp->otherend_id,
@@ -412,5 +424,12 @@ void xmit_grant(struct netchannel2_ring_pair *ncrp,
        }
 
        skb_co->nr_fragments = plan.out_fragment - msg->frags;
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+       if (plan.could_have_used_bypass &&
+           ncrp == &ncrp->interface->rings &&
+           ncrp->interface->extant_bypasses < ncrp->interface->max_bypasses)
+               msg->flags |= NC2_PACKET_FLAG_bypass_candidate;
+#endif
 }
 
diff --git a/include/xen/interface/io/netchannel2.h 
b/include/xen/interface/io/netchannel2.h
index f3cabe8..075658d 100644
--- a/include/xen/interface/io/netchannel2.h
+++ b/include/xen/interface/io/netchannel2.h
@@ -84,6 +84,11 @@ struct netchannel2_msg_packet {
  * regardless of any SET_OFFLOAD messages which may or may not have
  * been sent. */
 #define NC2_PACKET_FLAG_data_validated 2
+/* If set, this flag indicates that this packet could have used a
+ * bypass if one had been available, and so it should be sent to the
+ * autobypass state machine.
+ */
+#define NC2_PACKET_FLAG_bypass_candidate 4
 /* If set, the transmitting domain requires an event urgently when
  * this packet's finish message is sent.  Otherwise, the event can be
  * delayed. */
@@ -326,4 +331,12 @@ struct netchannel2_msg_bypass_detached {
        uint32_t handle;
 };
 
+#define NETCHANNEL2_MSG_SUGGEST_BYPASS 17
+struct netchannel2_msg_suggest_bypass {
+       struct netchannel2_msg_hdr hdr;
+       unsigned char mac[6];
+       uint16_t pad1;
+       uint32_t pad2;
+};
+
 #endif /* !__NETCHANNEL2_H__ */
-- 
1.6.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>