Discussion:
[PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
(too old to reply)
Krishna Kumar
2010-08-03 03:02:56 UTC
Permalink
From: Krishna Kumar <***@in.ibm.com>

Factor out flow calculation code from get_rps_cpu, since macvtap
driver can use the same code.

Revisions:

v2 - Ben: Separate flow calcuation out and use in select queue
v3 - Arnd: Don't re-implement MIN

Signed-off-by: Krishna Kumar <***@in.ibm.com>
---
include/linux/netdevice.h | 1
net/core/dev.c | 94 ++++++++++++++++++++++--------------
2 files changed, 59 insertions(+), 36 deletions(-)

diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.h
--- org/include/linux/netdevice.h 2010-08-03 08:19:57.000000000 +0530
+++ new/include/linux/netdevice.h 2010-08-03 08:19:57.000000000 +0530
@@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
return dev->name;
}

+extern int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb);
extern int netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...)
__attribute__ ((format (printf, 3, 4)));
diff -ruNp org/net/core/dev.c new/net/core/dev.c
--- org/net/core/dev.c 2010-08-03 08:19:57.000000000 +0530
+++ new/net/core/dev.c 2010-08-03 08:19:57.000000000 +0530
@@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
/*
- * get_rps_cpu is called from netif_receive_skb and returns the target
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * skb_calculate_flow: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. On success, returns a hash number (> 0),
+ * otherwise -1.
*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow **rflowp)
+int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb)
{
+ int hash = skb->rxhash;
struct ipv6hdr *ip6;
struct iphdr *ip;
- struct netdev_rx_queue *rxqueue;
- struct rps_map *map;
- struct rps_dev_flow_table *flow_table;
- struct rps_sock_flow_table *sock_flow_table;
- int cpu = -1;
u8 ip_proto;
- u16 tcpu;
u32 addr1, addr2, ihl;
union {
u32 v32;
u16 v16[2];
} ports;

- if (skb_rx_queue_recorded(skb)) {
- u16 index = skb_get_rx_queue(skb);
- if (unlikely(index >= dev->num_rx_queues)) {
- WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
- "on queue %u, but number of RX queues is %u\n",
- dev->name, index, dev->num_rx_queues);
- goto done;
- }
- rxqueue = dev->_rx + index;
- } else
- rxqueue = dev->_rx;
-
- if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
- goto done;
-
- if (skb->rxhash)
+ if (hash)
goto got_hash; /* Skip hash computation on packet header */

switch (skb->protocol) {
@@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
default:
goto done;
}
+
switch (ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
@@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
/* get a consistent hash (same value on both flow directions) */
if (addr2 < addr1)
swap(addr1, addr2);
- skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
- if (!skb->rxhash)
- skb->rxhash = 1;
+
+ hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+ if (!hash)
+ hash = 1;

got_hash:
+ return hash;
+
+done:
+ return -1;
+}
+EXPORT_SYMBOL(skb_calculate_flow);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_sock_flow_table *sock_flow_table;
+ int cpu = -1;
+ u16 tcpu;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+ if (unlikely(index >= dev->num_rx_queues)) {
+ WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
+ "on queue %u, but number of RX queues is %u\n",
+ dev->name, index, dev->num_rx_queues);
+ goto done;
+ }
+ rxqueue = dev->_rx + index;
+ } else
+ rxqueue = dev->_rx;
+
+ if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
+ goto done;
+
+ skb->rxhash = skb_calculate_flow(dev, skb);
+ if (skb->rxhash < 0)
+ goto done;
+
flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Krishna Kumar
2010-08-03 03:03:03 UTC
Permalink
From: Krishna Kumar <***@in.ibm.com>

Implement multiqueue facility for macvtap driver. The idea is that
a macvtap device can be opened multiple times and the fd's can be
used to register eg, as backend for vhost.

Signed-off-by: Krishna Kumar <***@in.ibm.com>
---
drivers/net/macvtap.c | 89 ++++++++++++++++++++++++++++-------
include/linux/if_macvlan.h | 9 +++
2 files changed, 80 insertions(+), 18 deletions(-)

diff -ruNp org/include/linux/if_macvlan.h new/include/linux/if_macvlan.h
--- org/include/linux/if_macvlan.h 2010-08-03 08:19:57.000000000 +0530
+++ new/include/linux/if_macvlan.h 2010-08-03 08:20:39.000000000 +0530
@@ -40,6 +40,12 @@ struct macvlan_rx_stats {
unsigned long rx_errors;
};

+/*
+ * Maximum times a macvtap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
+#define MAX_MACVTAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16)
+
struct macvlan_dev {
struct net_device *dev;
struct list_head list;
@@ -50,7 +56,8 @@ struct macvlan_dev {
enum macvlan_mode mode;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
- struct macvtap_queue *tap;
+ struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
+ int numvtaps;
};

static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
diff -ruNp org/drivers/net/macvtap.c new/drivers/net/macvtap.c
--- org/drivers/net/macvtap.c 2010-08-03 08:19:57.000000000 +0530
+++ new/drivers/net/macvtap.c 2010-08-03 08:19:57.000000000 +0530
@@ -84,26 +84,45 @@ static const struct proto_ops macvtap_so
static DEFINE_SPINLOCK(macvtap_lock);

/*
- * Choose the next free queue, for now there is only one
+ * get_slot: return a [unused/occupied] slot in vlan->taps[]:
+ * - if 'q' is NULL, return the first empty slot;
+ * - otherwise, return the slot this pointer occupies.
*/
+static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
+{
+ int i;
+
+ for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
+ if (rcu_dereference(vlan->taps[i]) == q)
+ return i;
+ }
+
+ /* Should never happen */
+ BUG_ON(1);
+}
+
static int macvtap_set_queue(struct net_device *dev, struct file *file,
struct macvtap_queue *q)
{
struct macvlan_dev *vlan = netdev_priv(dev);
+ int index;
int err = -EBUSY;

spin_lock(&macvtap_lock);
- if (rcu_dereference(vlan->tap))
+ if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
goto out;

err = 0;
+ index = get_slot(vlan, NULL);
rcu_assign_pointer(q->vlan, vlan);
- rcu_assign_pointer(vlan->tap, q);
+ rcu_assign_pointer(vlan->taps[index], q);
sock_hold(&q->sk);

q->file = file;
file->private_data = q;

+ vlan->numvtaps++;
+
out:
spin_unlock(&macvtap_lock);
return err;
@@ -124,9 +143,12 @@ static void macvtap_put_queue(struct mac
spin_lock(&macvtap_lock);
vlan = rcu_dereference(q->vlan);
if (vlan) {
- rcu_assign_pointer(vlan->tap, NULL);
+ int index = get_slot(vlan, q);
+
+ rcu_assign_pointer(vlan->taps[index], NULL);
rcu_assign_pointer(q->vlan, NULL);
sock_put(&q->sk);
+ --vlan->numvtaps;
}

spin_unlock(&macvtap_lock);
@@ -136,39 +158,72 @@ static void macvtap_put_queue(struct mac
}

/*
- * Since we only support one queue, just dereference the pointer.
+ * Select a queue based on the rxq of the device on which this packet
+ * arrived. If the incoming device is not mq, calculate a flow hash to
+ * select a queue. vlan->numvtaps is cached in case it reduces during
+ * the execution of this function.
*/
static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
struct sk_buff *skb)
{
struct macvlan_dev *vlan = netdev_priv(dev);
+ struct macvtap_queue *tap = NULL;
+ int numvtaps = vlan->numvtaps;
+ u16 rxq;
+
+ if (!numvtaps)
+ goto out;
+
+ if (likely(skb_rx_queue_recorded(skb))) {
+ rxq = skb_get_rx_queue(skb);
+
+ while (unlikely(rxq >= numvtaps))
+ rxq -= numvtaps;

- return rcu_dereference(vlan->tap);
+ tap = rcu_dereference(vlan->taps[rxq]);
+ if (tap)
+ goto out;
+ }
+
+ rxq = skb_calculate_flow(dev, skb);
+ if (rxq < 0)
+ rxq = smp_processor_id();
+
+ tap = rcu_dereference(vlan->taps[rxq & (numvtaps - 1)]);
+
+out:
+ return tap;
}

/*
* The net_device is going away, give up the reference
- * that it holds on the queue (all the queues one day)
- * and safely set the pointer from the queues to NULL.
+ * that it holds on all queues and safely set the pointer
+ * from the queues to NULL.
*/
static void macvtap_del_queues(struct net_device *dev)
{
struct macvlan_dev *vlan = netdev_priv(dev);
- struct macvtap_queue *q;
+ struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
+ int i, j = 0;

+ /* macvtap_put_queue can free some slots, so go through all slots */
spin_lock(&macvtap_lock);
- q = rcu_dereference(vlan->tap);
- if (!q) {
- spin_unlock(&macvtap_lock);
- return;
+ for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
+ q = rcu_dereference(vlan->taps[i]);
+ if (q) {
+ qlist[j++] = q;
+ rcu_assign_pointer(vlan->taps[i], NULL);
+ rcu_assign_pointer(q->vlan, NULL);
+ vlan->numvtaps--;
+ }
}
-
- rcu_assign_pointer(vlan->tap, NULL);
- rcu_assign_pointer(q->vlan, NULL);
+ BUG_ON(vlan->numvtaps != 0);
spin_unlock(&macvtap_lock);

synchronize_rcu();
- sock_put(&q->sk);
+
+ for (--j; j >= 0; j--)
+ sock_put(&qlist[j]->sk);
}

/*
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Changli Gao
2010-08-03 04:05:34 UTC
Permalink
Post by Krishna Kumar
Factor out flow calculation code from get_rps_cpu, since macvtap
driver can use the same code.
v2 - Ben: Separate flow calcuation out and use in select queue
v3 - Arnd: Don't re-implement MIN
---
=A0include/linux/netdevice.h | =A0 =A01
=A0net/core/dev.c =A0 =A0 =A0 =A0 =A0 =A0| =A0 94 +++++++++++++++++++=
+++--------------
Post by Krishna Kumar
=A02 files changed, 59 insertions(+), 36 deletions(-)
diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevice.=
h
Post by Krishna Kumar
--- org/include/linux/netdevice.h =A0 =A0 =A0 2010-08-03 08:19:57.000=
000000 +0530
Post by Krishna Kumar
+++ new/include/linux/netdevice.h =A0 =A0 =A0 2010-08-03 08:19:57.000=
000000 +0530
Post by Krishna Kumar
@@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
=A0 =A0 =A0 =A0return dev->name;
=A0}
+extern int skb_calculate_flow(struct net_device *dev, struct sk_buff=
*skb);
Post by Krishna Kumar
=A0extern int netdev_printk(const char *level, const struct net_devic=
e *dev,
Post by Krishna Kumar
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 const char *format, .=
=2E.)
Post by Krishna Kumar
=A0 =A0 =A0 =A0__attribute__ ((format (printf, 3, 4)));
diff -ruNp org/net/core/dev.c new/net/core/dev.c
--- org/net/core/dev.c =A02010-08-03 08:19:57.000000000 +0530
+++ new/net/core/dev.c =A02010-08-03 08:19:57.000000000 +0530
@@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
=A0 =A0 =A0 =A0__raise_softirq_irqoff(NET_RX_SOFTIRQ);
=A0}
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
=A0/*
- * get_rps_cpu is called from netif_receive_skb and returns the targ=
et
Post by Krishna Kumar
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * skb_calculate_flow: calculate a flow hash based on src/dst addres=
ses
Post by Krishna Kumar
+ * and src/dst port numbers. On success, returns a hash number (> 0)=
,
Post by Krishna Kumar
+ * otherwise -1.
=A0*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct rps_dev_flow **rf=
lowp)
Post by Krishna Kumar
+int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb)
=A0{
+ =A0 =A0 =A0 int hash =3D skb->rxhash;
=A0 =A0 =A0 =A0struct ipv6hdr *ip6;
=A0 =A0 =A0 =A0struct iphdr *ip;
- =A0 =A0 =A0 struct netdev_rx_queue *rxqueue;
- =A0 =A0 =A0 struct rps_map *map;
- =A0 =A0 =A0 struct rps_dev_flow_table *flow_table;
- =A0 =A0 =A0 struct rps_sock_flow_table *sock_flow_table;
- =A0 =A0 =A0 int cpu =3D -1;
=A0 =A0 =A0 =A0u8 ip_proto;
- =A0 =A0 =A0 u16 tcpu;
=A0 =A0 =A0 =A0u32 addr1, addr2, ihl;
=A0 =A0 =A0 =A0union {
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0u32 v32;
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0u16 v16[2];
=A0 =A0 =A0 =A0} ports;
- =A0 =A0 =A0 if (skb_rx_queue_recorded(skb)) {
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 u16 index =3D skb_get_rx_queue(skb);
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (unlikely(index >=3D dev->num_rx_que=
ues)) {
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 WARN_ONCE(dev->num_rx_q=
ueues > 1, "%s received packet "
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 "on que=
ue %u, but number of RX queues is %u\n",
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dev->na=
me, index, dev->num_rx_queues);
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx + index;
- =A0 =A0 =A0 } else
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx;
-
- =A0 =A0 =A0 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
-
- =A0 =A0 =A0 if (skb->rxhash)
+ =A0 =A0 =A0 if (hash)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto got_hash; /* Skip hash computatio=
n on packet header */
Post by Krishna Kumar
=A0 =A0 =A0 =A0switch (skb->protocol) {
@@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto done;
=A0 =A0 =A0 =A0}
+
=A0 =A0 =A0 =A0switch (ip_proto) {
@@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
=A0 =A0 =A0 =A0/* get a consistent hash (same value on both flow dire=
ctions) */
Post by Krishna Kumar
=A0 =A0 =A0 =A0if (addr2 < addr1)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0swap(addr1, addr2);
- =A0 =A0 =A0 skb->rxhash =3D jhash_3words(addr1, addr2, ports.v32, h=
ashrnd);
Post by Krishna Kumar
- =A0 =A0 =A0 if (!skb->rxhash)
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 skb->rxhash =3D 1;
+
+ =A0 =A0 =A0 hash =3D jhash_3words(addr1, addr2, ports.v32, hashrnd)=
;
Post by Krishna Kumar
+ =A0 =A0 =A0 if (!hash)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 hash =3D 1;
+ =A0 =A0 =A0 return hash;
+
+ =A0 =A0 =A0 return -1;
+}
+EXPORT_SYMBOL(skb_calculate_flow);
I have noticed that you use skb_calculate_flow() in
macvtap_get_queue() where skb->data doesn't point to the network
header but the ethernet header. However, skb_calculate_flow() assume
skb->data points to the network header. There are two choices:
* update skb_calculate_flow to support called in ethernet layer.
* pull skb before skb_calculate_flow, and push skb after
skb_calculate_flow() in macvtap_get_queue().

I prefer the former way.

BTW: the function name skb_calculate_flow isn't good. How about
skb_get_rxhash(). Maybe we can implement two versions: fast path and
slow path. And implement the fast path version as a inline function in
skbuff.h.

static inline u32 skb_get_rxhash(struct sk_buff *skb)
{
u32 rxhash;

rxhash =3D skb->rxhash;
if (!rxhash)
return __skb_get_rxhash(skb);
return rxhash;
}
Post by Krishna Kumar
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the targ=
et
Post by Krishna Kumar
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct rps_dev_flow **rf=
lowp)
Post by Krishna Kumar
+{
+ =A0 =A0 =A0 struct netdev_rx_queue *rxqueue;
+ =A0 =A0 =A0 struct rps_map *map;
+ =A0 =A0 =A0 struct rps_dev_flow_table *flow_table;
+ =A0 =A0 =A0 struct rps_sock_flow_table *sock_flow_table;
+ =A0 =A0 =A0 int cpu =3D -1;
+ =A0 =A0 =A0 u16 tcpu;
+
+ =A0 =A0 =A0 if (skb_rx_queue_recorded(skb)) {
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 u16 index =3D skb_get_rx_queue(skb);
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (unlikely(index >=3D dev->num_rx_que=
ues)) {
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 WARN_ONCE(dev->num_rx_q=
ueues > 1, "%s received packet "
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 "on que=
ue %u, but number of RX queues is %u\n",
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dev->na=
me, index, dev->num_rx_queues);
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx + index;
+ =A0 =A0 =A0 } else
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx;
+
+ =A0 =A0 =A0 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+
+ =A0 =A0 =A0 skb->rxhash =3D skb_calculate_flow(dev, skb);
+ =A0 =A0 =A0 if (skb->rxhash < 0)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+
=A0 =A0 =A0 =A0flow_table =3D rcu_dereference(rxqueue->rps_flow_table=
);
Post by Krishna Kumar
=A0 =A0 =A0 =A0sock_flow_table =3D rcu_dereference(rps_sock_flow_tabl=
e);
Post by Krishna Kumar
=A0 =A0 =A0 =A0if (flow_table && sock_flow_table) {
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
--=20
Regards,
Changli Gao(***@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Krishna Kumar2
2010-08-03 05:57:44 UTC
Permalink
Hi Changli,

Good catch.

Instead of adding support for ethernet header or pull/push,
I could defer the skb_push(ETH_HLEN), something like:

static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
{
struct macvtap_queue *q =3D macvtap_get_queue(dev, skb);
if (!q)
goto drop;

if (skb_queue_len(&q->sk.sk_receive_queue) >=3D dev->tx_queue_len)
goto drop;

+ skb_push(skb, ETH_HLEN);
...
}

and remove the same in macvtap_receive. Will this be better?

Your other suggestions also looks good.

Thanks,

- KK
08/03/2010 09:35 AM
To
cc
Subject
Re: [PATCH v3 1/2] core: Factor out flow calculation from get_rps_cpu
Post by Krishna Kumar
Factor out flow calculation code from get_rps_cpu, since macvtap
driver can use the same code.
v2 - Ben: Separate flow calcuation out and use in select queue
v3 - Arnd: Don't re-implement MIN
---
=A0include/linux/netdevice.h | =A0 =A01
=A0net/core/dev.c =A0 =A0 =A0 =A0 =A0 =A0| =A0 94 +++++++++++++++++=
+++++--------------
Post by Krishna Kumar
=A02 files changed, 59 insertions(+), 36 deletions(-)
diff -ruNp org/include/linux/netdevice.h new/include/linux/netdevic=
e.h
Post by Krishna Kumar
--- org/include/linux/netdevice.h =A0 =A0 =A0 2010-08-03 08:19:57.0=
00000000
+0530
Post by Krishna Kumar
+++ new/include/linux/netdevice.h =A0 =A0 =A0 2010-08-03 08:19:57.0=
00000000
+0530
Post by Krishna Kumar
@@ -2253,6 +2253,7 @@ static inline const char *netdev_name(co
=A0 =A0 =A0 =A0return dev->name;
=A0}
+extern int skb_calculate_flow(struct net_device *dev, struct sk_bu=
ff
*skb);
Post by Krishna Kumar
=A0extern int netdev_printk(const char *level, const struct net_dev=
ice
*dev,
Post by Krishna Kumar
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 const char *format,=
...)
Post by Krishna Kumar
=A0 =A0 =A0 =A0__attribute__ ((format (printf, 3, 4)));
diff -ruNp org/net/core/dev.c new/net/core/dev.c
--- org/net/core/dev.c =A02010-08-03 08:19:57.000000000 +0530
+++ new/net/core/dev.c =A02010-08-03 08:19:57.000000000 +0530
@@ -2263,51 +2263,24 @@ static inline void ____napi_schedule(str
=A0 =A0 =A0 =A0__raise_softirq_irqoff(NET_RX_SOFTIRQ);
=A0}
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
=A0/*
- * get_rps_cpu is called from netif_receive_skb and returns the ta=
rget
Post by Krishna Kumar
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * skb_calculate_flow: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. On success, returns a hash number (> =
0),
Post by Krishna Kumar
+ * otherwise -1.
=A0*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb=
,
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct rps_dev_flow **=
rflowp)
Post by Krishna Kumar
+int skb_calculate_flow(struct net_device *dev, struct sk_buff *skb=
)
Post by Krishna Kumar
=A0{
+ =A0 =A0 =A0 int hash =3D skb->rxhash;
=A0 =A0 =A0 =A0struct ipv6hdr *ip6;
=A0 =A0 =A0 =A0struct iphdr *ip;
- =A0 =A0 =A0 struct netdev_rx_queue *rxqueue;
- =A0 =A0 =A0 struct rps_map *map;
- =A0 =A0 =A0 struct rps_dev_flow_table *flow_table;
- =A0 =A0 =A0 struct rps_sock_flow_table *sock_flow_table;
- =A0 =A0 =A0 int cpu =3D -1;
=A0 =A0 =A0 =A0u8 ip_proto;
- =A0 =A0 =A0 u16 tcpu;
=A0 =A0 =A0 =A0u32 addr1, addr2, ihl;
=A0 =A0 =A0 =A0union {
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0u32 v32;
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0u16 v16[2];
=A0 =A0 =A0 =A0} ports;
- =A0 =A0 =A0 if (skb_rx_queue_recorded(skb)) {
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 u16 index =3D skb_get_rx_queue(skb);
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (unlikely(index >=3D dev->num_rx_q=
ueues)) {
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 WARN_ONCE(dev->num_rx=
_queues > 1, "%s
received packet "
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 "on q=
ueue %u, but number of RX
queues is %u\n",
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dev->=
name, index, dev->num_rx_queues);
Post by Krishna Kumar
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx + index;
- =A0 =A0 =A0 } else
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx;
-
- =A0 =A0 =A0 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
-
- =A0 =A0 =A0 if (skb->rxhash)
+ =A0 =A0 =A0 if (hash)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto got_hash; /* Skip hash computat=
ion on packet header
*/
Post by Krishna Kumar
=A0 =A0 =A0 =A0switch (skb->protocol) {
@@ -2334,6 +2307,7 @@ static int get_rps_cpu(struct net_device
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto done;
=A0 =A0 =A0 =A0}
+
=A0 =A0 =A0 =A0switch (ip_proto) {
@@ -2356,11 +2330,59 @@ static int get_rps_cpu(struct net_device
=A0 =A0 =A0 =A0/* get a consistent hash (same value on both flow di=
rections) */
Post by Krishna Kumar
=A0 =A0 =A0 =A0if (addr2 < addr1)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0swap(addr1, addr2);
- =A0 =A0 =A0 skb->rxhash =3D jhash_3words(addr1, addr2, ports.v32,=
hashrnd);
Post by Krishna Kumar
- =A0 =A0 =A0 if (!skb->rxhash)
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 skb->rxhash =3D 1;
+
+ =A0 =A0 =A0 hash =3D jhash_3words(addr1, addr2, ports.v32, hashrn=
d);
Post by Krishna Kumar
+ =A0 =A0 =A0 if (!hash)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 hash =3D 1;
+ =A0 =A0 =A0 return hash;
+
+ =A0 =A0 =A0 return -1;
+}
+EXPORT_SYMBOL(skb_calculate_flow);
I have noticed that you use skb_calculate_flow() in
macvtap_get_queue() where skb->data doesn't point to the network
header but the ethernet header. However, skb_calculate_flow() assume
* update skb_calculate_flow to support called in ethernet layer.
* pull skb before skb_calculate_flow, and push skb after
skb_calculate_flow() in macvtap_get_queue().
I prefer the former way.
BTW: the function name skb_calculate_flow isn't good. How about
skb_get_rxhash(). Maybe we can implement two versions: fast path and
slow path. And implement the fast path version as a inline function i=
n
skbuff.h.
static inline u32 skb_get_rxhash(struct sk_buff *skb)
{
u32 rxhash;
rxhash =3D skb->rxhash;
if (!rxhash)
return __skb_get_rxhash(skb);
return rxhash;
}
Post by Krishna Kumar
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the ta=
rget
Post by Krishna Kumar
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb=
,
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct rps_dev_flow **=
rflowp)
Post by Krishna Kumar
+{
+ =A0 =A0 =A0 struct netdev_rx_queue *rxqueue;
+ =A0 =A0 =A0 struct rps_map *map;
+ =A0 =A0 =A0 struct rps_dev_flow_table *flow_table;
+ =A0 =A0 =A0 struct rps_sock_flow_table *sock_flow_table;
+ =A0 =A0 =A0 int cpu =3D -1;
+ =A0 =A0 =A0 u16 tcpu;
+
+ =A0 =A0 =A0 if (skb_rx_queue_recorded(skb)) {
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 u16 index =3D skb_get_rx_queue(skb);
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (unlikely(index >=3D dev->num_rx_q=
ueues)) {
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 WARN_ONCE(dev->num_rx=
_queues > 1, "%s
received packet "
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 "on q=
ueue %u, but number of RX
queues is %u\n",
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dev->=
name, index, dev->num_rx_queues);
Post by Krishna Kumar
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx + index;
+ =A0 =A0 =A0 } else
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 rxqueue =3D dev->_rx;
+
+ =A0 =A0 =A0 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+
+ =A0 =A0 =A0 skb->rxhash =3D skb_calculate_flow(dev, skb);
+ =A0 =A0 =A0 if (skb->rxhash < 0)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto done;
+
=A0 =A0 =A0 =A0flow_table =3D rcu_dereference(rxqueue->rps_flow_tab=
le);
Post by Krishna Kumar
=A0 =A0 =A0 =A0sock_flow_table =3D rcu_dereference(rps_sock_flow_ta=
ble);
Post by Krishna Kumar
=A0 =A0 =A0 =A0if (flow_table && sock_flow_table) {
--
To unsubscribe from this list: send the line "unsubscribe netdev" i=
n
Post by Krishna Kumar
More majordomo info at =A0http://vger.kernel.org/majordomo-info.htm=
l
--
Regards,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Changli Gao
2010-08-03 06:11:30 UTC
Permalink
Post by Krishna Kumar2
Hi Changli,
Good catch.
Instead of adding support for ethernet header or pull/push,
static int macvtap_forward(struct net_device *dev, struct sk_buff *sk=
b)
Post by Krishna Kumar2
{
=A0 =A0 =A0 =A0struct macvtap_queue *q =3D macvtap_get_queue(dev, skb=
);
Post by Krishna Kumar2
=A0 =A0 =A0 =A0if (!q)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto drop;
=A0 =A0 =A0 =A0if (skb_queue_len(&q->sk.sk_receive_queue) >=3D dev->t=
x_queue_len)
Post by Krishna Kumar2
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto drop;
+ =A0 =A0 =A0 skb_push(skb, ETH_HLEN);
=A0 =A0 =A0 =A0...
}
and remove the same in macvtap_receive. Will this be better?
I am confused by the call sites of macvlan_dev.receive and
macvlan_dev.forward. They both are possible to be called in both
RX(skb->data points to network header) and TX(skb->data points to
ethernet) paths. The current code in macvtap shows that
macvlan_dev.receive should be called in network layer, and
macvlan_dev.forward should be called in dev layer. Am I correct?


--=20
Regards,
Changli Gao(***@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Changli Gao
2010-08-03 07:18:40 UTC
Permalink
Post by Changli Gao
Post by Krishna Kumar2
Hi Changli,
Good catch.
Instead of adding support for ethernet header or pull/push,
static int macvtap_forward(struct net_device *dev, struct sk_buff *s=
kb)
Post by Changli Gao
Post by Krishna Kumar2
{
=A0 =A0 =A0 =A0struct macvtap_queue *q =3D macvtap_get_queue(dev, sk=
b);
Post by Changli Gao
Post by Krishna Kumar2
=A0 =A0 =A0 =A0if (!q)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto drop;
=A0 =A0 =A0 =A0if (skb_queue_len(&q->sk.sk_receive_queue) >=3D dev->=
tx_queue_len)
Post by Changli Gao
Post by Krishna Kumar2
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0goto drop;
+ =A0 =A0 =A0 skb_push(skb, ETH_HLEN);
=A0 =A0 =A0 =A0...
}
and remove the same in macvtap_receive. Will this be better?
I am confused by the call sites of macvlan_dev.receive and
macvlan_dev.forward. They both are possible to be called in both
RX(skb->data points to network header) and TX(skb->data points to
ethernet) paths. The current code in macvtap shows that
macvlan_dev.receive should be called in network layer, and
macvlan_dev.forward should be called in dev layer. Am I correct?
After checking the code carefully, I find macvlan_dev.receive is
called in network layer(RX path), and macvlan_dev.forward is called in
dev layer(TX path). The current code hasn't any issue. Your solution
won't work, as macvtap_forward is called in dev layer, when skb->data
points to ethernet header already.

--=20
Regards,
Changli Gao(***@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2010-08-03 08:32:15 UTC
Permalink
Post by Changli Gao
Post by Changli Gao
I am confused by the call sites of macvlan_dev.receive and
macvlan_dev.forward. They both are possible to be called in both
RX(skb->data points to network header) and TX(skb->data points to
ethernet) paths. The current code in macvtap shows that
macvlan_dev.receive should be called in network layer, and
macvlan_dev.forward should be called in dev layer. Am I correct?
After checking the code carefully, I find macvlan_dev.receive is
called in network layer(RX path), and macvlan_dev.forward is called in
dev layer(TX path). The current code hasn't any issue. Your solution
won't work, as macvtap_forward is called in dev layer, when skb->data
points to ethernet header already.
Yes, that is correct. Forward is used for "bridge" mode, where we
can send data between two macvlan/macvtap endpoints directly, as
opposed to the default "vepa" mode, where all data is always sent
out to the lowerdev and may be returned by an adjacent switch in
hairpin configuration.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala
2010-08-03 22:36:10 UTC
Permalink
Post by Arnd Bergmann
Post by Changli Gao
Post by Changli Gao
I am confused by the call sites of macvlan_dev.receive and
macvlan_dev.forward. They both are possible to be called in both
RX(skb->data points to network header) and TX(skb->data points to
ethernet) paths. The current code in macvtap shows that
macvlan_dev.receive should be called in network layer, and
macvlan_dev.forward should be called in dev layer. Am I correct?
After checking the code carefully, I find macvlan_dev.receive is
called in network layer(RX path), and macvlan_dev.forward is called in
dev layer(TX path). The current code hasn't any issue. Your solution
won't work, as macvtap_forward is called in dev layer, when skb->data
points to ethernet header already.
Yes, that is correct. Forward is used for "bridge" mode, where we
can send data between two macvlan/macvtap endpoints directly, as
opposed to the default "vepa" mode, where all data is always sent
out to the lowerdev and may be returned by an adjacent switch in
hairpin configuration.
macvtap_forward() is also used in the normal receive path as
macvtap_receive() has a direct call to macvtap_forward() after
updating skb->data to point to the ethernet header.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Continue reading on narkive:
Loading...