Discussion:
[RFC] tcp md5 use of alloc_percpu
Crestez Dan Leonard
2014-10-22 18:55:46 UTC
Permalink
Hello,

It seems that the TCP MD5 feature allocates a percpu struct tcp_md5sig_pool and uses part of that memory for a scratch buffer to do crypto on. Here is the relevant code:

static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;

bp = &hp->md5_blk.ip4;

/*
* 1. the TCP pseudo-header (in the order: source IP address,
* destination IP address, zero-padded protocol number, and
* segment length)
*/
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);

sg_init_one(&sg, bp, sizeof(*bp));
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}

sg_init_one does virt_addr on the pointer which assumes it is directly accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu which can return memory from the vmalloc area after the pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting crashes on mips and I believe this to be the cause.

Allocating a scratch buffer this way is very peculiar. The tcp4_pseudohdr struct is only 12 bytes in length. Similar code in tcp_v6_md5_hash_pseudoheader uses a 40 byte tcp6_pseudohdr. I think it is perfectly reasonable to allocate this kind of stuff on the stack, right? These pseudohdr structs are not used at all outside these two static functions and it would simplify the code.

The whole notion of struct tcp_md5sig_pool seems dubious. This is a very tiny struct already and after removing the pseudohdr it shrinks to a percpu hash_desc for md5 (8 or 16 bytes). Wouldn't DEFINE_PERCPU be more appropriate? Before commit 71cea17ed39fdf1c0634f530ddc6a2c2fc601c2b the struct tcp_md5sig_pool structs were freed when all users were gone, but that functionality seems to have been dropped.

I'm not familiar with the linux crypto API. Isn't there an easier way to get a temporary md5 hasher?

Here's what I mean by allocating tcp{4,6}_pseudohdr on the stack:

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4062b4f..beabd7b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1266,33 +1266,9 @@ struct tcp_md5sig_info {
struct rcu_head rcu;
};

-/* - pseudo header */
-struct tcp4_pseudohdr {
- __be32 saddr;
- __be32 daddr;
- __u8 pad;
- __u8 protocol;
- __be16 len;
-};
-
-struct tcp6_pseudohdr {
- struct in6_addr saddr;
- struct in6_addr daddr;
- __be32 len;
- __be32 protocol; /* including padding */
-};
-
-union tcp_md5sum_block {
- struct tcp4_pseudohdr ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct tcp6_pseudohdr ip6;
-#endif
-};
-
/* - pool: digest algorithm, hash description and scratch buffer */
struct tcp_md5sig_pool {
struct hash_desc md5_desc;
- union tcp_md5sum_block md5_blk;
};

/* - functions */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 94d1a77..e716a67 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1041,27 +1041,33 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
GFP_KERNEL);
}

+struct tcp4_pseudohdr {
+ __be32 saddr;
+ __be32 daddr;
+ __u8 pad;
+ __u8 protocol;
+ __be16 len;
+};
+
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
- struct tcp4_pseudohdr *bp;
+ struct tcp4_pseudohdr bp;
struct scatterlist sg;

- bp = &hp->md5_blk.ip4;
-
/*
* 1. the TCP pseudo-header (in the order: source IP address,
* destination IP address, zero-padded protocol number, and
* segment length)
*/
- bp->saddr = saddr;
- bp->daddr = daddr;
- bp->pad = 0;
- bp->protocol = IPPROTO_TCP;
- bp->len = cpu_to_be16(nbytes);
-
- sg_init_one(&sg, bp, sizeof(*bp));
- return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+ bp.saddr = saddr;
+ bp.daddr = daddr;
+ bp.pad = 0;
+ bp.protocol = IPPROTO_TCP;
+ bp.len = cpu_to_be16(nbytes);
+
+ sg_init_one(&sg, &bp, sizeof(bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(bp));
}

static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8314955..87a9126 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -568,22 +568,28 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}

+struct tcp6_pseudohdr {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ __be32 len;
+ __be32 protocol; /* including padding */
+};
+
static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
const struct in6_addr *daddr,
const struct in6_addr *saddr, int nbytes)
{
- struct tcp6_pseudohdr *bp;
+ struct tcp6_pseudohdr bp;
struct scatterlist sg;

- bp = &hp->md5_blk.ip6;
/* 1. TCP pseudo-header (RFC2460) */
- bp->saddr = *saddr;
- bp->daddr = *daddr;
- bp->protocol = cpu_to_be32(IPPROTO_TCP);
- bp->len = cpu_to_be32(nbytes);
+ bp.saddr = *saddr;
+ bp.daddr = *daddr;
+ bp.protocol = cpu_to_be32(IPPROTO_TCP);
+ bp.len = cpu_to_be32(nbytes);

- sg_init_one(&sg, bp, sizeof(*bp));
- return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+ sg_init_one(&sg, &bp, sizeof(bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(bp));
}

static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
Eric Dumazet
2014-10-22 19:12:38 UTC
Permalink
Post by Crestez Dan Leonard
Hello,
It seems that the TCP MD5 feature allocates a percpu struct
tcp_md5sig_pool and uses part of that memory for a scratch buffer to
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
bp = &hp->md5_blk.ip4;
/*
* 1. the TCP pseudo-header (in the order: source IP address,
* destination IP address, zero-padded protocol number, and
* segment length)
*/
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}
sg_init_one does virt_addr on the pointer which assumes it is directly
accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu
which can return memory from the vmalloc area after the
pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting
crashes on mips and I believe this to be the cause.
Then just remove the alloc_percpu() from __tcp_alloc_md5sig_pool() and
make this a static per cpu definition (not a dynamic allocation)
Post by Crestez Dan Leonard
Allocating a scratch buffer this way is very peculiar. The
tcp4_pseudohdr struct is only 12 bytes in length. Similar code in
tcp_v6_md5_hash_pseudoheader uses a 40 byte tcp6_pseudohdr. I think it
is perfectly reasonable to allocate this kind of stuff on the stack,
right? These pseudohdr structs are not used at all outside these two
static functions and it would simplify the code.
Yep, but the sg stuff does not allow for stack variables. Because of
possible offloading and DMA, I dont know...
Post by Crestez Dan Leonard
The whole notion of struct tcp_md5sig_pool seems dubious. This is a
very tiny struct already and after removing the pseudohdr it shrinks
to a percpu hash_desc for md5 (8 or 16 bytes). Wouldn't DEFINE_PERCPU
be more appropriate?
Sure. this would be the more appropriate fix IMO.
Post by Crestez Dan Leonard
Before commit 71cea17ed39fdf1c0634f530ddc6a2c2fc601c2b the struct
tcp_md5sig_pool structs were freed when all users were gone, but that
functionality seems to have been dropped.
I'm not familiar with the linux crypto API. Isn't there an easier way
to get a temporary md5 hasher?
You should CC crypto guys maybe ...
Your patch is quite invasive, you should so something simpler to ease
backports.

Thanks
Jonathan Toppins
2014-10-22 21:35:12 UTC
Permalink
Post by Crestez Dan Leonard
Hello,
It seems that the TCP MD5 feature allocates a percpu struct
tcp_md5sig_pool and uses part of that memory for a scratch buffer to
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
bp = &hp->md5_blk.ip4;
/*
* 1. the TCP pseudo-header (in the order: source IP address,
* destination IP address, zero-padded protocol number, and
* segment length)
*/
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}
sg_init_one does virt_addr on the pointer which assumes it is directly
accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu
which can return memory from the vmalloc area after the
pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting
crashes on mips and I believe this to be the cause.
I can confirm this created an issue on our powerpc based switches. My
solution in our 3.2 kernel was to allocate the buffer on the stack. I
like this solution better.
Crestez Dan Leonard
2014-10-22 23:05:30 UTC
Permalink
Post by Eric Dumazet
Post by Crestez Dan Leonard
Hello,
It seems that the TCP MD5 feature allocates a percpu struct
tcp_md5sig_pool and uses part of that memory for a scratch buffer to
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
struct tcp4_pseudohdr *bp;
struct scatterlist sg;
bp = &hp->md5_blk.ip4;
/*
* 1. the TCP pseudo-header (in the order: source IP address,
* destination IP address, zero-padded protocol number, and
* segment length)
*/
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}
sg_init_one does virt_addr on the pointer which assumes it is directly
accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu
which can return memory from the vmalloc area after the
pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting
crashes on mips and I believe this to be the cause.
Allocating a scratch buffer this way is very peculiar. The
tcp4_pseudohdr struct is only 12 bytes in length. Similar code in
tcp_v6_md5_hash_pseudoheader uses a 40 byte tcp6_pseudohdr. I think it
is perfectly reasonable to allocate this kind of stuff on the stack,
right? These pseudohdr structs are not used at all outside these two
static functions and it would simplify the code.
Yep, but the sg stuff does not allow for stack variables. Because of
possible offloading and DMA, I dont know...
A stack buffer is used in tcp_md5_hash_header to add a tcphdr to the
hash. A quick grep for sg_init_one find a couple of additional instances
of what looks like doing crypto on small stack buffers:

net/bluetooth/smp.e:110
net/sunrpc/auth_gss/gss_krb5_crypto.c:194
net/rxrpc/rxkad.c:multiple

But those might also be bugs.

If the buffers passed to the crypto api need to be DMA-ble then wouldn't
this also exclude DEFINE_PERCPU? The DMA-API-HOWTO mentions that items
in data/text/bss might not be DMA-able, presumably depending on the
architecture.
Post by Eric Dumazet
Post by Crestez Dan Leonard
I'm not familiar with the linux crypto API. Isn't there an easier way
to get a temporary md5 hasher?
You should CC crypto guys maybe ...
Added linux-crypto in CC. To summarize the question: What kind of memory
can be passed to crypto api functions like crypto_hash_update?
Post by Eric Dumazet
Post by Crestez Dan Leonard
The whole notion of struct tcp_md5sig_pool seems dubious. This is a
very tiny struct already and after removing the pseudohdr it shrinks
to a percpu hash_desc for md5 (8 or 16 bytes). Wouldn't DEFINE_PERCPU
be more appropriate?
Sure. this would be the more appropriate fix IMO.
I'll post this as a patch if somebody can confirm that it is correct and
portable.

Doing a temp kmalloc/kfree would also work, but it would hurt
performance. It would be nice to have a generic way to ask for a small
temporary DMA-ble buffer.

If DEFINE_PERCPU is not suitable then the tcp_md5sig_pool structs should
be allocated via individual kmallocs for each cpu.

Regards,
Leonard
David Miller
2014-10-22 21:53:24 UTC
Permalink
From: Crestez Dan Leonard <***@gmail.com>
Date: Wed, 22 Oct 2014 21:55:46 +0300
Post by Crestez Dan Leonard
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
__be32 daddr, __be32 saddr, int nbytes)
{
- struct tcp4_pseudohdr *bp;
+ struct tcp4_pseudohdr bp;
...
Post by Crestez Dan Leonard
+ sg_init_one(&sg, &bp, sizeof(bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(bp));
As others have mentioned, you cannot do this.

On some architectures the kernel stack comes from vmalloc()
memory too.
Jonathan Toppins
2014-10-22 23:38:31 UTC
Permalink
Post by Crestez Dan Leonard
sg_init_one does virt_addr on the pointer which assumes it is directly accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu which can return memory from the vmalloc area after the pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting crashes on mips and I believe this to be the cause.
Thinking about this more if the issue really is sg_init_one assumes a
directly accessible memory region, can we just modify the zone
allocation to GFP_DMA using alloc_percpu_gfp()? Does this satisfy the
assumptions made by sg_init_one?

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e7..6924320 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2889,7 +2889,7 @@ static void __tcp_alloc_md5sig_pool(void)
int cpu;
struct tcp_md5sig_pool __percpu *pool;

- pool = alloc_percpu(struct tcp_md5sig_pool);
+ pool = alloc_percpu_gfp(struct tcp_md5sig_pool, GFP_DMA);
if (!pool)
return;
Crestez Dan Leonard
2014-10-23 01:00:39 UTC
Permalink
Post by Jonathan Toppins
Post by Crestez Dan Leonard
sg_init_one does virt_addr on the pointer which assumes it is directly accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu which can return memory from the vmalloc area after the pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting crashes on mips and I believe this to be the cause.
Thinking about this more if the issue really is sg_init_one assumes a
directly accessible memory region, can we just modify the zone
allocation to GFP_DMA using alloc_percpu_gfp()? Does this satisfy the
assumptions made by sg_init_one?
I don't think that alloc_percpu_gfp can be used that way. Looking at the
code it only checks for GFP_KERNEL and behaves "atomically" if it is not
present. This means that it fails rather than vmalloc a new percpu_chunk.

The problem is not that the memory is not allocated with GFP_DMA but
rather that the memory is allocated with vmalloc.

Regards,
Leonard
Eric Dumazet
2014-10-23 01:47:57 UTC
Permalink
Post by Crestez Dan Leonard
Post by Jonathan Toppins
Post by Crestez Dan Leonard
sg_init_one does virt_addr on the pointer which assumes it is directly accessible. But the tcp_md5sig_pool pointer comes from alloc_percpu which can return memory from the vmalloc area after the pcpu_first_chunk is exhausted. This looks wrong to me. I'm am getting crashes on mips and I believe this to be the cause.
Thinking about this more if the issue really is sg_init_one assumes a
directly accessible memory region, can we just modify the zone
allocation to GFP_DMA using alloc_percpu_gfp()? Does this satisfy the
assumptions made by sg_init_one?
I don't think that alloc_percpu_gfp can be used that way. Looking at the
code it only checks for GFP_KERNEL and behaves "atomically" if it is not
present. This means that it fails rather than vmalloc a new percpu_chunk.
The problem is not that the memory is not allocated with GFP_DMA but
rather that the memory is allocated with vmalloc.
Could you try the following patch ?

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..d253ad8ced64 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,30 +2868,29 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif

#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
static DEFINE_MUTEX(tcp_md5sig_mutex);
+static bool tcp_md5sig_pool_populated = false;

-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
+static void tcp_free_md5sig_pool(void)
{
int cpu;

for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
+ struct crypto_hash *hash;
+
+ hash = per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm;

- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
+ if (hash) {
+ crypto_free_hash(hash);
+ per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = NULL;
+ }
}
- free_percpu(pool);
}

static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
- struct tcp_md5sig_pool __percpu *pool;
-
- pool = alloc_percpu(struct tcp_md5sig_pool);
- if (!pool)
- return;

for_each_possible_cpu(cpu) {
struct crypto_hash *hash;
@@ -2900,29 +2899,29 @@ static void __tcp_alloc_md5sig_pool(void)
if (IS_ERR_OR_NULL(hash))
goto out_free;

- per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+ per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
}
- /* before setting tcp_md5sig_pool, we must commit all writes
- * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool = pool;
+ tcp_md5sig_pool_populated = true;
return;
out_free:
- __tcp_free_md5sig_pool(pool);
+ tcp_free_md5sig_pool();
}

bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool)) {
+ if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);

- if (!tcp_md5sig_pool)
+ if (!tcp_md5sig_pool_populated)
__tcp_alloc_md5sig_pool();

mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool != NULL;
+ return tcp_md5sig_pool_populated;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);

@@ -2936,13 +2935,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
*/
struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *p;
-
local_bh_disable();
- p = ACCESS_ONCE(tcp_md5sig_pool);
- if (p)
- return raw_cpu_ptr(p);

+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool */
+ smp_rmb();
+ return this_cpu_ptr(&tcp_md5sig_pool);
+ }
local_bh_enable();
return NULL;
}
David Ahern
2014-10-23 04:40:36 UTC
Permalink
Post by Crestez Dan Leonard
Hello,
This is a forward port of a local change to address the problem (local
kernel version is 3.4 so perhaps my quick bump to top of tree is off but
it shows the general idea). Been on my to-do list to figure out why this
is needed, but it seems related to your problem:

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..833a676bd4b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2941,7 +2941,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
local_bh_disable();
p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- return raw_cpu_ptr(p);
+ return __va(per_cpu_ptr_to_phys(raw_cpu_ptr(p)));

local_bh_enable();
return NULL;

David
Eric Dumazet
2014-10-23 05:23:53 UTC
Permalink
Post by David Ahern
Post by Crestez Dan Leonard
Hello,
This is a forward port of a local change to address the problem (local
kernel version is 3.4 so perhaps my quick bump to top of tree is off but
it shows the general idea). Been on my to-do list to figure out why this
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..833a676bd4b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2941,7 +2941,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
local_bh_disable();
p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- return raw_cpu_ptr(p);
+ return __va(per_cpu_ptr_to_phys(raw_cpu_ptr(p)));
local_bh_enable();
return NULL;
per_cpu_ptr_to_phys() can be pretty expensive and should not be called
in fast path.
Eric Dumazet
2014-10-23 05:38:08 UTC
Permalink
Post by Eric Dumazet
Post by David Ahern
Post by Crestez Dan Leonard
Hello,
This is a forward port of a local change to address the problem (local
kernel version is 3.4 so perhaps my quick bump to top of tree is off but
it shows the general idea). Been on my to-do list to figure out why this
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..833a676bd4b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2941,7 +2941,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
local_bh_disable();
p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- return raw_cpu_ptr(p);
+ return __va(per_cpu_ptr_to_phys(raw_cpu_ptr(p)));
local_bh_enable();
return NULL;
per_cpu_ptr_to_phys() can be pretty expensive and should not be called
in fast path.
My updated patch would be :

net/ipv4/tcp.c | 66 +++++++++++++++++++----------------------------
1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..af4dc16b61f6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,61 +2868,51 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif

#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, *tcp_md5sig_pool);
static DEFINE_MUTEX(tcp_md5sig_mutex);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
-
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- }
- free_percpu(pool);
-}
+static bool tcp_md5sig_pool_populated = false;

static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
- struct tcp_md5sig_pool __percpu *pool;
-
- pool = alloc_percpu(struct tcp_md5sig_pool);
- if (!pool)
- return;

for_each_possible_cpu(cpu) {
+ struct tcp_md5sig_pool *pool;
struct crypto_hash *hash;

- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR_OR_NULL(hash))
- goto out_free;
-
- per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+ pool = per_cpu(tcp_md5sig_pool, cpu);
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!pool)
+ return;
+ per_cpu(tcp_md5sig_pool, cpu) = pool;
+ }
+ if (!pool->md5_desc.tfm) {
+ hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(hash))
+ return;
+ pool->md5_desc.tfm = hash;
+ }
}
- /* before setting tcp_md5sig_pool, we must commit all writes
- * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool = pool;
- return;
-out_free:
- __tcp_free_md5sig_pool(pool);
+ tcp_md5sig_pool_populated = true;
}

bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool)) {
+ if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);

- if (!tcp_md5sig_pool)
+ if (!tcp_md5sig_pool_populated)
__tcp_alloc_md5sig_pool();

mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool != NULL;
+ return tcp_md5sig_pool_populated;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);

@@ -2936,13 +2926,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
*/
struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *p;
-
local_bh_disable();
- p = ACCESS_ONCE(tcp_md5sig_pool);
- if (p)
- return raw_cpu_ptr(p);

+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool */
+ smp_rmb();
+ return this_cpu_read(tcp_md5sig_pool);
+ }
local_bh_enable();
return NULL;
}
Jonathan Toppins
2014-10-23 06:58:53 UTC
Permalink
Post by Eric Dumazet
Post by Eric Dumazet
Post by David Ahern
Post by Crestez Dan Leonard
Hello,
This is a forward port of a local change to address the problem (local
kernel version is 3.4 so perhaps my quick bump to top of tree is off but
it shows the general idea). Been on my to-do list to figure out why this
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..833a676bd4b0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2941,7 +2941,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
local_bh_disable();
p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- return raw_cpu_ptr(p);
+ return __va(per_cpu_ptr_to_phys(raw_cpu_ptr(p)));
local_bh_enable();
return NULL;
per_cpu_ptr_to_phys() can be pretty expensive and should not be called
in fast path.
net/ipv4/tcp.c | 66 +++++++++++++++++++----------------------------
1 file changed, 28 insertions(+), 38 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..af4dc16b61f6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,61 +2868,51 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, *tcp_md5sig_pool);
static DEFINE_MUTEX(tcp_md5sig_mutex);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
-
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- }
- free_percpu(pool);
-}
+static bool tcp_md5sig_pool_populated = false;
static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
- struct tcp_md5sig_pool __percpu *pool;
-
- pool = alloc_percpu(struct tcp_md5sig_pool);
- if (!pool)
- return;
for_each_possible_cpu(cpu) {
+ struct tcp_md5sig_pool *pool;
struct crypto_hash *hash;
- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR_OR_NULL(hash))
- goto out_free;
-
- per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+ pool = per_cpu(tcp_md5sig_pool, cpu);
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
GFP_DMA | GFP_KERNEL
This memory will possibly be used in a DMA correct? (thinking crypto
hardware offload)
Post by Eric Dumazet
+ cpu_to_node(cpu));
+ if (!pool)
+ return;
+ per_cpu(tcp_md5sig_pool, cpu) = pool;
+ }
+ if (!pool->md5_desc.tfm) {
+ hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(hash))
+ return;
+ pool->md5_desc.tfm = hash;
+ }
}
- /* before setting tcp_md5sig_pool, we must commit all writes
- * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool = pool;
- return;
- __tcp_free_md5sig_pool(pool);
+ tcp_md5sig_pool_populated = true;
}
bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool)) {
+ if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);
- if (!tcp_md5sig_pool)
+ if (!tcp_md5sig_pool_populated)
__tcp_alloc_md5sig_pool();
mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool != NULL;
+ return tcp_md5sig_pool_populated;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -2936,13 +2926,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
*/
struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *p;
-
local_bh_disable();
- p = ACCESS_ONCE(tcp_md5sig_pool);
- if (p)
- return raw_cpu_ptr(p);
+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool */
+ smp_rmb();
+ return this_cpu_read(tcp_md5sig_pool);
+ }
local_bh_enable();
return NULL;
}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
More majordomo info at http://vger.kernel.org/majordomo-info.html
Eric Dumazet
2014-10-23 13:21:30 UTC
Permalink
Post by Jonathan Toppins
Post by Eric Dumazet
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
GFP_DMA | GFP_KERNEL
This memory will possibly be used in a DMA correct? (thinking crypto
hardware offload)
I am not sure this is the case, but this certainly can be added.
Eric Dumazet
2014-10-23 14:43:25 UTC
Permalink
Post by Eric Dumazet
Post by Jonathan Toppins
Post by Eric Dumazet
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
GFP_DMA | GFP_KERNEL
This memory will possibly be used in a DMA correct? (thinking crypto
hardware offload)
I am not sure this is the case, but this certainly can be added.
Yes, this is not the case.


The real problem is because sg_set_buf() does the following :


sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));

So it assumes a memory range is not spanning multiple pages.

So maybe a simpler patch would be :


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c5852d8ba3392b22aa58d6453ab4d..53e355199437b00836635c5919f2f1a1ae237271 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2886,10 +2886,17 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)

static void __tcp_alloc_md5sig_pool(void)
{
- int cpu;
struct tcp_md5sig_pool __percpu *pool;
+ size_t sz;
+ int cpu;

- pool = alloc_percpu(struct tcp_md5sig_pool);
+ /* sg_set_buf() assumes a contiguous memory area, but alloc_percpu()
+ * can use vmalloc(). So make sure we ask an alignment so that
+ * tcp_md5sig_pool is located in a single page.
+ */
+ BUILD_BUG_ON(sizeof(struct tcp_md5sig_pool) > PAGE_SIZE);
+ sz = roundup_pow_of_two(sizeof(struct tcp_md5sig_pool));
+ pool = __alloc_percpu(sz, sz);
if (!pool)
return;
Crestez Dan Leonard
2014-10-23 16:17:47 UTC
Permalink
Post by Eric Dumazet
Post by Eric Dumazet
Post by Jonathan Toppins
Post by Eric Dumazet
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
GFP_DMA | GFP_KERNEL
This memory will possibly be used in a DMA correct? (thinking crypto
hardware offload)
I am not sure this is the case, but this certainly can be added.
Yes, this is not the case.
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
So it assumes a memory range is not spanning multiple pages.
Doesn't virt_to_page also assume that the memory is not from vmalloc?

Making this portable would require checking if is_vmalloc_addr and doing
vmalloc_to_page instead. Easier to just kmalloc instead.

Regards,
Leonard
Eric Dumazet
2014-10-23 19:22:36 UTC
Permalink
Post by Crestez Dan Leonard
Doesn't virt_to_page also assume that the memory is not from vmalloc?
Making this portable would require checking if is_vmalloc_addr and doing
vmalloc_to_page instead. Easier to just kmalloc instead.
Oh right, I'll send a v2

Thanks !
Eric Dumazet
2014-10-23 16:33:18 UTC
Permalink
From: Eric Dumazet <***@google.com>

percpu tcp_md5sig_pool contains memory blobs that ultimately
go through sg_set_buf().

-> sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));

This requires that whole area is in a physically contiguous portion
of memory.

Given that alloc_percpu() can use vmalloc() areas, we need to make sure
tcp_md5sig_pool is allocated from a single page.


Signed-off-by: Eric Dumazet <***@google.com>
Fixes: 765cf9976e93 ("tcp: md5: remove one indirection level in tcp_md5sig_pool")
Reported-by: Crestez Dan Leonard <***@gmail.com>
---

Jonathan, David and Crestez, please test this patch fixes
the issue for good. We might in future kernels avoid the dynamic
memory allocations, but a simple fix for stable kernels is better IMO.

Thanks !

net/ipv4/tcp.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c5852d8ba3392b22aa58d6453ab4d..1f59e4130db99f129279c13f89b3c058ed6167e2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2886,10 +2886,17 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)

static void __tcp_alloc_md5sig_pool(void)
{
- int cpu;
struct tcp_md5sig_pool __percpu *pool;
+ size_t align;
+ int cpu;

- pool = alloc_percpu(struct tcp_md5sig_pool);
+ /* sg_set_buf() assumes a contiguous memory area, but alloc_percpu()
+ * can use vmalloc(). So make sure we request an alignment so that
+ * whole tcp_md5sig_pool is contained in a single page.
+ */
+ BUILD_BUG_ON(sizeof(struct tcp_md5sig_pool) > PAGE_SIZE);
+ align = roundup_pow_of_two(sizeof(struct tcp_md5sig_pool));
+ pool = __alloc_percpu(sizeof(struct tcp_md5sig_pool), align);
if (!pool)
return;
Eric Dumazet
2014-10-23 19:34:09 UTC
Permalink
Post by Eric Dumazet
percpu tcp_md5sig_pool contains memory blobs that ultimately
go through sg_set_buf().
-> sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
This requires that whole area is in a physically contiguous portion
of memory.
Given that alloc_percpu() can use vmalloc() areas, we need to make sure
tcp_md5sig_pool is allocated from a single page.
Fixes: 765cf9976e93 ("tcp: md5: remove one indirection level in tcp_md5sig_pool")
---
Self Nack, we definitely need to avoid vmalloc() in the first place.
Eric Dumazet
2014-10-23 19:58:58 UTC
Permalink
From: Eric Dumazet <***@google.com>

percpu tcp_md5sig_pool contains memory blobs that ultimately
go through sg_set_buf().

-> sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));

This requires that whole area is in a physically contiguous portion
of memory. And that @buf is not backed by vmalloc().

Given that alloc_percpu() can use vmalloc() areas, this does not
fit the requirements.

Replace alloc_percpu() by a static DEFINE_PER_CPU() as tcp_md5sig_pool
is small anyway, there is no gain to dynamically allocate it.

Signed-off-by: Eric Dumazet <***@google.com>
Fixes: 765cf9976e93 ("tcp: md5: remove one indirection level in tcp_md5sig_pool")
Reported-by: Crestez Dan Leonard <***@gmail.com>
---
net/ipv4/tcp.c | 59 +++++++++++++++--------------------------------
1 file changed, 20 insertions(+), 39 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..39ec0c379545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,61 +2868,42 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif

#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
static DEFINE_MUTEX(tcp_md5sig_mutex);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
-
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- }
- free_percpu(pool);
-}
+static bool tcp_md5sig_pool_populated = false;

static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
- struct tcp_md5sig_pool __percpu *pool;
-
- pool = alloc_percpu(struct tcp_md5sig_pool);
- if (!pool)
- return;

for_each_possible_cpu(cpu) {
- struct crypto_hash *hash;
-
- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR_OR_NULL(hash))
- goto out_free;
+ if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
+ struct crypto_hash *hash;

- per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+ hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(hash))
+ return;
+ per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
+ }
}
- /* before setting tcp_md5sig_pool, we must commit all writes
- * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool = pool;
- return;
-out_free:
- __tcp_free_md5sig_pool(pool);
+ tcp_md5sig_pool_populated = true;
}

bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool)) {
+ if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);

- if (!tcp_md5sig_pool)
+ if (!tcp_md5sig_pool_populated)
__tcp_alloc_md5sig_pool();

mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool != NULL;
+ return tcp_md5sig_pool_populated;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);

@@ -2936,13 +2917,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
*/
struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
- struct tcp_md5sig_pool __percpu *p;
-
local_bh_disable();
- p = ACCESS_ONCE(tcp_md5sig_pool);
- if (p)
- return raw_cpu_ptr(p);

+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+ smp_rmb();
+ return this_cpu_ptr(&tcp_md5sig_pool);
+ }
local_bh_enable();
return NULL;
}
David Ahern
2014-10-23 20:44:11 UTC
Permalink
Post by Jonathan Toppins
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..39ec0c379545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,61 +2868,42 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
static DEFINE_MUTEX(tcp_md5sig_mutex);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
-
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- }
- free_percpu(pool);
-}
+static bool tcp_md5sig_pool_populated = false;
global variables do not need to be initialized to 0.

I'll see how this applies to v3.4 and build an image.

Thanks,
David

Crestez Dan Leonard
2014-10-23 14:46:06 UTC
Permalink
Post by Eric Dumazet
Post by Jonathan Toppins
+ if (!pool) {
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL,
GFP_DMA | GFP_KERNEL
This memory will possibly be used in a DMA correct? (thinking crypto
hardware offload)
I am not sure this is the case, but this certainly can be added.
As far as I know what GFP_DMA actually does is restrict allocation to
low memory addresses under 24 bits for very old devices. There is also
GFP_DMA32 which restricts to addresses under 32 bytes (for device
which don't support 64 bit addresses).

This kind of stuff only belongs in device drivers where the exact
hardware limitations are known. I don't think crypto devices with this
kind of limitations can be exposed through the crypto API that the
md5sig feature uses.

--
Regards,
Leonard
Crestez Dan Leonard
2014-10-23 13:03:13 UTC
Permalink
Post by Eric Dumazet
net/ipv4/tcp.c | 66 +++++++++++++++++++----------------------------
1 file changed, 28 insertions(+), 38 deletions(-)
I can confirm that this patch fixes my original issue.

I am working with a kernel based on 3.10 so I had to integrate 71cea17ed39fdf1c0634f530ddc6a2c2fc601c2b as well.

Regards,
Leonard
Loading...