Discussion:
[PATCH RFC 1/9] net: rename fp->bpf_func to fp->run_filter
Pablo Neira Ayuso
2014-03-11 09:19:12 UTC
Permalink
This patch is a cleanup / rename for the function that actually
performs the filtering from bpf_func to a generic run_filter. This
change was introduced to generalize the socket filtering
infrastructure.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
arch/arm/net/bpf_jit_32.c | 6 +++---
arch/powerpc/net/bpf_jit_comp.c | 6 +++---
arch/s390/net/bpf_jit_comp.c | 6 +++---
arch/sparc/net/bpf_jit_comp.c | 6 +++---
arch/x86/net/bpf_jit_comp.c | 6 +++---
include/linux/filter.h | 6 +++---
net/core/filter.c | 2 +-
7 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 271b5e9..65bd347 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -924,7 +924,7 @@ void bpf_jit_compile(struct sk_filter *fp)
/* there are 2 passes here */
bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);

- fp->bpf_func = (void *)ctx.target;
+ fp->run_filter = (void *)ctx.target;
out:
kfree(ctx.offsets);
return;
@@ -932,7 +932,7 @@ out:

void bpf_jit_free(struct sk_filter *fp)
{
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);
kfree(fp);
}
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 555034f..6491d72 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -688,7 +688,7 @@ void bpf_jit_compile(struct sk_filter *fp)
/* Function descriptor nastiness: Address + TOC */
((u64 *)image)[0] = (u64)code_base;
((u64 *)image)[1] = local_paca->kernel_toc;
- fp->bpf_func = (void *)image;
+ fp->run_filter = (void *)image;
}
out:
kfree(addrs);
@@ -697,7 +697,7 @@ out:

void bpf_jit_free(struct sk_filter *fp)
{
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);
kfree(fp);
}
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 708d60e..23089df 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -876,7 +876,7 @@ void bpf_jit_compile(struct sk_filter *fp)
}
if (jit.start) {
set_memory_ro((unsigned long)header, header->pages);
- fp->bpf_func = (void *) jit.start;
+ fp->run_filter = (void *) jit.start;
}
out:
kfree(addrs);
@@ -884,10 +884,10 @@ out:

void bpf_jit_free(struct sk_filter *fp)
{
- unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
+ unsigned long addr = (unsigned long)fp->run_filter & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;

- if (fp->bpf_func == sk_run_filter)
+ if (fp->run_filter == sk_run_filter)
goto free_filter;
set_memory_rw(addr, header->pages);
module_free(NULL, header);
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index 01fe994..ee1cd30 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -808,7 +808,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf];

if (image) {
bpf_flush_icache(image, image + proglen);
- fp->bpf_func = (void *)image;
+ fp->run_filter = (void *)image;
}
out:
kfree(addrs);
@@ -817,7 +817,7 @@ out:

void bpf_jit_free(struct sk_filter *fp)
{
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);
kfree(fp);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4ed75dd..bfadd14 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -771,7 +771,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
if (image) {
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
- fp->bpf_func = (void *)image;
+ fp->run_filter = (void *)image;
}
out:
kfree(addrs);
@@ -781,7 +781,7 @@ out:
static void bpf_jit_free_deferred(struct work_struct *work)
{
struct sk_filter *fp = container_of(work, struct sk_filter, work);
- unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
+ unsigned long addr = (unsigned long)fp->run_filter & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;

set_memory_rw(addr, header->pages);
@@ -791,7 +791,7 @@ static void bpf_jit_free_deferred(struct work_struct *work)

void bpf_jit_free(struct sk_filter *fp)
{
- if (fp->bpf_func != sk_run_filter) {
+ if (fp->run_filter != sk_run_filter) {
INIT_WORK(&fp->work, bpf_jit_free_deferred);
schedule_work(&fp->work);
} else {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index e568c8e..6c5d597 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -27,8 +27,8 @@ struct sk_filter
atomic_t refcnt;
unsigned int len; /* Number of filter blocks */
struct rcu_head rcu;
- unsigned int (*bpf_func)(const struct sk_buff *skb,
- const struct sock_filter *filter);
+ unsigned int (*run_filter)(const struct sk_buff *skb,
+ const struct sock_filter *filter);
union {
struct sock_filter insns[0];
struct work_struct work;
@@ -70,7 +70,7 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
16, 1, image, proglen, false);
}
-#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
+#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->run_filter)(SKB, FILTER->insns)
#else
#include <linux/slab.h>
static inline void bpf_jit_compile(struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index ad30d62..0f63e67 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -645,7 +645,7 @@ static int __sk_prepare_filter(struct sk_filter *fp)
{
int err;

- fp->bpf_func = sk_run_filter;
+ fp->run_filter = sk_run_filter;

err = sk_chk_filter(fp->insns, fp->len);
if (err)
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-11 09:19:14 UTC
Permalink
This patch adds generic a new callback function to release bytecode
area which depends on the socket filter type. This change prepares
the implementation of new socket filtering approaches.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/linux/filter.h | 1 +
include/net/sock.h | 4 +---
net/core/filter.c | 4 ++--
3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ab37714..7cba4c2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -28,6 +28,7 @@ struct sk_filter {
struct rcu_head rcu;
unsigned int (*run_filter)(const struct sk_buff *skb,
const struct sock_filter *filter);
+ void (*release_rcu)(struct rcu_head *rcu);
union {
struct sock_filter insns[0];
struct work_struct work;
diff --git a/include/net/sock.h b/include/net/sock.h
index 7b9723c..9f9acbf 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1616,8 +1616,6 @@ void sk_common_release(struct sock *sk);
/* Initialise core socket variables */
void sock_init_data(struct socket *sock, struct sock *sk);

-void sk_filter_release_rcu(struct rcu_head *rcu);
-
/**
* sk_filter_release - release a socket filter
* @fp: filter to remove
@@ -1628,7 +1626,7 @@ void sk_filter_release_rcu(struct rcu_head *rcu);
static inline void sk_filter_release(struct sk_filter *fp)
{
if (atomic_dec_and_test(&fp->refcnt))
- call_rcu(&fp->rcu, sk_filter_release_rcu);
+ call_rcu(&fp->rcu, fp->release_rcu);
}

static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index 3ea0e7f..826ca63 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -633,19 +633,19 @@ EXPORT_SYMBOL(sk_chk_filter);
* sk_filter_release_rcu - Release a socket filter by rcu_head
* @rcu: rcu_head that contains the sk_filter to free
*/
-void sk_filter_release_rcu(struct rcu_head *rcu)
+static void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

bpf_jit_free(fp);
}
-EXPORT_SYMBOL(sk_filter_release_rcu);

static int __sk_prepare_filter(struct sk_filter *fp)
{
int err;

fp->run_filter = sk_run_filter;
+ fp->release_rcu = sk_filter_release_rcu;

err = sk_chk_filter(fp->insns, sk_bpf_flen(fp));
if (err)
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-11 09:19:15 UTC
Permalink
This function will be used from different classifiers, so make
them inline and move them to the nf_tables_core.h header file.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/net/netfilter/nf_tables_core.h | 42 ++++++++++++++++++++++++++++++++
net/netfilter/nf_tables_core.c | 40 ------------------------------
2 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index cf2b7ae..004c2aa 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -39,4 +39,46 @@ extern const struct nft_expr_ops nft_payload_fast_ops;
int nft_payload_module_init(void);
void nft_payload_module_exit(void);

+#include <net/netfilter/nf_tables.h>
+
+static inline void nft_cmp_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1])
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ u32 mask;
+
+ mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len);
+ if ((data[priv->sreg].data[0] & mask) == priv->data)
+ return;
+ data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+}
+
+static inline bool nft_payload_fast_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ struct nft_data *dest = &data[priv->dreg];
+ unsigned char *ptr;
+
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
+ ptr = skb_network_header(skb);
+ else
+ ptr = skb_network_header(skb) + pkt->xt.thoff;
+
+ ptr += priv->offset;
+
+ if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ return false;
+
+ if (priv->len == 2)
+ *(u16 *)dest->data = *(u16 *)ptr;
+ else if (priv->len == 4)
+ *(u32 *)dest->data = *(u32 *)ptr;
+ else
+ *(u8 *)dest->data = *(u8 *)ptr;
+ return true;
+}
+
#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 90998a6..d71a0be 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -21,46 +21,6 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>

-static void nft_cmp_fast_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1])
-{
- const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- u32 mask;
-
- mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len);
- if ((data[priv->sreg].data[0] & mask) == priv->data)
- return;
- data[NFT_REG_VERDICT].verdict = NFT_BREAK;
-}
-
-static bool nft_payload_fast_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
- const struct nft_pktinfo *pkt)
-{
- const struct nft_payload *priv = nft_expr_priv(expr);
- const struct sk_buff *skb = pkt->skb;
- struct nft_data *dest = &data[priv->dreg];
- unsigned char *ptr;
-
- if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
- ptr = skb_network_header(skb);
- else
- ptr = skb_network_header(skb) + pkt->xt.thoff;
-
- ptr += priv->offset;
-
- if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
- return false;
-
- if (priv->len == 2)
- *(u16 *)dest->data = *(u16 *)ptr;
- else if (priv->len == 4)
- *(u32 *)dest->data = *(u32 *)ptr;
- else
- *(u8 *)dest->data = *(u8 *)ptr;
- return true;
-}
-
struct nft_jumpstack {
const struct nft_chain *chain;
const struct nft_rule *rule;
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-11 09:19:17 UTC
Permalink
We didn't find a better file name candidate. The core file will
be used to store built-in kernel nf_tables infrastructure that
is common to new supported classifiers.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
net/netfilter/Makefile | 2 +-
net/netfilter/nf_tables_core.c | 233 ----------------------------------------
net/netfilter/nf_tables_nf.c | 233 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 234 insertions(+), 234 deletions(-)
delete mode 100644 net/netfilter/nf_tables_core.c
create mode 100644 net/netfilter/nf_tables_nf.c

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index bffdad7..bb9970c 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -65,7 +65,7 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o

# nf_tables
-nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nf_tables_nf.o nf_tables_api.o
nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
deleted file mode 100644
index d71a0be..0000000
--- a/net/netfilter/nf_tables_core.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <***@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/rculist.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_log.h>
-
-struct nft_jumpstack {
- const struct nft_chain *chain;
- const struct nft_rule *rule;
- int rulenum;
-};
-
-static inline void
-nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt,
- struct nft_jumpstack *jumpstack, unsigned int stackptr)
-{
- struct nft_stats __percpu *stats;
- const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this;
-
- rcu_read_lock_bh();
- stats = rcu_dereference(nft_base_chain(chain)->stats);
- __this_cpu_inc(stats->pkts);
- __this_cpu_add(stats->bytes, pkt->skb->len);
- rcu_read_unlock_bh();
-}
-
-enum nft_trace {
- NFT_TRACE_RULE,
- NFT_TRACE_RETURN,
- NFT_TRACE_POLICY,
-};
-
-static const char *const comments[] = {
- [NFT_TRACE_RULE] = "rule",
- [NFT_TRACE_RETURN] = "return",
- [NFT_TRACE_POLICY] = "policy",
-};
-
-static struct nf_loginfo trace_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = 4,
- .logflags = NF_LOG_MASK,
- },
- },
-};
-
-static void nft_trace_packet(const struct nft_pktinfo *pkt,
- const struct nft_chain *chain,
- int rulenum, enum nft_trace type)
-{
- struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
-
- nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
- pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
- chain->table->name, chain->name, comments[type],
- rulenum);
-}
-
-unsigned int
-nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
-{
- const struct nft_chain *chain = ops->priv;
- const struct nft_rule *rule;
- const struct nft_expr *expr, *last;
- struct nft_data data[NFT_REG_MAX + 1];
- unsigned int stackptr = 0;
- struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
- int rulenum = 0;
- /*
- * Cache cursor to avoid problems in case that the cursor is updated
- * while traversing the ruleset.
- */
- unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor);
-
-do_chain:
- rule = list_entry(&chain->rules, struct nft_rule, list);
-next_rule:
- data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
- list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
- /* This rule is not active, skip. */
- if (unlikely(rule->genmask & (1 << gencursor)))
- continue;
-
- rulenum++;
-
- nft_rule_for_each_expr(expr, last, rule) {
- if (expr->ops == &nft_cmp_fast_ops)
- nft_cmp_fast_eval(expr, data);
- else if (expr->ops != &nft_payload_fast_ops ||
- !nft_payload_fast_eval(expr, data, pkt))
- expr->ops->eval(expr, data, pkt);
-
- if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
- break;
- }
-
- switch (data[NFT_REG_VERDICT].verdict) {
- case NFT_BREAK:
- data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
- /* fall through */
- case NFT_CONTINUE:
- continue;
- }
- break;
- }
-
- switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_DROP:
- case NF_QUEUE:
- if (unlikely(pkt->skb->nf_trace))
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
-
- return data[NFT_REG_VERDICT].verdict;
- }
-
- switch (data[NFT_REG_VERDICT].verdict) {
- case NFT_JUMP:
- if (unlikely(pkt->skb->nf_trace))
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
-
- BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
- jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rule = rule;
- jumpstack[stackptr].rulenum = rulenum;
- stackptr++;
- /* fall through */
- case NFT_GOTO:
- chain = data[NFT_REG_VERDICT].chain;
- goto do_chain;
- case NFT_RETURN:
- if (unlikely(pkt->skb->nf_trace))
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
-
- /* fall through */
- case NFT_CONTINUE:
- break;
- default:
- WARN_ON(1);
- }
-
- if (stackptr > 0) {
- if (unlikely(pkt->skb->nf_trace))
- nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN);
-
- stackptr--;
- chain = jumpstack[stackptr].chain;
- rule = jumpstack[stackptr].rule;
- rulenum = jumpstack[stackptr].rulenum;
- goto next_rule;
- }
- nft_chain_stats(chain, pkt, jumpstack, stackptr);
-
- if (unlikely(pkt->skb->nf_trace))
- nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_POLICY);
-
- return nft_base_chain(chain)->policy;
-}
-EXPORT_SYMBOL_GPL(nft_do_chain);
-
-int __init nf_tables_core_module_init(void)
-{
- int err;
-
- err = nft_immediate_module_init();
- if (err < 0)
- goto err1;
-
- err = nft_cmp_module_init();
- if (err < 0)
- goto err2;
-
- err = nft_lookup_module_init();
- if (err < 0)
- goto err3;
-
- err = nft_bitwise_module_init();
- if (err < 0)
- goto err4;
-
- err = nft_byteorder_module_init();
- if (err < 0)
- goto err5;
-
- err = nft_payload_module_init();
- if (err < 0)
- goto err6;
-
- return 0;
-
-err6:
- nft_byteorder_module_exit();
-err5:
- nft_bitwise_module_exit();
-err4:
- nft_lookup_module_exit();
-err3:
- nft_cmp_module_exit();
-err2:
- nft_immediate_module_exit();
-err1:
- return err;
-}
-
-void nf_tables_core_module_exit(void)
-{
- nft_payload_module_exit();
- nft_byteorder_module_exit();
- nft_bitwise_module_exit();
- nft_lookup_module_exit();
- nft_cmp_module_exit();
- nft_immediate_module_exit();
-}
diff --git a/net/netfilter/nf_tables_nf.c b/net/netfilter/nf_tables_nf.c
new file mode 100644
index 0000000..d71a0be
--- /dev/null
+++ b/net/netfilter/nf_tables_nf.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <***@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+
+struct nft_jumpstack {
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ int rulenum;
+};
+
+static inline void
+nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt,
+ struct nft_jumpstack *jumpstack, unsigned int stackptr)
+{
+ struct nft_stats __percpu *stats;
+ const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this;
+
+ rcu_read_lock_bh();
+ stats = rcu_dereference(nft_base_chain(chain)->stats);
+ __this_cpu_inc(stats->pkts);
+ __this_cpu_add(stats->bytes, pkt->skb->len);
+ rcu_read_unlock_bh();
+}
+
+enum nft_trace {
+ NFT_TRACE_RULE,
+ NFT_TRACE_RETURN,
+ NFT_TRACE_POLICY,
+};
+
+static const char *const comments[] = {
+ [NFT_TRACE_RULE] = "rule",
+ [NFT_TRACE_RETURN] = "return",
+ [NFT_TRACE_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+static void nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace type)
+{
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type],
+ rulenum);
+}
+
+unsigned int
+nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+{
+ const struct nft_chain *chain = ops->priv;
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ struct nft_data data[NFT_REG_MAX + 1];
+ unsigned int stackptr = 0;
+ struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
+ int rulenum = 0;
+ /*
+ * Cache cursor to avoid problems in case that the cursor is updated
+ * while traversing the ruleset.
+ */
+ unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor);
+
+do_chain:
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+
+ /* This rule is not active, skip. */
+ if (unlikely(rule->genmask & (1 << gencursor)))
+ continue;
+
+ rulenum++;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (expr->ops == &nft_cmp_fast_ops)
+ nft_cmp_fast_eval(expr, data);
+ else if (expr->ops != &nft_payload_fast_ops ||
+ !nft_payload_fast_eval(expr, data, pkt))
+ expr->ops->eval(expr, data, pkt);
+
+ if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_BREAK:
+ data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+ /* fall through */
+ case NFT_CONTINUE:
+ continue;
+ }
+ break;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ return data[NFT_REG_VERDICT].verdict;
+ }
+
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_JUMP:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+ jumpstack[stackptr].chain = chain;
+ jumpstack[stackptr].rule = rule;
+ jumpstack[stackptr].rulenum = rulenum;
+ stackptr++;
+ /* fall through */
+ case NFT_GOTO:
+ chain = data[NFT_REG_VERDICT].chain;
+ goto do_chain;
+ case NFT_RETURN:
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
+
+ /* fall through */
+ case NFT_CONTINUE:
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ if (stackptr > 0) {
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN);
+
+ stackptr--;
+ chain = jumpstack[stackptr].chain;
+ rule = jumpstack[stackptr].rule;
+ rulenum = jumpstack[stackptr].rulenum;
+ goto next_rule;
+ }
+ nft_chain_stats(chain, pkt, jumpstack, stackptr);
+
+ if (unlikely(pkt->skb->nf_trace))
+ nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_POLICY);
+
+ return nft_base_chain(chain)->policy;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+ int err;
+
+ err = nft_immediate_module_init();
+ if (err < 0)
+ goto err1;
+
+ err = nft_cmp_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nft_lookup_module_init();
+ if (err < 0)
+ goto err3;
+
+ err = nft_bitwise_module_init();
+ if (err < 0)
+ goto err4;
+
+ err = nft_byteorder_module_init();
+ if (err < 0)
+ goto err5;
+
+ err = nft_payload_module_init();
+ if (err < 0)
+ goto err6;
+
+ return 0;
+
+err6:
+ nft_byteorder_module_exit();
+err5:
+ nft_bitwise_module_exit();
+err4:
+ nft_lookup_module_exit();
+err3:
+ nft_cmp_module_exit();
+err2:
+ nft_immediate_module_exit();
+err1:
+ return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+ nft_payload_module_exit();
+ nft_byteorder_module_exit();
+ nft_bitwise_module_exit();
+ nft_lookup_module_exit();
+ nft_cmp_module_exit();
+ nft_immediate_module_exit();
+}
--
1.7.10.4
Pablo Neira Ayuso
2014-03-11 09:19:20 UTC
Permalink
This patch adds SO_ATTACH_NFT_FILTER, which allows you to attach
a socket filter expressed using the suppported expressions via
setsockopt. You can detach the filter through SO_DETACH_NFT_FILTER,
which is actually an alias of SO_DETACH_FILTER that was added for
symmetry.

This patch includes a new NFT_SCOPE_SOCK scope which introduces the
verdict interpretation for the socket filtering. It is mimicing BPF,
thus, the returned value indicates the amount of bytes of the packet
that will be copied to userspace.

I also have registered expression types for this new scope, so you
can currently use the payload, immediate, bitwise, byteorder, meta
and cmp expressions to build filters. Moreover, there is a new
validation callback for the expression types which is used to make
sure you don't select unsupported meta expression types.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/linux/filter.h | 11 ++
include/net/netfilter/nf_tables.h | 6 +
include/net/netfilter/nf_tables_core.h | 3 +-
include/uapi/asm-generic/socket.h | 4 +
net/core/sock.c | 19 ++
net/netfilter/Kconfig | 8 +
net/netfilter/Makefile | 2 +
net/netfilter/nf_tables_api.c | 9 +-
net/netfilter/nf_tables_core.c | 18 +-
net/netfilter/nf_tables_sock.c | 327 ++++++++++++++++++++++++++++++++
net/netfilter/nft_bitwise.c | 24 ++-
net/netfilter/nft_byteorder.c | 25 ++-
net/netfilter/nft_cmp.c | 25 ++-
net/netfilter/nft_immediate.c | 25 ++-
net/netfilter/nft_meta.c | 43 ++++-
net/netfilter/nft_payload.c | 25 ++-
16 files changed, 558 insertions(+), 16 deletions(-)
create mode 100644 net/netfilter/nf_tables_sock.c

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7cba4c2..594cff7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -9,6 +9,14 @@
#include <linux/workqueue.h>
#include <uapi/linux/filter.h>

+#ifdef CONFIG_NF_TABLES_SOCKET
+#include <net/netfilter/nf_tables.h>
+
+int sk_nft_attach_filter(char __user *optval, struct sock *sk);
+int sk_nft_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len);
+#endif
+
#ifdef CONFIG_COMPAT
/*
* A struct sock_filter is architecture independent.
@@ -32,6 +40,9 @@ struct sk_filter {
union {
struct sock_filter insns[0];
struct work_struct work;
+#ifdef CONFIG_NF_TABLES_SOCKET
+ struct nft_expr expr[0];
+#endif
};
};

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index c4c35c4..62280fc 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -70,6 +70,7 @@ static inline void nft_data_debug(const struct nft_data *data)

enum nft_scope {
NFT_SCOPE_NF = 0,
+ NFT_SCOPE_SOCK,
NFT_SCOPE_MAX
};

@@ -258,6 +259,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
/**
* struct nft_expr_type - nf_tables expression type
*
+ * @validate: validate the netlink attributes
* @select_ops: function to select nft_expr_ops
* @ops: default ops, used when no select_ops functions is present
* @list: used internally
@@ -265,9 +267,12 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
* @owner: module reference
* @policy: netlink attribute policy
* @maxattr: highest netlink attribute number
+ * @scope: scope for this expression type
* @family: address family for AF-specific types
*/
struct nft_expr_type {
+ int (*validate)(const struct nft_ctx *,
+ const struct nlattr * const tb[]);
const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *,
const struct nlattr * const tb[]);
const struct nft_expr_ops *ops;
@@ -276,6 +281,7 @@ struct nft_expr_type {
struct module *owner;
const struct nla_policy *policy;
unsigned int maxattr;
+ enum nft_scope scope;
u8 family;
};

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 41ced3e..9ff6ec5 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -98,11 +98,12 @@ int nft_register_expr(struct nft_expr_type *type);
void nft_unregister_expr(struct nft_expr_type *type);
int nf_tables_newexpr(const struct nft_ctx *ctx,
const struct nft_expr_info *info, struct nft_expr *expr);
-int nf_tables_expr_parse(const struct nft_ctx *ctx,
+int nf_tables_expr_parse(const struct nft_ctx *ctx, u8 family,
const struct nlattr *nla, struct nft_expr_info *info,
int (*autoload)(const struct nft_ctx *ctx,
const struct nlattr *nla));
const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ enum nft_scope,
const struct nlattr *nla);
void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr,
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index ea0796b..a3bbe85 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -82,4 +82,8 @@

#define SO_BPF_EXTENSIONS 48

+#define SO_ATTACH_NFT_FILTER 49
+#define SO_DETACH_NFT_FILTER SO_DETACH_FILTER
+#define SO_NFT_GET_FILTER SO_ATTACH_NFT_FILTER
+
#endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 5b6a943..9176700 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -915,6 +915,16 @@ set_rcvbuf:
sk->sk_max_pacing_rate);
break;

+#ifdef CONFIG_NF_TABLES_SOCKET
+ case SO_ATTACH_NFT_FILTER:
+ ret = -EINVAL;
+ if (optlen < sizeof(struct nlattr))
+ return -EFAULT;
+
+ ret = sk_nft_attach_filter(optval, sk);
+ break;
+#endif
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1185,6 +1195,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_max_pacing_rate;
break;

+#ifdef CONFIG_NF_TABLES_SOCKET
+ case SO_NFT_GET_FILTER:
+ len = sk_nft_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+#endif
+
default:
return -ENOPROTOOPT;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 5bd91a8..4eec678 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -433,6 +433,14 @@ config NF_TABLES

To compile it as a module, choose M here.

+config NF_TABLES_SOCKET
+ bool "Enable nftables socket filtering"
+ depends on NF_TABLES
+ select NF_TABLES_CORE
+ ---help---
+ This feature allows you to use nf_tables expressions to define
+ socket filters as an alternative to BPF.
+
config NF_TABLES_INET
depends on NF_TABLES && IPV6
select NF_TABLES_IPV4
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7564485..9c32cbd 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,6 +86,8 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o

+obj-$(CONFIG_NF_TABLES_SOCKET) += nf_tables_sock.o
+
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c616fea..fd40d21 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1399,14 +1399,14 @@ static int nft_expr_autoload(const struct nft_ctx *ctx,
request_module("nft-expr-%u-%.*s", ctx->afi->family,
nla_len(nla), (char *)nla_data(nla));
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (__nft_expr_type_get(ctx->afi->family, nla))
+ if (__nft_expr_type_get(ctx->afi->family, ctx->scope, nla))
return -EAGAIN;

nfnl_unlock(NFNL_SUBSYS_NFTABLES);
request_module("nft-expr-%.*s",
nla_len(nla), (char *)nla_data(nla));
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (__nft_expr_type_get(ctx->afi->family, nla))
+ if (__nft_expr_type_get(ctx->afi->family, ctx->scope, nla))
return -EAGAIN;
#endif
return -ENOENT;
@@ -1484,8 +1484,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
goto err1;
if (n == NFT_RULE_MAXEXPRS)
goto err1;
- err = nf_tables_expr_parse(&ctx, tmp, &info[n],
- nft_expr_autoload);
+ err = nf_tables_expr_parse(&ctx, afi->family, tmp,
+ &info[n],
+ nft_expr_autoload);
if (err < 0)
goto err1;
size += info[n].ops->size;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index fe427d4..7a2b542 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -54,12 +54,14 @@ void nft_unregister_expr(struct nft_expr_type *type)
EXPORT_SYMBOL_GPL(nft_unregister_expr);

const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ enum nft_scope scope,
const struct nlattr *nla)
{
const struct nft_expr_type *type;

list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
if (!nla_strcmp(nla, type->name) &&
+ type->scope == scope &&
(!type->family || type->family == family))
return type;
}
@@ -68,6 +70,7 @@ const struct nft_expr_type *__nft_expr_type_get(u8 family,
EXPORT_SYMBOL(__nft_expr_type_get);

static const struct nft_expr_type *nft_expr_type_get(u8 family,
+ enum nft_scope scope,
const struct nlattr *nla)
{
const struct nft_expr_type *type;
@@ -75,7 +78,7 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family,
if (nla == NULL)
return ERR_PTR(-EINVAL);

- type = __nft_expr_type_get(family, nla);
+ type = __nft_expr_type_get(family, scope, nla);
if (type != NULL && try_module_get(type->owner))
return type;

@@ -87,8 +90,8 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
[NFTA_EXPR_DATA] = { .type = NLA_NESTED },
};

-int nf_tables_expr_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
- struct nft_expr_info *info,
+int nf_tables_expr_parse(const struct nft_ctx *ctx, u8 family,
+ const struct nlattr *nla, struct nft_expr_info *info,
int (*autoload)(const struct nft_ctx *ctx,
const struct nlattr *nla))
{
@@ -101,8 +104,8 @@ int nf_tables_expr_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
if (err < 0)
return err;

- type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
- if (IS_ERR(type) < 0) {
+ type = nft_expr_type_get(family, ctx->scope, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type)) {
if (PTR_ERR(type) == -ENOENT)
return autoload(ctx, tb[NFTA_EXPR_NAME]);

@@ -143,6 +146,11 @@ int nf_tables_newexpr(const struct nft_ctx *ctx,
int err;

expr->ops = ops;
+ if (ops->type->validate) {
+ err = ops->type->validate(ctx, (const struct nlattr **)info->tb);
+ if (err < 0)
+ goto err1;
+ }
if (ops->init) {
err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
if (err < 0)
diff --git a/net/netfilter/nf_tables_sock.c b/net/netfilter/nf_tables_sock.c
new file mode 100644
index 0000000..533a285
--- /dev/null
+++ b/net/netfilter/nf_tables_sock.c
@@ -0,0 +1,327 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <***@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/filter.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+#define nft_filter_for_each_expr(expr, rem) \
+ for (; rem > 0; rem -= expr->ops->size, expr = nft_expr_next(expr))
+
+static unsigned int sk_nft_run_filter(const struct sk_buff *skb,
+ const struct sock_filter *filter)
+{
+ const struct nft_expr *expr = (const struct nft_expr *)filter;
+ struct sk_filter *fp = container_of(filter, struct sk_filter, insns[0]);
+ struct nft_data data[NFT_REG_MAX + 1];
+ const struct nft_pktinfo pkt = {
+ .skb = (struct sk_buff *)skb,
+ .in = skb->dev,
+ .xt.thoff = skb_transport_offset(skb),
+ };
+ int rem = fp->size;
+
+ data[NFT_REG_VERDICT].verdict = 0;
+ nft_filter_for_each_expr(expr, rem) {
+ if (expr->ops == &nft_cmp_fast_ops)
+ nft_cmp_fast_eval(expr, data);
+ else if (expr->ops != &nft_payload_fast_ops ||
+ !nft_payload_fast_eval(expr, data, &pkt))
+ expr->ops->eval(expr, data, &pkt);
+ switch (data[NFT_REG_VERDICT].verdict) {
+ case NFT_BREAK:
+ /* we got a mismatch, skip this packet */
+ return 0;
+ }
+ }
+ return data[NFT_REG_VERDICT].verdict;
+}
+
+static void sk_nft_filter_release_rcu(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+ struct nft_expr *expr = fp->expr;
+ struct nft_ctx ctx = {
+ .scope = NFT_SCOPE_SOCK,
+ };
+ int rem = fp->size;
+
+ nft_filter_for_each_expr(expr, rem)
+ nf_tables_expr_destroy(&ctx, expr);
+
+ kfree(fp);
+}
+
+/* Maximum number of expressions per filter, like BPF */
+#define SK_NFT_MAX_EXPR 4096
+static struct nft_expr_info *info;
+static DEFINE_MUTEX(nft_expr_info_mutex);
+
+static int nft_sock_expr_autoload(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
+{
+#ifdef CONFIG_MODULES
+ mutex_unlock(&nft_expr_info_mutex);
+ request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla));
+ mutex_lock(&nft_expr_info_mutex);
+ if (__nft_expr_type_get(AF_UNSPEC, ctx->scope, nla))
+ return -EAGAIN;
+#endif
+ return -ENOENT;
+}
+
+int sk_nft_attach_filter(char __user *optval, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+ size_t size;
+ struct nlattr _attr;
+ struct nlattr *attr, *tmp;
+ int err, i, n;
+ struct nft_expr *expr;
+ int ret, rem;
+ struct nft_ctx ctx = {
+ .scope = NFT_SCOPE_SOCK,
+ };
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ if (copy_from_user((void *)&_attr, optval, sizeof(struct nlattr))) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ if (_attr.nla_len < sizeof(struct nlattr)) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ attr = kmalloc(_attr.nla_len, GFP_KERNEL);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ if (copy_from_user(attr, optval, _attr.nla_len)) {
+ err = -EFAULT;
+ goto err2;
+ }
+retry:
+ size = 0;
+ n = 0;
+ mutex_lock(&nft_expr_info_mutex);
+ nla_for_each_nested(tmp, attr, rem) {
+ err = -EINVAL;
+ if (nla_type(tmp) != NFTA_LIST_ELEM)
+ goto err3;
+ if (n == SK_NFT_MAX_EXPR)
+ goto err3;
+ err = nf_tables_expr_parse(&ctx, AF_UNSPEC, tmp, &info[n],
+ nft_sock_expr_autoload);
+ if (err < 0) {
+ if (err == -EAGAIN) {
+ for (i = 0; i < n; i++) {
+ if (info[i].ops != NULL)
+ module_put(info[i].ops->type->owner);
+ }
+ mutex_unlock(&nft_expr_info_mutex);
+ goto retry;
+ }
+ goto err3;
+ }
+
+ size += info[n].ops->size;
+ n++;
+ }
+
+ fp = sock_kmalloc(sk, sizeof(struct sk_filter) + size, GFP_KERNEL);
+ if (!fp) {
+ err = -ENOMEM;
+ goto err3;
+ }
+
+ expr = fp->expr;
+ for (i = 0; i < n; i++) {
+ err = nf_tables_newexpr(&ctx, &info[i], expr);
+ if (err < 0)
+ goto err4;
+ info[i].ops = NULL;
+ expr = nft_expr_next(expr);
+ }
+ mutex_unlock(&nft_expr_info_mutex);
+
+ atomic_set(&fp->refcnt, 1);
+ fp->size = size;
+ fp->run_filter = sk_nft_run_filter;
+ fp->release_rcu = sk_nft_filter_release_rcu;
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+ return 0;
+err4:
+ sock_kfree_s(sk, fp, size);
+err3:
+ for (i = 0; i < n; i++) {
+ if (info[i].ops != NULL)
+ module_put(info[i].ops->type->owner);
+ }
+ mutex_unlock(&nft_expr_info_mutex);
+err2:
+ kfree(attr);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(sk_nft_attach_filter);
+
+int sk_nft_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
+{
+ struct sk_buff *skb;
+ struct sk_filter *fp;
+ int ret = 0, rem;
+ struct nlattr *list;
+ struct nft_expr *expr;
+
+ lock_sock(sk);
+ fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk));
+ if (!fp)
+ goto out;
+
+ /* Estimate the size of the bytecode to TLV translation based on
+ * worst case scenario according to the expressions that we have:
+ *
+ * NFTA_RULE_EXPRESSIONS nest 4 bytes
+ * NFTA_LIST_ELEM nest 4 bytes
+ * 4 attribute header 4 * 4 bytes = 16 bytes
+ * 2 attribute (16 bytes) 2 * 16 bytes = 32 bytes
+ * 2 attribute (4 bytes) 2 * 4 bytes = 8 bytes +
+ * ---------------------------------------------------------
+ * 64 bytes
+ */
+ skb = alloc_skb(fp->size * 64, GFP_KERNEL);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rem = fp->size;
+ list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (list == NULL)
+ goto nla_put_failure;
+ nft_filter_for_each_expr(expr, rem) {
+ struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (elem == NULL)
+ goto nla_put_failure;
+ if (nf_tables_fill_expr_info(skb, expr, NFT_SCOPE_SOCK) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, elem);
+ }
+ nla_nest_end(skb, list);
+
+ if (copy_to_user(ubuf, skb->data, skb->len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ kfree_skb(skb);
+ ret = skb->len;
+out:
+ release_sock(sk);
+ return ret;
+
+nla_put_failure:
+ release_sock(sk);
+ kfree_skb(skb);
+ return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(sk_nft_get_filter);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+ [NFTA_VERDICT_CODE] = { .type = NLA_U32 },
+};
+
+static int nft_sock_verdict_init(const struct nft_ctx *ctx,
+ struct nft_data *data,
+ struct nft_data_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_VERDICT_CODE])
+ return -EINVAL;
+
+ data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+ desc->type = NFT_DATA_VERDICT;
+
+ return 0;
+}
+
+static int nft_sock_validate_verdict(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type)
+{
+ return 0;
+}
+
+static void nft_sock_verdict_uninit(const struct nft_data *data)
+{
+}
+
+static int nft_sock_verdict_dump(struct sk_buff *skb,
+ const struct nft_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_verdict_ops nft_data_sock_ops = {
+ .init = nft_sock_verdict_init,
+ .validate = nft_sock_validate_verdict,
+ .uninit = nft_sock_verdict_uninit,
+ .dump = nft_sock_verdict_dump,
+};
+
+static __init int nft_sock_init(void)
+{
+ info = kmalloc(SK_NFT_MAX_EXPR * sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (info == NULL) {
+ pr_err("Cannot initialize nft socket filtering");
+ return -ENOMEM;
+ }
+
+ nft_verdict_ops_register(NFT_SCOPE_SOCK, &nft_data_sock_ops);
+ return 0;
+}
+core_initcall(nft_sock_init);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 9e87b67..d8c0dfd 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -136,9 +136,31 @@ static struct nft_expr_type nft_bitwise_type __read_mostly = {
.owner = THIS_MODULE,
};

+static struct nft_expr_type nft_bitwise_sock_type __read_mostly = {
+ .name = "bitwise",
+ .scope = NFT_SCOPE_SOCK,
+ .ops = &nft_bitwise_ops,
+ .policy = nft_bitwise_policy,
+ .maxattr = NFTA_BITWISE_MAX,
+ .owner = THIS_MODULE,
+};
+
int __init nft_bitwise_module_init(void)
{
- return nft_register_expr(&nft_bitwise_type);
+ int err;
+
+ err = nft_register_expr(&nft_bitwise_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_bitwise_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_bitwise_type);
+ return err;
}

void nft_bitwise_module_exit(void)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index baca196..bcaff60 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -163,12 +163,35 @@ static struct nft_expr_type nft_byteorder_type __read_mostly = {
.owner = THIS_MODULE,
};

+static struct nft_expr_type nft_byteorder_sock_type __read_mostly = {
+ .name = "byteorder",
+ .scope = NFT_SCOPE_SOCK,
+ .ops = &nft_byteorder_ops,
+ .policy = nft_byteorder_policy,
+ .maxattr = NFTA_BYTEORDER_MAX,
+ .owner = THIS_MODULE,
+};
+
int __init nft_byteorder_module_init(void)
{
- return nft_register_expr(&nft_byteorder_type);
+ int err;
+
+ err = nft_register_expr(&nft_byteorder_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_byteorder_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_byteorder_type);
+ return err;
}

void nft_byteorder_module_exit(void)
{
+ nft_unregister_expr(&nft_byteorder_sock_type);
nft_unregister_expr(&nft_byteorder_type);
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 3117110..f70b883 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -216,12 +216,35 @@ static struct nft_expr_type nft_cmp_type __read_mostly = {
.owner = THIS_MODULE,
};

+static struct nft_expr_type nft_cmp_sock_type __read_mostly = {
+ .name = "cmp",
+ .scope = NFT_SCOPE_SOCK,
+ .select_ops = nft_cmp_select_ops,
+ .policy = nft_cmp_policy,
+ .maxattr = NFTA_CMP_MAX,
+ .owner = THIS_MODULE,
+};
+
int __init nft_cmp_module_init(void)
{
- return nft_register_expr(&nft_cmp_type);
+ int err;
+
+ err = nft_register_expr(&nft_cmp_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_cmp_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_cmp_type);
+ return err;
}

void nft_cmp_module_exit(void)
{
+ nft_unregister_expr(&nft_cmp_sock_type);
nft_unregister_expr(&nft_cmp_type);
}
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index ae359f4..3470346 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -124,12 +124,35 @@ static struct nft_expr_type nft_imm_type __read_mostly = {
.owner = THIS_MODULE,
};

+static struct nft_expr_type nft_imm_sock_type __read_mostly = {
+ .name = "immediate",
+ .scope = NFT_SCOPE_SOCK,
+ .ops = &nft_imm_ops,
+ .policy = nft_immediate_policy,
+ .maxattr = NFTA_IMMEDIATE_MAX,
+ .owner = THIS_MODULE,
+};
+
int __init nft_immediate_module_init(void)
{
- return nft_register_expr(&nft_imm_type);
+ int err;
+
+ err = nft_register_expr(&nft_imm_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_imm_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_imm_type);
+ return err;
}

void nft_immediate_module_exit(void)
{
+ nft_unregister_expr(&nft_imm_sock_type);
nft_unregister_expr(&nft_imm_type);
}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 82c40d8..11df25e 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -321,13 +321,54 @@ static struct nft_expr_type nft_meta_type __read_mostly = {
.owner = THIS_MODULE,
};

+static int nft_meta_type_sock_validate(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ switch (ntohl(nla_get_be32(tb[NFTA_META_KEY]))) {
+ case NFT_META_LEN:
+ case NFT_META_PROTOCOL:
+ case NFT_META_PRIORITY:
+ case NFT_META_MARK:
+ case NFT_META_IIF:
+ case NFT_META_IIFNAME:
+ case NFT_META_IIFTYPE:
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static struct nft_expr_type nft_meta_sock_type __read_mostly = {
+ .name = "meta",
+ .scope = NFT_SCOPE_SOCK,
+ .validate = nft_meta_type_sock_validate,
+ .select_ops = &nft_meta_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_meta_module_init(void)
{
- return nft_register_expr(&nft_meta_type);
+ int err;
+
+ err = nft_register_expr(&nft_meta_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_meta_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_meta_type);
+ return err;
}

static void __exit nft_meta_module_exit(void)
{
+ nft_unregister_expr(&nft_meta_sock_type);
nft_unregister_expr(&nft_meta_type);
}

diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 9deadb6..e40fd67 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -151,12 +151,35 @@ static struct nft_expr_type nft_payload_type __read_mostly = {
.owner = THIS_MODULE,
};

+static struct nft_expr_type nft_payload_sock_type __read_mostly = {
+ .name = "payload",
+ .scope = NFT_SCOPE_SOCK,
+ .select_ops = nft_payload_select_ops,
+ .policy = nft_payload_policy,
+ .maxattr = NFTA_PAYLOAD_MAX,
+ .owner = THIS_MODULE,
+};
+
int __init nft_payload_module_init(void)
{
- return nft_register_expr(&nft_payload_type);
+ int err;
+
+ err = nft_register_expr(&nft_payload_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_payload_sock_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_expr(&nft_payload_type);
+ return err;
}

void nft_payload_module_exit(void)
{
+ nft_unregister_expr(&nft_payload_sock_type);
nft_unregister_expr(&nft_payload_type);
}
--
1.7.10.4
Pablo Neira Ayuso
2014-03-11 09:19:16 UTC
Permalink
This new function replaces several calls of nft_data_init which
rely on the ctx parameter to know whether the verdict data is
possible or not. This will allows us to pass context during data
initialization, which is required by new classifiers.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/net/netfilter/nf_tables.h | 2 ++
net/netfilter/nf_tables_api.c | 51 ++++++++++++++++++++++++-------------
net/netfilter/nft_bitwise.c | 4 +--
net/netfilter/nft_cmp.c | 6 ++---
4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e6bc14d..696a916 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -96,6 +96,8 @@ struct nft_data_desc {

int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla);
+int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla);
void nft_data_uninit(const struct nft_data *data, enum nft_data_types type);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
enum nft_data_types type, unsigned int len);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 33045a5..9879f23 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2760,11 +2760,11 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,
return -EINVAL;
}

- err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]);
+ err = nft_value_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
goto err1;
err = -EINVAL;
- if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
+ if (d1.len != set->klen)
goto err2;

err = -EEXIST;
@@ -2854,12 +2854,12 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_KEY] == NULL)
goto err1;

- err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]);
+ err = nft_value_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
goto err1;

err = -EINVAL;
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+ if (desc.len != set->klen)
goto err2;

err = set->ops->get(set, &elem);
@@ -3256,8 +3256,9 @@ nla_put_failure:
return -1;
}

-static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
- struct nft_data_desc *desc, const struct nlattr *nla)
+static int nft_do_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc,
+ const struct nlattr *nla)
{
unsigned int len;

@@ -3285,6 +3286,24 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
[NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
};

+static int nft_do_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc,
+ const struct nlattr *nla, bool allow_verdict)
+{
+ struct nlattr *tb[NFTA_DATA_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DATA_VALUE])
+ return nft_do_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ if (tb[NFTA_DATA_VERDICT] && allow_verdict)
+ return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ return -EINVAL;
+}
+
/**
* nft_data_init - parse nf_tables data netlink attributes
*
@@ -3302,21 +3321,17 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
- struct nlattr *tb[NFTA_DATA_MAX + 1];
- int err;
-
- err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
- if (err < 0)
- return err;
-
- if (tb[NFTA_DATA_VALUE])
- return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
+ return nft_do_data_init(ctx, data, desc, nla, true);
}
EXPORT_SYMBOL_GPL(nft_data_init);

+int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ return nft_do_data_init(ctx, data, desc, nla, false);
+}
+EXPORT_SYMBOL_GPL(nft_value_init);
+
/**
* nft_data_uninit - release a nft_data item
*
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 4fb6ee2..5f944fb 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -78,13 +78,13 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,

priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));

- err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+ err = nft_value_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
if (d1.len != priv->len)
return -EINVAL;

- err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+ err = nft_value_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
if (err < 0)
return err;
if (d2.len != priv->len)
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 954925d..7758c1c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -78,7 +78,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));

- err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
BUG_ON(err < 0);

priv->len = desc.len;
@@ -124,7 +124,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,

priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));

- err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
BUG_ON(err < 0);
desc.len *= BITS_PER_BYTE;

@@ -194,7 +194,7 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}

- err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-11 09:19:19 UTC
Permalink
This patch allows us to add multiple verdict interpretations depending
on the scope that is specified. This allows us to add different verdict
handling for new classifiers, eg. socket filtering using nf_tables.

The idea is to add a struct nft_verdict_ops that provides the
corresponding interpretation for the given scope. There is an array
of struct nft_verdict_ops for each scope which is accessed only from
user context at configuration stage. I didn't find any good reason
to protect this array with any locking since each position of it
is initialized/reset by the corresponding user (socket filter and
netfilter) in the initialization and module removal path (before
any client can actually access it). This structure is not accessed
from the packet path and the only user is the one that registers it.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/net/netfilter/nf_tables.h | 19 ++-
include/net/netfilter/nf_tables_core.h | 17 ++-
include/net/netfilter/nft_reject.h | 3 +-
net/netfilter/nf_tables_api.c | 238 ++++++--------------------------
net/netfilter/nf_tables_core.c | 224 +++++++++++++++++++++++++++++-
net/netfilter/nft_bitwise.c | 11 +-
net/netfilter/nft_byteorder.c | 3 +-
net/netfilter/nft_cmp.c | 17 ++-
net/netfilter/nft_compat.c | 6 +-
net/netfilter/nft_counter.c | 3 +-
net/netfilter/nft_ct.c | 9 +-
net/netfilter/nft_exthdr.c | 3 +-
net/netfilter/nft_hash.c | 12 +-
net/netfilter/nft_immediate.c | 10 +-
net/netfilter/nft_limit.c | 3 +-
net/netfilter/nft_log.c | 3 +-
net/netfilter/nft_lookup.c | 3 +-
net/netfilter/nft_meta.c | 8 +-
net/netfilter/nft_nat.c | 3 +-
net/netfilter/nft_payload.c | 3 +-
net/netfilter/nft_queue.c | 3 +-
net/netfilter/nft_rbtree.c | 12 +-
net/netfilter/nft_reject.c | 3 +-
23 files changed, 365 insertions(+), 251 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 696a916..c4c35c4 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -68,6 +68,11 @@ static inline void nft_data_debug(const struct nft_data *data)
data->data[2], data->data[3]);
}

+enum nft_scope {
+ NFT_SCOPE_NF = 0,
+ NFT_SCOPE_MAX
+};
+
/**
* struct nft_ctx - nf_tables rule/set context
*
@@ -78,6 +83,7 @@ static inline void nft_data_debug(const struct nft_data *data)
* @table: the table the chain is contained in
* @chain: the chain the rule is contained in
* @nla: netlink attributes
+ * @scope: the scope for this context
*/
struct nft_ctx {
struct net *net;
@@ -87,6 +93,7 @@ struct nft_ctx {
const struct nft_table *table;
const struct nft_chain *chain;
const struct nlattr * const *nla;
+ enum nft_scope scope;
};

struct nft_data_desc {
@@ -98,9 +105,11 @@ int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla);
int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla);
-void nft_data_uninit(const struct nft_data *data, enum nft_data_types type);
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type,
+ enum nft_scope scope);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
- enum nft_data_types type, unsigned int len);
+ enum nft_data_types type, unsigned int len,
+ enum nft_scope scope);

static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
{
@@ -178,7 +187,8 @@ struct nft_set_ops {
unsigned int (*privsize)(const struct nlattr * const nla[]);
int (*init)(const struct nft_set *set,
const struct nlattr * const nla[]);
- void (*destroy)(const struct nft_set *set);
+ void (*destroy)(const struct nft_ctx *ctx,
+ const struct nft_set *set);

struct list_head list;
struct module *owner;
@@ -294,7 +304,8 @@ struct nft_expr_ops {
void (*destroy)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
int (*dump)(struct sk_buff *skb,
- const struct nft_expr *expr);
+ const struct nft_expr *expr,
+ enum nft_scope scope);
int (*validate)(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data);
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 98b9c74..41ced3e 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -105,6 +105,21 @@ int nf_tables_expr_parse(const struct nft_ctx *ctx,
const struct nft_expr_type *__nft_expr_type_get(u8 family,
const struct nlattr *nla);
void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
-int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr);
+int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope);
+
+struct nft_verdict_ops {
+ int (*init)(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla);
+ int (*validate)(const struct nft_ctx *ctx, enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type);
+ void (*uninit)(const struct nft_data *data);
+ int (*dump)(struct sk_buff *skb, const struct nft_data *data);
+};
+
+void nft_verdict_ops_register(enum nft_scope scope,
+ const struct nft_verdict_ops *ops);
+void nft_verdict_ops_unregister(enum nft_scope scope);

#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index 36b0da2..2f6d96d 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -12,7 +12,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[]);

-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr);
+int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope);

void nft_reject_ipv4_eval(const struct nft_expr *expr,
struct nft_data data[NFT_REG_MAX + 1],
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 29d6745..c616fea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1075,6 +1075,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->table = table;
ctx->chain = chain;
ctx->nla = nla;
+ ctx->scope = NFT_SCOPE_NF;
}

/*
@@ -1160,7 +1161,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM);
if (elem == NULL)
goto nla_put_failure;
- if (nf_tables_fill_expr_info(skb, expr) < 0)
+ if (nf_tables_fill_expr_info(skb, expr, NFT_SCOPE_NF) < 0)
goto nla_put_failure;
nla_nest_end(skb, elem);
}
@@ -2282,7 +2283,7 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set
list_del(&set->list);
nf_tables_set_notify(ctx, set, NFT_MSG_DELSET);

- set->ops->destroy(set);
+ set->ops->destroy(ctx, set);
module_put(set->ops->owner);
kfree(set);
}
@@ -2425,14 +2426,14 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
goto nla_put_failure;

if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE,
- set->klen) < 0)
+ set->klen, NFT_SCOPE_NF) < 0)
goto nla_put_failure;

if (set->flags & NFT_SET_MAP &&
!(elem->flags & NFT_SET_ELEM_INTERVAL_END) &&
nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data,
set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
- set->dlen) < 0)
+ set->dlen, NFT_SCOPE_NF) < 0)
goto nla_put_failure;

if (elem->flags != 0)
@@ -2624,6 +2625,7 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,
.afi = ctx->afi,
.table = ctx->table,
.chain = binding->chain,
+ .scope = NFT_SCOPE_NF,
};

err = nft_validate_data_load(&bind_ctx, dreg,
@@ -2641,9 +2643,9 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set,

err3:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_uninit(&elem.data, d2.type);
+ nft_data_uninit(&elem.data, d2.type, NFT_SCOPE_NF);
err2:
- nft_data_uninit(&elem.key, d1.type);
+ nft_data_uninit(&elem.key, d1.type, NFT_SCOPE_NF);
err1:
return err;
}
@@ -2706,12 +2708,12 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set,

set->ops->remove(set, &elem);

- nft_data_uninit(&elem.key, NFT_DATA_VALUE);
+ nft_data_uninit(&elem.key, NFT_DATA_VALUE, NFT_SCOPE_NF);
if (set->flags & NFT_SET_MAP)
- nft_data_uninit(&elem.data, set->dtype);
+ nft_data_uninit(&elem.data, set->dtype, NFT_SCOPE_NF);

err2:
- nft_data_uninit(&elem.key, desc.type);
+ nft_data_uninit(&elem.key, desc.type, NFT_SCOPE_NF);
err1:
return err;
}
@@ -2922,42 +2924,6 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
}

/**
- * nft_validate_input_register - validate an expressions' input register
- *
- * @reg: the register number
- *
- * Validate that the input register is one of the general purpose
- * registers.
- */
-int nft_validate_input_register(enum nft_registers reg)
-{
- if (reg <= NFT_REG_VERDICT)
- return -EINVAL;
- if (reg > NFT_REG_MAX)
- return -ERANGE;
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_validate_input_register);
-
-/**
- * nft_validate_output_register - validate an expressions' output register
- *
- * @reg: the register number
- *
- * Validate that the output register is one of the general purpose
- * registers or the verdict register.
- */
-int nft_validate_output_register(enum nft_registers reg)
-{
- if (reg < NFT_REG_VERDICT)
- return -EINVAL;
- if (reg > NFT_REG_MAX)
- return -ERANGE;
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_validate_output_register);
-
-/**
* nft_validate_data_load - validate an expressions' data load
*
* @ctx: context of the expression performing the load
@@ -2970,37 +2936,27 @@ EXPORT_SYMBOL_GPL(nft_validate_output_register);
* that its runtime gathered data, which is always of type
* NFT_DATA_VALUE.
*/
-int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
- const struct nft_data *data,
- enum nft_data_types type)
+static int nft_validate_verdict(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type)
{
int err;

- switch (reg) {
- case NFT_REG_VERDICT:
- if (data == NULL || type != NFT_DATA_VERDICT)
- return -EINVAL;
-
- if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) {
- err = nf_tables_check_loops(ctx, data->chain);
- if (err < 0)
- return err;
+ if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) {
+ err = nf_tables_check_loops(ctx, data->chain);
+ if (err < 0)
+ return err;

- if (ctx->chain->level + 1 > data->chain->level) {
- if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
- return -EMLINK;
- data->chain->level = ctx->chain->level + 1;
- }
+ if (ctx->chain->level + 1 > data->chain->level) {
+ if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
+ return -EMLINK;
+ data->chain->level = ctx->chain->level + 1;
}
-
- return 0;
- default:
- if (data != NULL && type != NFT_DATA_VALUE)
- return -EINVAL;
- return 0;
}
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_data_load);

static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
[NFTA_VERDICT_CODE] = { .type = NLA_U32 },
@@ -3094,131 +3050,6 @@ nla_put_failure:
return -1;
}

-static int nft_do_value_init(const struct nft_ctx *ctx, struct nft_data *data,
- struct nft_data_desc *desc,
- const struct nlattr *nla)
-{
- unsigned int len;
-
- len = nla_len(nla);
- if (len == 0)
- return -EINVAL;
- if (len > sizeof(data->data))
- return -EOVERFLOW;
-
- nla_memcpy(data->data, nla, sizeof(data->data));
- desc->type = NFT_DATA_VALUE;
- desc->len = len;
- return 0;
-}
-
-static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
- unsigned int len)
-{
- return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
-}
-
-static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
- [NFTA_DATA_VALUE] = { .type = NLA_BINARY,
- .len = FIELD_SIZEOF(struct nft_data, data) },
- [NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
-};
-
-static int nft_do_data_init(const struct nft_ctx *ctx, struct nft_data *data,
- struct nft_data_desc *desc,
- const struct nlattr *nla, bool allow_verdict)
-{
- struct nlattr *tb[NFTA_DATA_MAX + 1];
- int err;
-
- err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
- if (err < 0)
- return err;
-
- if (tb[NFTA_DATA_VALUE])
- return nft_do_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && allow_verdict)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
-}
-
-/**
- * nft_data_init - parse nf_tables data netlink attributes
- *
- * @ctx: context of the expression using the data
- * @data: destination struct nft_data
- * @desc: data description
- * @nla: netlink attribute containing data
- *
- * Parse the netlink data attributes and initialize a struct nft_data.
- * The type and length of data are returned in the data description.
- *
- * The caller can indicate that it only wants to accept data of type
- * NFT_DATA_VALUE by passing NULL for the ctx argument.
- */
-int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
- struct nft_data_desc *desc, const struct nlattr *nla)
-{
- return nft_do_data_init(ctx, data, desc, nla, true);
-}
-EXPORT_SYMBOL_GPL(nft_data_init);
-
-int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
- struct nft_data_desc *desc, const struct nlattr *nla)
-{
- return nft_do_data_init(ctx, data, desc, nla, false);
-}
-EXPORT_SYMBOL_GPL(nft_value_init);
-
-/**
- * nft_data_uninit - release a nft_data item
- *
- * @data: struct nft_data to release
- * @type: type of data
- *
- * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
- * all others need to be released by calling this function.
- */
-void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
-{
- switch (type) {
- case NFT_DATA_VALUE:
- return;
- case NFT_DATA_VERDICT:
- return nft_verdict_uninit(data);
- default:
- WARN_ON(1);
- }
-}
-EXPORT_SYMBOL_GPL(nft_data_uninit);
-
-int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
- enum nft_data_types type, unsigned int len)
-{
- struct nlattr *nest;
- int err;
-
- nest = nla_nest_start(skb, attr);
- if (nest == NULL)
- return -1;
-
- switch (type) {
- case NFT_DATA_VALUE:
- err = nft_value_dump(skb, data, len);
- break;
- case NFT_DATA_VERDICT:
- err = nft_verdict_dump(skb, data);
- break;
- default:
- err = -EINVAL;
- WARN_ON(1);
- }
-
- nla_nest_end(skb, nest);
- return err;
-}
-EXPORT_SYMBOL_GPL(nft_data_dump);
-
static int nf_tables_init_net(struct net *net)
{
INIT_LIST_HEAD(&net->nft.af_info);
@@ -3230,6 +3061,13 @@ static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
};

+static struct nft_verdict_ops nft_verdict_ops = {
+ .init = nft_verdict_init,
+ .validate = nft_validate_verdict,
+ .uninit = nft_verdict_uninit,
+ .dump = nft_verdict_dump,
+};
+
static int __init nf_tables_module_init(void)
{
int err;
@@ -3245,13 +3083,22 @@ static int __init nf_tables_module_init(void)
if (err < 0)
goto err2;

+ nft_verdict_ops_register(NFT_SCOPE_NF, &nft_verdict_ops);
+
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
goto err3;

pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <***@trash.net>\n");
- return register_pernet_subsys(&nf_tables_net_ops);
+ err = register_pernet_subsys(&nf_tables_net_ops);
+ if (err < 0)
+ goto err4;
+
+ return err;
+err4:
+ nfnetlink_subsys_unregister(&nf_tables_subsys);
err3:
+ nft_verdict_ops_unregister(NFT_SCOPE_NF);
nf_tables_core_module_exit();
err2:
kfree(info);
@@ -3263,6 +3110,7 @@ static void __exit nf_tables_module_exit(void)
{
unregister_pernet_subsys(&nf_tables_net_ops);
nfnetlink_subsys_unregister(&nf_tables_subsys);
+ nft_verdict_ops_unregister(NFT_SCOPE_NF);
nf_tables_core_module_exit();
kfree(info);
}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 20bd1f0..fe427d4 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -67,7 +67,8 @@ const struct nft_expr_type *__nft_expr_type_get(u8 family,
}
EXPORT_SYMBOL(__nft_expr_type_get);

-const struct nft_expr_type *nft_expr_type_get(u8 family, const struct nlattr *nla)
+static const struct nft_expr_type *nft_expr_type_get(u8 family,
+ const struct nlattr *nla)
{
const struct nft_expr_type *type;

@@ -80,7 +81,6 @@ const struct nft_expr_type *nft_expr_type_get(u8 family, const struct nlattr *nl

return ERR_PTR(-ENOENT);
}
-EXPORT_SYMBOL_GPL(nft_expr_type_get);

static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
[NFTA_EXPR_NAME] = { .type = NLA_STRING },
@@ -165,7 +165,8 @@ void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
}
EXPORT_SYMBOL_GPL(nf_tables_expr_destroy);

-int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr)
+int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
goto nla_put_failure;
@@ -174,7 +175,7 @@ int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr)
struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
if (data == NULL)
goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
+ if (expr->ops->dump(skb, expr, scope) < 0)
goto nla_put_failure;
nla_nest_end(skb, data);
}
@@ -186,6 +187,221 @@ nla_put_failure:
};
EXPORT_SYMBOL_GPL(nf_tables_fill_expr_info);

+/**
+ * nft_validate_input_register - validate an expressions' input register
+ *
+ * @reg: the register number
+ *
+ * Validate that the input register is one of the general purpose
+ * registers.
+ */
+int nft_validate_input_register(enum nft_registers reg)
+{
+ if (reg <= NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_input_register);
+
+/**
+ * nft_validate_output_register - validate an expressions' output register
+ *
+ * @reg: the register number
+ *
+ * Validate that the output register is one of the general purpose
+ * registers or the verdict register.
+ */
+int nft_validate_output_register(enum nft_registers reg)
+{
+ if (reg < NFT_REG_VERDICT)
+ return -EINVAL;
+ if (reg > NFT_REG_MAX)
+ return -ERANGE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_output_register);
+
+static const struct nft_verdict_ops *verdict_ops[NFT_SCOPE_MAX];
+
+/**
+ * nft_validate_data_load - validate an expressions' data load
+ *
+ * @ctx: context of the expression performing the load
+ * @reg: the destination register number
+ * @data: the data to load
+ * @type: the data type
+ *
+ * Validate that a data load uses the appropriate data type for
+ * the destination register. A value of NULL for the data means
+ * that its runtime gathered data, which is always of type
+ * NFT_DATA_VALUE.
+ */
+int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type)
+{
+ switch (reg) {
+ case NFT_REG_VERDICT:
+ if (data == NULL || type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ return verdict_ops[ctx->scope]->validate(ctx, reg, data, type);
+ default:
+ if (data != NULL && type != NFT_DATA_VALUE)
+ return -EINVAL;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_validate_data_load);
+
+/**
+ * nft_data_uninit - release a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ * @scope: scope of this uninitialization
+ *
+ * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type,
+ enum nft_scope scope)
+{
+ switch (type) {
+ case NFT_DATA_VALUE:
+ return;
+ case NFT_DATA_VERDICT:
+ verdict_ops[scope]->uninit(data);
+ break;
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+static int nft_do_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc,
+ const struct nlattr *nla)
+{
+ unsigned int len;
+
+ len = nla_len(nla);
+ if (len == 0)
+ return -EINVAL;
+ if (len > sizeof(data->data))
+ return -EOVERFLOW;
+
+ nla_memcpy(data->data, nla, sizeof(data->data));
+ desc->type = NFT_DATA_VALUE;
+ desc->len = len;
+ return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+ unsigned int len)
+{
+ return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+ [NFTA_DATA_VALUE] = { .type = NLA_BINARY,
+ .len = FIELD_SIZEOF(struct nft_data, data) },
+ [NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
+};
+
+int nft_do_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla,
+ bool allow_verdict)
+{
+ struct nlattr *tb[NFTA_DATA_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DATA_VALUE])
+ return nft_do_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ if (tb[NFTA_DATA_VERDICT] && allow_verdict) {
+ return verdict_ops[ctx->scope]->init(ctx, data, desc,
+ tb[NFTA_DATA_VERDICT]);
+ }
+ return -EINVAL;
+}
+
+/**
+ * nft_data_init - parse nf_tables data netlink attributes
+ *
+ * @ctx: context of the expression using the data
+ * @data: destination struct nft_data
+ * @desc: data description
+ * @nla: netlink attribute containing data
+ *
+ * Parse the netlink data attributes and initialize a struct nft_data.
+ * The type and length of data are returned in the data description.
+ *
+ * The caller can indicate that it only wants to accept data of type
+ * NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ return nft_do_data_init(ctx, data, desc, nla, true);
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ return nft_do_data_init(ctx, data, desc, nla, false);
+}
+EXPORT_SYMBOL_GPL(nft_value_init);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+ enum nft_data_types type, unsigned int len,
+ enum nft_scope scope)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start(skb, attr);
+ if (nest == NULL)
+ return -1;
+
+ switch (type) {
+ case NFT_DATA_VALUE:
+ err = nft_value_dump(skb, data, len);
+ break;
+ case NFT_DATA_VERDICT:
+ err = verdict_ops[scope]->dump(skb, data);
+ break;
+ default:
+ err = -EINVAL;
+ WARN_ON(1);
+ }
+
+ nla_nest_end(skb, nest);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+void nft_verdict_ops_register(enum nft_scope scope,
+ const struct nft_verdict_ops *ops)
+{
+ BUG_ON(scope >= NFT_SCOPE_MAX || verdict_ops[scope]);
+ verdict_ops[scope] = ops;
+}
+EXPORT_SYMBOL_GPL(nft_verdict_ops_register);
+
+void nft_verdict_ops_unregister(enum nft_scope scope)
+{
+ BUG_ON(scope >= NFT_SCOPE_MAX || !verdict_ops[scope]);
+ verdict_ops[scope] = NULL;
+}
+EXPORT_SYMBOL_GPL(nft_verdict_ops_unregister);
+
static __init int nf_tables_core_init(void)
{
int err;
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 5f944fb..9e87b67 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -78,13 +78,13 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,

priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));

- err = nft_value_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
+ err = nft_value_init(ctx, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
if (d1.len != priv->len)
return -EINVAL;

- err = nft_value_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
+ err = nft_value_init(ctx, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]);
if (err < 0)
return err;
if (d2.len != priv->len)
@@ -93,7 +93,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
return 0;
}

-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);

@@ -105,11 +106,11 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;

if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
- NFT_DATA_VALUE, priv->len) < 0)
+ NFT_DATA_VALUE, priv->len, scope) < 0)
goto nla_put_failure;

if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
- NFT_DATA_VALUE, priv->len) < 0)
+ NFT_DATA_VALUE, priv->len, scope) < 0)
goto nla_put_failure;

return 0;
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index c39ed8d..baca196 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -125,7 +125,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return 0;
}

-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 0252f63..3117110 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -78,14 +78,15 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));

- err = nft_value_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(ctx, &priv->data, &desc, tb[NFTA_CMP_DATA]);
BUG_ON(err < 0);

priv->len = desc.len;
return 0;
}

-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);

@@ -95,7 +96,7 @@ static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;

if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
- NFT_DATA_VALUE, priv->len) < 0)
+ NFT_DATA_VALUE, priv->len, scope) < 0)
goto nla_put_failure;
return 0;

@@ -124,7 +125,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,

priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));

- err = nft_value_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(ctx, &data, &desc, tb[NFTA_CMP_DATA]);
BUG_ON(err < 0);
desc.len *= BITS_PER_BYTE;

@@ -134,7 +135,8 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
return 0;
}

-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
struct nft_data data;
@@ -146,7 +148,8 @@ static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)

data.data[0] = priv->data;
if (nft_data_dump(skb, NFTA_CMP_DATA, &data,
- NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0)
+ NFT_DATA_VALUE, priv->len / BITS_PER_BYTE,
+ scope) < 0)
goto nla_put_failure;
return 0;

@@ -195,7 +198,7 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}

- err = nft_value_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
+ err = nft_value_init(ctx, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8a779be..7426692 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -227,7 +227,8 @@ target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
return ret;
}

-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
@@ -423,7 +424,8 @@ static inline int nft_compat_match_offset(struct xt_match *match)
#endif
}

-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
void *info = nft_expr_priv(expr);
struct xt_match *match = expr->ops->data;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index c89ee48..eb6354c 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -35,7 +35,8 @@ static void nft_counter_eval(const struct nft_expr *expr,
write_sequnlock_bh(&priv->lock);
}

-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
struct nft_counter *priv = nft_expr_priv(expr);
unsigned int seq;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index bd0d41e..09d4f28 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -324,7 +324,8 @@ static void nft_ct_destroy(const struct nft_ctx *ctx,
nft_ct_l3proto_module_put(ctx->afi->family);
}

-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_ct *priv = nft_expr_priv(expr);

@@ -351,7 +352,8 @@ nla_put_failure:
return -1;
}

-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_ct *priv = nft_expr_priv(expr);

@@ -385,8 +387,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
};

static const struct nft_expr_ops *
-nft_ct_select_ops(const struct nft_ctx *ctx,
- const struct nlattr * const tb[])
+nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
if (tb[NFTA_CT_KEY] == NULL)
return ERR_PTR(-EINVAL);
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 55c939f..7e55a78 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -80,7 +80,8 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
}

-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 6a1acde..d0e53f9 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -245,12 +245,13 @@ static int nft_hash_insert(const struct nft_set *set,
return 0;
}

-static void nft_hash_elem_destroy(const struct nft_set *set,
+static void nft_hash_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
struct nft_hash_elem *he)
{
- nft_data_uninit(&he->key, NFT_DATA_VALUE);
+ nft_data_uninit(&he->key, NFT_DATA_VALUE, ctx->scope);
if (set->flags & NFT_SET_MAP)
- nft_data_uninit(he->data, set->dtype);
+ nft_data_uninit(he->data, set->dtype, ctx->scope);
kfree(he);
}

@@ -351,7 +352,8 @@ static int nft_hash_init(const struct nft_set *set,
return 0;
}

-static void nft_hash_destroy(const struct nft_set *set)
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
const struct nft_hash *priv = nft_set_priv(set);
const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
@@ -362,7 +364,7 @@ static void nft_hash_destroy(const struct nft_set *set)
for (he = nft_dereference(tbl->buckets[i]); he != NULL;
he = next) {
next = nft_dereference(he->next);
- nft_hash_elem_destroy(set, he);
+ nft_hash_elem_destroy(ctx, set, he);
}
}
kfree(tbl);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 810385e..ae359f4 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -66,7 +66,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
return 0;

err1:
- nft_data_uninit(&priv->data, desc.type);
+ nft_data_uninit(&priv->data, desc.type, ctx->scope);
return err;
}

@@ -74,10 +74,12 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
- return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+ return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg),
+ ctx->scope);
}

-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);

@@ -85,7 +87,7 @@ static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;

return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data,
- nft_dreg_to_type(priv->dreg), priv->dlen);
+ nft_dreg_to_type(priv->dreg), priv->dlen, scope);

nla_put_failure:
return -1;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 85da5bd..6c058bd 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -70,7 +70,8 @@ static int nft_limit_init(const struct nft_ctx *ctx,
return 0;
}

-static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_limit *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 10cfb15..2684262 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -83,7 +83,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
kfree(priv->prefix);
}

-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_log *priv = nft_expr_priv(expr);
const struct nf_loginfo *li = &priv->loginfo;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 7fd2bea..38f6b6d 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -97,7 +97,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
nf_tables_unbind_set(ctx, priv->set, &priv->binding);
}

-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_lookup *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index e8254ad..82c40d8 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -246,8 +246,8 @@ static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}

-static int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_get_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_meta *priv = nft_expr_priv(expr);

@@ -261,8 +261,8 @@ nla_put_failure:
return -1;
}

-static int nft_meta_set_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_meta *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index a0195d2..d890584 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -152,7 +152,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}

-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_nat *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index acb330e..9deadb6 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -75,7 +75,8 @@ static int nft_payload_init(const struct nft_ctx *ctx,
return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE);
}

-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_payload *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index e8ae2f6..390107b 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -82,7 +82,8 @@ static int nft_queue_init(const struct nft_ctx *ctx,
return 0;
}

-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_queue *priv = nft_expr_priv(expr);

diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
index e21d69d..94f2380 100644
--- a/net/netfilter/nft_rbtree.c
+++ b/net/netfilter/nft_rbtree.c
@@ -65,13 +65,14 @@ out:
return false;
}

-static void nft_rbtree_elem_destroy(const struct nft_set *set,
+static void nft_rbtree_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
struct nft_rbtree_elem *rbe)
{
- nft_data_uninit(&rbe->key, NFT_DATA_VALUE);
+ nft_data_uninit(&rbe->key, NFT_DATA_VALUE, ctx->scope);
if (set->flags & NFT_SET_MAP &&
!(rbe->flags & NFT_SET_ELEM_INTERVAL_END))
- nft_data_uninit(rbe->data, set->dtype);
+ nft_data_uninit(rbe->data, set->dtype, ctx->scope);

kfree(rbe);
}
@@ -209,7 +210,8 @@ static int nft_rbtree_init(const struct nft_set *set,
return 0;
}

-static void nft_rbtree_destroy(const struct nft_set *set)
+static void nft_rbtree_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
@@ -218,7 +220,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- nft_rbtree_elem_destroy(set, rbe);
+ nft_rbtree_elem_destroy(ctx, set, rbe);
}
}

diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index f3448c2..09bc4b2 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -49,7 +49,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_reject_init);

-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr,
+ enum nft_scope scope)
{
const struct nft_reject *priv = nft_expr_priv(expr);
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-11 09:19:13 UTC
Permalink
This patch converts the len field in the struct sock_filter structure
from length in filter lines to bytes. I have added the new function
sk_bpf_flen() which allows you to obtain the filter in number of
blocks (which is what you usually need to iterate over a BPF filter).

The corresponding BPF jit implementation has been also adjusted.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
arch/arm/net/bpf_jit_32.c | 19 ++++++++++---------
arch/powerpc/net/bpf_jit_comp.c | 4 ++--
arch/s390/net/bpf_jit_comp.c | 10 +++++-----
arch/sparc/net/bpf_jit_comp.c | 2 +-
arch/x86/net/bpf_jit_comp.c | 2 +-
include/linux/filter.h | 10 ++++------
include/net/sock.h | 4 ++--
net/core/filter.c | 22 +++++++++++++++-------
net/core/sock_diag.c | 4 ++--
9 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 65bd347..844e97b 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -135,7 +135,7 @@ static u16 saved_regs(struct jit_ctx *ctx)
{
u16 ret = 0;

- if ((ctx->skf->len > 1) ||
+ if ((sk_bpf_flen(ctx->skf) > 1) ||
(ctx->skf->insns[0].code == BPF_S_RET_A))
ret |= 1 << r_A;

@@ -283,7 +283,7 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx)
ctx->imms[i] = k;

/* constants go just after the epilogue */
- offset = ctx->offsets[ctx->skf->len];
+ offset = ctx->offsets[sk_bpf_flen(ctx->skf)];
offset += ctx->prologue_bytes;
offset += ctx->epilogue_bytes;
offset += i * 4;
@@ -423,7 +423,7 @@ static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx)
emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx);
} else {
_emit(cond, ARM_MOV_I(ARM_R0, 0), ctx);
- _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx);
+ _emit(cond, ARM_B(b_imm(sk_bpf_flen(ctx->skf), ctx)), ctx);
}
}

@@ -476,10 +476,10 @@ static int build_body(struct jit_ctx *ctx)
const struct sk_filter *prog = ctx->skf;
const struct sock_filter *inst;
unsigned i, load_order, off, condt;
- int imm12;
+ int imm12, flen = sk_bpf_flen(ctx->skf);
u32 k;

- for (i = 0; i < prog->len; i++) {
+ for (i = 0; i < flen; i++) {
inst = &(prog->insns[i]);
/* K as an immediate value operand */
k = inst->k;
@@ -773,8 +773,8 @@ cmp_x:
ctx->ret0_fp_idx = i;
emit_mov_i(ARM_R0, k, ctx);
b_epilogue:
- if (i != ctx->skf->len - 1)
- emit(ARM_B(b_imm(prog->len, ctx)), ctx);
+ if (i != flen - 1)
+ emit(ARM_B(b_imm(flen, ctx)), ctx);
break;
case BPF_S_MISC_TAX:
/* X = A */
@@ -867,6 +867,7 @@ void bpf_jit_compile(struct sk_filter *fp)
struct jit_ctx ctx;
unsigned tmp_idx;
unsigned alloc_size;
+ int flen = sk_bpf_flen(fp);

if (!bpf_jit_enable)
return;
@@ -875,7 +876,7 @@ void bpf_jit_compile(struct sk_filter *fp)
ctx.skf = fp;
ctx.ret0_fp_idx = -1;

- ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL);
+ ctx.offsets = kzalloc(4 * (flen + 1), GFP_KERNEL);
if (ctx.offsets == NULL)
return;

@@ -922,7 +923,7 @@ void bpf_jit_compile(struct sk_filter *fp)

if (bpf_jit_enable > 1)
/* there are 2 passes here */
- bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
+ bpf_jit_dump(flen, alloc_size, 2, ctx.target);

fp->run_filter = (void *)ctx.target;
out:
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 6491d72..a4d8db3 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -134,7 +134,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
unsigned int *addrs)
{
const struct sock_filter *filter = fp->insns;
- int flen = fp->len;
+ int flen = sk_bpf_flen(fp);
u8 *func;
unsigned int true_cond;
int i;
@@ -581,7 +581,7 @@ void bpf_jit_compile(struct sk_filter *fp)
unsigned int *addrs;
struct codegen_context cgctx;
int pass;
- int flen = fp->len;
+ int flen = sk_bpf_flen(fp);

if (!bpf_jit_enable)
return;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 23089df..54906d9 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -826,11 +826,11 @@ void bpf_jit_compile(struct sk_filter *fp)
unsigned long size, prg_len, lit_len;
struct bpf_jit jit, cjit;
unsigned int *addrs;
- int pass, i;
+ int pass, i, flen = sk_bpf_flen(fp);

if (!bpf_jit_enable)
return;
- addrs = kcalloc(fp->len, sizeof(*addrs), GFP_KERNEL);
+ addrs = kcalloc(flen, sizeof(*addrs), GFP_KERNEL);
if (addrs == NULL)
return;
memset(&jit, 0, sizeof(cjit));
@@ -842,9 +842,9 @@ void bpf_jit_compile(struct sk_filter *fp)

bpf_jit_prologue(&jit);
bpf_jit_noleaks(&jit, fp->insns);
- for (i = 0; i < fp->len; i++) {
+ for (i = 0; i < flen; i++) {
if (bpf_jit_insn(&jit, fp->insns + i, addrs, i,
- i == fp->len - 1))
+ i == len - 1))
goto out;
}
bpf_jit_epilogue(&jit);
@@ -870,7 +870,7 @@ void bpf_jit_compile(struct sk_filter *fp)
cjit = jit;
}
if (bpf_jit_enable > 1) {
- bpf_jit_dump(fp->len, jit.end - jit.start, pass, jit.start);
+ bpf_jit_dump(flen, jit.end - jit.start, pass, jit.start);
if (jit.start)
print_fn_code(jit.start, jit.mid - jit.start);
}
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index ee1cd30..15e6b00 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -359,7 +359,7 @@ void bpf_jit_compile(struct sk_filter *fp)
unsigned int cleanup_addr, proglen, oldproglen = 0;
u32 temp[8], *prog, *func, seen = 0, pass;
const struct sock_filter *filter = fp->insns;
- int i, flen = fp->len, pc_ret0 = -1;
+ int i, flen = sk_bpf_flen(fp), pc_ret0 = -1;
unsigned int *addrs;
void *image;

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bfadd14..81bf69a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -193,7 +193,7 @@ void bpf_jit_compile(struct sk_filter *fp)
unsigned int cleanup_addr; /* epilogue code offset */
unsigned int *addrs;
const struct sock_filter *filter = fp->insns;
- int flen = fp->len;
+ int flen = sk_bpf_flen(fp);

if (!bpf_jit_enable)
return;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6c5d597..ab37714 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -22,10 +22,9 @@ struct compat_sock_fprog {
struct sk_buff;
struct sock;

-struct sk_filter
-{
+struct sk_filter {
atomic_t refcnt;
- unsigned int len; /* Number of filter blocks */
+ unsigned int size; /* filter bytecode size in bytes */
struct rcu_head rcu;
unsigned int (*run_filter)(const struct sk_buff *skb,
const struct sock_filter *filter);
@@ -35,10 +34,9 @@ struct sk_filter
};
};

-static inline unsigned int sk_filter_size(unsigned int proglen)
+static inline unsigned int sk_bpf_flen(struct sk_filter *filter)
{
- return max(sizeof(struct sk_filter),
- offsetof(struct sk_filter, insns[proglen]));
+ return filter->size / sizeof(struct sock_filter);
}

extern int sk_filter(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 5c3f7c3..7b9723c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1633,14 +1633,14 @@ static inline void sk_filter_release(struct sk_filter *fp)

static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
- atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_sub(fp->size, &sk->sk_omem_alloc);
sk_filter_release(fp);
}

static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
atomic_inc(&fp->refcnt);
- atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_add(fp->size, &sk->sk_omem_alloc);
}

/*
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f63e67..3ea0e7f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -647,7 +647,7 @@ static int __sk_prepare_filter(struct sk_filter *fp)

fp->run_filter = sk_run_filter;

- err = sk_chk_filter(fp->insns, fp->len);
+ err = sk_chk_filter(fp->insns, sk_bpf_flen(fp));
if (err)
return err;

@@ -655,6 +655,12 @@ static int __sk_prepare_filter(struct sk_filter *fp)
return 0;
}

+static unsigned int sk_filter_size(unsigned int proglen)
+{
+ return max(sizeof(struct sk_filter),
+ offsetof(struct sk_filter, insns[proglen]));
+}
+
/**
* sk_unattached_filter_create - create an unattached filter
* @fprog: the filter program
@@ -682,7 +688,7 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
memcpy(fp->insns, fprog->filter, fsize);

atomic_set(&fp->refcnt, 1);
- fp->len = fprog->len;
+ fp->size = fsize;

err = __sk_prepare_filter(fp);
if (err)
@@ -735,7 +741,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
}

atomic_set(&fp->refcnt, 1);
- fp->len = fprog->len;
+ fp->size = fsize;

err = __sk_prepare_filter(fp);
if (err) {
@@ -853,6 +859,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len)
{
struct sk_filter *filter;
+ unsigned int flen;
int i, ret;

lock_sock(sk);
@@ -861,15 +868,16 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int
ret = 0;
if (!filter)
goto out;
- ret = filter->len;
+
+ ret = flen = sk_bpf_flen(filter);
if (!len)
goto out;
ret = -EINVAL;
- if (len < filter->len)
+ if (len < flen)
goto out;

ret = -EFAULT;
- for (i = 0; i < filter->len; i++) {
+ for (i = 0; i < flen; i++) {
struct sock_filter fb;

sk_decode_filter(&filter->insns[i], &fb);
@@ -877,7 +885,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int
goto out;
}

- ret = filter->len;
+ ret = flen;
out:
release_sock(sk);
return ret;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a0e9cf6..343bd58 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -65,7 +65,7 @@ int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
rcu_read_lock();

filter = rcu_dereference(sk->sk_filter);
- len = filter ? filter->len * sizeof(struct sock_filter) : 0;
+ len = filter ? filter->size : 0;

attr = nla_reserve(skb, attrtype, len);
if (attr == NULL) {
@@ -77,7 +77,7 @@ int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
struct sock_filter *fb = (struct sock_filter *)nla_data(attr);
int i;

- for (i = 0; i < filter->len; i++, fb++)
+ for (i = 0; i < sk_bpf_flen(filter); i++, fb++)
sk_decode_filter(&filter->insns[i], fb);
}
--
1.7.10.4
Pablo Neira Ayuso
2014-03-11 09:19:18 UTC
Permalink
Move expression infrastructure that will be needed by other
classifiers (eg. socket filtering) to nf_tables_core.c and make
this built-in. The basic expressions are also registered from
the core: payload, cmp, immediate, bitwise and byteorder.

I have added a new lock to protect the list of expression which
is common to both nf_tables and nf_tables_sock and converted it
to use rcu.

Signed-off-by: Pablo Neira Ayuso <***@netfilter.org>
---
include/net/netfilter/nf_tables_core.h | 26 ++++
net/netfilter/Kconfig | 5 +
net/netfilter/Makefile | 7 +-
net/netfilter/nf_tables_api.c | 208 ++++-------------------------
net/netfilter/nf_tables_core.c | 225 ++++++++++++++++++++++++++++++++
net/netfilter/nf_tables_nf.c | 46 +------
net/netfilter/nft_cmp.c | 1 +
net/netfilter/nft_payload.c | 1 +
8 files changed, 286 insertions(+), 233 deletions(-)
create mode 100644 net/netfilter/nf_tables_core.c

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 004c2aa..98b9c74 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -81,4 +81,30 @@ static inline bool nft_payload_fast_eval(const struct nft_expr *expr,
return true;
}

+#include <net/netfilter/nf_tables.h>
+
+/**
+ * struct nft_expr_info - nf_tables expression information
+ *
+ * @ops: Pointer to expression operations
+ * @tb: array of attributes for this expression
+ */
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+int nft_register_expr(struct nft_expr_type *type);
+void nft_unregister_expr(struct nft_expr_type *type);
+int nf_tables_newexpr(const struct nft_ctx *ctx,
+ const struct nft_expr_info *info, struct nft_expr *expr);
+int nf_tables_expr_parse(const struct nft_ctx *ctx,
+ const struct nlattr *nla, struct nft_expr_info *info,
+ int (*autoload)(const struct nft_ctx *ctx,
+ const struct nlattr *nla));
+const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ const struct nlattr *nla);
+void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
+int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr);
+
#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e9410d1..5bd91a8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -413,8 +413,13 @@ config NETFILTER_SYNPROXY

endif # NF_CONNTRACK

+config NF_TABLES_CORE
+ boolean
+ default n
+
config NF_TABLES
select NETFILTER_NETLINK
+ select NF_TABLES_CORE
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index bb9970c..7564485 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -65,9 +65,10 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o

# nf_tables
-nf_tables-objs += nf_tables_nf.o nf_tables_api.o
-nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
-nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+obj-$(CONFIG_NF_TABLES_CORE) += nf_tables_core.o nft_payload.o nft_cmp.o \
+ nft_immediate.o nft_bitwise.o nft_byteorder.o
+
+nf_tables-objs += nf_tables_nf.o nf_tables_api.o nft_lookup.o

obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9879f23..29d6745 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -21,8 +21,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>

-static LIST_HEAD(nf_tables_expressions);
-
/**
* nft_register_afinfo - register nf_tables address family info
*
@@ -1080,188 +1078,6 @@ static void nft_ctx_init(struct nft_ctx *ctx,
}

/*
- * Expressions
- */
-
-/**
- * nft_register_expr - register nf_tables expr type
- * @ops: expr type
- *
- * Registers the expr type for use with nf_tables. Returns zero on
- * success or a negative errno code otherwise.
- */
-int nft_register_expr(struct nft_expr_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail(&type->list, &nf_tables_expressions);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_expr);
-
-/**
- * nft_unregister_expr - unregister nf_tables expr type
- * @ops: expr type
- *
- * Unregisters the expr typefor use with nf_tables.
- */
-void nft_unregister_expr(struct nft_expr_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_del(&type->list);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_expr);
-
-static const struct nft_expr_type *__nft_expr_type_get(u8 family,
- struct nlattr *nla)
-{
- const struct nft_expr_type *type;
-
- list_for_each_entry(type, &nf_tables_expressions, list) {
- if (!nla_strcmp(nla, type->name) &&
- (!type->family || type->family == family))
- return type;
- }
- return NULL;
-}
-
-static const struct nft_expr_type *nft_expr_type_get(u8 family,
- struct nlattr *nla)
-{
- const struct nft_expr_type *type;
-
- if (nla == NULL)
- return ERR_PTR(-EINVAL);
-
- type = __nft_expr_type_get(family, nla);
- if (type != NULL && try_module_get(type->owner))
- return type;
-
-#ifdef CONFIG_MODULES
- if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (__nft_expr_type_get(family, nla))
- return ERR_PTR(-EAGAIN);
-
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%.*s",
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (__nft_expr_type_get(family, nla))
- return ERR_PTR(-EAGAIN);
- }
-#endif
- return ERR_PTR(-ENOENT);
-}
-
-static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
- [NFTA_EXPR_NAME] = { .type = NLA_STRING },
- [NFTA_EXPR_DATA] = { .type = NLA_NESTED },
-};
-
-static int nf_tables_fill_expr_info(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
- goto nla_put_failure;
-
- if (expr->ops->dump) {
- struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
- if (data == NULL)
- goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
- goto nla_put_failure;
- nla_nest_end(skb, data);
- }
-
- return skb->len;
-
-nla_put_failure:
- return -1;
-};
-
-struct nft_expr_info {
- const struct nft_expr_ops *ops;
- struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
-};
-
-static int nf_tables_expr_parse(const struct nft_ctx *ctx,
- const struct nlattr *nla,
- struct nft_expr_info *info)
-{
- const struct nft_expr_type *type;
- const struct nft_expr_ops *ops;
- struct nlattr *tb[NFTA_EXPR_MAX + 1];
- int err;
-
- err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
- if (err < 0)
- return err;
-
- type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
- if (IS_ERR(type))
- return PTR_ERR(type);
-
- if (tb[NFTA_EXPR_DATA]) {
- err = nla_parse_nested(info->tb, type->maxattr,
- tb[NFTA_EXPR_DATA], type->policy);
- if (err < 0)
- goto err1;
- } else
- memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
-
- if (type->select_ops != NULL) {
- ops = type->select_ops(ctx,
- (const struct nlattr * const *)info->tb);
- if (IS_ERR(ops)) {
- err = PTR_ERR(ops);
- goto err1;
- }
- } else
- ops = type->ops;
-
- info->ops = ops;
- return 0;
-
-err1:
- module_put(type->owner);
- return err;
-}
-
-static int nf_tables_newexpr(const struct nft_ctx *ctx,
- const struct nft_expr_info *info,
- struct nft_expr *expr)
-{
- const struct nft_expr_ops *ops = info->ops;
- int err;
-
- expr->ops = ops;
- if (ops->init) {
- err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
- if (err < 0)
- goto err1;
- }
-
- return 0;
-
-err1:
- expr->ops = NULL;
- return err;
-}
-
-static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
- struct nft_expr *expr)
-{
- if (expr->ops->destroy)
- expr->ops->destroy(ctx, expr);
- module_put(expr->ops->type->owner);
-}
-
-/*
* Rules
*/

@@ -1574,6 +1390,27 @@ nf_tables_trans_add(struct nft_ctx *ctx, struct nft_rule *rule)
return rupd;
}

+static int nft_expr_autoload(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
+{
+#ifdef CONFIG_MODULES
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%u-%.*s", ctx->afi->family,
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(ctx->afi->family, nla))
+ return -EAGAIN;
+
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(ctx->afi->family, nla))
+ return -EAGAIN;
+#endif
+ return -ENOENT;
+}
+
static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -1646,7 +1483,8 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
goto err1;
if (n == NFT_RULE_MAXEXPRS)
goto err1;
- err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
+ err = nf_tables_expr_parse(&ctx, tmp, &info[n],
+ nft_expr_autoload);
if (err < 0)
goto err1;
size += info[n].ops->size;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 0000000..20bd1f0
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2007-2009 Patrick McHardy <***@trash.net>
+ * Copyright (c) 2012-2014 Pablo Neira Ayuso <***@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <net/netlink.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <linux/netfilter/nf_tables.h>
+
+/*
+ * Expressions
+ */
+
+static LIST_HEAD(nf_tables_expressions);
+static DEFINE_MUTEX(nf_tables_expr_mutex);
+
+/**
+ * nft_register_expr - register nf_tables expr type
+ * @ops: expr type
+ *
+ * Registers the expr type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_type *type)
+{
+ mutex_lock(&nf_tables_expr_mutex);
+ list_add_tail(&type->list, &nf_tables_expressions);
+ mutex_unlock(&nf_tables_expr_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ * nft_unregister_expr - unregister nf_tables expr type
+ * @ops: expr type
+ *
+ * Unregisters the expr typefor use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_type *type)
+{
+ mutex_lock(&nf_tables_expr_mutex);
+ list_del(&type->list);
+ mutex_unlock(&nf_tables_expr_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ const struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
+ if (!nla_strcmp(nla, type->name) &&
+ (!type->family || type->family == family))
+ return type;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(__nft_expr_type_get);
+
+const struct nft_expr_type *nft_expr_type_get(u8 family, const struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ type = __nft_expr_type_get(family, nla);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nft_expr_type_get);
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_DATA] = { .type = NLA_NESTED },
+};
+
+int nf_tables_expr_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+ struct nft_expr_info *info,
+ int (*autoload)(const struct nft_ctx *ctx,
+ const struct nlattr *nla))
+{
+ const struct nft_expr_type *type;
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+ if (err < 0)
+ return err;
+
+ type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type) < 0) {
+ if (PTR_ERR(type) == -ENOENT)
+ return autoload(ctx, tb[NFTA_EXPR_NAME]);
+
+ return PTR_ERR(type);
+ }
+
+ if (tb[NFTA_EXPR_DATA]) {
+ err = nla_parse_nested(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA], type->policy);
+ if (err < 0)
+ goto err1;
+ } else
+ memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
+
+ if (type->select_ops != NULL) {
+ ops = type->select_ops(ctx,
+ (const struct nlattr * const *)info->tb);
+ if (IS_ERR(ops)) {
+ err = PTR_ERR(ops);
+ goto err1;
+ }
+ } else
+ ops = type->ops;
+
+ info->ops = ops;
+ return 0;
+
+err1:
+ module_put(type->owner);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_tables_expr_parse);
+
+int nf_tables_newexpr(const struct nft_ctx *ctx,
+ const struct nft_expr_info *info, struct nft_expr *expr)
+{
+ const struct nft_expr_ops *ops = info->ops;
+ int err;
+
+ expr->ops = ops;
+ if (ops->init) {
+ err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ if (err < 0)
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ expr->ops = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_tables_newexpr);
+
+void nf_tables_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
+{
+ if (expr->ops->destroy)
+ expr->ops->destroy(ctx, expr);
+ module_put(expr->ops->type->owner);
+}
+EXPORT_SYMBOL_GPL(nf_tables_expr_destroy);
+
+int nf_tables_fill_expr_info(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
+ goto nla_put_failure;
+
+ if (expr->ops->dump) {
+ struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+ if (data == NULL)
+ goto nla_put_failure;
+ if (expr->ops->dump(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, data);
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+};
+EXPORT_SYMBOL_GPL(nf_tables_fill_expr_info);
+
+static __init int nf_tables_core_init(void)
+{
+ int err;
+
+ err = nft_immediate_module_init();
+ if (err < 0)
+ goto err1;
+
+ err = nft_cmp_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nft_bitwise_module_init();
+ if (err < 0)
+ goto err3;
+
+ err = nft_byteorder_module_init();
+ if (err < 0)
+ goto err4;
+
+ err = nft_payload_module_init();
+ if (err < 0)
+ goto err5;
+
+ return 0;
+err5:
+ nft_byteorder_module_exit();
+err4:
+ nft_bitwise_module_exit();
+err3:
+ nft_cmp_module_exit();
+err2:
+ nft_immediate_module_exit();
+err1:
+ return err;
+}
+core_initcall(nf_tables_core_init);
diff --git a/net/netfilter/nf_tables_nf.c b/net/netfilter/nf_tables_nf.c
index d71a0be..a9d2fa4 100644
--- a/net/netfilter/nf_tables_nf.c
+++ b/net/netfilter/nf_tables_nf.c
@@ -180,54 +180,10 @@ EXPORT_SYMBOL_GPL(nft_do_chain);

int __init nf_tables_core_module_init(void)
{
- int err;
-
- err = nft_immediate_module_init();
- if (err < 0)
- goto err1;
-
- err = nft_cmp_module_init();
- if (err < 0)
- goto err2;
-
- err = nft_lookup_module_init();
- if (err < 0)
- goto err3;
-
- err = nft_bitwise_module_init();
- if (err < 0)
- goto err4;
-
- err = nft_byteorder_module_init();
- if (err < 0)
- goto err5;
-
- err = nft_payload_module_init();
- if (err < 0)
- goto err6;
-
- return 0;
-
-err6:
- nft_byteorder_module_exit();
-err5:
- nft_bitwise_module_exit();
-err4:
- nft_lookup_module_exit();
-err3:
- nft_cmp_module_exit();
-err2:
- nft_immediate_module_exit();
-err1:
- return err;
+ return nft_lookup_module_init();
}

void nf_tables_core_module_exit(void)
{
- nft_payload_module_exit();
- nft_byteorder_module_exit();
- nft_bitwise_module_exit();
nft_lookup_module_exit();
- nft_cmp_module_exit();
- nft_immediate_module_exit();
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 7758c1c..0252f63 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -161,6 +161,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
};
+EXPORT_SYMBOL_GPL(nft_cmp_fast_ops);

static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index a2aeb31..acb330e 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -106,6 +106,7 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.init = nft_payload_init,
.dump = nft_payload_dump,
};
+EXPORT_SYMBOL_GPL(nft_payload_fast_ops);

static const struct nft_expr_ops *
nft_payload_select_ops(const struct nft_ctx *ctx,
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Daniel Borkmann
2014-03-11 10:29:54 UTC
Permalink
Hi!
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Similarly to BPF, you can attach filters via setsockopt()
SO_ATTACH_NFT_FILTER. The filter that is passed to the kernel is
expression list (nested attribute)
expression element (nested attribute)
expression name (string)
expression data (nested attribute)
... specific attribute for this expression go here
This is similar to the netlink format of the nf_tables rules, so we
can re-use most of the infrastructure that we already have in userspace.
The kernel takes the TLV representation and translates it to the native
nf_tables representation.
The patches 1-3 have helped to generalize the existing socket filtering
infrastructure to allow pluging new socket filtering frameworks. Then,
patches 4-8 generalize the nf_tables code by move the neccessary nf_tables
expression and data initialization core infrastructure. Then, patch 9
provides the nf_tables socket filtering capabilities.
Patrick and I have been discussing for a while that part of this
generalisation works should also help to add support for providing a
replacement to the tc framework, so with the necessary work, nf_tables
may provide in the near future packet a single packet classification
framework for Linux.
I'm being curious here ;) as there's currently an ongoing effort on
netdev for Alexei's eBPF engine (part 1 at [1,2,3]), which addresses
shortcomings of current BPF and shall long term entirely replace the
current BPF engine code to let filters entirely run in eBPF resp.
eBPF's JIT engine, as I understand, which is also transparently usable
in cls_bpf for classification in tc w/o rewriting on a different filter
language. Performance figures have been posted/provided in [1] as well.

So the plan on your side would be to have an alternative to eBPF, or
build on top of it to reuse its in-kernel JIT compiler?

[1] http://patchwork.ozlabs.org/patch/328927/
[2] http://patchwork.ozlabs.org/patch/328926/
[3] http://patchwork.ozlabs.org/patch/328928/
http://people.netfilter.org/pablo/nft-sock-filter-test.c
I'm currently reusing the existing libnftnl interfaces, my plan is to
new interfaces in that library for easier and more simple filter
definition for socket filtering.
Note that the current nf_tables expression-set is also limited with
regards to BPF, but the infrastructure that we have can be easily
extended with new expressions.
Comments welcome!
net: rename fp->bpf_func to fp->run_filter
net: filter: account filter length in bytes
net: filter: generalise sk_filter_release
netfilter: nf_tables: move fast operations to header
netfilter: nf_tables: add nft_value_init
netfilter: nf_tables: rename nf_tables_core.c to nf_tables_nf.c
netfilter: nf_tables: move expression infrastructure to built-in core
netfilter: nf_tables: generalize verdict handling and introduce scopes
netfilter: nf_tables: add support for socket filtering
arch/arm/net/bpf_jit_32.c | 25 +-
arch/powerpc/net/bpf_jit_comp.c | 10 +-
arch/s390/net/bpf_jit_comp.c | 16 +-
arch/sparc/net/bpf_jit_comp.c | 8 +-
arch/x86/net/bpf_jit_comp.c | 8 +-
include/linux/filter.h | 28 +-
include/net/netfilter/nf_tables.h | 27 +-
include/net/netfilter/nf_tables_core.h | 84 +++++
include/net/netfilter/nft_reject.h | 3 +-
include/net/sock.h | 8 +-
include/uapi/asm-generic/socket.h | 4 +
net/core/filter.c | 28 +-
net/core/sock.c | 19 ++
net/core/sock_diag.c | 4 +-
net/netfilter/Kconfig | 13 +
net/netfilter/Makefile | 9 +-
net/netfilter/nf_tables_api.c | 440 ++++---------------------
net/netfilter/nf_tables_core.c | 564 +++++++++++++++++++++-----------
net/netfilter/nf_tables_nf.c | 189 +++++++++++
net/netfilter/nf_tables_sock.c | 327 ++++++++++++++++++
net/netfilter/nft_bitwise.c | 35 +-
net/netfilter/nft_byteorder.c | 28 +-
net/netfilter/nft_cmp.c | 43 ++-
net/netfilter/nft_compat.c | 6 +-
net/netfilter/nft_counter.c | 3 +-
net/netfilter/nft_ct.c | 9 +-
net/netfilter/nft_exthdr.c | 3 +-
net/netfilter/nft_hash.c | 12 +-
net/netfilter/nft_immediate.c | 35 +-
net/netfilter/nft_limit.c | 3 +-
net/netfilter/nft_log.c | 3 +-
net/netfilter/nft_lookup.c | 3 +-
net/netfilter/nft_meta.c | 51 ++-
net/netfilter/nft_nat.c | 3 +-
net/netfilter/nft_payload.c | 29 +-
net/netfilter/nft_queue.c | 3 +-
net/netfilter/nft_rbtree.c | 12 +-
net/netfilter/nft_reject.c | 3 +-
38 files changed, 1416 insertions(+), 682 deletions(-)
create mode 100644 net/netfilter/nf_tables_nf.c
create mode 100644 net/netfilter/nf_tables_sock.c
Alexei Starovoitov
2014-03-11 17:59:42 UTC
Permalink
Post by Daniel Borkmann
Hi!
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Similarly to BPF, you can attach filters via setsockopt()
SO_ATTACH_NFT_FILTER. The filter that is passed to the kernel is
expression list (nested attribute)
expression element (nested attribute)
expression name (string)
expression data (nested attribute)
... specific attribute for this expression go here
This is similar to the netlink format of the nf_tables rules, so we
can re-use most of the infrastructure that we already have in userspace.
The kernel takes the TLV representation and translates it to the native
nf_tables representation.
The patches 1-3 have helped to generalize the existing socket filtering
infrastructure to allow pluging new socket filtering frameworks. Then,
patches 4-8 generalize the nf_tables code by move the neccessary nf_tables
expression and data initialization core infrastructure. Then, patch 9
provides the nf_tables socket filtering capabilities.
Patrick and I have been discussing for a while that part of this
generalisation works should also help to add support for providing a
replacement to the tc framework, so with the necessary work, nf_tables
may provide in the near future packet a single packet classification
framework for Linux.
I'm being curious here ;) as there's currently an ongoing effort on
netdev for Alexei's eBPF engine (part 1 at [1,2,3]), which addresses
shortcomings of current BPF and shall long term entirely replace the
current BPF engine code to let filters entirely run in eBPF resp.
eBPF's JIT engine, as I understand, which is also transparently usable
in cls_bpf for classification in tc w/o rewriting on a different filter
language. Performance figures have been posted/provided in [1] as well.
So the plan on your side would be to have an alternative to eBPF, or
build on top of it to reuse its in-kernel JIT compiler?
[1] http://patchwork.ozlabs.org/patch/328927/
[2] http://patchwork.ozlabs.org/patch/328926/
[3] http://patchwork.ozlabs.org/patch/328928/
http://people.netfilter.org/pablo/nft-sock-filter-test.c
I'm currently reusing the existing libnftnl interfaces, my plan is to
new interfaces in that library for easier and more simple filter
definition for socket filtering.
Note that the current nf_tables expression-set is also limited with
regards to BPF, but the infrastructure that we have can be easily
extended with new expressions.
Comments welcome!
Hi Pablo,

Could you share what performance you're getting when doing nft
filter equivalent to 'tcpdump port 22' ?
Meaning your filter needs to parse eth->proto, ip or ipv6 header and
check both ports. How will it compare with JITed bpf/ebpf ?

I was trying to go the other way: improve nft performance with ebpf.
10/40G links are way to fast for interpreters. imo JIT is the only way.

here are some comments about patches:
1/9:
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);

David suggested that these comparisons in all jits are ugly.
I've fixed it in my patches. When they're in, you wouldn't need to
mess with this.

2/9:
- atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_sub(fp->size, &sk->sk_omem_alloc);

that's a big change in socket memory accounting.
We used to account for the whole sk_filter... now you're counting
filter size only.
Is it valid?

7/9:
whole nft_expr_autoload() looks scary from security point of view.
If I'm reading it correctly, the code will do request_module() based on
userspace request to attach filter?

9/9:
+ case SO_NFT_GET_FILTER:
+ len = sk_nft_get_filter(sk, (struct sock_filter __user
*)optval, len);
with my patches there was a concern regarding socket checkpoint/restore
and I had to preserve existing filter image to make sure it's not broken.
Could you please coordinate with Pavel and co to test this piece?

What will happen if nft_filter attached, but so_get_filter is called? crash?

+static int nft_sock_expr_autoload(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
+{
+#ifdef CONFIG_MODULES
+ mutex_unlock(&nft_expr_info_mutex);
+ request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla));
+ mutex_lock(&nft_expr_info_mutex);

same security concern here...

+int sk_nft_attach_filter(char __user *optval, struct sock *sk)
+{

what about sk_clone_lock()? since filter program is in nft, do you need to do
special steps during copy of socket?

+ fp = sock_kmalloc(sk, sizeof(struct sk_filter) + size, GFP_KERNEL);

this may allocate more memory then you need.
Currently sk_filter_size() computes it in an accurate way.

Also the same issue of optmem accounting as I mentioned in 2/9

+err4:
+ sock_kfree_s(sk, fp, size);

a small bug: allocated sizeof(sk_filter)+size, but freeing 'size' only...

Overall I think it's very interesting work.
Not sure what's the use case for it though.

I'll cook up a patch for the opposite approach (use ebpf inside nft)
and will send you for review.
I would prefer to work together to satisfy your and our user requests.

Thanks
Alexei
Post by Daniel Borkmann
net: rename fp->bpf_func to fp->run_filter
net: filter: account filter length in bytes
net: filter: generalise sk_filter_release
netfilter: nf_tables: move fast operations to header
netfilter: nf_tables: add nft_value_init
netfilter: nf_tables: rename nf_tables_core.c to nf_tables_nf.c
netfilter: nf_tables: move expression infrastructure to built-in core
netfilter: nf_tables: generalize verdict handling and introduce scopes
netfilter: nf_tables: add support for socket filtering
arch/arm/net/bpf_jit_32.c | 25 +-
arch/powerpc/net/bpf_jit_comp.c | 10 +-
arch/s390/net/bpf_jit_comp.c | 16 +-
arch/sparc/net/bpf_jit_comp.c | 8 +-
arch/x86/net/bpf_jit_comp.c | 8 +-
include/linux/filter.h | 28 +-
include/net/netfilter/nf_tables.h | 27 +-
include/net/netfilter/nf_tables_core.h | 84 +++++
include/net/netfilter/nft_reject.h | 3 +-
include/net/sock.h | 8 +-
include/uapi/asm-generic/socket.h | 4 +
net/core/filter.c | 28 +-
net/core/sock.c | 19 ++
net/core/sock_diag.c | 4 +-
net/netfilter/Kconfig | 13 +
net/netfilter/Makefile | 9 +-
net/netfilter/nf_tables_api.c | 440 ++++---------------------
net/netfilter/nf_tables_core.c | 564
+++++++++++++++++++++-----------
net/netfilter/nf_tables_nf.c | 189 +++++++++++
net/netfilter/nf_tables_sock.c | 327 ++++++++++++++++++
net/netfilter/nft_bitwise.c | 35 +-
net/netfilter/nft_byteorder.c | 28 +-
net/netfilter/nft_cmp.c | 43 ++-
net/netfilter/nft_compat.c | 6 +-
net/netfilter/nft_counter.c | 3 +-
net/netfilter/nft_ct.c | 9 +-
net/netfilter/nft_exthdr.c | 3 +-
net/netfilter/nft_hash.c | 12 +-
net/netfilter/nft_immediate.c | 35 +-
net/netfilter/nft_limit.c | 3 +-
net/netfilter/nft_log.c | 3 +-
net/netfilter/nft_lookup.c | 3 +-
net/netfilter/nft_meta.c | 51 ++-
net/netfilter/nft_nat.c | 3 +-
net/netfilter/nft_payload.c | 29 +-
net/netfilter/nft_queue.c | 3 +-
net/netfilter/nft_rbtree.c | 12 +-
net/netfilter/nft_reject.c | 3 +-
38 files changed, 1416 insertions(+), 682 deletions(-)
create mode 100644 net/netfilter/nf_tables_nf.c
create mode 100644 net/netfilter/nf_tables_sock.c
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-12 09:15:00 UTC
Permalink
Hi!

I'm going to reply to Daniel and you in the same email, see below.
Post by Alexei Starovoitov
Post by Daniel Borkmann
Hi!
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Similarly to BPF, you can attach filters via setsockopt()
SO_ATTACH_NFT_FILTER. The filter that is passed to the kernel is
expression list (nested attribute)
expression element (nested attribute)
expression name (string)
expression data (nested attribute)
... specific attribute for this expression go here
This is similar to the netlink format of the nf_tables rules, so we
can re-use most of the infrastructure that we already have in userspace.
The kernel takes the TLV representation and translates it to the native
nf_tables representation.
The patches 1-3 have helped to generalize the existing socket filtering
infrastructure to allow pluging new socket filtering frameworks. Then,
patches 4-8 generalize the nf_tables code by move the neccessary nf_tables
expression and data initialization core infrastructure. Then, patch 9
provides the nf_tables socket filtering capabilities.
Patrick and I have been discussing for a while that part of this
generalisation works should also help to add support for providing a
replacement to the tc framework, so with the necessary work, nf_tables
may provide in the near future packet a single packet classification
framework for Linux.
I'm being curious here ;) as there's currently an ongoing effort on
netdev for Alexei's eBPF engine (part 1 at [1,2,3]), which addresses
shortcomings of current BPF and shall long term entirely replace the
current BPF engine code to let filters entirely run in eBPF resp.
eBPF's JIT engine, as I understand, which is also transparently usable
in cls_bpf for classification in tc w/o rewriting on a different filter
language. Performance figures have been posted/provided in [1] as well.
So the plan on your side would be to have an alternative to eBPF, or
build on top of it to reuse its in-kernel JIT compiler?
[1] http://patchwork.ozlabs.org/patch/328927/
struct sk_filter
{
atomic_t refcnt;
- unsigned int len; /* Number of filter blocks */
+ /* len - number of insns in sock_filter program
+ * len_ext - number of insns in socket_filter_ext program
+ * jited - true if either original or extended program was
JITed
+ * orig_prog - original sock_filter program if not NULL
+ */
+ unsigned int len;
+ unsigned int len_ext;
+ unsigned int jited:1;
+ struct sock_filter *orig_prog;
struct rcu_head rcu;
- unsigned int (*bpf_func)(const struct sk_buff *skb,
- const struct sock_filter
*filter);
+ union {
+ unsigned int (*bpf_func)(const struct sk_buff *skb,
+ const struct sock_filter *fp);
+ unsigned int (*bpf_func_ext)(const struct sk_buff *skb,
+ const struct sock_filter_ext *fp);
+ };
union {
struct sock_filter insns[0];
+ struct sock_filter_ext insns_ext[0];
struct work_struct work;
};
};

I think we have to generalise this to make it flexible to accomodate
any socket filtering infrastructure. For example, instead of having
bpf_func and bpf_func_ext, I think it would be good to generalise it
that so we pass some void *filter. I also think that other private
information to the filtering approach should be put after the
filtering code, in some variable length area.

This change looks quite ad-hoc. My 1-3 patches were more going to the
direction of making this in some generic way to accomodate any socket
filtering infrastructure.
Post by Alexei Starovoitov
Post by Daniel Borkmann
[2] http://patchwork.ozlabs.org/patch/328926/
[3] http://patchwork.ozlabs.org/patch/328928/
http://people.netfilter.org/pablo/nft-sock-filter-test.c
I'm currently reusing the existing libnftnl interfaces, my plan is to
new interfaces in that library for easier and more simple filter
definition for socket filtering.
Note that the current nf_tables expression-set is also limited with
regards to BPF, but the infrastructure that we have can be easily
extended with new expressions.
Comments welcome!
Hi Pablo,
Could you share what performance you're getting when doing nft
filter equivalent to 'tcpdump port 22' ?
Meaning your filter needs to parse eth->proto, ip or ipv6 header and
check both ports. How will it compare with JITed bpf/ebpf ?
We already have plans to add jit to nf_tables.
Post by Alexei Starovoitov
I was trying to go the other way: improve nft performance with ebpf.
10/40G links are way to fast for interpreters. imo JIT is the only way.
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);
David suggested that these comparisons in all jits are ugly.
I've fixed it in my patches. When they're in, you wouldn't need to
mess with this.
I see you have added fp->jited for this. I think we can make this more
generic if we have some enum so fp->type will tell us what kind of
filter we have, ie. bpf, bpf-jitted, nft, etc.
Post by Alexei Starovoitov
- atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_sub(fp->size, &sk->sk_omem_alloc);
that's a big change in socket memory accounting.
We used to account for the whole sk_filter... now you're counting
filter size only.
Is it valid?
We need this change for if nf_tables. We don't use a fixed layout for
each instruction of the filter like you do, I mean:

+struct sock_filter_ext {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register */
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};

If I didn't miss anything from your patchset, that structure is again
exposed to userspace, so we won't be able to change it ever unless we
add a new binary interface.
Post by Alexei Starovoitov
From the user interface perspective, our nft approach is quite
different, from userspace you express the filter using TLVs (like in
the payload of the netlink message). Then, nf_tables core
infrastructure parses this and transforms it to the kernel
representation. It's very flexible since you don't expose the internal
representation to userspace, which means that we can change the
internal layout anytime, thus, highly extensible.

BTW, how do you make when you don't need the imm field? You just leave
it unset I guess. And how does the code to compare an IPv6 address
looks like?
Post by Alexei Starovoitov
whole nft_expr_autoload() looks scary from security point of view.
If I'm reading it correctly, the code will do request_module() based on
userspace request to attach filter?
Only root can invoke that code so far.
Post by Alexei Starovoitov
+ len = sk_nft_get_filter(sk, (struct sock_filter __user
*)optval, len);
with my patches there was a concern regarding socket checkpoint/restore
and I had to preserve existing filter image to make sure it's not broken.
Could you please coordinate with Pavel and co to test this piece?
What will happen if nft_filter attached, but so_get_filter is called? crash?
I was trying to avoid to add some enum to identify to sk_filter to
identify what kind of filter we have there, but this is needed
indeed. Thanks for spotting this.
Post by Alexei Starovoitov
+static int nft_sock_expr_autoload(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
+{
+#ifdef CONFIG_MODULES
+ mutex_unlock(&nft_expr_info_mutex);
+ request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla));
+ mutex_lock(&nft_expr_info_mutex);
same security concern here...
+int sk_nft_attach_filter(char __user *optval, struct sock *sk)
+{
what about sk_clone_lock()? since filter program is in nft, do you need to do
special steps during copy of socket?
+ fp = sock_kmalloc(sk, sizeof(struct sk_filter) + size, GFP_KERNEL);
this may allocate more memory then you need.
Currently sk_filter_size() computes it in an accurate way.
That won't work for nf_tables, we don't necessarily have to stick to a
fixed layout per instruction like you do, so the real filter size is
obtained after parsing the TLVs that was passed from userspace.
Post by Alexei Starovoitov
Also the same issue of optmem accounting as I mentioned in 2/9
+ sock_kfree_s(sk, fp, size);
a small bug: allocated sizeof(sk_filter)+size, but freeing 'size' only...
Good catch, I'll fix it, thanks.
Post by Alexei Starovoitov
Overall I think it's very interesting work.
Not sure what's the use case for it though.
I'll cook up a patch for the opposite approach (use ebpf inside nft)
and will send you for review.
I don't like the idea of sticking to some fixed layout structure to
represent the filter, please hold on with it.
Post by Alexei Starovoitov
I would prefer to work together to satisfy your and our user requests.
That would be nice, but so far I think that we have different
approaches from user interface perspective and, thus, the kernel
design for it.
Pablo Neira Ayuso
2014-03-12 09:27:07 UTC
Permalink
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
whole nft_expr_autoload() looks scary from security point of view.
If I'm reading it correctly, the code will do request_module() based on
userspace request to attach filter?
Only root can invoke that code so far.
Oops, this is obviously wrong. This request_module part needs a fix
indeed for the non-root part.
Alexei Starovoitov
2014-03-13 03:29:07 UTC
Permalink
Post by Pablo Neira Ayuso
Hi!
I'm going to reply to Daniel and you in the same email, see below.
struct sk_filter
{
atomic_t refcnt;
- unsigned int len; /* Number of filter blocks */
+ /* len - number of insns in sock_filter program
+ * len_ext - number of insns in socket_filter_ext program
+ * jited - true if either original or extended program was
JITed
+ * orig_prog - original sock_filter program if not NULL
+ */
+ unsigned int len;
+ unsigned int len_ext;
+ unsigned int jited:1;
+ struct sock_filter *orig_prog;
struct rcu_head rcu;
- unsigned int (*bpf_func)(const struct sk_buff *skb,
- const struct sock_filter
*filter);
+ union {
+ unsigned int (*bpf_func)(const struct sk_buff *skb,
+ const struct sock_filter *fp);
+ unsigned int (*bpf_func_ext)(const struct sk_buff *skb,
+ const struct sock_filter_ext *fp);
+ };
union {
struct sock_filter insns[0];
+ struct sock_filter_ext insns_ext[0];
struct work_struct work;
};
};
I think we have to generalise this to make it flexible to accomodate
any socket filtering infrastructure. For example, instead of having
bpf_func and bpf_func_ext, I think it would be good to generalise it
that so we pass some void *filter. I also think that other private
well, David indicated that using 'void*' for such cases is undesirable,
since we want to rely on compiler to do type verification as much
as we can. My patches are preserving type safety.
'void * filter' would mean - open the door for anything.
I don't think we want that type of 'generality'.
Post by Pablo Neira Ayuso
information to the filtering approach should be put after the
filtering code, in some variable length area.
This change looks quite ad-hoc. My 1-3 patches were more going to the
direction of making this in some generic way to accomodate any socket
filtering infrastructure.
They may look ad-hoc, but they're preserving type checking.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Could you share what performance you're getting when doing nft
filter equivalent to 'tcpdump port 22' ?
Meaning your filter needs to parse eth->proto, ip or ipv6 header and
check both ports. How will it compare with JITed bpf/ebpf ?
We already have plans to add jit to nf_tables.
The patches don't explain the reasons to do nft socket filtering.
I can only guess and my guess that this is to show
that nft sort of can do what bpf can.
tc can be made to do socket filtering too.
The differentiation is speed and ease of use.
Both have big question marks in sock_filter+nft approach.

I think to consider nft to be one and only classifier, some
benchmarking needs to be done first:
nft vs bpf, nft vs tc, nft vs ovs, ...

It can be done the other way too. nft can run on top of tc.
ovs can run on top of tc and so on.
I'm not advocating any of that.

Having one interpreter underneath doesn't mean that all
components will be easier to maintain or have less code around.
Code that converts from one to another counts as well.
Simplicity and performance should be the deciding factor.
imo nft+sock_filter example is not simple.

I've posted patches to compile restricted C into ebpf.
Theoretically I can make 'universal kernel module' out of ebpf.
Like, compile any C code into ebpf and jit it.
Such 'universal kernel module' will be runnable on all architectures.
One compiler to rule them all... one ebpf to run them all... NO!
That may be cool thing for university research, but no good
reason to do it in practice.
Same way nft for socket filtering is a cool research, but what
is the strong reason to have it in kernel and maintain forever?
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
- if (fp->bpf_func != sk_run_filter)
- module_free(NULL, fp->bpf_func);
+ if (fp->run_filter != sk_run_filter)
+ module_free(NULL, fp->run_filter);
David suggested that these comparisons in all jits are ugly.
I've fixed it in my patches. When they're in, you wouldn't need to
mess with this.
I see you have added fp->jited for this. I think we can make this more
generic if we have some enum so fp->type will tell us what kind of
filter we have, ie. bpf, bpf-jitted, nft, etc.
Such enum will have a problem of explosion when flags
start to cross-multiply.
fp->jited flag just says jitted or not. Easier to check.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
- atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_sub(fp->size, &sk->sk_omem_alloc);
that's a big change in socket memory accounting.
We used to account for the whole sk_filter... now you're counting
filter size only.
Is it valid?
We need this change for if nf_tables. We don't use a fixed layout for
I think you missed the point.
Allocated sockopt memory is not counted properly.
It's not fixed vs non-fixed. sizeof(struct sk_filter) needs to
be accounted too, since it was allocated.
Post by Pablo Neira Ayuso
+struct sock_filter_ext {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register */
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
If I didn't miss anything from your patchset, that structure is again
exposed to userspace, so we won't be able to change it ever unless we
add a new binary interface.
From the user interface perspective, our nft approach is quite
different, from userspace you express the filter using TLVs (like in
the payload of the netlink message). Then, nf_tables core
infrastructure parses this and transforms it to the kernel
representation. It's very flexible since you don't expose the internal
representation to userspace, which means that we can change the
internal layout anytime, thus, highly extensible.
bpf/ebpf is an instruction set. Arguing about fixed vs non-fixed
insn size is like arguing x86 variable encoding vs sparc fixed.
Both are infinitely flexible.
ebpf is a low level insn set with defined calling convention, so
ebpf program can call fixed set of kernel functions if ebpf checker
allows that.
In sockfilters+ebpf, seccomp+ebpf, ovs+ebpf and tracing+ebpf
I've already demonstrated that ebpf instruction set, its
interpreter and its JIT are staying the same, while all are doing
very different things. Cannot think of better extensibility.

Extensibility is not with new instructions. We don't add new
instructions to x86 just to do a new feature.
New instructions are added to CPUs to make feature faster.
Like aes crypto can be done with normal x86 insns and it can
be done with aseni intel extensions. Same way aes can be done
with ebpf and ebpf doesn't need new instructions to do that.
Post by Pablo Neira Ayuso
BTW, how do you make when you don't need the imm field? You just leave
it unset I guess. And how does the code to compare an IPv6 address
looks like?
how x86 or sparc instruction set handles ipv6 addresses
without 128-bit registers? ebpf does the same.

nft design picked 128 bit registers because of ipv6 addresses?
That makes it difficult to jit.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
whole nft_expr_autoload() looks scary from security point of view.
If I'm reading it correctly, the code will do request_module() based on
userspace request to attach filter?
Only root can invoke that code so far.
If we want to allow non-root access the whole nft needs to
be scrutinized.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
+ fp = sock_kmalloc(sk, sizeof(struct sk_filter) + size, GFP_KERNEL);
this may allocate more memory then you need.
Currently sk_filter_size() computes it in an accurate way.
That won't work for nf_tables, we don't necessarily have to stick to a
fixed layout per instruction like you do, so the real filter size is
obtained after parsing the TLVs that was passed from userspace.
That sock_malloc is allocating sizeof(sk_filter) + size.
Meaning that it allocated unnecessary sizeof(work_struct) bytes
and not accounting for them and for sk_filter itself in filter charge/uncharge
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Also the same issue of optmem accounting as I mentioned in 2/9
+ sock_kfree_s(sk, fp, size);
a small bug: allocated sizeof(sk_filter)+size, but freeing 'size' only...
Good catch, I'll fix it, thanks.
Post by Alexei Starovoitov
Overall I think it's very interesting work.
Not sure what's the use case for it though.
I'll cook up a patch for the opposite approach (use ebpf inside nft)
and will send you for review.
I don't like the idea of sticking to some fixed layout structure to
represent the filter, please hold on with it.
Post by Alexei Starovoitov
I would prefer to work together to satisfy your and our user requests.
That would be nice, but so far I think that we have different
approaches from user interface perspective and, thus, the kernel
design for it.
Since you're planning to do nft-jit, you'd need to generate bits and
bytes for different architectures with their own quirks.
Don't you want to offload this work to somebody who already did this
and who understand quirks of different architectures?
Now imagine that you can generate some intermediate representation
that maps one to one to x86 and other architectures.
Intermediate representation is a pseudo assembler code.
Say you want to translate nft-cmp instruction into sequence of native
comparisons. You'd need to use load from memory, compare and
branch operations. That's ebpf!
Design of ebpf is such that every ebpf insn maps to one native insn.
You can view ebpf as a tool to achieve jiting of nft.
It will save you a lot of time.

Thanks
Alexei
Pablo Neira Ayuso
2014-03-13 12:29:13 UTC
Permalink
[...]
Post by Alexei Starovoitov
The patches don't explain the reasons to do nft socket filtering.
OK, some reasons from the interface point of view:

1) It provides an extensible interface to userspace. We didn't
invented a new wheel in that regard, we just reused the
extensibility of TLVs used in netlink as intermediate format
between user and kernelspace, also used many other applications
outthere. The TLV parsing and building is not new code, most that of
that code has been exposed to userspace already through netlink.

2) It shows that, with little generalisation, we can open the door to
one single *classification interface* for the users. Just make some
little googling, you'll find *lots of people* barfing on the fact that
we have that many interfaces to classify packets in Linux. And I'm
*not* talking about the packet classification approach itself, that's a
different debate of course.

[...]
Post by Alexei Starovoitov
Simplicity and performance should be the deciding factor.
imo nft+sock_filter example is not simple.
OK, some comments in that regard:

1) Simplicity: With the nft approach you can just use a filter
expressed in json, eg.

{"rule":
{"expr":[
{"type":"meta","dreg":1,"key":"protocol"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":2,"data0":"0x00000008"}}},
{"type":"payload","dreg":1,"offset":9,"len":1,"base":"network"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":1,"data0":"0x00000006"}}},
{"type":"immediate","dreg":0,"immediatedata":{"data_reg":{"type":"verdict","verdict":"0xffffffff"}}}]
}
}

Which is still more readable and easier to maintain that a BPF
snippet. So you just pass it to the json parser in the libnftnl
library (or whatever other better minimalistic library we would ever
have) which transforms this to TLV format that you can pass to the
kernel. The kernel will do the job to translate this.

How are users going to integrate the restricted C's eBPF code
infrastructure into their projects? I don't think that will be simple.
They will likely include the BPF snippet to avoid all the integration
burden, as it already happens in many projects with BPF filters.

2) Performance. Patrick has been doing many progress with the generic
set infrastructure for nft. In that regard, we aim to achieve
performance by arranging data using performance data structures,
*jit is not the only way to boost performance* (although we also
want to have it).

Some example:

set type IPv4 address = { N thousands of IPv4 addresses }

reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
lookup(reg1, set)
reg0 <- immediate(0xffffffff)

Or even better, using dictionaries:

set type IPv4 address = { 1.1.1.1 : accept, 2.2.2.2 : accept, 3.3.3.3 : drop ...}

reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
reg0 <- lookup(reg1, set)

Where "accept" is an alias of 0xffffffff and "drop" is 0 in the
nft-sock case. The verdict part has been generalised so we can adapt
nft to the corresponding packet classification engine.

This part just needs a follow-up patch to add set-based filtering for
nft-sockets. See this for more information about ongoing efforts on
the nft set infrastructure: http://patchwork.ozlabs.org/patch/326368/
that will also integrate into nft-sock.

[...]
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
+struct sock_filter_ext {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register */
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
If I didn't miss anything from your patchset, that structure is again
exposed to userspace, so we won't be able to change it ever unless we
add a new binary interface.
From the user interface perspective, our nft approach is quite
different, from userspace you express the filter using TLVs (like in
the payload of the netlink message). Then, nf_tables core
infrastructure parses this and transforms it to the kernel
representation. It's very flexible since you don't expose the internal
representation to userspace, which means that we can change the
internal layout anytime, thus, highly extensible.
bpf/ebpf is an instruction set. Arguing about fixed vs non-fixed
insn size is like arguing x86 variable encoding vs sparc fixed.
Both are infinitely flexible.
Right, you can extend interfaces forever with lots of patchwork and
"smart tricks" but that doesn't mean that will look nice...

As I said, I believe that having a nice extensible interface is
extremely important to make it easier for development. If we have to
rearrange the internal representation for some reason, we can do it
indeed without bothering about making translations to avoid breaking
userspace and having to use ugly tricks (just see sk_decode_filter()
or any other translation to support any new way to express a
filter...).

[...]
Post by Alexei Starovoitov
Say you want to translate nft-cmp instruction into sequence of native
comparisons. You'd need to use load from memory, compare and
branch operations. That's ebpf!
Nope sorry, that's not ebpf. That's assembler code.

[...]
Post by Alexei Starovoitov
You can view ebpf as a tool to achieve jiting of nft.
It will save you a lot of time.
nft interface is already well-abstracted from the representation, so I
don't find a good reason to make a step backward that will force us to
represent the instructions using a fixed layout structure that is
exposed to userspace, that we won't be able to change once if this
gets into mainstream.

Probably the nft is not the easiest path, I agree, it's been not so
far if you look at the record. But with time and development hours
from everyone, I believe we'll enjoy a nice framework.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov
2014-03-14 15:28:05 UTC
Permalink
Post by Pablo Neira Ayuso
[...]
It seems you're assuming that ebpf inherited all the shortcomings
of bpf and making conclusion based on that. Not your fault.
I didn't explain it well enough.
Technically ebpf is a small evolution of bpf, but applicability made a
giant leap. I cannot compile C into bpf, but I can do that with ebpf.
I cannot do table lookups in bpf, but I can do that in ebpf.
I cannot rewrite packet headers in bpf, but I can do that in ebpf, etc.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
The patches don't explain the reasons to do nft socket filtering.
1) It provides an extensible interface to userspace. We didn't
invented a new wheel in that regard, we just reused the
extensibility of TLVs used in netlink as intermediate format
between user and kernelspace, also used many other applications
outthere. The TLV parsing and building is not new code, most that of
that code has been exposed to userspace already through netlink.
2) It shows that, with little generalisation, we can open the door to
one single *classification interface* for the users. Just make some
little googling, you'll find *lots of people* barfing on the fact that
we have that many interfaces to classify packets in Linux. And I'm
*not* talking about the packet classification approach itself, that's a
different debate of course.
[...]
Post by Alexei Starovoitov
Simplicity and performance should be the deciding factor.
imo nft+sock_filter example is not simple.
1) Simplicity: With the nft approach you can just use a filter
expressed in json, eg.
{"expr":[
{"type":"meta","dreg":1,"key":"protocol"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":2,"data0":"0x00000008"}}},
{"type":"payload","dreg":1,"offset":9,"len":1,"base":"network"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":1,"data0":"0x00000006"}}},
{"type":"immediate","dreg":0,"immediatedata":{"data_reg":{"type":"verdict","verdict":"0xffffffff"}}}]
}
}
sorry. It surely looks simple to you, but I cannot figure out what
the above snippet suppose to do. Could you please explain.
Post by Pablo Neira Ayuso
Which is still more readable and easier to maintain that a BPF
snippet. So you just pass it to the json parser in the libnftnl
library (or whatever other better minimalistic library we would ever
have) which transforms this to TLV format that you can pass to the
kernel. The kernel will do the job to translate this.
How are users going to integrate the restricted C's eBPF code
infrastructure into their projects? I don't think that will be simple.
They will likely include the BPF snippet to avoid all the integration
burden, as it already happens in many projects with BPF filters.
what you're seeing is the counter argument to your 'bpf not-simple'
statement :) If bpf snippets were hard to understand, people
wouldn't be including them as-is in their programs.
One can always do 'tcpdump expression -d' to generate bpf snippet
or use libpcap in their programs to dynamically generate them.
libseccomp dynamically generates bpf too.
Post by Pablo Neira Ayuso
2) Performance. Patrick has been doing many progress with the generic
set infrastructure for nft. In that regard, we aim to achieve
performance by arranging data using performance data structures,
*jit is not the only way to boost performance* (although we also
want to have it).
set type IPv4 address = { N thousands of IPv4 addresses }
reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
lookup(reg1, set)
reg0 <- immediate(0xffffffff)
set type IPv4 address = { 1.1.1.1 : accept, 2.2.2.2 : accept, 3.3.3.3 : drop ...}
reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
reg0 <- lookup(reg1, set)
Where "accept" is an alias of 0xffffffff and "drop" is 0 in the
nft-sock case. The verdict part has been generalised so we can adapt
nft to the corresponding packet classification engine.
that example is actually illustrates that nft needs to be continuously
tweaked to add features like 'set'. We can do better.

Here is the example from V2 series that shows how hash tables can be
used in C that translates to ebpf, without changing ebpf itself:
void dropmon(struct kprobe_args *ctx)
{
void *loc;
uint64_t *drop_cnt;
/* skb:kfree_skb is defined as:
* TRACE_EVENT(kfree_skb,
* TP_PROTO(struct sk_buff *skb, void *location),
* so ctx->arg2 is 'location'
*/
loc = (void *)ctx->arg2;

drop_cnt = bpf_table_lookup(ctx, 0, &loc);
if (drop_cnt) {
__sync_fetch_and_add(drop_cnt, 1);
} else {
uint64_t init = 0;
bpf_table_update(ctx, 0, &loc, &init);
}
}
the above C program compiles into ebpf, attaches to kfree_skb() tracepoint
and counts packet drops at different locations.
userspace can read the table and print it in user friendly way while
the tracing filter is running or when it's stopped.
That's an example of fast drop monitor that is safe to insert into live kernel.

Actually I think it's wrong to even compare nft with ebpf.
ebpf doesn't dictate a new user interface. At all.
There is old bpf to write socket filters. It's good enough.
I'm not planning to hack libpcap just to generate ebpf.

User interface is orthogonal to kernel implementation.
We can argue whether C representation of filter is better than json,
but what kernel runs at the lowest level is independent of that.

Insisting that user interface and kernel representation must be
one-to-one is unnecessary restrictive. User interface can and
should evolve independently of what kernel is doing underneath.

In case of socket filters tcpdump/libpcap syntax is the user interface.
old bpf is a kernel-user api. I don't think there is a strong need
to change either. ebpf is not touching them, but helping to execute
tcpdump filters faster.
In case of tracing filters I propose C-like user interface.
Kernel API for translated C programs is a different matter.
ebpf in the kernel is just the engine to execute it.
1st and 2nd may look completely different after community feedback,
but in kernel ebpf engine can stay unmodified.
Post by Pablo Neira Ayuso
Right, you can extend interfaces forever with lots of patchwork and
"smart tricks" but that doesn't mean that will look nice...
I'm not sure what you mean here.
Post by Pablo Neira Ayuso
As I said, I believe that having a nice extensible interface is
extremely important to make it easier for development. If we have to
rearrange the internal representation for some reason, we can do it
indeed without bothering about making translations to avoid breaking
userspace and having to use ugly tricks (just see sk_decode_filter()
or any other translation to support any new way to express a
filter...).
nice that your brought this up :)
As I mentioned in v4 thread sk_decode_filter() can be removed.
It was introduced to improve old interpreter performance and now
this part is obsolete.
Post by Pablo Neira Ayuso
[...]
Post by Alexei Starovoitov
Say you want to translate nft-cmp instruction into sequence of native
comparisons. You'd need to use load from memory, compare and
branch operations. That's ebpf!
Nope sorry, that's not ebpf. That's assembler code.
Well, in my previous email I tried to explain that assembler == ebpf :)
Please post x86_64 assembler code that future nft-jit suppose to
generate and I can post equivalent ebpf code that will be jited
exactly to your x86_64...
Post by Pablo Neira Ayuso
[...]
Post by Alexei Starovoitov
You can view ebpf as a tool to achieve jiting of nft.
It will save you a lot of time.
nft interface is already well-abstracted from the representation, so I
don't find a good reason to make a step backward that will force us to
represent the instructions using a fixed layout structure that is
exposed to userspace, that we won't be able to change once if this
gets into mainstream.
I don't think we're on the same page still.
To make this more productive, please say what feature you would
want to see supported and I can show how it is done without
changing ebpf insn set.
Post by Pablo Neira Ayuso
Probably the nft is not the easiest path, I agree, it's been not so
far if you look at the record. But with time and development hours
from everyone, I believe we'll enjoy a nice framework.
No doubt that nftables is a nice framework. Let's keep it going
and let's make it faster.

Regards,
Alexei
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-03-14 18:16:00 UTC
Permalink
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
[...]
It seems you're assuming that ebpf inherited all the shortcomings
of bpf and making conclusion based on that. Not your fault.
I didn't explain it well enough.
Frankly, from the *interface point of view* it is indeed inheriting
all of its shortcomings...
Post by Alexei Starovoitov
Technically ebpf is a small evolution of bpf, but applicability made a
giant leap. I cannot compile C into bpf, but I can do that with ebpf.
Good that we can get better userspace tools, but still the layout of
your instructions is exposed to userspace so it cannot be changed
*ever*. The kernel interface is still an array of binary structures of
fixed size just like BPF does.

Why do we have to have a 32 bits immediate that may be zero most of
the time? Just because:

1) You needed to align your new instruction layout to 64 bits / two 32
bits words.
2) You noticed you get better performance with those changes,
including the new "if" statement that works better with branch
prediction.
3) It's easier for you to make the jit translation.

That means your interface is exposing *all of your internal
implementation decisions* and that's a very bad since we will always
have to come up with smart tricks not to break backward compatibility
if we want to improve the internal implementation.
Post by Alexei Starovoitov
I cannot do table lookups in bpf, but I can do that in ebpf.
I cannot rewrite packet headers in bpf, but I can do that in ebpf, etc.
Sorry, I don't buy this "we get more features" if in the long run we
have restricted extensibility.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
The patches don't explain the reasons to do nft socket filtering.
1) It provides an extensible interface to userspace. We didn't
invented a new wheel in that regard, we just reused the
extensibility of TLVs used in netlink as intermediate format
between user and kernelspace, also used many other applications
outthere. The TLV parsing and building is not new code, most that of
that code has been exposed to userspace already through netlink.
2) It shows that, with little generalisation, we can open the door to
one single *classification interface* for the users. Just make some
little googling, you'll find *lots of people* barfing on the fact that
we have that many interfaces to classify packets in Linux. And I'm
*not* talking about the packet classification approach itself, that's a
different debate of course.
[...]
Post by Alexei Starovoitov
Simplicity and performance should be the deciding factor.
imo nft+sock_filter example is not simple.
1) Simplicity: With the nft approach you can just use a filter
expressed in json, eg.
{"expr":[
{"type":"meta","dreg":1,"key":"protocol"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":2,"data0":"0x00000008"}}},
{"type":"payload","dreg":1,"offset":9,"len":1,"base":"network"},
{"type":"cmp","sreg":1,"op":"eq","cmpdata":{"data_reg":{"type":"value","len":1,"data0":"0x00000006"}}},
{"type":"immediate","dreg":0,"immediatedata":{"data_reg":{"type":"verdict","verdict":"0xffffffff"}}}]
}
}
sorry. It surely looks simple to you, but I cannot figure out what
the above snippet suppose to do. Could you please explain.
1) Fetch skb->protocol put it into a register.
2) Compare it to ETHERPROTO_IP. If not matching, break.
3) Fetch payload byte offset 9, only 1 byte.
4) Compare it with IPPROTO_TCP. If not matching, break.
5) If matching, pass the packet to userspace.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
Which is still more readable and easier to maintain that a BPF
snippet. So you just pass it to the json parser in the libnftnl
library (or whatever other better minimalistic library we would ever
have) which transforms this to TLV format that you can pass to the
kernel. The kernel will do the job to translate this.
How are users going to integrate the restricted C's eBPF code
infrastructure into their projects? I don't think that will be simple.
They will likely include the BPF snippet to avoid all the integration
burden, as it already happens in many projects with BPF filters.
what you're seeing is the counter argument to your 'bpf not-simple'
statement :) If bpf snippets were hard to understand, people
wouldn't be including them as-is in their programs.
No, we had no other choice, it was the best thing that we had so far.
People have been just including that "magic code", BPF is quite
unreadable, your extension doesn't make any better.
Post by Alexei Starovoitov
One can always do 'tcpdump expression -d' to generate bpf snippet
or use libpcap in their programs to dynamically generate them.
libseccomp dynamically generates bpf too.
Autogeneration tools are of course good to have, that can be achieved
with *any* framework. I don't think this is the main point of this
discussion.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
2) Performance. Patrick has been doing many progress with the generic
set infrastructure for nft. In that regard, we aim to achieve
performance by arranging data using performance data structures,
*jit is not the only way to boost performance* (although we also
want to have it).
set type IPv4 address = { N thousands of IPv4 addresses }
reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
lookup(reg1, set)
reg0 <- immediate(0xffffffff)
set type IPv4 address = { 1.1.1.1 : accept, 2.2.2.2 : accept, 3.3.3.3 : drop ...}
reg1 <- payload(network header, offsetof(struct iphdr, daddr), 4)
reg0 <- lookup(reg1, set)
Where "accept" is an alias of 0xffffffff and "drop" is 0 in the
nft-sock case. The verdict part has been generalised so we can adapt
nft to the corresponding packet classification engine.
that example is actually illustrates that nft needs to be continuously
tweaked to add features like 'set'. We can do better.
No, it just proofs that our framework is extensible and that we can
improve easily.
Post by Alexei Starovoitov
Here is the example from V2 series that shows how hash tables can be
void dropmon(struct kprobe_args *ctx)
{
void *loc;
uint64_t *drop_cnt;
* TRACE_EVENT(kfree_skb,
* TP_PROTO(struct sk_buff *skb, void *location),
* so ctx->arg2 is 'location'
*/
loc = (void *)ctx->arg2;
drop_cnt = bpf_table_lookup(ctx, 0, &loc);
Is there room to extend your framework with any other data structure
which is *not* a hashtable? What are you plans for that?
Post by Alexei Starovoitov
if (drop_cnt) {
__sync_fetch_and_add(drop_cnt, 1);
} else {
uint64_t init = 0;
bpf_table_update(ctx, 0, &loc, &init);
}
}
the above C program compiles into ebpf, attaches to kfree_skb() tracepoint
and counts packet drops at different locations.
userspace can read the table and print it in user friendly way while
the tracing filter is running or when it's stopped.
That's an example of fast drop monitor that is safe to insert into live kernel.
Actually I think it's wrong to even compare nft with ebpf.
ebpf doesn't dictate a new user interface. At all.
There is old bpf to write socket filters. It's good enough.
I'm not planning to hack libpcap just to generate ebpf.
User interface is orthogonal to kernel implementation.
We can argue whether C representation of filter is better than json,
but what kernel runs at the lowest level is independent of that.
Insisting that user interface and kernel representation must be
one-to-one is unnecessary restrictive. User interface can and
should evolve independently of what kernel is doing underneath.
In case of socket filters tcpdump/libpcap syntax is the user interface.
No, that's not the real interface. That's a wrapper / abstraction over
the kernel interface is exposing.
Post by Alexei Starovoitov
old bpf is a kernel-user api. I don't think there is a strong need
to change either. ebpf is not touching them, but helping to execute
tcpdump filters faster.
In case of tracing filters I propose C-like user interface.
Kernel API for translated C programs is a different matter.
ebpf in the kernel is just the engine to execute it.
1st and 2nd may look completely different after community feedback,
but in kernel ebpf engine can stay unmodified.
The only different that I see with ebpf is that you provide nice end
user tools, but the design from the kernel interface has exactly the
same problems.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
Right, you can extend interfaces forever with lots of patchwork and
"smart tricks" but that doesn't mean that will look nice...
I'm not sure what you mean here.
For example, this:

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cf9cd13509a7..e1b979312588 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -273,6 +273,13 @@ static struct ctl_table net_core_table[] = {
},
#endif
{
+ .procname = "bpf_ext_enable",
+ .data = &bpf_ext_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },

This /proc thing seems to me like the last resource we have to avoid
breaking backward compatibility. I have used it as well myself, but
it's like the *pull alarm* when we did a wrong design decision.

What if we need a new ABI breakage for BPF again? Will we need to add
a new /proc interface for that? As far as I can tell from your
patches, the answer is yes.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
As I said, I believe that having a nice extensible interface is
extremely important to make it easier for development. If we have to
rearrange the internal representation for some reason, we can do it
indeed without bothering about making translations to avoid breaking
userspace and having to use ugly tricks (just see sk_decode_filter()
or any other translation to support any new way to express a
filter...).
nice that your brought this up :)
As I mentioned in v4 thread sk_decode_filter() can be removed.
It was introduced to improve old interpreter performance and now
this part is obsolete.
What are your plans then? Will you implement that converter in
userspace? You mention that you don't want to enhance libpcap, which
seems to me like the natural way to extend things.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
[...]
Post by Alexei Starovoitov
Say you want to translate nft-cmp instruction into sequence of native
comparisons. You'd need to use load from memory, compare and
branch operations. That's ebpf!
Nope sorry, that's not ebpf. That's assembler code.
Well, in my previous email I tried to explain that assembler == ebpf :)
I see, so I was right. You want to expose a pseudo-assembler
interface just because that makes it easier to you to provide the jit
translation.
Post by Alexei Starovoitov
Please post x86_64 assembler code that future nft-jit suppose to
generate and I can post equivalent ebpf code that will be jited
exactly to your x86_64...
That's possible of course. There are many ways to implement the same
thing, they can provide the same features, but not the same degree of
extensibility.
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
[...]
Post by Alexei Starovoitov
You can view ebpf as a tool to achieve jiting of nft.
It will save you a lot of time.
nft interface is already well-abstracted from the representation, so I
don't find a good reason to make a step backward that will force us to
represent the instructions using a fixed layout structure that is
exposed to userspace, that we won't be able to change once if this
gets into mainstream.
I don't think we're on the same page still.
To make this more productive, please say what feature you would
want to see supported and I can show how it is done without
changing ebpf insn set.
Post by Pablo Neira Ayuso
Probably the nft is not the easiest path, I agree, it's been not so
far if you look at the record. But with time and development hours
from everyone, I believe we'll enjoy a nice framework.
No doubt that nftables is a nice framework. Let's keep it going
and let's make it faster.
We want to make it faster, but I don't like the idea of converting
it to bpf just to get the jit fast.

We are spending quite a lot of time to provide good performance
through arraging data in performance data structures, the jit
microoptimization will be just the cherry on the top of the cake.
Alexei Starovoitov
2014-03-15 04:04:50 UTC
Permalink
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
[...]
It seems you're assuming that ebpf inherited all the shortcomings
of bpf and making conclusion based on that. Not your fault.
I didn't explain it well enough.
Frankly, from the *interface point of view* it is indeed inheriting
all of its shortcomings...
Hi Pablo, David,

Let's go back to what ebpf is...
ebpf == generalization of assembler instructions across different architectures.

Take x86_64 instructions ld/st/alu/mov/cmp/call/jmp and
then rename them into my_ld, my_st, my_add, my_call, etc
Then do the same for arm64.
Also rename register names into r0,r1,r2
and remember how you did the mapping.
Also analyze x86_64, arm64 call convention, so that callee saved
registers are mapped to the same regs and arguments are passed
in r1, r2, ...

A function call in such assembler will look like:
my_mov r1, 1
my_mov r2, 2
my_call foo

that maps back to x86_64:
mov rdi, 1
mov rsi, 2
call foo

Since calling convention is compatible between 'renamed assembler'
and original x86_64 or arm assembler, the program written in 'renamed
assembler' can call native functions directly.
The opposite is also true.
Functions written in x86 assembler or C can call into functions
written in 'renamed' assembler.
Example:

f1.s:
mov rdi, 1
mov rsi, 2
call f2
ret

f2.s:
my_mov r3, r1
my_mov r2, r1
my_mov r1, r3
my_call f3
my_ret

f3.s:
mov rax, rdi
sub rax, rsi
ret

fyi, in C these assembler blobs roughly do:
u64 f1() { return f2(1,2); }
u64 f2(u64 a, u64 b) { return f3(b, a); }
u64 f3(u64 a, u64 b) { return a - b; }

f1.s and f3.s are written in x86_64 and f2.s is written in 'renamed assembler'.

compile f1.s, f3.s into binary x86 code
compile f2.s into some binary code
(either fixed insn size or variable, that's irrelevant), let's call it format B

Now load all three binary blobs into kernel.
1st and 3rd blob can be loaded as-is.
2nd blob needs to be remapped from format B into x86_64 binary code.

After that CPU can call f1() and receive 1 back.

What programs can be written in x86_64 assembler? Anything.
What programs can be written in renamed assembler? Anything.

How often do we want to extend x86_64 assembler? Rarely.
Only when an algorithm implemented in pure x86_64 needs
mmx/ssa acceleration.
Intel does not extend x86 to add a feature, but to accelerate a feature.
Same with 'renamed' assembler.
Any algorithm can be implemented using renamed assembler.

So what is ebpf? It's a format B. It can be fixed size or variable.
That is irrelevant. While loading, the program in format B is
reverse mapped into x86 binary code.

What programs can be written in format B? Anything.
Does format B needs to be extended to support nft? no
to support socket filters? no
to support tracing filters? no
to support crazy idea that will come N years from now? no
Hard to believe? Think back that it is renamed x86 assembler.

Format B was chosen to look like bpf to make an adoption easier
and to make conversion from bpf to ebpf trivial,
but looks like it was a bad idea.
I should have just called it 'simplified x86_64 insn set'.

Now about 'user interface point of view'...
old bpf, netlink, nft format are interfaces.
Until format B is exposed to user space it is not an interface.
nftables can use format B to jit the code.
nftables's user interface doesn't change.

In the patches I sent, ebpf is _not_ exposed to the user.
My patch set immediately helps performance of existing
socket filters and seccomp.
And can do jitting for nft.

Another way of thinking about ebpf:
ebpf is a different way of encoding x86 insns.

I also think we can expose ebpf to the user space,
but that's a different patch and a different discussion.

Thanks!

Hi Pablo,
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Technically ebpf is a small evolution of bpf, but applicability made a
giant leap. I cannot compile C into bpf, but I can do that with ebpf.
Good that we can get better userspace tools, but still the layout of
your instructions is exposed to userspace so it cannot be changed
*ever*. The kernel interface is still an array of binary structures of
fixed size just like BPF does.
that fixed size is irrelevant from extensibility point of view.
sparc has fixed size instructions too, but we don't change sparc
instruction width.
Let's say we decided to remap all sparc instructions and add new
pseudo instructions. These pseudo sparc insns won't buy us any
performance, because in the end they're remapped into real
instructions that cpu can execute.
These fake new pseudo sparc instructions won't give us
any new features either.

Format B should not be changed.
We can add new instructions if we really really need,
but there will not be a need to change existing insns.
Hard to believe? Think back that it is simplified x86.
We don't have a need to change existing x86 insns.

Example 1:
there are xadd_w and xadd_dw insns in format B to do
atomic increments. They don't have to be in the instruction set.
I've added them for performance and not because it cannot
be done without them.
atomic increments could have been done with function call.
ebpf call insn is #1 instruction that was missing in bpf.
It makes ebpf usable for any job.
ebpf program can always call a function.

Example 2:
In old bpf there are many extensions that fetch skb->protocol,
skb->len, skb->pkt_type and so on.
One extension per skb field. That was bad.
They were done as instruction extensions in old bpf,
because bpf didn't have a generic load operation.
ebpf doesn't need extensions for them. All these old bpf
extensions are converted to generic 'load' insn in ebpf.
and jited to x86 as single load, whereas old bpf jit needs
to have its own 'case' statement for every extension.

We can gradually replace old bpf jits with new ebpf jits.
and take time while doing this without exposing ebpf
to the userspace.

Example 3:
Old bpf_s_anc_nlattr extension cannot be jited with
current bpf jit, because it's too complicated
(requires thinking about calling convention, etc)
After conversion to ebpf it finally can be jited,
since it becomes a function call in x86.

That's the point. We do not need to change ebpf insn set.
Anything can be implemented as generic load/store operations
and function calls because ebpf == x86 assembler.
Post by Pablo Neira Ayuso
Why do we have to have a 32 bits immediate that may be zero most of
1) You needed to align your new instruction layout to 64 bits / two 32
bits words.
wrong guess. It's not alignment.
Format B could be anything.
I just picked it to be similar to old BPF to be easier to understand.
Apparently it's not that easy.
Post by Pablo Neira Ayuso
2) You noticed you get better performance with those changes,
including the new "if" statement that works better with branch
prediction.
nope. that's a side effect of 'simplified x86 assembler'
Neither x86 nor arm nor other CPUs have 'dual branch'
instructions. All CPUs either branch or fall through and
since ebpf is just a simplified x86 here you have such
style of branches.
Post by Pablo Neira Ayuso
3) It's easier for you to make the jit translation.
sorry, but 'easier' was not a factor.
ebpf instructions are 8-byte wide, just because old bpfs are
8-byte wide. I fitted all x86 instructions into 8 bytes.
Could have picked any other size.

Another way of thinking about ebpf:
ebpf is a different way of encoding x86 insns.

Whether instructions are variable length or fixed, it's
the same complexity to map back to x86.

ebpf interpreter is a different matter.
Interpreter obviously works better with fixed insn size.
ebpf interpreter you see only exists to support architectures
that don't have ebpf->native mapper yet.

If majority thinks that variable length insns will work better,
let's re-encode the whole thing into variable length.
I just don't see what it will buy us.
but I'm fine re-encoding ebpf into any other format.
Obviously 'simplified x86/arm insn set' can have any format,
as long as it is convenient to execute by interpreter,
not too complex for remapping into native
and has room to add instructions.
imo proposed ebpf format fits these three attributes just fine.
Post by Pablo Neira Ayuso
That means your interface is exposing *all of your internal
implementation decisions* and that's a very bad since we will always
have to come up with smart tricks not to break backward compatibility
if we want to improve the internal implementation.
We're not going to go back and break compatibility.
We're not breaking compatibility now either.
Did Intel change x86 encoding? no.
Same with ebpf. We don't need to change ebpf encoding.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
I cannot do table lookups in bpf, but I can do that in ebpf.
I cannot rewrite packet headers in bpf, but I can do that in ebpf, etc.
Sorry, I don't buy this "we get more features" if in the long run we
have restricted extensibility.
That's not productive to keep saying 'restricted extensibility'
without providing a specific example.
Please come up with at least one case that ebpf cannot
handle as presented.
What instructions do you think are missing?
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Here is the example from V2 series that shows how hash tables can be
void dropmon(struct kprobe_args *ctx)
{
void *loc;
uint64_t *drop_cnt;
* TRACE_EVENT(kfree_skb,
* TP_PROTO(struct sk_buff *skb, void *location),
* so ctx->arg2 is 'location'
*/
loc = (void *)ctx->arg2;
drop_cnt = bpf_table_lookup(ctx, 0, &loc);
Is there room to extend your framework with any other data structure
which is *not* a hashtable? What are you plans for that?
Please understand hashtable or xyztable is not an ebpf instruction.
It is a function call.
Generic call.
ebpf can call any function.
ebpf doesn't need to change a single bit to support other tables.
ebpf jits don't need to change either.
How table is implemented is out side of ebpf scope.
Type of keys, values are arbitrary.
bpf_table_lookup() is a C function inside kernel that ebpf
program calls.
Post by Pablo Neira Ayuso
The only different that I see with ebpf is that you provide nice end
user tools, but the design from the kernel interface has exactly the
same problems.
ok. what problems? Please be specific.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
Right, you can extend interfaces forever with lots of patchwork and
"smart tricks" but that doesn't mean that will look nice...
I'm not sure what you mean here.
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cf9cd13509a7..e1b979312588 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -273,6 +273,13 @@ static struct ctl_table net_core_table[] = {
},
#endif
{
+ .procname = "bpf_ext_enable",
+ .data = &bpf_ext_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
This /proc thing seems to me like the last resource we have to avoid
breaking backward compatibility. I have used it as well myself, but
it's like the *pull alarm* when we did a wrong design decision.
What if we need a new ABI breakage for BPF again? Will we need to add
a new /proc interface for that? As far as I can tell from your
patches, the answer is yes.
Where do you see ABI breakage? It's not broken.
I can remove bpf_ext_enable flag.
On or off it doesn't break any user interface.
Socket filters are still loading old bpf programs.
seccomp is still loading old bpf programs.
They get converted on the fly to new ebpf.
I added the flag only to be able to easily benchmark two interpreters.
We're planning to remove old bpf interpreter. It's obsolete.
Just like old sk_decode_filter().
This /proc flag doesn't need to be there. It can be removed.
Not a single user space app will notice the difference,
other than faster performance.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
As I said, I believe that having a nice extensible interface is
extremely important to make it easier for development. If we have to
rearrange the internal representation for some reason, we can do it
indeed without bothering about making translations to avoid breaking
userspace and having to use ugly tricks (just see sk_decode_filter()
or any other translation to support any new way to express a
filter...).
nice that your brought this up :)
As I mentioned in v4 thread sk_decode_filter() can be removed.
It was introduced to improve old interpreter performance and now
this part is obsolete.
What are your plans then? Will you implement that converter in
userspace? You mention that you don't want to enhance libpcap, which
seems to me like the natural way to extend things.
Please see 1/3 patch. Converter from old bpf to ebpf takes 263 lines
of trivial remapping. It's that simple.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Say you want to translate nft-cmp instruction into sequence of native
comparisons. You'd need to use load from memory, compare and
branch operations. That's ebpf!
Nope sorry, that's not ebpf. That's assembler code.
Well, in my previous email I tried to explain that assembler == ebpf :)
I see, so I was right. You want to expose a pseudo-assembler
interface just because that makes it easier to you to provide the jit
translation.
If you're saying that ebpf == assembler, then yes, you're right.
Post by Pablo Neira Ayuso
Post by Alexei Starovoitov
Please post x86_64 assembler code that future nft-jit suppose to
generate and I can post equivalent ebpf code that will be jited
exactly to your x86_64...
That's possible of course. There are many ways to implement the same
thing, they can provide the same features, but not the same degree of
extensibility.
Totally agree. nft is definitely less extensible than ebpf.
You need to change nft for every new feature, whereas I don't need
to change ebpf. I don't need to change ebpf jits either.

Key point is: ebpf does not _need_ to be changed.
There are still plenty of reserved bits, so new instructions can be
added to improve performance, but so far I don't see a need.
We don't _have_ to add instructions. There is always
a way to do with what it has now.
It is a complete instruction set to support any integer program.
Yeah, floating point is not supported and will not be.

Thanks
Alexei
Pablo Neira Ayuso
2014-03-15 19:03:01 UTC
Permalink
On Fri, Mar 14, 2014 at 09:04:50PM -0700, Alexei Starovoitov wrote:
[...]
Post by Alexei Starovoitov
In the patches I sent, ebpf is _not_ exposed to the user.
From your last patch: http://patchwork.ozlabs.org/patch/329713/
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 8eb9ccaa5b48..4e98fe16ba88 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -1,5 +1,6 @@
/*
* Linux Socket Filter Data Structures
+ * Extended BPF is Copyright (c) 2011-2014, PLUMgrid, http://plumgrid.com
*/

#ifndef _UAPI__LINUX_FILTER_H__
@@ -19,7 +20,7 @@
* Try and keep these values and structures similar to BSD,
* especially
* the BPF code definitions which need to match so you can share
* filters
*/
-
+
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
@@ -27,6 +28,14 @@ struct sock_filter { /* Filter block */
__u32 k; /* Generic multiuse field */
};

+struct sock_filter_ext {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register */
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
+
struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
unsigned short len; /* Number of filter blocks */
struct sock_filter __user *filter;

That sock_filter_ext structure is exposed to userspace as well as many
other new BPF_* macros that you have defined.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov
2014-03-15 19:18:44 UTC
Permalink
Post by Alexei Starovoitov
[...]
Post by Alexei Starovoitov
In the patches I sent, ebpf is _not_ exposed to the user.
From your last patch: http://patchwork.ozlabs.org/patch/329713/
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 8eb9ccaa5b48..4e98fe16ba88 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -1,5 +1,6 @@
/*
* Linux Socket Filter Data Structures
+ * Extended BPF is Copyright (c) 2011-2014, PLUMgrid, http://plumgrid.com
*/
#ifndef _UAPI__LINUX_FILTER_H__
@@ -19,7 +20,7 @@
* Try and keep these values and structures similar to BSD,
* especially
* the BPF code definitions which need to match so you can share
* filters
*/
-
+
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
@@ -27,6 +28,14 @@ struct sock_filter { /* Filter block */
__u32 k; /* Generic multiuse field */
};
+struct sock_filter_ext {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register */
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
+
struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
unsigned short len; /* Number of filter blocks */
struct sock_filter __user *filter;
That sock_filter_ext structure is exposed to userspace as well as many
other new BPF_* macros that you have defined.
For the first few versions of the patchset they were in linux/bpf.h,
but then it was suggested to put them into uapi/linux/filter.h
to make the whole thing consistent with existing sock_filter structure.
So yes, uapi header is changed as:
include/uapi/linux/filter.h | 33 ++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)

but there is no way to use these #define from user space at present.

As I said I think it's safe to expose it, because these defines won't change,
but if there is a concern I can move it back into linux/bpf.h

Thanks
Alexei
Andi Kleen
2014-03-11 12:57:39 UTC
Permalink
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Similarly to BPF, you can attach filters via setsockopt()
SO_ATTACH_NFT_FILTER. The filter that is passed to the kernel is
Could you explain how you validted and tested the nf engine to make it safe for
non root without any security problems?

-andi
--
***@linux.intel.com -- Speaking for myself only
David Miller
2014-04-04 15:24:32 UTC
Permalink
From: Pablo Neira Ayuso <***@netfilter.org>
Date: Tue, 11 Mar 2014 10:19:11 +0100
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Generally I like this series, but of course you will need to respin
it against the work that went into net-next recently.

I only wonder about the expression implementation module loading
logic when we add an nft filter to a socket.

It seems that if the module doesn't exist, we return -EAGAIN, drop the
mutex, and retry. I see nothing which breaks this loop, it seems like
it can run forever if a module is simply not present.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso
2014-04-04 15:27:34 UTC
Permalink
Post by David Miller
Date: Tue, 11 Mar 2014 10:19:11 +0100
The following patchset provides a socket filtering alternative to BPF
which allows you to define your filter using the nf_tables expressions.
Generally I like this series, but of course you will need to respin
it against the work that went into net-next recently.
Sure, no problem.
Post by David Miller
I only wonder about the expression implementation module loading
logic when we add an nft filter to a socket.
Yes, that needs to be revisited, some people already rised concerns on
that.
Post by David Miller
It seems that if the module doesn't exist, we return -EAGAIN, drop the
mutex, and retry. I see nothing which breaks this loop, it seems like
it can run forever if a module is simply not present.
Will recheck this as well. Thanks for the feedback.

Loading...