From c33cb0020ee6dd96cc9976d6085a7d8422f6dbed Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 22 Feb 2021 08:00:00 +0000 Subject: [PATCH 1/9] uapi: nfnetlink_cthelper.h: fix userspace compilation error Apparently, and could not be included into the same compilation unit because of a cut-and-paste typo in the former header. Fixes: 12f7a505331e6 ("netfilter: add user-space connection tracking helper infrastructure") Cc: # v3.6 Signed-off-by: Dmitry V. Levin Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_cthelper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nfnetlink_cthelper.h b/include/uapi/linux/netfilter/nfnetlink_cthelper.h index a13137afc429..70af02092d16 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cthelper.h +++ b/include/uapi/linux/netfilter/nfnetlink_cthelper.h @@ -5,7 +5,7 @@ #define NFCT_HELPER_STATUS_DISABLED 0 #define NFCT_HELPER_STATUS_ENABLED 1 -enum nfnl_acct_msg_types { +enum nfnl_cthelper_msg_types { NFNL_MSG_CTHELPER_NEW, NFNL_MSG_CTHELPER_GET, NFNL_MSG_CTHELPER_DEL, From c57ea2d7d81fbaa72c7d0ffbff61ade1039f4a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klemen=20Ko=C5=A1ir?= Date: Sat, 20 Feb 2021 18:29:26 +0900 Subject: [PATCH 2/9] netfilter: conntrack: Remove a double space in a log message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed an extra space in a log message and an extra blank line in code. Signed-off-by: Klemen Košir Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 118f415928ae..b055187235f8 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -219,7 +219,7 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) return NULL; pr_info("nf_conntrack: default automatic helper assignment " "has been turned off for security reasons and CT-based " - " firewall rule not found. Use the iptables CT target " + "firewall rule not found. Use the iptables CT target " "to attach helpers instead.\n"); net->ct.auto_assign_helper_warned = 1; return NULL; @@ -228,7 +228,6 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); } - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { From 03a3ca37e4c6478e3a84f04c8429dd5889e107fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:19 +0100 Subject: [PATCH 3/9] netfilter: nf_nat: undo erroneous tcp edemux lookup Under extremely rare conditions TCP early demux will retrieve the wrong socket. 1. local machine establishes a connection to a remote server, S, on port p. This gives: laddr:lport -> S:p ... both in tcp and conntrack. 2. local machine establishes a connection to host H, on port p2. 2a. TCP stack choses same laddr:lport, so we have laddr:lport -> H:p2 from TCP point of view. 2b). There is a destination NAT rewrite in place, translating H:p2 to S:p. This results in following conntrack entries: I) laddr:lport -> S:p (origin) S:p -> laddr:lport (reply) II) laddr:lport -> H:p2 (origin) S:p -> laddr:lport2 (reply) NAT engine has rewritten laddr:lport to laddr:lport2 to map the reply packet to the correct origin. When server sends SYN/ACK to laddr:lport2, the PREROUTING hook will undo-the SNAT transformation, rewriting IP header to S:p -> laddr:lport This causes TCP early demux to associate the skb with the TCP socket of the first connection. The INPUT hook will then reverse the DNAT transformation, rewriting the IP header to H:p2 -> laddr:lport. Because packet ends up with the wrong socket, the new connection never completes: originator stays in SYN_SENT and conntrack entry remains in SYN_RECV until timeout, and responder retransmits SYN/ACK until it gives up. To resolve this, orphan the skb after the input rewrite: Because the source IP address changed, the socket must be incorrect. We can't move the DNAT undo to prerouting due to backwards compatibility, doing so will make iptables/nftables rules to no longer match the way they did. After orphan, the packet will be handed to the next protocol layer (tcp, udp, ...) and that will repeat the socket lookup just like as if early demux was disabled. Fixes: 41063e9dd1195 ("ipv4: Early TCP socket demux.") Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1427 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_proto.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index e87b6bd6b3cd..4731d21fc3ad 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -646,8 +646,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, } static unsigned int -nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; @@ -659,6 +659,23 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb, return ret; } +static unsigned int +nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + __be32 saddr = ip_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv4_fn(priv, skb, state); + + if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr && + !inet_sk_transparent(sk)) + skb_orphan(skb); /* TCP edemux obtained wrong socket */ + + return ret; +} + static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -736,7 +753,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -757,7 +774,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, From 07b5a76e18925a595bfef44531dbf2f397bb5507 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:20 +0100 Subject: [PATCH 4/9] netfilter: conntrack: avoid misleading 'invalid' in log message The packet is not flagged as invalid: conntrack will accept it and its associated with the conntrack entry. This happens e.g. when receiving a retransmitted SYN in SYN_RECV state. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 1d7e1c595546..ec23330687a5 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -982,8 +982,10 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " - "state %s ", tcp_conntrack_names[old_state]); + nf_ct_l4proto_log_invalid(skb, ct, + "packet (index %d) in dir %d ignored, state %s", + index, dir, + tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or From c2c16ccba2f55d527dd145a5d8c038694b3b343f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 24 Feb 2021 17:23:21 +0100 Subject: [PATCH 5/9] selftests: netfilter: test nat port clash resolution interaction with tcp early demux Convert Antonio Ojeas bug reproducer to a kselftest. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/nf_nat_edemux.sh | 99 +++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nf_nat_edemux.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3006a8e5b41a..3171069a6b46 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -4,7 +4,7 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh \ + nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..cfee3b65be0f --- /dev/null +++ b/tools/testing/selftests/netfilter/nf_nat_edemux.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +cleanup() +{ + ip netns del $ns1 + ip netns del $ns2 +} + +iperf3 -v > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iperf3" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns1" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns1" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns $ns1 dev veth1 +ip link set netns $ns2 dev veth2 + +ip netns exec $ns1 ip link set up dev lo +ip netns exec $ns1 ip link set up dev veth1 +ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec $ns2 ip link set up dev lo +ip netns exec $ns2 ip link set up dev veth2 +ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec $ns1 iperf3 -s > /dev/null 2>&1 & +iperfs=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 + +sleep 1 + +# add a persistent connection from the other namespace +ip netns exec $ns2 nc -q 10 -w 10 192.168.1.1 5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec $ns2 nc -w 3 -q 3 10.96.0.1 443 >/dev/null +ret=$? + +kill $iperfs + +# Check nc can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: nc can connect via NAT'd address" +else + echo "FAIL: nc cannot connect via NAT'd address" + exit 1 +fi + +exit 0 From 8e24edddad152b998b37a7f583175137ed2e04a5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 27 Feb 2021 11:27:45 +0300 Subject: [PATCH 6/9] netfilter: x_tables: gpf inside xt_find_revision() nested target/match_revfn() calls work with xt[NFPROTO_UNSPEC] lists without taking xt[NFPROTO_UNSPEC].mutex. This can race with module unload and cause host to crash: general protection fault: 0000 [#1] Modules linked in: ... [last unloaded: xt_cluster] CPU: 0 PID: 542455 Comm: iptables RIP: 0010:[] [] strcmp+0x18/0x40 RDX: 0000000000000003 RSI: ffff9a5a5d9abe10 RDI: dead000000000111 R13: ffff9a5a5d9abe10 R14: ffff9a5a5d9abd8c R15: dead000000000100 (VvS: %R15 -- &xt_match, %RDI -- &xt_match.name, xt_cluster unregister match in xt[NFPROTO_UNSPEC].match list) Call Trace: [] match_revfn+0x54/0xc0 [] match_revfn+0xaf/0xc0 [] xt_find_revision+0x6e/0xf0 [] do_ipt_get_ctl+0x100/0x420 [ip_tables] [] nf_getsockopt+0x4f/0x70 [] ip_getsockopt+0xde/0x100 [] raw_getsockopt+0x25/0x50 [] sock_common_getsockopt+0x1a/0x20 [] SyS_getsockopt+0x7d/0xf0 [] system_call_fastpath+0x25/0x2a Fixes: 656caff20e1 ("netfilter 04/09: x_tables: fix match/target revision lookup") Signed-off-by: Vasily Averin Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index acce622582e3..bce6ca203d46 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -330,6 +330,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_match *m; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision > *bestp) @@ -338,6 +339,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -350,6 +352,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) const struct xt_target *t; int have_rev = 0; + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision > *bestp) @@ -358,6 +361,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) have_rev = 1; } } + mutex_unlock(&xt[af].mutex); if (af != NFPROTO_UNSPEC && !have_rev) return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); @@ -371,12 +375,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else have_rev = match_revfn(af, name, revision, &best); - mutex_unlock(&xt[af].mutex); /* Nothing at all? Return 0 to try loading module. */ if (best == -1) { From 9cc0001a18b4e5f46ec481201c88ae16f0a69bb0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 27 Feb 2021 22:31:27 +0100 Subject: [PATCH 7/9] netfilter: nftables: disallow updates on table ownership Disallow updating the ownership bit on an existing table: Do not allow to grab ownership on an existing table. Do not allow to drop ownership on an existing table. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1eb5cdb3033..b07703e19108 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -916,6 +916,12 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags == ctx->table->flags) return 0; + if ((nft_table_has_owner(ctx->table) && + !(flags & NFT_TABLE_F_OWNER)) || + (!nft_table_has_owner(ctx->table) && + flags & NFT_TABLE_F_OWNER)) + return -EOPNOTSUPP; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) From 2888b080d05c819205bbfe52c624a639f44c266a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Mar 2021 23:58:27 +0100 Subject: [PATCH 8/9] netfilter: nftables: fix possible double hook unregistration with table owner Skip hook unregistration of owner tables from the netns exit path, nft_rcv_nl_event() unregisters the table hooks before tearing down the table content. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b07703e19108..796ce86ef7eb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9028,8 +9028,12 @@ static void __nft_release_hooks(struct net *net) { struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_has_owner(table)) + continue; + __nft_release_hook(net, table); + } } static void __nft_release_table(struct net *net, struct nft_table *table) From bd1777b3a88f98e223392221b330668458aac7f1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Mar 2021 04:00:09 +0100 Subject: [PATCH 9/9] netfilter: nftables: bogus check for netlink portID with table owner The existing branch checks for 0 != table->nlpid which always evaluates true for tables that have an owner. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 796ce86ef7eb..224c8e537cb3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9083,13 +9083,12 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nf_tables_table_destroy(&ctx); } -static void __nft_release_tables(struct net *net, u32 nlpid) +static void __nft_release_tables(struct net *net) { struct nft_table *table, *nt; list_for_each_entry_safe(table, nt, &net->nft.tables, list) { - if (nft_table_has_owner(table) && - nlpid != table->nlpid) + if (nft_table_has_owner(table)) continue; __nft_release_table(net, table); @@ -9155,7 +9154,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); - __nft_release_tables(net, 0); + __nft_release_tables(net); mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list));