From d410976f561fb05913e5ab2f3b41d50ab8040a2f Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Sun, 4 Aug 2024 10:47:44 -0400 Subject: [PATCH] patch release: v8 Signed-off-by: Christian Hopps --- patches/v8/v8-0000-cover-letter.patch | 138 ++++ ...01-xfrm-config-add-CONFIG_XFRM_IPTFS.patch | 42 ++ ...uapi-add-ip_tfs_-_hdr-packet-formats.patch | 43 ++ ...add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch | 29 + ...m-netlink-add-config-netlink-options.patch | 159 ++++ ...rm-add-mode_cbs-module-functionality.patch | 356 +++++++++ ...eric-iptfs-defines-and-functionality.patch | 265 +++++++ ...m-iptfs-add-new-iptfs-xfrm-mode-impl.patch | 253 +++++++ ...d-user-packet-tunnel-ingress-handlin.patch | 685 ++++++++++++++++++ ...hare-page-fragments-of-inner-packets.patch | 164 +++++ ...d-fragmenting-of-larger-than-MTU-use.patch | 557 ++++++++++++++ ...d-basic-receive-packet-tunnel-egress.patch | 310 ++++++++ ...ndle-received-fragmented-inner-packe.patch | 675 +++++++++++++++++ ...d-reusing-received-skb-for-the-tunne.patch | 194 +++++ ...-iptfs-add-skb-fragment-sharing-code.patch | 390 ++++++++++ ...andle-reordering-of-received-packets.patch | 665 +++++++++++++++++ ...m-iptfs-add-tracepoint-functionality.patch | 458 ++++++++++++ 17 files changed, 5383 insertions(+) create mode 100644 patches/v8/v8-0000-cover-letter.patch create mode 100644 patches/v8/v8-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch create mode 100644 patches/v8/v8-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch create mode 100644 patches/v8/v8-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch create mode 100644 patches/v8/v8-0004-xfrm-netlink-add-config-netlink-options.patch create mode 100644 patches/v8/v8-0005-xfrm-add-mode_cbs-module-functionality.patch create mode 100644 patches/v8/v8-0006-xfrm-add-generic-iptfs-defines-and-functionality.patch create mode 100644 patches/v8/v8-0007-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch create mode 100644 patches/v8/v8-0008-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch create mode 100644 patches/v8/v8-0009-xfrm-iptfs-share-page-fragments-of-inner-packets.patch create mode 100644 patches/v8/v8-0010-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch create mode 100644 patches/v8/v8-0011-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch create mode 100644 patches/v8/v8-0012-xfrm-iptfs-handle-received-fragmented-inner-packe.patch create mode 100644 patches/v8/v8-0013-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch create mode 100644 patches/v8/v8-0014-xfrm-iptfs-add-skb-fragment-sharing-code.patch create mode 100644 patches/v8/v8-0015-xfrm-iptfs-handle-reordering-of-received-packets.patch create mode 100644 patches/v8/v8-0016-xfrm-iptfs-add-tracepoint-functionality.patch diff --git a/patches/v8/v8-0000-cover-letter.patch b/patches/v8/v8-0000-cover-letter.patch new file mode 100644 index 0000000..a582c63 --- /dev/null +++ b/patches/v8/v8-0000-cover-letter.patch @@ -0,0 +1,138 @@ +Subject: [PATCH ipsec-next v8 00/16] Add IP-TFS mode to xfrm + +* Summary of Changes: + +This patchset adds a new xfrm mode implementing on-demand IP-TFS. IP-TFS +(AggFrag encapsulation) has been standardized in RFC9347. + + Link: https://www.rfc-editor.org/rfc/rfc9347.txt + +This feature supports demand driven (i.e., non-constant send rate) +IP-TFS to take advantage of the AGGFRAG ESP payload encapsulation. This +payload type supports aggregation and fragmentation of the inner IP +packet stream which in turn yields higher small-packet bandwidth as well +as reducing MTU/PMTU issues. Congestion control is unimplementated as +the send rate is demand driven rather than constant. + +In order to allow loading this fucntionality as a module a set of +callbacks xfrm_mode_cbs has been added to xfrm as well. + +Patchset Changes: +----------------- + + include/net/xfrm.h | 44 + + include/uapi/linux/in.h | 2 + + include/uapi/linux/ip.h | 16 + + include/uapi/linux/ipsec.h | 3 +- + include/uapi/linux/snmp.h | 3 + + include/uapi/linux/xfrm.h | 9 +- + net/ipv4/esp4.c | 3 +- + net/ipv6/esp6.c | 3 +- + net/netfilter/nft_xfrm.c | 3 +- + net/xfrm/Kconfig | 16 + + net/xfrm/Makefile | 1 + + net/xfrm/trace_iptfs.h | 218 ++++ + net/xfrm/xfrm_compat.c | 10 +- + net/xfrm/xfrm_device.c | 4 +- + net/xfrm/xfrm_input.c | 18 +- + net/xfrm/xfrm_iptfs.c | 2858 ++++++++++++++++++++++++++++++++++++++++++++ + net/xfrm/xfrm_output.c | 6 + + net/xfrm/xfrm_policy.c | 26 +- + net/xfrm/xfrm_proc.c | 3 + + net/xfrm/xfrm_state.c | 84 ++ + net/xfrm/xfrm_user.c | 77 ++ + 21 files changed, 3388 insertions(+), 19 deletions(-) + +Patchset Structure: +------------------- + +The first 6 commits are changes to the xfrm infrastructure to support +the callbacks as well as more generic IP-TFS additions that may be used +outside the actual IP-TFS implementation. + + - xfrm: config: add CONFIG_XFRM_IPTFS + - include: uapi: add ip_tfs_*_hdr packet formats + - include: uapi: add IPPROTO_AGGFRAG for AGGFRAG in ESP + - xfrm: netlink: add config (netlink) options + - xfrm: add mode_cbs module functionality + - xfrm: add generic iptfs defines and functionality + +The last 10 commits constitute the IP-TFS implementation constructed in +layers to make review easier. The first 9 commits all apply to a single +file `net/xfrm/xfrm_iptfs.c`, the last commit adds a new tracepoint +header file along with the use of these new tracepoint calls. + + - xfrm: iptfs: add new iptfs xfrm mode impl + - xfrm: iptfs: add user packet (tunnel ingress) handling + - xfrm: iptfs: share page fragments of inner packets + - xfrm: iptfs: add fragmenting of larger than MTU user packets + - xfrm: iptfs: add basic receive packet (tunnel egress) handling + - xfrm: iptfs: handle received fragmented inner packets + - xfrm: iptfs: add reusing received skb for the tunnel egress packet + - xfrm: iptfs: add skb-fragment sharing code + - xfrm: iptfs: handle reordering of received packets + - xfrm: iptfs: add tracepoint functionality + +Patchset History: +----------------- + +RFCv1 (11/10/2023) + +RFCv1 -> RFCv2 (11/12/2023) + + Updates based on feedback from Simon Horman, Antony, + Michael Richardson, and kernel test robot. + +RFCv2 -> v1 (2/19/2024) + + Updates based on feedback from Sabrina Dubroca, kernel test robot + +v1 -> v2 (5/19/2024) + + Updates based on feedback from Sabrina Dubroca, Simon Horman, Antony. + + o Add handling of new netlink SA direction attribute (Antony). + o Split single patch/commit of xfrm_iptfs.c (the actual IP-TFS impl) + into 9+1 distinct layered functionality commits for aiding review. + - xfrm: fix return check on clone() callback + - xfrm: add sa_len() callback in xfrm_mode_cbs for copy to user + - iptfs: remove unneeded skb free count variable + - iptfs: remove unused variable and "breadcrumb" for future code. + - iptfs: use do_div() to avoid "__udivd13 missing" link failure. + - iptfs: remove some BUG_ON() assertions questioned in review. + +v2->v3 + - Git User Glitch + +v2->v4 (6/17/2024) + + - iptfs: copy only the netlink attributes to user based on the + direction of the SA. + + - xfrm: stats: in the output path check for skb->dev == NULL prior to + setting xfrm statistics on dev_net(skb->dev) as skb->dev may be NULL + for locally generated packets. + + - xfrm: stats: fix an input use case where dev_net(skb->dev) is used + to inc stats after skb is possibly NULL'd earlier. Switch to using + existing saved `net` pointer. + +v4->v5 (7/14/2024) + - uapi: add units to doc comments + - iptfs: add MODULE_DESCRIPTION() + - squash nl-direction-update commit + +v5->v6 (7/31/2024) + * sysctl: removed IPTFS sysctl additions + - xfrm: use array of pointers vs structs for mode callbacks + - iptfs: eliminate a memleak during state alloc failure + - iptfs: free send queue content on SA delete + - add some kdoc and comments + - cleanup a couple formatting choices per Steffen + +v6->v7 (8/1/2024) + - Rebased on latest ipsec-next + +v7->v8 (8/4/2024) + - Use lock and rcu to load iptfs module -- copy existing use pattern + - fix 2 warnings from the kernel bot diff --git a/patches/v8/v8-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch b/patches/v8/v8-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch new file mode 100644 index 0000000..e44b698 --- /dev/null +++ b/patches/v8/v8-0001-xfrm-config-add-CONFIG_XFRM_IPTFS.patch @@ -0,0 +1,42 @@ +From 0822158912c8e8477b2282ecd6ef406cbf5dcb28 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sun, 12 Nov 2023 06:28:49 -0500 +Subject: [PATCH ipsec-next v8 01/16] xfrm: config: add CONFIG_XFRM_IPTFS + +Add new Kconfig option to enable IP-TFS (RFC9347) functionality. + +Signed-off-by: Christian Hopps +--- + net/xfrm/Kconfig | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig +index d7b16f2c23e9..f0157702718f 100644 +--- a/net/xfrm/Kconfig ++++ b/net/xfrm/Kconfig +@@ -135,6 +135,22 @@ config NET_KEY_MIGRATE + + If unsure, say N. + ++config XFRM_IPTFS ++ tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support" ++ depends on XFRM ++ help ++ Information on the IP-TFS/AGGFRAG encapsulation can be found ++ in RFC 9347. This feature supports demand driven (i.e., ++ non-constant send rate) IP-TFS to take advantage of the ++ AGGFRAG ESP payload encapsulation. This payload type ++ supports aggregation and fragmentation of the inner IP ++ packet stream which in turn yields higher small-packet ++ bandwidth as well as reducing MTU/PMTU issues. Congestion ++ control is unimplementated as the send rate is demand driven ++ rather than constant. ++ ++ If unsure, say N. ++ + config XFRM_ESPINTCP + bool + +-- +2.46.0 + diff --git a/patches/v8/v8-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch b/patches/v8/v8-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch new file mode 100644 index 0000000..6a868c6 --- /dev/null +++ b/patches/v8/v8-0002-include-uapi-add-ip_tfs_-_hdr-packet-formats.patch @@ -0,0 +1,43 @@ +From 8cba80905abbe7af6ca00618afcd13b3c13a91af Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 20 Apr 2022 13:15:20 -0400 +Subject: [PATCH ipsec-next v8 02/16] include: uapi: add ip_tfs_*_hdr packet + formats + +Add the on-wire basic and congestion-control IP-TFS packet headers. + +Signed-off-by: Christian Hopps +--- + include/uapi/linux/ip.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h +index 283dec7e3645..5bd7ce934d74 100644 +--- a/include/uapi/linux/ip.h ++++ b/include/uapi/linux/ip.h +@@ -137,6 +137,22 @@ struct ip_beet_phdr { + __u8 reserved; + }; + ++struct ip_iptfs_hdr { ++ __u8 subtype; /* 0*: basic, 1: CC */ ++ __u8 flags; ++ __be16 block_offset; ++}; ++ ++struct ip_iptfs_cc_hdr { ++ __u8 subtype; /* 0: basic, 1*: CC */ ++ __u8 flags; ++ __be16 block_offset; ++ __be32 loss_rate; ++ __be64 rtt_adelay_xdelay; ++ __be32 tval; ++ __be32 techo; ++}; ++ + /* index values for the variables in ipv4_devconf */ + enum + { +-- +2.46.0 + diff --git a/patches/v8/v8-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch b/patches/v8/v8-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch new file mode 100644 index 0000000..9561ee2 --- /dev/null +++ b/patches/v8/v8-0003-include-uapi-add-IPPROTO_AGGFRAG-for-AGGFRAG-in-E.patch @@ -0,0 +1,29 @@ +From 93deaa06aefbdf1f581f9956c27e3e765671dea3 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sat, 27 Aug 2022 02:26:52 +0000 +Subject: [PATCH ipsec-next v8 03/16] include: uapi: add IPPROTO_AGGFRAG for + AGGFRAG in ESP + +Add the RFC assigned IP protocol number for AGGFRAG. + +Signed-off-by: Christian Hopps +--- + include/uapi/linux/in.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h +index d358add1611c..268086e85d04 100644 +--- a/include/uapi/linux/in.h ++++ b/include/uapi/linux/in.h +@@ -79,6 +79,8 @@ enum { + #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ + #define IPPROTO_ETHERNET IPPROTO_ETHERNET ++ IPPROTO_AGGFRAG = 144, /* AGGFRAG in ESP (RFC 9347) */ ++#define IPPROTO_AGGFRAG IPPROTO_AGGFRAG + IPPROTO_RAW = 255, /* Raw IP packets */ + #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_SMC = 256, /* Shared Memory Communications */ +-- +2.46.0 + diff --git a/patches/v8/v8-0004-xfrm-netlink-add-config-netlink-options.patch b/patches/v8/v8-0004-xfrm-netlink-add-config-netlink-options.patch new file mode 100644 index 0000000..b5021e8 --- /dev/null +++ b/patches/v8/v8-0004-xfrm-netlink-add-config-netlink-options.patch @@ -0,0 +1,159 @@ +From 687c7c1a043f788ef10bd1827ab11b7187b078e0 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sun, 12 Nov 2023 06:02:21 -0500 +Subject: [PATCH ipsec-next v8 04/16] xfrm: netlink: add config (netlink) + options + +Add netlink options for configuring IP-TFS SAs. + +Signed-off-by: Christian Hopps +--- + include/uapi/linux/xfrm.h | 9 ++++++- + net/xfrm/xfrm_compat.c | 10 ++++++-- + net/xfrm/xfrm_user.c | 52 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 68 insertions(+), 3 deletions(-) + +diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h +index f28701500714..042ebf94bb3d 100644 +--- a/include/uapi/linux/xfrm.h ++++ b/include/uapi/linux/xfrm.h +@@ -158,7 +158,8 @@ enum { + #define XFRM_MODE_ROUTEOPTIMIZATION 2 + #define XFRM_MODE_IN_TRIGGER 3 + #define XFRM_MODE_BEET 4 +-#define XFRM_MODE_MAX 5 ++#define XFRM_MODE_IPTFS 5 ++#define XFRM_MODE_MAX 6 + + /* Netlink configuration messages. */ + enum { +@@ -322,6 +323,12 @@ enum xfrm_attr_type_t { + XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ + XFRMA_SA_DIR, /* __u8 */ + XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ ++ XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */ ++ XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */ ++ XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */ ++ XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */ ++ XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size (octets) */ ++ XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */ + __XFRMA_MAX + + #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ +diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c +index 91357ccaf4af..5c55e07f3d10 100644 +--- a/net/xfrm/xfrm_compat.c ++++ b/net/xfrm/xfrm_compat.c +@@ -282,9 +282,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) + case XFRMA_MTIMER_THRESH: + case XFRMA_SA_DIR: + case XFRMA_NAT_KEEPALIVE_INTERVAL: ++ case XFRMA_IPTFS_DROP_TIME: ++ case XFRMA_IPTFS_REORDER_WINDOW: ++ case XFRMA_IPTFS_DONT_FRAG: ++ case XFRMA_IPTFS_INIT_DELAY: ++ case XFRMA_IPTFS_MAX_QSIZE: ++ case XFRMA_IPTFS_PKT_SIZE: + return xfrm_nla_cpy(dst, src, nla_len(src)); + default: +- BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); ++ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); + pr_warn_once("unsupported nla_type %d\n", src->nla_type); + return -EOPNOTSUPP; + } +@@ -439,7 +445,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, + int err; + + if (type > XFRMA_MAX) { +- BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); ++ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); + NL_SET_ERR_MSG(extack, "Bad attribute"); + return -EOPNOTSUPP; + } +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index 55f039ec3d59..f6ed019192f3 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -297,6 +297,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); + goto out; + } ++ if ((attrs[XFRMA_IPTFS_DROP_TIME] || ++ attrs[XFRMA_IPTFS_REORDER_WINDOW] || ++ attrs[XFRMA_IPTFS_DONT_FRAG] || ++ attrs[XFRMA_IPTFS_INIT_DELAY] || ++ attrs[XFRMA_IPTFS_MAX_QSIZE] || ++ attrs[XFRMA_IPTFS_PKT_SIZE]) && ++ p->mode != XFRM_MODE_IPTFS) { ++ NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode"); ++ goto out; ++ } + break; + + case IPPROTO_COMP: +@@ -417,6 +427,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + goto out; + } + ++ if (attrs[XFRMA_IPTFS_DROP_TIME]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ + if (attrs[XFRMA_REPLAY_VAL]) { + struct xfrm_replay_state *replay; + +@@ -454,6 +476,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + } + + } ++ ++ if (attrs[XFRMA_IPTFS_DONT_FRAG]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_INIT_DELAY]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (attrs[XFRMA_IPTFS_PKT_SIZE]) { ++ NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA"); ++ err = -EINVAL; ++ goto out; ++ } + } + + out: +@@ -3176,6 +3222,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { + [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), + [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, ++ [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, ++ [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, ++ [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, + }; + EXPORT_SYMBOL_GPL(xfrma_policy); + +-- +2.46.0 + diff --git a/patches/v8/v8-0005-xfrm-add-mode_cbs-module-functionality.patch b/patches/v8/v8-0005-xfrm-add-mode_cbs-module-functionality.patch new file mode 100644 index 0000000..8c74a09 --- /dev/null +++ b/patches/v8/v8-0005-xfrm-add-mode_cbs-module-functionality.patch @@ -0,0 +1,356 @@ +From 765efbca88e8e887005f4a5c6617c7abeb521b68 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 31 Jul 2024 15:15:29 -0400 +Subject: [PATCH ipsec-next v8 05/16] xfrm: add mode_cbs module functionality + +Add a set of callbacks xfrm_mode_cbs to xfrm_state. These callbacks +enable the addition of new xfrm modes, such as IP-TFS to be defined +in modules. + +Signed-off-by: Christian Hopps +--- + include/net/xfrm.h | 43 +++++++++++++++++++++++++ + net/xfrm/xfrm_device.c | 3 +- + net/xfrm/xfrm_input.c | 18 +++++++++-- + net/xfrm/xfrm_output.c | 2 ++ + net/xfrm/xfrm_policy.c | 18 +++++++---- + net/xfrm/xfrm_state.c | 72 ++++++++++++++++++++++++++++++++++++++++++ + net/xfrm/xfrm_user.c | 13 ++++++++ + 7 files changed, 159 insertions(+), 10 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 54cef89f6c1e..323e768ce4f3 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -209,6 +209,7 @@ struct xfrm_state { + u16 family; + xfrm_address_t saddr; + int header_len; ++ int enc_hdr_len; + int trailer_len; + u32 extra_flags; + struct xfrm_mark smark; +@@ -299,6 +300,9 @@ struct xfrm_state { + * interpreted by xfrm_type methods. */ + void *data; + u8 dir; ++ ++ const struct xfrm_mode_cbs *mode_cbs; ++ void *mode_data; + }; + + static inline struct net *xs_net(struct xfrm_state *x) +@@ -451,6 +455,45 @@ struct xfrm_type_offload { + int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); + void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); + ++/** ++ * struct xfrm_mode_cbs - XFRM mode callbacks ++ * @owner: module owner or NULL ++ * @create_state: Add mode specific state to new `xfrm_state *x` ++ * @delete_state: Cleanup mode specific state from `xfrm_state *x` ++ * @user_init: Process mode specific netlink attributes from user ++ * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` ++ * @clone: Copy mode specific values from `orig` to new state `x` ++ * @sa_len: Return space required to store mode specific netlink attributes ++ * @get_inner_mtu: Return avail payload space after removing encap overhead ++ * @input: Process received packet from SA using mode ++ * @output: Output given packet using mode ++ * @prepare_output: Add mode specific encapsulation to packet in skb. On return ++ * `transport_header` should point at ESP header, `network_header` should ++ * point at outer IP header and `mac_header` should opint at the ++ * protocol/nexthdr field of the outer IP. ++ * ++ * One should examine and understand the specific uses of these callbacks in ++ * xfrm for further detail on how and when these functions are called. RTSL. ++ */ ++struct xfrm_mode_cbs { ++ struct module *owner; ++ int (*create_state)(struct xfrm_state *x); ++ void (*delete_state)(struct xfrm_state *x); ++ int (*user_init)(struct net *net, struct xfrm_state *x, ++ struct nlattr **attrs, ++ struct netlink_ext_ack *extack); ++ int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); ++ int (*clone)(struct xfrm_state *x, struct xfrm_state *orig); ++ unsigned int (*sa_len)(const struct xfrm_state *x); ++ u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); ++ int (*input)(struct xfrm_state *x, struct sk_buff *skb); ++ int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); ++ int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); ++}; ++ ++int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); ++void xfrm_unregister_mode_cbs(u8 mode); ++ + static inline int xfrm_af2proto(unsigned int family) + { + switch(family) { +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index 9a44d363ba62..e412e4afb169 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, + skb->transport_header = skb->network_header + hsize; + + skb_reset_mac_len(skb); +- pskb_pull(skb, skb->mac_len + x->props.header_len); ++ pskb_pull(skb, ++ skb->mac_len + x->props.header_len - x->props.enc_hdr_len); + } + + static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 749e7eea99e4..b7b5cda986fb 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, + WARN_ON_ONCE(1); + break; + default: ++ if (x->mode_cbs && x->mode_cbs->input) ++ return x->mode_cbs->input(x, skb); ++ + WARN_ON_ONCE(1); + break; + } +@@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, + return -EOPNOTSUPP; + } + ++/* NOTE: encap_type - In addition to the normal (non-negative) values for ++ * encap_type, a negative value of -1 or -2 can be used to resume/restart this ++ * function after a previous invocation early terminated for async operation. ++ */ + int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + { + const struct xfrm_state_afinfo *afinfo; +@@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + + family = x->props.family; + ++ /* An encap_type of -2 indicates reconstructed inner packet */ ++ if (encap_type == -2) ++ goto resume_decapped; ++ + /* An encap_type of -1 indicates async resumption. */ + if (encap_type == -1) { + async = 1; +@@ -679,11 +690,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + + XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; + +- if (xfrm_inner_mode_input(x, skb)) { ++ err = xfrm_inner_mode_input(x, skb); ++ if (err == -EINPROGRESS) ++ return 0; ++ else if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } +- ++resume_decapped: + if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { + decaps = 1; + break; +diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c +index e5722c95b8bb..ef81359e4038 100644 +--- a/net/xfrm/xfrm_output.c ++++ b/net/xfrm/xfrm_output.c +@@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) + WARN_ON_ONCE(1); + break; + default: ++ if (x->mode_cbs && x->mode_cbs->prepare_output) ++ return x->mode_cbs->prepare_output(x, skb); + WARN_ON_ONCE(1); + break; + } +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index c56c61b0c12e..f764b1409175 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -2719,13 +2719,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, + + dst1->input = dst_discard; + +- rcu_read_lock(); +- afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); +- if (likely(afinfo)) +- dst1->output = afinfo->output; +- else +- dst1->output = dst_discard_out; +- rcu_read_unlock(); ++ if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { ++ dst1->output = xfrm[i]->mode_cbs->output; ++ } else { ++ rcu_read_lock(); ++ afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); ++ if (likely(afinfo)) ++ dst1->output = afinfo->output; ++ else ++ dst1->output = dst_discard_out; ++ rcu_read_unlock(); ++ } + + xdst_prev = xdst; + +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index 37478d36a8df..e7b656bb6c0d 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -515,6 +515,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) + return NULL; + } + ++static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX]; ++static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock); ++ ++int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs) ++{ ++ if (mode >= XFRM_MODE_MAX) ++ return -EINVAL; ++ ++ spin_lock_bh(&xfrm_mode_cbs_map_lock); ++ rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs); ++ spin_unlock_bh(&xfrm_mode_cbs_map_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(xfrm_register_mode_cbs); ++ ++void xfrm_unregister_mode_cbs(u8 mode) ++{ ++ if (mode >= XFRM_MODE_MAX) ++ return; ++ ++ spin_lock_bh(&xfrm_mode_cbs_map_lock); ++ RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL); ++ spin_unlock_bh(&xfrm_mode_cbs_map_lock); ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL(xfrm_unregister_mode_cbs); ++ ++static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode) ++{ ++ const struct xfrm_mode_cbs *cbs; ++ bool try_load = true; ++ ++ if (mode >= XFRM_MODE_MAX) ++ return NULL; ++ ++retry: ++ rcu_read_lock(); ++ ++ cbs = rcu_dereference(xfrm_mode_cbs_map[mode]); ++ if (cbs && !try_module_get(cbs->owner)) ++ cbs = NULL; ++ ++ rcu_read_unlock(); ++ ++ if (mode == XFRM_MODE_IPTFS && !cbs && try_load) { ++ request_module("xfrm-iptfs"); ++ try_load = false; ++ goto retry; ++ } ++ ++ return cbs; ++} ++ + void xfrm_state_free(struct xfrm_state *x) + { + kmem_cache_free(xfrm_state_cache, x); +@@ -523,6 +577,8 @@ EXPORT_SYMBOL(xfrm_state_free); + + static void ___xfrm_state_destroy(struct xfrm_state *x) + { ++ if (x->mode_cbs && x->mode_cbs->delete_state) ++ x->mode_cbs->delete_state(x); + hrtimer_cancel(&x->mtimer); + del_timer_sync(&x->rtimer); + kfree(x->aead); +@@ -680,6 +736,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) + x->replay_maxage = 0; + x->replay_maxdiff = 0; + spin_lock_init(&x->lock); ++ x->mode_data = NULL; + } + return x; + } +@@ -1806,6 +1863,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, + x->new_mapping_sport = 0; + x->dir = orig->dir; + ++ x->mode_cbs = orig->mode_cbs; ++ if (x->mode_cbs && x->mode_cbs->clone) { ++ if (x->mode_cbs->clone(x, orig)) ++ goto error; ++ } ++ + return x; + + error: +@@ -2845,6 +2908,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) + case XFRM_MODE_TUNNEL: + break; + default: ++ if (x->mode_cbs && x->mode_cbs->get_inner_mtu) ++ return x->mode_cbs->get_inner_mtu(x, mtu); ++ + WARN_ON_ONCE(1); + break; + } +@@ -2945,6 +3011,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, + } + } + ++ x->mode_cbs = xfrm_get_mode_cbs(x->props.mode); ++ if (x->mode_cbs) { ++ if (x->mode_cbs->create_state) ++ err = x->mode_cbs->create_state(x); ++ module_put(x->mode_cbs->owner); ++ } + error: + return err; + } +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index f6ed019192f3..419bbeea6b20 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -918,6 +918,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, + goto error; + } + ++ if (x->mode_cbs && x->mode_cbs->user_init) { ++ err = x->mode_cbs->user_init(net, x, attrs, extack); ++ if (err) ++ goto error; ++ } ++ + return x; + + error: +@@ -1331,6 +1337,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, + if (ret) + goto out; + } ++ if (x->mode_cbs && x->mode_cbs->copy_to_user) ++ ret = x->mode_cbs->copy_to_user(x, skb); ++ if (ret) ++ goto out; + if (x->mapping_maxage) { + ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); + if (ret) +@@ -3540,6 +3550,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) + if (x->nat_keepalive_interval) + l += nla_total_size(sizeof(x->nat_keepalive_interval)); + ++ if (x->mode_cbs && x->mode_cbs->sa_len) ++ l += x->mode_cbs->sa_len(x); ++ + return l; + } + +-- +2.46.0 + diff --git a/patches/v8/v8-0006-xfrm-add-generic-iptfs-defines-and-functionality.patch b/patches/v8/v8-0006-xfrm-add-generic-iptfs-defines-and-functionality.patch new file mode 100644 index 0000000..17ba1ef --- /dev/null +++ b/patches/v8/v8-0006-xfrm-add-generic-iptfs-defines-and-functionality.patch @@ -0,0 +1,265 @@ +From 64c53b48d6b88414ff6295aa0ac8756882057ea6 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 31 Jul 2024 15:20:53 -0400 +Subject: [PATCH ipsec-next v8 06/16] xfrm: add generic iptfs defines and + functionality + +Define `XFRM_MODE_IPTFS` and `IPSEC_MODE_IPTFS` constants, and add these to +switch case and conditionals adjacent with the existing TUNNEL modes. + +Signed-off-by: Christian Hopps +--- + include/net/xfrm.h | 1 + + include/uapi/linux/ipsec.h | 3 ++- + include/uapi/linux/snmp.h | 3 +++ + net/ipv4/esp4.c | 3 ++- + net/ipv6/esp6.c | 3 ++- + net/netfilter/nft_xfrm.c | 3 ++- + net/xfrm/xfrm_device.c | 1 + + net/xfrm/xfrm_output.c | 4 ++++ + net/xfrm/xfrm_policy.c | 8 ++++++-- + net/xfrm/xfrm_proc.c | 3 +++ + net/xfrm/xfrm_state.c | 12 ++++++++++++ + net/xfrm/xfrm_user.c | 12 ++++++++++++ + 12 files changed, 50 insertions(+), 6 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 323e768ce4f3..f06e1d76e16a 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -37,6 +37,7 @@ + #define XFRM_PROTO_COMP 108 + #define XFRM_PROTO_IPIP 4 + #define XFRM_PROTO_IPV6 41 ++#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG + #define XFRM_PROTO_ROUTING IPPROTO_ROUTING + #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS + +diff --git a/include/uapi/linux/ipsec.h b/include/uapi/linux/ipsec.h +index 50d8ee1791e2..696b790f4346 100644 +--- a/include/uapi/linux/ipsec.h ++++ b/include/uapi/linux/ipsec.h +@@ -14,7 +14,8 @@ enum { + IPSEC_MODE_ANY = 0, /* We do not support this for SA */ + IPSEC_MODE_TRANSPORT = 1, + IPSEC_MODE_TUNNEL = 2, +- IPSEC_MODE_BEET = 3 ++ IPSEC_MODE_BEET = 3, ++ IPSEC_MODE_IPTFS = 4 + }; + + enum { +diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h +index adf5fd78dd50..77eb078f06a6 100644 +--- a/include/uapi/linux/snmp.h ++++ b/include/uapi/linux/snmp.h +@@ -339,6 +339,9 @@ enum + LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ + LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */ + LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */ ++ LINUX_MIB_XFRMNOSKBERROR, /* XfrmNoSkbError */ ++ LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */ ++ LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */ + __LINUX_MIB_XFRMMAX + }; + +diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c +index 47378ca41904..9a0165e3ceba 100644 +--- a/net/ipv4/esp4.c ++++ b/net/ipv4/esp4.c +@@ -815,7 +815,8 @@ int esp_input_done2(struct sk_buff *skb, int err) + } + + skb_pull_rcsum(skb, hlen); +- if (x->props.mode == XFRM_MODE_TUNNEL) ++ if (x->props.mode == XFRM_MODE_TUNNEL || ++ x->props.mode == XFRM_MODE_IPTFS) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -ihl); +diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c +index 3920e8aa1031..5f85435de722 100644 +--- a/net/ipv6/esp6.c ++++ b/net/ipv6/esp6.c +@@ -858,7 +858,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + skb_pull_rcsum(skb, hlen); +- if (x->props.mode == XFRM_MODE_TUNNEL) ++ if (x->props.mode == XFRM_MODE_TUNNEL || ++ x->props.mode == XFRM_MODE_IPTFS) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); +diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c +index 1c866757db55..620238c6ef4c 100644 +--- a/net/netfilter/nft_xfrm.c ++++ b/net/netfilter/nft_xfrm.c +@@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) + return true; + } + +- return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; ++ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || ++ mode == XFRM_MODE_IPTFS; + } + + static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, +diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c +index e412e4afb169..d4905796e9ab 100644 +--- a/net/xfrm/xfrm_device.c ++++ b/net/xfrm/xfrm_device.c +@@ -69,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, + static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) + { + switch (x->outer_mode.encap) { ++ case XFRM_MODE_IPTFS: + case XFRM_MODE_TUNNEL: + if (x->outer_mode.family == AF_INET) + return __xfrm_mode_tunnel_prep(x, skb, +diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c +index ef81359e4038..b5025cf6136e 100644 +--- a/net/xfrm/xfrm_output.c ++++ b/net/xfrm/xfrm_output.c +@@ -677,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) + + return; + } ++ if (x->outer_mode.encap == XFRM_MODE_IPTFS) { ++ xo->inner_ipproto = IPPROTO_AGGFRAG; ++ return; ++ } + + /* non-Tunnel Mode */ + if (!skb->encapsulation) +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index f764b1409175..928c3ed79d21 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -2473,6 +2473,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, + struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; + + if (tmpl->mode == XFRM_MODE_TUNNEL || ++ tmpl->mode == XFRM_MODE_IPTFS || + tmpl->mode == XFRM_MODE_BEET) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; +@@ -3264,7 +3265,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + ok: + xfrm_pols_put(pols, drop_pols); + if (dst && dst->xfrm && +- dst->xfrm->props.mode == XFRM_MODE_TUNNEL) ++ (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || ++ dst->xfrm->props.mode == XFRM_MODE_IPTFS)) + dst->flags |= DST_XFRM_TUNNEL; + return dst; + +@@ -4509,6 +4511,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm + switch (t->mode) { + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: ++ case XFRM_MODE_IPTFS: + if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, + m->old_family) && + xfrm_addr_equal(&t->saddr, &m->old_saddr, +@@ -4551,7 +4554,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, + continue; + n++; + if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && +- pol->xfrm_vec[i].mode != XFRM_MODE_BEET) ++ pol->xfrm_vec[i].mode != XFRM_MODE_BEET && ++ pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) + continue; + /* update endpoints */ + memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, +diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c +index eeb984be03a7..e851b388995a 100644 +--- a/net/xfrm/xfrm_proc.c ++++ b/net/xfrm/xfrm_proc.c +@@ -43,6 +43,9 @@ static const struct snmp_mib xfrm_mib_list[] = { + SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), + SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), ++ SNMP_MIB_ITEM("XfrmNoSkbError", LINUX_MIB_XFRMNOSKBERROR), ++ SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), ++ SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), + SNMP_MIB_SENTINEL + }; + +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index e7b656bb6c0d..1d4b884f82d0 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -467,6 +467,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, ++ [XFRM_MODE_IPTFS] = { ++ .encap = XFRM_MODE_IPTFS, ++ .flags = XFRM_MODE_FLAG_TUNNEL, ++ .family = AF_INET, ++ }, + }; + + static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { +@@ -488,6 +493,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, ++ [XFRM_MODE_IPTFS] = { ++ .encap = XFRM_MODE_IPTFS, ++ .flags = XFRM_MODE_FLAG_TUNNEL, ++ .family = AF_INET6, ++ }, + }; + + static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +@@ -2194,6 +2204,7 @@ static int __xfrm6_state_sort_cmp(const void *p) + #endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: ++ case XFRM_MODE_IPTFS: + return 4; + } + return 5; +@@ -2220,6 +2231,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) + #endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: ++ case XFRM_MODE_IPTFS: + return 3; + } + return 4; +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index 419bbeea6b20..40c79bd14a7e 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -379,6 +379,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_BEET: + break; ++ case XFRM_MODE_IPTFS: ++ if (p->id.proto != IPPROTO_ESP) { ++ NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP"); ++ goto out; ++ } ++ if (sa_dir == 0) { ++ NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute"); ++ goto out; ++ } ++ break; + + default: + NL_SET_ERR_MSG(extack, "Unsupported mode"); +@@ -1984,6 +1994,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, + return -EINVAL; + } + break; ++ case XFRM_MODE_IPTFS: ++ break; + default: + if (ut[i].family != prev_family) { + NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); +-- +2.46.0 + diff --git a/patches/v8/v8-0007-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch b/patches/v8/v8-0007-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch new file mode 100644 index 0000000..9d41975 --- /dev/null +++ b/patches/v8/v8-0007-xfrm-iptfs-add-new-iptfs-xfrm-mode-impl.patch @@ -0,0 +1,253 @@ +From 0eb6a6e7ba6df17631a4ce1f92e7666f8b60d36c Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 31 Jul 2024 12:20:57 -0400 +Subject: [PATCH ipsec-next v8 07/16] xfrm: iptfs: add new iptfs xfrm mode impl + +Add a new xfrm mode implementing AggFrag/IP-TFS from RFC9347. + +This utilizes the new xfrm_mode_cbs to implement demand-driven IP-TFS +functionality. This functionality can be used to increase bandwidth +utilization through small packet aggregation, as well as help solve PMTU +issues through it's efficient use of fragmentation. + + Link: https://www.rfc-editor.org/rfc/rfc9347.txt + +Multiple commits follow to build the functionality into xfrm_iptfs.c + +Signed-off-by: Christian Hopps +--- + net/xfrm/Makefile | 1 + + net/xfrm/xfrm_iptfs.c | 210 ++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 211 insertions(+) + create mode 100644 net/xfrm/xfrm_iptfs.c + +diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile +index 512e0b2f8514..5a1787587cb3 100644 +--- a/net/xfrm/Makefile ++++ b/net/xfrm/Makefile +@@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o + obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o + obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o + obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o ++obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o + obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o + obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +new file mode 100644 +index 000000000000..201406175d17 +--- /dev/null ++++ b/net/xfrm/xfrm_iptfs.c +@@ -0,0 +1,210 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* xfrm_iptfs: IPTFS encapsulation support ++ * ++ * April 21 2022, Christian Hopps ++ * ++ * Copyright (c) 2022, LabN Consulting, L.L.C. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "xfrm_inout.h" ++ ++/** ++ * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. ++ * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, ++ * otherwise the user specified value. ++ */ ++struct xfrm_iptfs_config { ++ u32 pkt_size; /* outer_packet_size or 0 */ ++}; ++ ++/** ++ * struct xfrm_iptfs_data - mode specific xfrm state. ++ * @cfg: IPTFS tunnel config. ++ * @x: owning SA (xfrm_state). ++ * @payload_mtu: max payload size. ++ */ ++struct xfrm_iptfs_data { ++ struct xfrm_iptfs_config cfg; ++ ++ /* Ingress User Input */ ++ struct xfrm_state *x; /* owning state */ ++ u32 payload_mtu; /* max payload size */ ++}; ++ ++/* ========================== */ ++/* State Management Functions */ ++/* ========================== */ ++ ++/** ++ * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. ++ * @x: xfrm state. ++ * @outer_mtu: the outer mtu ++ */ ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++{ ++ struct crypto_aead *aead; ++ u32 blksize; ++ ++ aead = x->data; ++ blksize = ALIGN(crypto_aead_blocksize(aead), 4); ++ return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & ++ ~(blksize - 1)) - 2; ++} ++ ++/** ++ * iptfs_user_init() - initialize the SA with IPTFS options from netlink. ++ * @net: the net data ++ * @x: xfrm state ++ * @attrs: netlink attributes ++ * @extack: extack return data ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int iptfs_user_init(struct net *net, struct xfrm_state *x, ++ struct nlattr **attrs, ++ struct netlink_ext_ack *extack) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc; ++ ++ xc = &xtfs->cfg; ++ ++ if (attrs[XFRMA_IPTFS_PKT_SIZE]) { ++ xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); ++ if (!xc->pkt_size) { ++ xtfs->payload_mtu = 0; ++ } else if (xc->pkt_size > x->props.header_len) { ++ xtfs->payload_mtu = xc->pkt_size - x->props.header_len; ++ } else { ++ NL_SET_ERR_MSG(extack, ++ "Packet size must be 0 or greater than IPTFS/ESP header length"); ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++static unsigned int iptfs_sa_len(const struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc = &xtfs->cfg; ++ unsigned int l = 0; ++ ++ if (x->dir == XFRM_SA_DIR_OUT) ++ l += nla_total_size(sizeof(xc->pkt_size)); ++ ++ return l; ++} ++ ++static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct xfrm_iptfs_config *xc = &xtfs->cfg; ++ int ret = 0; ++ ++ if (x->dir == XFRM_SA_DIR_OUT) ++ ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); ++ ++ return ret; ++} ++ ++static void __iptfs_init_state(struct xfrm_state *x, ++ struct xfrm_iptfs_data *xtfs) ++{ ++ /* Modify type (esp) adjustment values */ ++ ++ if (x->props.family == AF_INET) ++ x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); ++ else if (x->props.family == AF_INET6) ++ x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); ++ x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); ++ ++ /* Always keep a module reference when x->mode_data is set */ ++ __module_get(x->mode_cbs->owner); ++ ++ x->mode_data = xtfs; ++ xtfs->x = x; ++} ++ ++static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ ++ xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); ++ if (!xtfs) ++ return -ENOMEM; ++ ++ __iptfs_init_state(x, xtfs); ++ ++ return 0; ++} ++ ++static int iptfs_create_state(struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ ++ xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); ++ if (!xtfs) ++ return -ENOMEM; ++ ++ __iptfs_init_state(x, xtfs); ++ ++ return 0; ++} ++ ++static void iptfs_delete_state(struct xfrm_state *x) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ ++ if (!xtfs) ++ return; ++ ++ kfree_sensitive(xtfs); ++ ++ module_put(x->mode_cbs->owner); ++} ++ ++static const struct xfrm_mode_cbs iptfs_mode_cbs = { ++ .owner = THIS_MODULE, ++ .create_state = iptfs_create_state, ++ .delete_state = iptfs_delete_state, ++ .user_init = iptfs_user_init, ++ .copy_to_user = iptfs_copy_to_user, ++ .sa_len = iptfs_sa_len, ++ .clone = iptfs_clone, ++ .get_inner_mtu = iptfs_get_inner_mtu, ++}; ++ ++static int __init xfrm_iptfs_init(void) ++{ ++ int err; ++ ++ pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); ++ ++ err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); ++ if (err < 0) ++ pr_info("%s: can't register IP-TFS\n", __func__); ++ ++ return err; ++} ++ ++static void __exit xfrm_iptfs_fini(void) ++{ ++ xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); ++} ++ ++module_init(xfrm_iptfs_init); ++module_exit(xfrm_iptfs_fini); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels"); +-- +2.46.0 + diff --git a/patches/v8/v8-0008-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch b/patches/v8/v8-0008-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch new file mode 100644 index 0000000..943b99a --- /dev/null +++ b/patches/v8/v8-0008-xfrm-iptfs-add-user-packet-tunnel-ingress-handlin.patch @@ -0,0 +1,685 @@ +From 878dcaa71186d04c40f1545e29819b2ca8282406 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sun, 4 Aug 2024 10:13:06 -0400 +Subject: [PATCH ipsec-next v8 08/16] xfrm: iptfs: add user packet (tunnel + ingress) handling + +Add tunnel packet output functionality. This is code handles +the ingress to the tunnel. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 577 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 574 insertions(+), 3 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 201406175d17..9c48c15cbed0 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -19,29 +19,553 @@ + + #include "xfrm_inout.h" + ++/* ------------------------------------------------ */ ++/* IPTFS default SA values (tunnel ingress/dir-out) */ ++/* ------------------------------------------------ */ ++ ++/** ++ * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay ++ * ++ * The initial output delay is the amount of time prior to servicing the output ++ * queue after queueing the first packet on said queue. This applies anytime the ++ * output queue was previously empty. ++ * ++ * Default 0. ++ */ ++#define IPTFS_DEFAULT_INIT_DELAY_USECS 0 ++ ++/** ++ * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size. ++ * ++ * The default IPTFS max output queue size in octets. The output queue is where ++ * received packets destined for output over an IPTFS tunnel are stored prior to ++ * being output in aggregated/fragmented form over the IPTFS tunnel. ++ * ++ * Default 1M. ++ */ ++#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) ++ ++#define NSECS_IN_USEC 1000 ++ ++#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT ++ + /** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. ++ * @max_queue_size: The maximum number of octets allowed to be queued to be sent ++ * over the IPTFS SA. The queue size is measured as the size of all the ++ * packets enqueued. + */ + struct xfrm_iptfs_config { + u32 pkt_size; /* outer_packet_size or 0 */ ++ u32 max_queue_size; /* octets */ + }; + + /** + * struct xfrm_iptfs_data - mode specific xfrm state. + * @cfg: IPTFS tunnel config. + * @x: owning SA (xfrm_state). ++ * @queue: queued user packets to send. ++ * @queue_size: number of octets on queue (sum of packet sizes). ++ * @ecn_queue_size: octets above with ECN mark. ++ * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. ++ * @iptfs_timer: output timer. + * @payload_mtu: max payload size. + */ + struct xfrm_iptfs_data { + struct xfrm_iptfs_config cfg; + + /* Ingress User Input */ +- struct xfrm_state *x; /* owning state */ ++ struct xfrm_state *x; /* owning state */ ++ struct sk_buff_head queue; /* output queue */ ++ ++ u32 queue_size; /* octets */ ++ u32 ecn_queue_size; /* octets above which ECN mark */ ++ u64 init_delay_ns; /* nanoseconds */ ++ struct hrtimer iptfs_timer; /* output timer */ + u32 payload_mtu; /* max payload size */ + }; + ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); ++static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); ++ ++/* ================================= */ ++/* IPTFS Sending (ingress) Functions */ ++/* ================================= */ ++ ++/* ------------------------- */ ++/* Enqueue to send functions */ ++/* ------------------------- */ ++ ++/** ++ * iptfs_enqueue() - enqueue packet if ok to send. ++ * @xtfs: xtfs state ++ * @skb: the packet ++ * ++ * Return: true if packet enqueued. ++ */ ++static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) ++{ ++ u64 newsz = xtfs->queue_size + skb->len; ++ struct iphdr *iph; ++ ++ assert_spin_locked(&xtfs->x->lock); ++ ++ if (newsz > xtfs->cfg.max_queue_size) ++ return false; ++ ++ /* Set ECN CE if we are above our ECN queue threshold */ ++ if (newsz > xtfs->ecn_queue_size) { ++ iph = ip_hdr(skb); ++ if (iph->version == 4) ++ IP_ECN_set_ce(iph); ++ else if (iph->version == 6) ++ IP6_ECN_set_ce(skb, ipv6_hdr(skb)); ++ } ++ ++ __skb_queue_tail(&xtfs->queue, skb); ++ xtfs->queue_size += skb->len; ++ return true; ++} ++ ++static int iptfs_get_cur_pmtu(struct xfrm_state *x, ++ struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) ++{ ++ struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); ++ u32 payload_mtu = xtfs->payload_mtu; ++ u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); ++ ++ if (payload_mtu && payload_mtu < pmtu) ++ pmtu = payload_mtu; ++ ++ return pmtu; ++} ++ ++static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) ++{ ++ if (skb->len <= pmtu) ++ return 0; ++ ++ /* We only send ICMP too big if the user has configured us as ++ * dont-fragment. ++ */ ++ if (skb->dev) ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); ++ ++ if (sk) { ++ xfrm_local_error(skb, pmtu); ++ } else if (ip_hdr(skb)->version == 4) { ++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, ++ htonl(pmtu)); ++ } else { ++ WARN_ON_ONCE(ip_hdr(skb)->version != 6); ++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); ++ } ++ return 1; ++} ++ ++/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload ++ * (i.e., aggregating or fragmenting as appropriate). ++ * This is set in dst->output for an SA. ++ */ ++static int iptfs_output_collect(struct net *net, struct sock *sk, ++ struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct xfrm_state *x = dst->xfrm; ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *segs, *nskb; ++ u32 pmtu = 0; ++ bool ok = true; ++ bool was_gso; ++ ++ /* We have hooked into dst_entry->output which means we have skipped the ++ * protocol specific netfilter (see xfrm4_output, xfrm6_output). ++ * when our timer runs we will end up calling xfrm_output directly on ++ * the encapsulated traffic. ++ * ++ * For both cases this is the NF_INET_POST_ROUTING hook which allows ++ * changing the skb->dst entry which then may not be xfrm based anymore ++ * in which case a REROUTED flag is set. and dst_output is called. ++ * ++ * For IPv6 we are also skipping fragmentation handling for local ++ * sockets, which may or may not be good depending on our tunnel DF ++ * setting. Normally with fragmentation supported we want to skip this ++ * fragmentation. ++ */ ++ ++ BUG_ON(!xtfs); ++ ++ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ ++ /* Break apart GSO skbs. If the queue is nearing full then we want the ++ * accounting and queuing to be based on the individual packets not on the ++ * aggregate GSO buffer. ++ */ ++ was_gso = skb_is_gso(skb); ++ if (!was_gso) { ++ segs = skb; ++ } else { ++ segs = skb_gso_segment(skb, 0); ++ if (IS_ERR_OR_NULL(segs)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb); ++ if (IS_ERR(segs)) ++ return PTR_ERR(segs); ++ return -EINVAL; ++ } ++ consume_skb(skb); ++ skb = NULL; ++ } ++ ++ /* We can be running on multiple cores and from the network softirq or ++ * from user context depending on where the packet is coming from. ++ */ ++ spin_lock_bh(&x->lock); ++ ++ skb_list_walk_safe(segs, skb, nskb) { ++ skb_mark_not_on_list(skb); ++ ++ /* Once we drop due to no queue space we continue to drop the ++ * rest of the packets from that GRO. ++ */ ++ if (!ok) { ++nospace: ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); ++ kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); ++ continue; ++ } ++ ++ /* Fragmenting handled in following commits. */ ++ if (iptfs_is_too_big(sk, skb, pmtu)) { ++ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); ++ continue; ++ } ++ ++ /* Enqueue to send in tunnel */ ++ ok = iptfs_enqueue(xtfs, skb); ++ if (!ok) ++ goto nospace; ++ } ++ ++ /* Start a delay timer if we don't have one yet */ ++ if (!hrtimer_is_queued(&xtfs->iptfs_timer)) ++ hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, ++ IPTFS_HRTIMER_MODE); ++ ++ spin_unlock_bh(&x->lock); ++ return 0; ++} ++ ++/* -------------------------- */ ++/* Dequeue and send functions */ ++/* -------------------------- */ ++ ++static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) ++{ ++ struct ip_iptfs_hdr *h; ++ size_t hsz = sizeof(*h); ++ ++ /* now reset values to be pointing at the rest of the packets */ ++ h = skb_push(skb, hsz); ++ memset(h, 0, hsz); ++ if (blkoff) ++ h->block_offset = htons(blkoff); ++ ++ /* network_header current points at the inner IP packet ++ * move it to the iptfs header ++ */ ++ skb->transport_header = skb->network_header; ++ skb->network_header -= hsz; ++ ++ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; ++} ++ ++static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, ++ struct sk_buff *child) ++{ ++ u32 fllen = 0; ++ ++ /* It might be possible to account for a frag list in addition to page ++ * fragment if it's a valid state to be in. The page fragments size ++ * should be kept as data_len so only the frag_list size is removed, ++ * this must be done above as well. ++ */ ++ BUG_ON(skb_shinfo(child)->nr_frags); ++ *nextp = skb_shinfo(child)->frag_list; ++ while (*nextp) { ++ fllen += (*nextp)->len; ++ nextp = &(*nextp)->next; ++ } ++ skb_frag_list_init(child); ++ BUG_ON(fllen > child->data_len); ++ child->len -= fllen; ++ child->data_len -= fllen; ++ ++ return nextp; ++} ++ ++static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *skb, *skb2, **nextp; ++ struct skb_shared_info *shi; ++ ++ while ((skb = __skb_dequeue(list))) { ++ u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ int remaining; ++ ++ /* protocol comes to us cleared sometimes */ ++ skb->protocol = x->outer_mode.family == AF_INET ? ++ htons(ETH_P_IP) : ++ htons(ETH_P_IPV6); ++ ++ if (skb->len > mtu) { ++ /* We handle this case before enqueueing so we are only ++ * here b/c MTU changed after we enqueued before we ++ * dequeued, just drop these. ++ */ ++ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); ++ ++ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); ++ continue; ++ } ++ ++ /* If we don't have a cksum in the packet we need to add one ++ * before encapsulation. ++ */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_help(skb)) { ++ XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), ++ LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb); ++ continue; ++ } ++ } ++ ++ /* Convert first inner packet into an outer IPTFS packet */ ++ iptfs_output_prepare_skb(skb, 0); ++ ++ /* The space remaining to send more inner packet data is `mtu` - ++ * (skb->len - sizeof iptfs header). This is b/c the `mtu` value ++ * has the basic IPTFS header len accounted for, and we added ++ * that header to the skb so it is a part of skb->len, thus we ++ * subtract it from the skb length. ++ */ ++ remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); ++ ++ /* Re-home (un-nest) nested fragment lists. We need to do this ++ * b/c we will simply be appending any following aggregated ++ * inner packets to the frag list. ++ */ ++ shi = skb_shinfo(skb); ++ nextp = &shi->frag_list; ++ while (*nextp) { ++ if (skb_has_frag_list(*nextp)) ++ nextp = iptfs_rehome_fraglist(&(*nextp)->next, ++ *nextp); ++ else ++ nextp = &(*nextp)->next; ++ } ++ ++ /* See if we have enough space to simply append. ++ * ++ * NOTE: Maybe do not append if we will be mis-aligned, ++ * SW-based endpoints will probably have to copy in this ++ * case. ++ */ ++ while ((skb2 = skb_peek(list))) { ++ if (skb2->len > remaining) ++ break; ++ ++ __skb_unlink(skb2, list); ++ ++ /* If we don't have a cksum in the packet we need to add ++ * one before encapsulation. ++ */ ++ if (skb2->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_help(skb2)) { ++ XFRM_INC_STATS(xs_net(x), ++ LINUX_MIB_XFRMOUTERROR); ++ kfree_skb(skb2); ++ continue; ++ } ++ } ++ ++ /* Do accounting */ ++ skb->data_len += skb2->len; ++ skb->len += skb2->len; ++ remaining -= skb2->len; ++ ++ /* Append to the frag_list */ ++ *nextp = skb2; ++ nextp = &skb2->next; ++ BUG_ON(*nextp); ++ if (skb_has_frag_list(skb2)) ++ nextp = iptfs_rehome_fraglist(nextp, skb2); ++ skb->truesize += skb2->truesize; ++ } ++ ++ xfrm_output(NULL, skb); ++ } ++} ++ ++static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) ++{ ++ struct sk_buff_head list; ++ struct xfrm_iptfs_data *xtfs; ++ struct xfrm_state *x; ++ ++ xtfs = container_of(me, typeof(*xtfs), iptfs_timer); ++ x = xtfs->x; ++ ++ /* Process all the queued packets ++ * ++ * softirq execution order: timer > tasklet > hrtimer ++ * ++ * Network rx will have run before us giving one last chance to queue ++ * ingress packets for us to process and transmit. ++ */ ++ ++ spin_lock(&x->lock); ++ __skb_queue_head_init(&list); ++ skb_queue_splice_init(&xtfs->queue, &list); ++ xtfs->queue_size = 0; ++ spin_unlock(&x->lock); ++ ++ /* After the above unlock, packets can begin queuing again, and the ++ * timer can be set again, from another CPU either in softirq or user ++ * context (not from this one since we are running at softirq level ++ * already). ++ */ ++ ++ iptfs_output_queued(x, &list); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/** ++ * iptfs_encap_add_ipv4() - add outer encaps ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the ++ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set ++ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do ++ * anything with inner headers as they aren't pointing into a normal IP ++ * singleton inner packet. ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct iphdr *top_iph; ++ ++ skb_reset_inner_network_header(skb); ++ skb_reset_inner_transport_header(skb); ++ ++ skb_set_network_header(skb, ++ -(x->props.header_len - x->props.enc_hdr_len)); ++ skb->mac_header = ++ skb->network_header + offsetof(struct iphdr, protocol); ++ skb->transport_header = skb->network_header + sizeof(*top_iph); ++ ++ top_iph = ip_hdr(skb); ++ top_iph->ihl = 5; ++ top_iph->version = 4; ++ top_iph->protocol = IPPROTO_AGGFRAG; ++ ++ /* As we have 0, fractional, 1 or N inner packets there's no obviously ++ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 ++ * 3.1. ++ */ ++ top_iph->tos = 0; ++ ++ top_iph->frag_off = htons(IP_DF); ++ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); ++ top_iph->saddr = x->props.saddr.a4; ++ top_iph->daddr = x->id.daddr.a4; ++ ip_select_ident(dev_net(dst->dev), skb, NULL); ++ ++ return 0; ++} ++ ++/** ++ * iptfs_encap_add_ipv6() - add outer encaps ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the ++ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set ++ * the flow label and TOS/DSCP bits. It also sets the protocol to a different ++ * value and doesn't do anything with inner headers as they aren't pointing into ++ * a normal IP singleton inner packet. ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct ipv6hdr *top_iph; ++ int dsfield; ++ ++ skb_reset_inner_network_header(skb); ++ skb_reset_inner_transport_header(skb); ++ ++ skb_set_network_header(skb, ++ -x->props.header_len + x->props.enc_hdr_len); ++ skb->mac_header = ++ skb->network_header + offsetof(struct ipv6hdr, nexthdr); ++ skb->transport_header = skb->network_header + sizeof(*top_iph); ++ ++ top_iph = ipv6_hdr(skb); ++ top_iph->version = 6; ++ top_iph->priority = 0; ++ memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); ++ top_iph->nexthdr = IPPROTO_AGGFRAG; ++ ++ /* As we have 0, fractional, 1 or N inner packets there's no obviously ++ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 ++ * 3.1. ++ */ ++ dsfield = 0; ++ ipv6_change_dsfield(top_iph, 0, dsfield); ++ ++ top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); ++ top_iph->saddr = *(struct in6_addr *)&x->props.saddr; ++ top_iph->daddr = *(struct in6_addr *)&x->id.daddr; ++ ++ return 0; ++} ++ ++/** ++ * iptfs_prepare_output() - prepare the skb for output ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * Return: Error value, if 0 then skb values should be as follows: ++ * - transport_header should point at ESP header ++ * - network_header should point at Outer IP header ++ * - mac_header should point at protocol/nexthdr of the outer IP ++ */ ++static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ if (x->outer_mode.family == AF_INET) ++ return iptfs_encap_add_ipv4(x, skb); ++ if (x->outer_mode.family == AF_INET6) { ++#if IS_ENABLED(CONFIG_IPV6) ++ return iptfs_encap_add_ipv6(x, skb); ++#else ++ WARN_ON_ONCE(1); ++ return -EAFNOSUPPORT; ++#endif ++ } ++ WARN_ON_ONCE(1); ++ return -EOPNOTSUPP; ++} ++ + /* ========================== */ + /* State Management Functions */ + /* ========================== */ +@@ -77,8 +601,11 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + { + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc; ++ u64 q; + + xc = &xtfs->cfg; ++ xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; ++ xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); +@@ -92,6 +619,17 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + return -EINVAL; + } + } ++ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) ++ xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); ++ if (attrs[XFRMA_IPTFS_INIT_DELAY]) ++ xtfs->init_delay_ns = ++ (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * ++ NSECS_IN_USEC; ++ ++ q = (u64)xc->max_queue_size * 95; ++ (void)do_div(q, 100); ++ xtfs->ecn_queue_size = (u32)q; ++ + return 0; + } + +@@ -101,8 +639,11 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + +- if (x->dir == XFRM_SA_DIR_OUT) ++ if (x->dir == XFRM_SA_DIR_OUT) { ++ l += nla_total_size(sizeof(u32)); /* init delay usec */ ++ l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(xc->pkt_size)); ++ } + + return l; + } +@@ -112,9 +653,22 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + int ret = 0; ++ u64 q; ++ ++ if (x->dir == XFRM_SA_DIR_OUT) { ++ q = xtfs->init_delay_ns; ++ (void)do_div(q, NSECS_IN_USEC); ++ ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); ++ if (ret) ++ return ret; ++ ++ ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, ++ xc->max_queue_size); ++ if (ret) ++ return ret; + +- if (x->dir == XFRM_SA_DIR_OUT) + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); ++ } + + return ret; + } +@@ -122,6 +676,10 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + static void __iptfs_init_state(struct xfrm_state *x, + struct xfrm_iptfs_data *xtfs) + { ++ __skb_queue_head_init(&xtfs->queue); ++ hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); ++ xtfs->iptfs_timer.function = iptfs_delay_timer; ++ + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) +@@ -166,10 +724,21 @@ static int iptfs_create_state(struct xfrm_state *x) + static void iptfs_delete_state(struct xfrm_state *x) + { + struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff_head list; ++ struct sk_buff *skb; + + if (!xtfs) + return; + ++ spin_lock_bh(&xtfs->x->lock); ++ hrtimer_cancel(&xtfs->iptfs_timer); ++ __skb_queue_head_init(&list); ++ skb_queue_splice_init(&xtfs->queue, &list); ++ spin_unlock_bh(&xtfs->x->lock); ++ ++ while ((skb = __skb_dequeue(&list))) ++ kfree_skb(skb); ++ + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +@@ -184,6 +753,8 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .sa_len = iptfs_sa_len, + .clone = iptfs_clone, + .get_inner_mtu = iptfs_get_inner_mtu, ++ .output = iptfs_output_collect, ++ .prepare_output = iptfs_prepare_output, + }; + + static int __init xfrm_iptfs_init(void) +-- +2.46.0 + diff --git a/patches/v8/v8-0009-xfrm-iptfs-share-page-fragments-of-inner-packets.patch b/patches/v8/v8-0009-xfrm-iptfs-share-page-fragments-of-inner-packets.patch new file mode 100644 index 0000000..2046bdd --- /dev/null +++ b/patches/v8/v8-0009-xfrm-iptfs-share-page-fragments-of-inner-packets.patch @@ -0,0 +1,164 @@ +From 4f83913ed9bd4176e3213a1c428c59631aff466b Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Thu, 4 Apr 2024 02:33:03 -0400 +Subject: [PATCH ipsec-next v8 09/16] xfrm: iptfs: share page fragments of + inner packets + +When possible rather than appending secondary (aggregated) inner packets +to the fragment list, share their page fragments with the outer IPTFS +packet. This allows for more efficient packet transmission. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 88 ++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 79 insertions(+), 9 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 9c48c15cbed0..20c19894720e 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -9,6 +9,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -90,6 +91,24 @@ struct xfrm_iptfs_data { + static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + ++/* ================= */ ++/* SK_BUFF Functions */ ++/* ================= */ ++ ++/** ++ * skb_head_to_frag() - initialize a skb_frag_t based on skb head data ++ * @skb: skb with the head data ++ * @frag: frag to initialize ++ */ ++static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) ++{ ++ struct page *page = virt_to_head_page(skb->data); ++ unsigned char *addr = (unsigned char *)page_address(page); ++ ++ BUG_ON(!skb->head_frag); ++ skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -306,14 +325,44 @@ static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, + return nextp; + } + ++static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) ++{ ++ struct skb_shared_info *fromi = skb_shinfo(from); ++ struct skb_shared_info *toi = skb_shinfo(to); ++ unsigned int new_truesize; ++ ++ /* If we have data in a head page, grab it */ ++ if (!skb_headlen(from)) { ++ new_truesize = SKB_TRUESIZE(skb_end_offset(from)); ++ } else { ++ skb_head_to_frag(from, &toi->frags[toi->nr_frags]); ++ skb_frag_ref(to, toi->nr_frags++); ++ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ } ++ ++ /* Move any other page fragments rather than copy */ ++ memcpy(&toi->frags[toi->nr_frags], fromi->frags, ++ sizeof(fromi->frags[0]) * fromi->nr_frags); ++ toi->nr_frags += fromi->nr_frags; ++ fromi->nr_frags = 0; ++ from->data_len = 0; ++ from->len = 0; ++ to->truesize += from->truesize - new_truesize; ++ from->truesize = new_truesize; ++ ++ /* We are done with this SKB */ ++ consume_skb(from); ++} ++ + static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + { + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *skb, *skb2, **nextp; +- struct skb_shared_info *shi; ++ struct skb_shared_info *shi, *shi2; + + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ bool share_ok = true; + int remaining; + + /* protocol comes to us cleared sometimes */ +@@ -357,7 +406,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + + /* Re-home (un-nest) nested fragment lists. We need to do this + * b/c we will simply be appending any following aggregated +- * inner packets to the frag list. ++ * inner packets using the frag list. + */ + shi = skb_shinfo(skb); + nextp = &shi->frag_list; +@@ -369,6 +418,9 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + nextp = &(*nextp)->next; + } + ++ if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) ++ share_ok = false; ++ + /* See if we have enough space to simply append. + * + * NOTE: Maybe do not append if we will be mis-aligned, +@@ -393,18 +445,36 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + } + } + ++ /* skb->pp_recycle is passed to __skb_flag_unref for all ++ * frag pages so we can only share pages with skb's who ++ * match ourselves. ++ */ ++ shi2 = skb_shinfo(skb2); ++ if (share_ok && ++ (shi2->frag_list || ++ (!skb2->head_frag && skb_headlen(skb)) || ++ skb->pp_recycle != skb2->pp_recycle || ++ skb_zcopy(skb2) || ++ (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS))) ++ share_ok = false; ++ + /* Do accounting */ + skb->data_len += skb2->len; + skb->len += skb2->len; + remaining -= skb2->len; + +- /* Append to the frag_list */ +- *nextp = skb2; +- nextp = &skb2->next; +- BUG_ON(*nextp); +- if (skb_has_frag_list(skb2)) +- nextp = iptfs_rehome_fraglist(nextp, skb2); +- skb->truesize += skb2->truesize; ++ if (share_ok) { ++ iptfs_consume_frags(skb, skb2); ++ } else { ++ /* Append to the frag_list */ ++ *nextp = skb2; ++ nextp = &skb2->next; ++ BUG_ON(*nextp); ++ if (skb_has_frag_list(skb2)) ++ nextp = iptfs_rehome_fraglist(nextp, ++ skb2); ++ skb->truesize += skb2->truesize; ++ } + } + + xfrm_output(NULL, skb); +-- +2.46.0 + diff --git a/patches/v8/v8-0010-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch b/patches/v8/v8-0010-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch new file mode 100644 index 0000000..d6d8eb3 --- /dev/null +++ b/patches/v8/v8-0010-xfrm-iptfs-add-fragmenting-of-larger-than-MTU-use.patch @@ -0,0 +1,557 @@ +From cc57b2837cce6e5db78580eac19f950373ac16ac Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 31 Jul 2024 12:23:28 -0400 +Subject: [PATCH ipsec-next v8 10/16] xfrm: iptfs: add fragmenting of larger + than MTU user packets + +Add support for tunneling user (inner) packets that are larger than the +tunnel's path MTU (outer) using IP-TFS fragmentation. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 407 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 381 insertions(+), 26 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 20c19894720e..38735e2d64c3 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -46,12 +46,23 @@ + */ + #define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) + ++/* 1) skb->head should be cache aligned. ++ * 2) when resv is for L2 headers (i.e., ethernet) we want the cacheline to ++ * start -16 from data. ++ * 3) when resv is for L3+L2 headers IOW skb->data points at the IPTFS payload ++ * we want data to be cache line aligned so all the pushed headers will be in ++ * another cacheline. ++ */ ++#define XFRM_IPTFS_MIN_L3HEADROOM 128 ++#define XFRM_IPTFS_MIN_L2HEADROOM (64 + 16) ++#define IPTFS_FRAG_COPY_MAX 256 /* max for copying to create iptfs frags */ + #define NSECS_IN_USEC 1000 + + #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + + /** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. ++ * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. + * @max_queue_size: The maximum number of octets allowed to be queued to be sent +@@ -59,6 +70,7 @@ + * packets enqueued. + */ + struct xfrm_iptfs_config { ++ bool dont_frag : 1; + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + }; +@@ -88,13 +100,71 @@ struct xfrm_iptfs_data { + u32 payload_mtu; /* max payload size */ + }; + +-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); ++static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + + /* ================= */ + /* SK_BUFF Functions */ + /* ================= */ + ++/** ++ * iptfs_alloc_skb() - Allocate a new `skb` using a meta-data template. ++ * @tpl: the template to copy the new `skb`s meta-data from. ++ * @len: the linear length of the head data, zero is fine. ++ * @l3resv: true if reserve needs to support pushing L3 headers ++ * ++ * A new `skb` is allocated and it's meta-data is initialized from `tpl`, the ++ * head data is sized to `len` + reserved space set according to the @l3resv ++ * boolean. When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which ++ * arranges for `skb->data - 16` (etherhdr space) to be the start of a cacheline. ++ * Otherwise, @l3resv is true and resv is either the size of headroom from `tpl` or ++ * XFRM_IPTFS_MIN_L3HEADROOM whichever is greater, which tries to align ++ * skb->data to a cacheline as all headers will be pushed on the previous ++ * cacheline bytes. ++ * ++ * When copying meta-data from the @tpl, the sk_buff->headers are not copied. ++ * ++ * Zero length skbs are allocated when we only need a head skb to hold new ++ * packet headers (basically the mac header) that sit on top of existing shared ++ * packet data. ++ * ++ * Return: the new skb or NULL. ++ */ ++static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, ++ bool l3resv) ++{ ++ struct sk_buff *skb; ++ u32 resv; ++ ++ if (!l3resv) { ++ resv = XFRM_IPTFS_MIN_L2HEADROOM; ++ } else { ++ resv = skb_headroom(tpl); ++ if (resv < XFRM_IPTFS_MIN_L3HEADROOM) ++ resv = XFRM_IPTFS_MIN_L3HEADROOM; ++ } ++ ++ skb = alloc_skb(len + resv, GFP_ATOMIC); ++ if (!skb) { ++ XFRM_INC_STATS(dev_net(tpl->dev), LINUX_MIB_XFRMNOSKBERROR); ++ return NULL; ++ } ++ ++ skb_reserve(skb, resv); ++ ++ /* We do not want any of the tpl->headers copied over, so we do ++ * not use `skb_copy_header()`. ++ */ ++ skb->tstamp = tpl->tstamp; ++ skb->dev = tpl->dev; ++ memcpy(skb->cb, tpl->cb, sizeof(skb->cb)); ++ skb_dst_copy(skb, tpl); ++ __skb_ext_copy(skb, tpl); ++ __nf_copy(skb, tpl, false); ++ ++ return skb; ++} ++ + /** + * skb_head_to_frag() - initialize a skb_frag_t based on skb head data + * @skb: skb with the head data +@@ -109,6 +179,41 @@ static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); + } + ++/** ++ * skb_copy_bits_seq - copy bits from a skb_seq_state to kernel buffer ++ * @st: source skb_seq_state ++ * @offset: offset in source ++ * @to: destination buffer ++ * @len: number of bytes to copy ++ * ++ * Copy @len bytes from @offset bytes into the source @st to the destination ++ * buffer @to. `offset` should increase (or be unchanged) with each subsequent ++ * call to this function. If offset needs to decrease from the previous use `st` ++ * should be reset first. ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, ++ int len) ++{ ++ const u8 *data; ++ u32 sqlen; ++ ++ for (;;) { ++ sqlen = skb_seq_read(offset, &data, st); ++ if (sqlen == 0) ++ return -ENOMEM; ++ if (sqlen >= len) { ++ memcpy(to, data, len); ++ return 0; ++ } ++ memcpy(to, data, sqlen); ++ to += sqlen; ++ offset += sqlen; ++ len -= sqlen; ++ } ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -153,7 +258,7 @@ static int iptfs_get_cur_pmtu(struct xfrm_state *x, + { + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); + u32 payload_mtu = xtfs->payload_mtu; +- u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); ++ u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + + if (payload_mtu && payload_mtu < pmtu) + pmtu = payload_mtu; +@@ -216,7 +321,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + + BUG_ON(!xtfs); + +- pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); ++ if (xtfs->cfg.dont_frag) ++ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + + /* Break apart GSO skbs. If the queue is nearing full then we want the + * accounting and queuing to be based on the individual packets not on the +@@ -256,8 +362,10 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + continue; + } + +- /* Fragmenting handled in following commits. */ +- if (iptfs_is_too_big(sk, skb, pmtu)) { ++ /* If the user indicated no iptfs fragmenting check before ++ * enqueue. ++ */ ++ if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -301,6 +409,219 @@ static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + } + ++/** ++ * iptfs_copy_create_frag() - create an inner fragment skb. ++ * @st: The source packet data. ++ * @offset: offset in @st of the new fragment data. ++ * @copy_len: the amount of data to copy from @st. ++ * ++ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must ++ * not be greater than the max fragment size. ++ * ++ * Return: the new fragment skb or an ERR_PTR(). ++ */ ++static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, ++ u32 offset, u32 copy_len) ++{ ++ struct sk_buff *src = st->root_skb; ++ struct sk_buff *skb; ++ int err; ++ ++ skb = iptfs_alloc_skb(src, copy_len, true); ++ if (!skb) ++ return ERR_PTR(-ENOMEM); ++ ++ /* Now copy `copy_len` data from src */ ++ err = skb_copy_bits_seq(st, offset, skb_put(skb, copy_len), copy_len); ++ if (err) { ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ ++ return skb; ++} ++ ++/** ++ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. ++ * @skbp: the source packet skb (IN), skb holding the last fragment in ++ * the fragment stream (OUT). ++ * @xtfs: IPTFS SA state. ++ * @mtu: the max IPTFS fragment size. ++ * ++ * This function is responsible for fragmenting a larger inner packet into a ++ * sequence of IPTFS payload packets. The last fragment is returned rather than ++ * being sent so that the caller can append more inner packets (aggregation) if ++ * there is room. ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int iptfs_copy_create_frags(struct sk_buff **skbp, ++ struct xfrm_iptfs_data *xtfs, u32 mtu) ++{ ++ struct skb_seq_state skbseq; ++ struct list_head sublist; ++ struct sk_buff *skb = *skbp; ++ struct sk_buff *nskb = *skbp; ++ u32 copy_len, offset; ++ u32 to_copy = skb->len - mtu; ++ int err = 0; ++ ++ INIT_LIST_HEAD(&sublist); ++ ++ BUG_ON(skb->len <= mtu); ++ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); ++ ++ /* A trimmed `skb` will be sent as the first fragment, later. */ ++ offset = mtu; ++ to_copy = skb->len - offset; ++ while (to_copy) { ++ /* Send all but last fragment to allow agg. append */ ++ list_add_tail(&nskb->list, &sublist); ++ ++ /* FUTURE: if the packet has an odd/non-aligning length we could ++ * send less data in the penultimate fragment so that the last ++ * fragment then ends on an aligned boundary. ++ */ ++ copy_len = min(to_copy, mtu); ++ nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); ++ if (IS_ERR(nskb)) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMOUTERROR); ++ skb_abort_seq_read(&skbseq); ++ err = PTR_ERR(nskb); ++ nskb = NULL; ++ break; ++ } ++ iptfs_output_prepare_skb(nskb, to_copy); ++ offset += copy_len; ++ to_copy -= copy_len; ++ } ++ skb_abort_seq_read(&skbseq); ++ ++ /* return last fragment that will be unsent (or NULL) */ ++ *skbp = nskb; ++ ++ /* trim the original skb to MTU */ ++ if (!err) ++ err = pskb_trim(skb, mtu); ++ ++ if (err) { ++ /* Free all frags. Don't bother sending a partial packet we will ++ * never complete. ++ */ ++ kfree_skb(nskb); ++ list_for_each_entry_safe(skb, nskb, &sublist, list) { ++ skb_list_del_init(skb); ++ kfree_skb(skb); ++ } ++ return err; ++ } ++ ++ /* prepare the initial fragment with an iptfs header */ ++ iptfs_output_prepare_skb(skb, 0); ++ ++ /* Send all but last fragment, if we fail to send a fragment then free ++ * the rest -- no point in sending a packet that can't be reassembled. ++ */ ++ list_for_each_entry_safe(skb, nskb, &sublist, list) { ++ skb_list_del_init(skb); ++ if (!err) ++ err = xfrm_output(NULL, skb); ++ else ++ kfree_skb(skb); ++ } ++ if (err) ++ kfree_skb(*skbp); ++ return err; ++} ++ ++/** ++ * iptfs_first_should_copy() - determine if we should copy packet data. ++ * @first_skb: the first skb in the packet ++ * @mtu: the MTU. ++ * ++ * Determine if we should create subsequent skbs to hold the remaining data from ++ * a large inner packet by copying the packet data, or cloning the original skb ++ * and adjusting the offsets. ++ * ++ * Return: true if we should copy the data out of the skb. ++ */ ++static bool iptfs_first_should_copy(struct sk_buff *first_skb, u32 mtu) ++{ ++ u32 frag_copy_max; ++ ++ /* If we have less than frag_copy_max for remaining packet we copy ++ * those tail bytes as it is more efficient. ++ */ ++ frag_copy_max = min(mtu, IPTFS_FRAG_COPY_MAX); ++ if ((int)first_skb->len - (int)mtu < (int)frag_copy_max) ++ return true; ++ ++ /* If we have non-linear skb just use copy */ ++ if (skb_is_nonlinear(first_skb)) ++ return true; ++ ++ /* So we have a simple linear skb, easy to clone and share */ ++ return false; ++} ++ ++/** ++ * iptfs_first_skb() - handle the first dequeued inner packet for output ++ * @skbp: the source packet skb (IN), skb holding the last fragment in ++ * the fragment stream (OUT). ++ * @xtfs: IPTFS SA state. ++ * @mtu: the max IPTFS fragment size. ++ * ++ * This function is responsible for fragmenting a larger inner packet into a ++ * sequence of IPTFS payload packets. If it needs to fragment into subsequent ++ * skb's, it will either do so by copying or cloning. ++ * ++ * The last fragment is returned rather than being sent so that the caller can ++ * append more inner packets (aggregation) if there is room. ++ * ++ * Return: 0 on success or a negative error code on failure ++ */ ++static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, ++ u32 mtu) ++{ ++ struct sk_buff *skb = *skbp; ++ int err; ++ ++ /* Classic ESP skips the don't fragment ICMP error if DF is clear on ++ * the inner packet or ignore_df is set. Otherwise it will send an ICMP ++ * or local error if the inner packet won't fit it's MTU. ++ * ++ * With IPTFS we do not care about the inner packet DF bit. If the ++ * tunnel is configured to "don't fragment" we error back if things ++ * don't fit in our max packet size. Otherwise we iptfs-fragment as ++ * normal. ++ */ ++ ++ /* The opportunity for HW offload has ended */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ err = skb_checksum_help(skb); ++ if (err) ++ return err; ++ } ++ ++ /* We've split these up before queuing */ ++ BUG_ON(skb_is_gso(skb)); ++ ++ /* Simple case -- it fits. `mtu` accounted for all the overhead ++ * including the basic IPTFS header. ++ */ ++ if (skb->len <= mtu) { ++ iptfs_output_prepare_skb(skb, 0); ++ return 0; ++ } ++ ++ if (iptfs_first_should_copy(skb, mtu)) ++ return iptfs_copy_create_frags(skbp, xtfs, mtu); ++ ++ /* For now we always copy */ ++ return iptfs_copy_create_frags(skbp, xtfs, mtu); ++} ++ + static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, + struct sk_buff *child) + { +@@ -360,6 +681,15 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + struct sk_buff *skb, *skb2, **nextp; + struct skb_shared_info *shi, *shi2; + ++ /* If we are fragmenting due to a large inner packet we will output all ++ * the outer IPTFS packets required to contain the fragments of the ++ * single large inner packet. These outer packets need to be sent ++ * consecutively (ESP seq-wise). Since this output function is always ++ * running from a timer we do not need a lock to provide this guarantee. ++ * We will output our packets consecutively before the timer is allowed ++ * to run again on some other CPU. ++ */ ++ + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + bool share_ok = true; +@@ -370,7 +700,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + htons(ETH_P_IP) : + htons(ETH_P_IPV6); + +- if (skb->len > mtu) { ++ if (skb->len > mtu && xtfs->cfg.dont_frag) { + /* We handle this case before enqueueing so we are only + * here b/c MTU changed after we enqueued before we + * dequeued, just drop these. +@@ -381,26 +711,22 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + continue; + } + +- /* If we don't have a cksum in the packet we need to add one +- * before encapsulation. ++ /* Convert first inner packet into an outer IPTFS packet, ++ * dealing with any fragmentation into multiple outer packets ++ * if necessary. + */ +- if (skb->ip_summed == CHECKSUM_PARTIAL) { +- if (skb_checksum_help(skb)) { +- XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), +- LINUX_MIB_XFRMOUTERROR); +- kfree_skb(skb); +- continue; +- } +- } +- +- /* Convert first inner packet into an outer IPTFS packet */ +- iptfs_output_prepare_skb(skb, 0); ++ if (iptfs_first_skb(&skb, xtfs, mtu)) ++ continue; + +- /* The space remaining to send more inner packet data is `mtu` - +- * (skb->len - sizeof iptfs header). This is b/c the `mtu` value +- * has the basic IPTFS header len accounted for, and we added +- * that header to the skb so it is a part of skb->len, thus we +- * subtract it from the skb length. ++ /* If fragmentation was required the returned skb is the last ++ * IPTFS fragment in the chain, and it's IPTFS header blkoff has ++ * been set just past the end of the fragment data. ++ * ++ * In either case the space remaining to send more inner packet ++ * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c ++ * the `mtu` value has the basic IPTFS header len accounted for, ++ * and we added that header to the skb so it is a part of ++ * skb->len, thus we subtract it from the skb length. + */ + remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); + +@@ -641,11 +967,13 @@ static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) + /* ========================== */ + + /** +- * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. ++ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * @x: xfrm state. + * @outer_mtu: the outer mtu ++ * ++ * Return: Correct MTU taking in to account the encap overhead. + */ +-static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) + { + struct crypto_aead *aead; + u32 blksize; +@@ -656,6 +984,23 @@ static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) + ~(blksize - 1)) - 2; + } + ++/** ++ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. ++ * @x: xfrm state. ++ * @outer_mtu: Outer MTU for the encapsulated packet. ++ * ++ * Return: Correct MTU taking in to account the encap overhead. ++ */ ++static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ++{ ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ ++ /* If not dont-frag we have no MTU */ ++ if (!xtfs->cfg.dont_frag) ++ return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU; ++ return __iptfs_get_inner_mtu(x, outer_mtu); ++} ++ + /** + * iptfs_user_init() - initialize the SA with IPTFS options from netlink. + * @net: the net data +@@ -677,6 +1022,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + ++ if (attrs[XFRMA_IPTFS_DONT_FRAG]) ++ xc->dont_frag = true; + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { +@@ -710,6 +1057,8 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + unsigned int l = 0; + + if (x->dir == XFRM_SA_DIR_OUT) { ++ if (xc->dont_frag) ++ l += nla_total_size(0); /* dont-frag flag */ + l += nla_total_size(sizeof(u32)); /* init delay usec */ + l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(xc->pkt_size)); +@@ -726,6 +1075,12 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + u64 q; + + if (x->dir == XFRM_SA_DIR_OUT) { ++ if (xc->dont_frag) { ++ ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); ++ if (ret) ++ return ret; ++ } ++ + q = xtfs->init_delay_ns; + (void)do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); +-- +2.46.0 + diff --git a/patches/v8/v8-0011-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch b/patches/v8/v8-0011-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch new file mode 100644 index 0000000..5ee96f9 --- /dev/null +++ b/patches/v8/v8-0011-xfrm-iptfs-add-basic-receive-packet-tunnel-egress.patch @@ -0,0 +1,310 @@ +From 08da91e83f8672f453f9d67d8527fac38cd0c5ac Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sat, 3 Aug 2024 00:43:21 -0400 +Subject: [PATCH ipsec-next v8 11/16] xfrm: iptfs: add basic receive packet + (tunnel egress) handling + +Add handling of packets received from the tunnel. This implements +tunnel egress functionality. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 268 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 268 insertions(+) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 38735e2d64c3..ea0f47af345d 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -20,6 +20,10 @@ + + #include "xfrm_inout.h" + ++/* IPTFS encap (header) values. */ ++#define IPTFS_SUBTYPE_BASIC 0 ++#define IPTFS_SUBTYPE_CC 1 ++ + /* ------------------------------------------------ */ + /* IPTFS default SA values (tunnel ingress/dir-out) */ + /* ------------------------------------------------ */ +@@ -214,6 +218,269 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + } + } + ++/* ================================== */ ++/* IPTFS Receiving (egress) Functions */ ++/* ================================== */ ++ ++/** ++ * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. ++ * @skblen: the total data size for `skb`. ++ * @st: The source for the rest of the data to copy into `skb`. ++ * @off: The offset into @st to copy data from. ++ * @len: The length of data to copy from @st into `skb`. This must be <= ++ * @skblen. ++ * ++ * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, ++ * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len ++ * bytes from @st into `skb` starting from @off. ++ * ++ * It is an error for @len to be greater than the amount of data left in @st. ++ * ++ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. ++ */ ++static struct sk_buff * ++iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) ++{ ++ struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); ++ ++ if (!skb) ++ return NULL; ++ if (skb_copy_bits_seq(st, off, skb_put(skb, len), len)) { ++ XFRM_INC_STATS(dev_net(st->root_skb->dev), ++ LINUX_MIB_XFRMINERROR); ++ kfree_skb(skb); ++ return NULL; ++ } ++ return skb; ++} ++ ++/** ++ * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. ++ * @x: xfrm state ++ * @skb: the inner packet ++ * ++ * Finish the standard xfrm processing on the inner packet prior to sending back ++ * through gro_cells_receive. We do this separately b/c we are building a list ++ * of packets in the hopes that one day a list will be taken by ++ * xfrm_input. ++ */ ++static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ skb_reset_network_header(skb); ++ ++ /* The packet is going back through gro_cells_receive no need to ++ * set this. ++ */ ++ skb_reset_transport_header(skb); ++ ++ /* Packet already has checksum value set. */ ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ /* Our skb will contain the header data copied when this outer packet ++ * which contained the start of this inner packet. This is true ++ * when we allocate a new skb as well as when we reuse the existing skb. ++ */ ++ if (ip_hdr(skb)->version == 0x4) { ++ struct iphdr *iph = ip_hdr(skb); ++ ++ if (x->props.flags & XFRM_STATE_DECAP_DSCP) ++ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); ++ if (!(x->props.flags & XFRM_STATE_NOECN)) ++ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) ++ IP_ECN_set_ce(iph); ++ ++ skb->protocol = htons(ETH_P_IP); ++ } else { ++ struct ipv6hdr *iph = ipv6_hdr(skb); ++ ++ if (x->props.flags & XFRM_STATE_DECAP_DSCP) ++ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); ++ if (!(x->props.flags & XFRM_STATE_NOECN)) ++ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) ++ IP6_ECN_set_ce(skb, iph); ++ ++ skb->protocol = htons(ETH_P_IPV6); ++ } ++} ++ ++/** ++ * iptfs_input() - handle receipt of iptfs payload ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * Process the IPTFS payload in `skb` and consume it afterwards. ++ * ++ * Returns 0. ++ */ ++static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ u8 hbytes[sizeof(struct ipv6hdr)]; ++ struct ip_iptfs_cc_hdr iptcch; ++ struct skb_seq_state skbseq; ++ struct list_head sublist; /* rename this it's just a list */ ++ struct sk_buff *first_skb, *next; ++ const unsigned char *old_mac; ++ struct ip_iptfs_hdr *ipth; ++ struct iphdr *iph; ++ struct net *net; ++ u32 remaining, iplen, iphlen, data, tail; ++ u32 blkoff; ++ ++ net = xs_net(x); ++ first_skb = NULL; ++ ++ /* Large enough to hold both types of header */ ++ ipth = (struct ip_iptfs_hdr *)&iptcch; ++ ++ /* Save the old mac header if set */ ++ old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; ++ ++ skb_prepare_seq_read(skb, 0, skb->len, &skbseq); ++ ++ /* Get the IPTFS header and validate it */ ++ ++ if (skb_copy_bits_seq(&skbseq, 0, ipth, sizeof(*ipth))) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ data = sizeof(*ipth); ++ ++ /* Set data past the basic header */ ++ if (ipth->subtype == IPTFS_SUBTYPE_CC) { ++ /* Copy the rest of the CC header */ ++ remaining = sizeof(iptcch) - sizeof(*ipth); ++ if (skb_copy_bits_seq(&skbseq, data, ipth + 1, remaining)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ data += remaining; ++ } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); ++ goto done; ++ } ++ ++ if (ipth->flags != 0) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); ++ goto done; ++ } ++ ++ INIT_LIST_HEAD(&sublist); ++ ++ /* Fragment handling in following commits */ ++ blkoff = ntohs(ipth->block_offset); ++ data += blkoff; ++ ++ /* New packets */ ++ tail = skb->len; ++ while (data < tail) { ++ __be16 protocol = 0; ++ ++ /* Gather information on the next data block. ++ * `data` points to the start of the data block. ++ */ ++ remaining = tail - data; ++ ++ /* try and copy enough bytes to read length from ipv4/ipv6 */ ++ iphlen = min_t(u32, remaining, 6); ++ if (skb_copy_bits_seq(&skbseq, data, hbytes, iphlen)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ ++ iph = (struct iphdr *)hbytes; ++ if (iph->version == 0x4) { ++ /* must have at least tot_len field present */ ++ if (remaining < 4) ++ break; ++ ++ iplen = be16_to_cpu(iph->tot_len); ++ iphlen = iph->ihl << 2; ++ protocol = cpu_to_be16(ETH_P_IP); ++ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; ++ } else if (iph->version == 0x6) { ++ /* must have at least payload_len field present */ ++ if (remaining < 6) ++ break; ++ ++ iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); ++ iplen += sizeof(struct ipv6hdr); ++ iphlen = sizeof(struct ipv6hdr); ++ protocol = cpu_to_be16(ETH_P_IPV6); ++ XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = ++ ipv6_get_dsfield((struct ipv6hdr *)iph); ++ } else if (iph->version == 0x0) { ++ /* pad */ ++ break; ++ } else { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); ++ goto done; ++ } ++ ++ if (unlikely(skbseq.stepped_offset)) { ++ /* We need to reset our seq read, it can't backup at ++ * this point. ++ */ ++ struct sk_buff *save = skbseq.root_skb; ++ ++ skb_abort_seq_read(&skbseq); ++ skb_prepare_seq_read(save, data, tail, &skbseq); ++ } ++ ++ if (!first_skb) ++ first_skb = skb; ++ ++ /* Fragment handling in following commits */ ++ if (iplen > remaining) ++ break; ++ ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, iplen); ++ if (!skb) { ++ /* skip to next packet or done */ ++ data += iplen; ++ continue; ++ } ++ ++ skb->protocol = protocol; ++ if (old_mac) { ++ /* rebuild the mac header */ ++ skb_set_mac_header(skb, -first_skb->mac_len); ++ memcpy(skb_mac_header(skb), old_mac, ++ first_skb->mac_len); ++ eth_hdr(skb)->h_proto = skb->protocol; ++ } ++ ++ data += iplen; ++ iptfs_complete_inner_skb(x, skb); ++ list_add_tail(&skb->list, &sublist); ++ } ++ ++ /* Send the packets! */ ++ list_for_each_entry_safe(skb, next, &sublist, list) { ++ skb_list_del_init(skb); ++ if (xfrm_input(skb, 0, 0, -2)) ++ kfree_skb(skb); ++ } ++ ++done: ++ skb = skbseq.root_skb; ++ skb_abort_seq_read(&skbseq); ++ ++ if (first_skb) { ++ consume_skb(first_skb); ++ } else { ++ /* skb is the original passed in skb, but we didn't get far ++ * enough to process it as the first_skb. ++ */ ++ kfree_skb(skb); ++ } ++ ++ /* We always have dealt with the input SKB, either we are re-using it, ++ * or we have freed it. Return EINPROGRESS so that xfrm_input stops ++ * processing it. ++ */ ++ return -EINPROGRESS; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -1178,6 +1445,7 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .sa_len = iptfs_sa_len, + .clone = iptfs_clone, + .get_inner_mtu = iptfs_get_inner_mtu, ++ .input = iptfs_input, + .output = iptfs_output_collect, + .prepare_output = iptfs_prepare_output, + }; +-- +2.46.0 + diff --git a/patches/v8/v8-0012-xfrm-iptfs-handle-received-fragmented-inner-packe.patch b/patches/v8/v8-0012-xfrm-iptfs-handle-received-fragmented-inner-packe.patch new file mode 100644 index 0000000..6983d2a --- /dev/null +++ b/patches/v8/v8-0012-xfrm-iptfs-handle-received-fragmented-inner-packe.patch @@ -0,0 +1,675 @@ +From 8ab369b695c4af9ecdc7fe709a0c7a364d85e234 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Sat, 3 Aug 2024 01:16:53 -0400 +Subject: [PATCH ipsec-next v8 12/16] xfrm: iptfs: handle received fragmented + inner packets + +Add support for handling receipt of partial inner packets that have +been fragmented across multiple outer IP-TFS tunnel packets. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 488 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 474 insertions(+), 14 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index ea0f47af345d..5f9159260319 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -24,6 +24,21 @@ + #define IPTFS_SUBTYPE_BASIC 0 + #define IPTFS_SUBTYPE_CC 1 + ++/* ----------------------------------------------- */ ++/* IP-TFS default SA values (tunnel egress/dir-in) */ ++/* ----------------------------------------------- */ ++ ++/** ++ * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time ++ * ++ * The default IPTFS drop time in microseconds. The drop time is the amount of ++ * time before a missing out-of-order IPTFS tunnel packet is considered lost. ++ * See also the reorder window. ++ * ++ * Default 1s. ++ */ ++#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 ++ + /* ------------------------------------------------ */ + /* IPTFS default SA values (tunnel ingress/dir-out) */ + /* ------------------------------------------------ */ +@@ -89,6 +104,13 @@ struct xfrm_iptfs_config { + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. + * @payload_mtu: max payload size. ++ * @drop_lock: lock to protect reorder queue. ++ * @drop_timer: timer for considering next packet lost. ++ * @drop_time_ns: timer intervan in nanoseconds. ++ * @ra_newskb: new pkt being reassembled. ++ * @ra_wantseq: expected next sequence for reassembly. ++ * @ra_runt: last pkt bytes from very end of last skb. ++ * @ra_runtlen: size of ra_runt. + */ + struct xfrm_iptfs_data { + struct xfrm_iptfs_config cfg; +@@ -102,10 +124,33 @@ struct xfrm_iptfs_data { + u64 init_delay_ns; /* nanoseconds */ + struct hrtimer iptfs_timer; /* output timer */ + u32 payload_mtu; /* max payload size */ ++ ++ /* Tunnel egress */ ++ spinlock_t drop_lock; ++ struct hrtimer drop_timer; ++ u64 drop_time_ns; ++ ++ /* Tunnel egress reassembly */ ++ struct sk_buff *ra_newskb; /* new pkt being reassembled */ ++ u64 ra_wantseq; /* expected next sequence */ ++ u8 ra_runt[6]; /* last pkt bytes from last skb */ ++ u8 ra_runtlen; /* count of ra_runt */ + }; + + static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); + static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); ++static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); ++ ++/* ================= */ ++/* Utility Functions */ ++/* ================= */ ++ ++static u64 __esp_seq(struct sk_buff *skb) ++{ ++ u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); ++ ++ return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; ++} + + /* ================= */ + /* SK_BUFF Functions */ +@@ -254,6 +299,67 @@ iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) + return skb; + } + ++/** ++ * iptfs_input_save_runt() - save data in xtfs runt space. ++ * @xtfs: xtfs state ++ * @seq: the current sequence ++ * @buf: packet data ++ * @len: length of packet data ++ * ++ * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data ++ * runt space. ++ */ ++static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, ++ u8 *buf, int len) ++{ ++ BUG_ON(xtfs->ra_newskb); /* we won't have a new SKB yet */ ++ ++ memcpy(xtfs->ra_runt, buf, len); ++ ++ xtfs->ra_runtlen = len; ++ xtfs->ra_wantseq = seq + 1; ++} ++ ++/** ++ * __iptfs_iphlen() - return the v4/v6 header length using packet data. ++ * @data: pointer at octet with version nibble ++ * ++ * The version data is expected to be valid (i.e., either 4 or 6). ++ * ++ * Return: the IP header size based on the IP version. ++ */ ++static u32 __iptfs_iphlen(u8 *data) ++{ ++ struct iphdr *iph = (struct iphdr *)data; ++ ++ if (iph->version == 0x4) ++ return sizeof(*iph); ++ BUG_ON(iph->version != 0x6); ++ return sizeof(struct ipv6hdr); ++} ++ ++/** ++ * __iptfs_iplen() - return the v4/v6 length using packet data. ++ * @data: pointer to ip (v4/v6) packet header ++ * ++ * Grab the IPv4 or IPv6 length value in the start of the inner packet header ++ * pointed to by `data`. Assumes data len is enough for the length field only. ++ * ++ * The version data is expected to be valid (i.e., either 4 or 6). ++ * ++ * Return: the length value. ++ */ ++static u32 __iptfs_iplen(u8 *data) ++{ ++ struct iphdr *iph = (struct iphdr *)data; ++ ++ if (iph->version == 0x4) ++ return ntohs(iph->tot_len); ++ BUG_ON(iph->version != 0x6); ++ return ntohs(((struct ipv6hdr *)iph)->payload_len) + ++ sizeof(struct ipv6hdr); ++} ++ + /** + * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. + * @x: xfrm state +@@ -303,6 +409,239 @@ static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) + } + } + ++static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) ++{ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ /* We don't care if it works locking takes care of things */ ++ hrtimer_try_to_cancel(&xtfs->drop_timer); ++ if (free) ++ kfree_skb(xtfs->ra_newskb); ++ xtfs->ra_newskb = NULL; ++} ++ ++/** ++ * iptfs_reassem_abort() - In-progress packet is aborted free the state. ++ * @xtfs: xtfs state ++ */ ++static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) ++{ ++ __iptfs_reassem_done(xtfs, true); ++} ++ ++/** ++ * iptfs_reassem_done() - In-progress packet is complete, clear the state. ++ * @xtfs: xtfs state ++ */ ++static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) ++{ ++ __iptfs_reassem_done(xtfs, false); ++} ++ ++/** ++ * iptfs_reassem_cont() - Continue the reassembly of an inner packets. ++ * @xtfs: xtfs state ++ * @seq: sequence of current packet ++ * @st: seq read stat for current packet ++ * @skb: current packet ++ * @data: offset into sequential packet data ++ * @blkoff: packet blkoff value ++ * @list: list of skbs to enqueue completed packet on ++ * ++ * Process an IPTFS payload that has a non-zero `blkoff` or when we are ++ * expecting the continuation b/c we have a runt or in-progress packet. ++ * ++ * Return: the new data offset to continue processing from. ++ */ ++static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, ++ struct skb_seq_state *st, struct sk_buff *skb, ++ u32 data, u32 blkoff, struct list_head *list) ++{ ++ struct sk_buff *newskb = xtfs->ra_newskb; ++ u32 remaining = skb->len - data; ++ u32 runtlen = xtfs->ra_runtlen; ++ u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; ++ ++ /* Handle packet fragment we aren't expecting */ ++ if (!runtlen && !xtfs->ra_newskb) ++ return data + min(blkoff, remaining); ++ ++ /* Important to remember that input to this function is an ordered ++ * packet stream (unless the user disabled the reorder window). Thus if ++ * we are waiting for, and expecting the next packet so we can continue ++ * assembly, a newer sequence number indicates older ones are not coming ++ * (or if they do should be ignored). Technically we can receive older ++ * ones when the reorder window is disabled; however, the user should ++ * have disabled fragmentation in this case, and regardless we don't ++ * deal with it. ++ * ++ * blkoff could be zero if the stream is messed up (or it's an all pad ++ * insertion) be careful to handle that case in each of the below ++ */ ++ ++ /* Too old case: This can happen when the reorder window is disabled so ++ * ordering isn't actually guaranteed. ++ */ ++ if (seq < xtfs->ra_wantseq) ++ return data + remaining; ++ ++ /* Too new case: We missed what we wanted cleanup. */ ++ if (seq > xtfs->ra_wantseq) { ++ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ if (blkoff == 0) { ++ if ((*skb->data & 0xF0) != 0) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ /* Handle all pad case, advance expected sequence number. ++ * (RFC 9347 S2.2.3) ++ */ ++ xtfs->ra_wantseq++; ++ /* will end parsing */ ++ return data + remaining; ++ } ++ ++ if (runtlen) { ++ BUG_ON(xtfs->ra_newskb); ++ ++ /* Regardless of what happens we're done with the runt */ ++ xtfs->ra_runtlen = 0; ++ ++ /* The start of this inner packet was at the very end of the last ++ * iptfs payload which didn't include enough for the ip header ++ * length field. We must have *at least* that now. ++ */ ++ rrem = sizeof(xtfs->ra_runt) - runtlen; ++ if (remaining < rrem || blkoff < rrem) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* fill in the runt data */ ++ if (skb_copy_bits_seq(st, data, &xtfs->ra_runt[runtlen], ++ rrem)) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ ++ /* We have enough data to get the ip length value now, ++ * allocate an in progress skb ++ */ ++ ipremain = __iptfs_iplen(xtfs->ra_runt); ++ if (ipremain < sizeof(xtfs->ra_runt)) { ++ /* length has to be at least runtsize large */ ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* For the runt case we don't attempt sharing currently. NOTE: ++ * Currently, this IPTFS implementation will not create runts. ++ */ ++ ++ newskb = iptfs_alloc_skb(skb, ipremain, false); ++ if (!newskb) { ++ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR); ++ goto abandon; ++ } ++ xtfs->ra_newskb = newskb; ++ ++ /* Copy the runt data into the buffer, but leave data ++ * pointers the same as normal non-runt case. The extra `rrem` ++ * recopied bytes are basically cacheline free. Allows using ++ * same logic below to complete. ++ */ ++ memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, ++ sizeof(xtfs->ra_runt)); ++ } ++ ++ /* Continue reassembling the packet */ ++ ipremain = __iptfs_iplen(newskb->data); ++ iphlen = __iptfs_iphlen(newskb->data); ++ ++ /* Sanity check, we created the newskb knowing the IP length so the IP ++ * length can't now be shorter. ++ */ ++ BUG_ON(newskb->len > ipremain); ++ ++ ipremain -= newskb->len; ++ if (blkoff < ipremain) { ++ /* Corrupt data, we don't have enough to complete the packet */ ++ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ ++ /* We want the IP header in linear space */ ++ if (newskb->len < iphlen) { ++ iphremain = iphlen - newskb->len; ++ if (blkoff < iphremain) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINIPTFSERROR); ++ goto abandon; ++ } ++ fraglen = min(blkoff, remaining); ++ copylen = min(fraglen, iphremain); ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), ++ copylen)) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ /* this is a silly condition that might occur anyway */ ++ if (copylen < iphremain) { ++ xtfs->ra_wantseq++; ++ return data + fraglen; ++ } ++ /* update data and things derived from it */ ++ data += copylen; ++ blkoff -= copylen; ++ remaining -= copylen; ++ ipremain -= copylen; ++ } ++ ++ fraglen = min(blkoff, remaining); ++ copylen = min(fraglen, ipremain); ++ ++ /* We verified this was true in the main receive routine */ ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ ++ /* copy fragment data into newskb */ ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { ++ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } ++ ++ if (copylen < ipremain) { ++ xtfs->ra_wantseq++; ++ } else { ++ /* We are done with packet reassembly! */ ++ BUG_ON(copylen != ipremain); ++ iptfs_reassem_done(xtfs); ++ iptfs_complete_inner_skb(xtfs->x, newskb); ++ list_add_tail(&newskb->list, list); ++ } ++ ++ /* will continue on to new data block or end */ ++ return data + fraglen; ++ ++abandon: ++ if (xtfs->ra_newskb) { ++ iptfs_reassem_abort(xtfs); ++ } else { ++ xtfs->ra_runtlen = 0; ++ xtfs->ra_wantseq = 0; ++ } ++ /* skip past fragment, maybe to end */ ++ return data + min(blkoff, remaining); ++} ++ + /** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state +@@ -320,15 +659,20 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + struct list_head sublist; /* rename this it's just a list */ + struct sk_buff *first_skb, *next; + const unsigned char *old_mac; ++ struct xfrm_iptfs_data *xtfs; + struct ip_iptfs_hdr *ipth; + struct iphdr *iph; + struct net *net; + u32 remaining, iplen, iphlen, data, tail; +- u32 blkoff; ++ u32 blkoff, capturelen; ++ u64 seq; + ++ xtfs = x->mode_data; + net = xs_net(x); + first_skb = NULL; + ++ seq = __esp_seq(skb); ++ + /* Large enough to hold both types of header */ + ipth = (struct ip_iptfs_hdr *)&iptcch; + +@@ -366,12 +710,27 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + + INIT_LIST_HEAD(&sublist); + +- /* Fragment handling in following commits */ ++ /* Handle fragment at start of payload, and/or waiting reassembly. */ ++ + blkoff = ntohs(ipth->block_offset); +- data += blkoff; ++ /* check before locking i.e., maybe */ ++ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { ++ spin_lock(&xtfs->drop_lock); ++ ++ /* check again after lock */ ++ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { ++ data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, ++ blkoff, &sublist); ++ } ++ ++ spin_unlock(&xtfs->drop_lock); ++ } + + /* New packets */ ++ + tail = skb->len; ++ BUG_ON(xtfs->ra_newskb && data < tail); ++ + while (data < tail) { + __be16 protocol = 0; + +@@ -390,8 +749,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + iph = (struct iphdr *)hbytes; + if (iph->version == 0x4) { + /* must have at least tot_len field present */ +- if (remaining < 4) ++ if (remaining < 4) { ++ /* save the bytes we have, advance data and exit */ ++ iptfs_input_save_runt(xtfs, seq, hbytes, ++ remaining); ++ data += remaining; + break; ++ } + + iplen = be16_to_cpu(iph->tot_len); + iphlen = iph->ihl << 2; +@@ -399,8 +763,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + XFRM_MODE_SKB_CB(skbseq.root_skb)->tos = iph->tos; + } else if (iph->version == 0x6) { + /* must have at least payload_len field present */ +- if (remaining < 6) ++ if (remaining < 6) { ++ /* save the bytes we have, advance data and exit */ ++ iptfs_input_save_runt(xtfs, seq, hbytes, ++ remaining); ++ data += remaining; + break; ++ } + + iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); + iplen += sizeof(struct ipv6hdr); +@@ -410,6 +779,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + ipv6_get_dsfield((struct ipv6hdr *)iph); + } else if (iph->version == 0x0) { + /* pad */ ++ data = tail; + break; + } else { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); +@@ -429,16 +799,14 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + if (!first_skb) + first_skb = skb; + +- /* Fragment handling in following commits */ +- if (iplen > remaining) +- break; +- +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, iplen); ++ capturelen = min(iplen, remaining); ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, capturelen); + if (!skb) { + /* skip to next packet or done */ +- data += iplen; ++ data += capturelen; + continue; + } ++ BUG_ON(skb->len != capturelen); + + skb->protocol = protocol; + if (old_mac) { +@@ -449,11 +817,38 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + eth_hdr(skb)->h_proto = skb->protocol; + } + +- data += iplen; ++ data += capturelen; ++ ++ if (skb->len < iplen) { ++ BUG_ON(data != tail); ++ BUG_ON(xtfs->ra_newskb); ++ ++ /* Start reassembly */ ++ spin_lock(&xtfs->drop_lock); ++ ++ xtfs->ra_newskb = skb; ++ xtfs->ra_wantseq = seq + 1; ++ if (!hrtimer_is_queued(&xtfs->drop_timer)) { ++ /* softirq blocked lest the timer fire and interrupt us */ ++ BUG_ON(!in_interrupt()); ++ hrtimer_start(&xtfs->drop_timer, ++ xtfs->drop_time_ns, ++ IPTFS_HRTIMER_MODE); ++ } ++ ++ spin_unlock(&xtfs->drop_lock); ++ ++ break; ++ } ++ + iptfs_complete_inner_skb(x, skb); + list_add_tail(&skb->list, &sublist); + } + ++ if (data != tail) ++ /* this should not happen from the above code */ ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); ++ + /* Send the packets! */ + list_for_each_entry_safe(skb, next, &sublist, list) { + skb_list_del_init(skb); +@@ -481,6 +876,47 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + return -EINPROGRESS; + } + ++/** ++ * iptfs_drop_timer() - Handle drop timer expiry. ++ * @me: the timer ++ * ++ * This is similar to our input function. ++ * ++ * The drop timer is set when we start an in progress reassembly, and also when ++ * we save a future packet in the window saved array. ++ * ++ * NOTE packets in the save window are always newer WRT drop times as ++ * they get further in the future. i.e. for: ++ * ++ * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, ++ * then D(n-1) <= D(n). ++ * ++ * So, regardless of why the timer is firing we can always discard any inprogress ++ * fragment; either it's the reassembly timer, or slot 0 is going to be ++ * dropped as S0 must have the most recent drop time, and slot 0 holds the ++ * continuation fragment of the in progress packet. ++ * ++ * Returns HRTIMER_NORESTART. ++ */ ++static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) ++{ ++ struct xfrm_iptfs_data *xtfs; ++ struct xfrm_state *x; ++ ++ xtfs = container_of(me, typeof(*xtfs), drop_timer); ++ x = xtfs->x; ++ ++ /* Drop any in progress packet */ ++ spin_lock(&xtfs->drop_lock); ++ if (xtfs->ra_newskb) { ++ kfree_skb(xtfs->ra_newskb); ++ xtfs->ra_newskb = NULL; ++ } ++ spin_unlock(&xtfs->drop_lock); ++ ++ return HRTIMER_NORESTART; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -1287,6 +1723,7 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + + xc = &xtfs->cfg; + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; ++ xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) +@@ -1305,6 +1742,10 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + } + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); ++ if (attrs[XFRMA_IPTFS_DROP_TIME]) ++ xtfs->drop_time_ns = ++ (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * ++ NSECS_IN_USEC; + if (attrs[XFRMA_IPTFS_INIT_DELAY]) + xtfs->init_delay_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * +@@ -1323,7 +1764,9 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + +- if (x->dir == XFRM_SA_DIR_OUT) { ++ if (x->dir == XFRM_SA_DIR_IN) { ++ l += nla_total_size(sizeof(u32)); /* drop time usec */ ++ } else { + if (xc->dont_frag) + l += nla_total_size(0); /* dont-frag flag */ + l += nla_total_size(sizeof(u32)); /* init delay usec */ +@@ -1341,7 +1784,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + int ret = 0; + u64 q; + +- if (x->dir == XFRM_SA_DIR_OUT) { ++ if (x->dir == XFRM_SA_DIR_IN) { ++ q = xtfs->drop_time_ns; ++ (void)do_div(q, NSECS_IN_USEC); ++ ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); ++ } else { + if (xc->dont_frag) { + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); + if (ret) +@@ -1372,6 +1819,10 @@ static void __iptfs_init_state(struct xfrm_state *x, + hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->iptfs_timer.function = iptfs_delay_timer; + ++ spin_lock_init(&xtfs->drop_lock); ++ hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); ++ xtfs->drop_timer.function = iptfs_drop_timer; ++ + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) +@@ -1395,6 +1846,8 @@ static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) + if (!xtfs) + return -ENOMEM; + ++ xtfs->ra_newskb = NULL; ++ + __iptfs_init_state(x, xtfs); + + return 0; +@@ -1431,6 +1884,13 @@ static void iptfs_delete_state(struct xfrm_state *x) + while ((skb = __skb_dequeue(&list))) + kfree_skb(skb); + ++ spin_lock_bh(&xtfs->drop_lock); ++ hrtimer_cancel(&xtfs->drop_timer); ++ spin_unlock_bh(&xtfs->drop_lock); ++ ++ if (xtfs->ra_newskb) ++ kfree_skb(xtfs->ra_newskb); ++ + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +-- +2.46.0 + diff --git a/patches/v8/v8-0013-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch b/patches/v8/v8-0013-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch new file mode 100644 index 0000000..46b65e4 --- /dev/null +++ b/patches/v8/v8-0013-xfrm-iptfs-add-reusing-received-skb-for-the-tunne.patch @@ -0,0 +1,194 @@ +From 09dbc1b46be133c94e9690a452983d6254d92d37 Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:08:37 -0400 +Subject: [PATCH ipsec-next v8 13/16] xfrm: iptfs: add reusing received skb for + the tunnel egress packet + +Add an optimization of re-using the tunnel outer skb re-transmission +of the inner packet to avoid skb allocation and copy. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 126 +++++++++++++++++++++++++++++++++++------- + 1 file changed, 105 insertions(+), 21 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 5f9159260319..b05b72cd3ba5 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -657,19 +657,20 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; + struct list_head sublist; /* rename this it's just a list */ +- struct sk_buff *first_skb, *next; ++ struct sk_buff *first_skb, *defer, *next; + const unsigned char *old_mac; + struct xfrm_iptfs_data *xtfs; + struct ip_iptfs_hdr *ipth; + struct iphdr *iph; + struct net *net; +- u32 remaining, iplen, iphlen, data, tail; ++ u32 remaining, first_iplen, iplen, iphlen, data, tail; + u32 blkoff, capturelen; + u64 seq; + + xtfs = x->mode_data; + net = xs_net(x); + first_skb = NULL; ++ defer = NULL; + + seq = __esp_seq(skb); + +@@ -796,25 +797,94 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + skb_prepare_seq_read(save, data, tail, &skbseq); + } + +- if (!first_skb) ++ if (first_skb) { ++ skb = NULL; ++ } else { + first_skb = skb; ++ first_iplen = iplen; ++ ++ /* We are going to skip over `data` bytes to reach the ++ * start of the IP header of `iphlen` len for `iplen` ++ * inner packet. ++ */ ++ ++ if (skb_has_frag_list(skb)) { ++ defer = skb; ++ skb = NULL; ++ } else if (data + iphlen <= skb_headlen(skb) && ++ /* make sure our header is 32-bit aligned? */ ++ /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ ++ skb_tailroom(skb) + tail - data >= iplen) { ++ /* Reuse the received skb. ++ * ++ * We have enough headlen to pull past any ++ * initial fragment data, leaving at least the ++ * IP header in the linear buffer space. ++ * ++ * For linear buffer space we only require that ++ * linear buffer space is large enough to ++ * eventually hold the entire reassembled ++ * packet (by including tailroom in the check). ++ * ++ * For non-linear tailroom is 0 and so we only ++ * re-use if the entire packet is present ++ * already. ++ * ++ * NOTE: there are many more options for ++ * sharing, KISS for now. Also, this can produce ++ * skb's with the IP header unaligned to 32 ++ * bits. If that ends up being a problem then a ++ * check should be added to the conditional ++ * above that the header lies on a 32-bit ++ * boundary as well. ++ */ ++ skb_pull(skb, data); ++ ++ /* our range just changed */ ++ data = 0; ++ tail = skb->len; ++ remaining = skb->len; ++ ++ skb->protocol = protocol; ++ skb_mac_header_rebuild(skb); ++ if (skb->mac_len) ++ eth_hdr(skb)->h_proto = skb->protocol; ++ ++ /* all pointers could be changed now reset walk */ ++ skb_abort_seq_read(&skbseq); ++ skb_prepare_seq_read(skb, data, tail, &skbseq); ++ } else { ++ /* We couldn't reuse the input skb so allocate a ++ * new one. ++ */ ++ defer = skb; ++ skb = NULL; ++ } ++ ++ /* Don't trim `first_skb` until the end as we are ++ * walking that data now. ++ */ ++ } + + capturelen = min(iplen, remaining); +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, capturelen); + if (!skb) { +- /* skip to next packet or done */ +- data += capturelen; +- continue; +- } +- BUG_ON(skb->len != capturelen); +- +- skb->protocol = protocol; +- if (old_mac) { +- /* rebuild the mac header */ +- skb_set_mac_header(skb, -first_skb->mac_len); +- memcpy(skb_mac_header(skb), old_mac, +- first_skb->mac_len); +- eth_hdr(skb)->h_proto = skb->protocol; ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, ++ capturelen); ++ if (!skb) { ++ /* skip to next packet or done */ ++ data += capturelen; ++ continue; ++ } ++ BUG_ON(skb->len != capturelen); ++ ++ skb->protocol = protocol; ++ if (old_mac) { ++ /* rebuild the mac header */ ++ skb_set_mac_header(skb, -first_skb->mac_len); ++ memcpy(skb_mac_header(skb), old_mac, ++ first_skb->mac_len); ++ eth_hdr(skb)->h_proto = skb->protocol; ++ } + } + + data += capturelen; +@@ -849,8 +919,19 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + /* this should not happen from the above code */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + ++ if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { ++ /* first_skb is queued b/c !defer and not partial */ ++ if (pskb_trim(first_skb, first_iplen)) { ++ /* error trimming */ ++ list_del(&first_skb->list); ++ defer = first_skb; ++ } ++ first_skb->ip_summed = CHECKSUM_NONE; ++ } ++ + /* Send the packets! */ + list_for_each_entry_safe(skb, next, &sublist, list) { ++ BUG_ON(skb == defer); + skb_list_del_init(skb); + if (xfrm_input(skb, 0, 0, -2)) + kfree_skb(skb); +@@ -860,12 +941,15 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); + +- if (first_skb) { +- consume_skb(first_skb); +- } else { ++ if (defer) { ++ consume_skb(defer); ++ } else if (!first_skb) { + /* skb is the original passed in skb, but we didn't get far +- * enough to process it as the first_skb. ++ * enough to process it as the first_skb, if we had it would ++ * either be save in ra_newskb, trimmed and sent on as an skb or ++ * placed in defer to be freed. + */ ++ BUG_ON(!skb); + kfree_skb(skb); + } + +-- +2.46.0 + diff --git a/patches/v8/v8-0014-xfrm-iptfs-add-skb-fragment-sharing-code.patch b/patches/v8/v8-0014-xfrm-iptfs-add-skb-fragment-sharing-code.patch new file mode 100644 index 0000000..8ec0d56 --- /dev/null +++ b/patches/v8/v8-0014-xfrm-iptfs-add-skb-fragment-sharing-code.patch @@ -0,0 +1,390 @@ +From a5905e6ab2b17969779c390eba5b0024b1e90b2d Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 12:53:38 -0400 +Subject: [PATCH ipsec-next v8 14/16] xfrm: iptfs: add skb-fragment sharing + code + +Avoid copying the inner packet data by sharing the skb data fragments +from the output packet skb into new inner packet skb. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 305 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 297 insertions(+), 8 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index b05b72cd3ba5..598dc88b1408 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -75,6 +75,7 @@ + #define XFRM_IPTFS_MIN_L3HEADROOM 128 + #define XFRM_IPTFS_MIN_L2HEADROOM (64 + 16) + #define IPTFS_FRAG_COPY_MAX 256 /* max for copying to create iptfs frags */ ++#define IPTFS_PKT_SHARE_MIN 129 /* min to try to share vs copy pkt data */ + #define NSECS_IN_USEC 1000 + + #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT +@@ -228,6 +229,207 @@ static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); + } + ++/** ++ * struct skb_frag_walk - use to track a walk through fragments ++ * @fragi: current fragment index ++ * @past: length of data in fragments before @fragi ++ * @total: length of data in all fragments ++ * @nr_frags: number of fragments present in array ++ * @initial_offset: the value passed in to skb_prepare_frag_walk() ++ * @pp_recycle: copy of skb->pp_recycle ++ * @frags: the page fragments inc. room for head page ++ */ ++struct skb_frag_walk { ++ u32 fragi; ++ u32 past; ++ u32 total; ++ u32 nr_frags; ++ u32 initial_offset; ++ bool pp_recycle; ++ skb_frag_t frags[MAX_SKB_FRAGS + 1]; ++}; ++ ++/** ++ * skb_prepare_frag_walk() - initialize a frag walk over an skb. ++ * @skb: the skb to walk. ++ * @initial_offset: start the walk @initial_offset into the skb. ++ * @walk: the walk to initialize ++ * ++ * Future calls to skb_add_frags() will expect the @offset value to be at ++ * least @initial_offset large. ++ */ ++static void skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, ++ struct skb_frag_walk *walk) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frag, *from; ++ u32 i; ++ ++ walk->initial_offset = initial_offset; ++ walk->fragi = 0; ++ walk->past = 0; ++ walk->total = 0; ++ walk->nr_frags = 0; ++ walk->pp_recycle = skb->pp_recycle; ++ ++ if (skb->head_frag) { ++ if (initial_offset >= skb_headlen(skb)) { ++ initial_offset -= skb_headlen(skb); ++ } else { ++ frag = &walk->frags[walk->nr_frags++]; ++ skb_head_to_frag(skb, frag); ++ frag->offset += initial_offset; ++ frag->len -= initial_offset; ++ walk->total += frag->len; ++ initial_offset = 0; ++ } ++ } else { ++ BUG_ON(skb_headlen(skb) > initial_offset); ++ initial_offset -= skb_headlen(skb); ++ } ++ ++ for (i = 0; i < shinfo->nr_frags; i++) { ++ from = &shinfo->frags[i]; ++ if (initial_offset >= from->len) { ++ initial_offset -= from->len; ++ continue; ++ } ++ frag = &walk->frags[walk->nr_frags++]; ++ *frag = *from; ++ if (initial_offset) { ++ frag->offset += initial_offset; ++ frag->len -= initial_offset; ++ initial_offset = 0; ++ } ++ walk->total += frag->len; ++ } ++ BUG_ON(initial_offset != 0); ++} ++ ++static u32 __skb_reset_frag_walk(struct skb_frag_walk *walk, u32 offset) ++{ ++ /* Adjust offset to refer to internal walk values */ ++ BUG_ON(offset < walk->initial_offset); ++ offset -= walk->initial_offset; ++ ++ /* Get to the correct fragment for offset */ ++ while (offset < walk->past) { ++ walk->past -= walk->frags[--walk->fragi].len; ++ if (offset >= walk->past) ++ break; ++ BUG_ON(walk->fragi == 0); ++ } ++ while (offset >= walk->past + walk->frags[walk->fragi].len) ++ walk->past += walk->frags[walk->fragi++].len; ++ ++ /* offset now relative to this current frag */ ++ offset -= walk->past; ++ return offset; ++} ++ ++/** ++ * skb_can_add_frags() - check if ok to add frags from walk to skb ++ * @skb: skb to check for adding frags to ++ * @walk: the walk that will be used as source for frags. ++ * @offset: offset from beginning of original skb to start from. ++ * @len: amount of data to add frag references to in @skb. ++ * ++ * Return: true if ok to add frags. ++ */ ++static bool skb_can_add_frags(const struct sk_buff *skb, ++ struct skb_frag_walk *walk, u32 offset, u32 len) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ u32 fragi, nr_frags, fraglen; ++ ++ if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) ++ return false; ++ ++ /* Make offset relative to current frag after setting that */ ++ offset = __skb_reset_frag_walk(walk, offset); ++ ++ /* Verify we have array space for the fragments we need to add */ ++ fragi = walk->fragi; ++ nr_frags = shinfo->nr_frags; ++ while (len && fragi < walk->nr_frags) { ++ skb_frag_t *frag = &walk->frags[fragi]; ++ ++ fraglen = frag->len; ++ if (offset) { ++ fraglen -= offset; ++ offset = 0; ++ } ++ if (++nr_frags > MAX_SKB_FRAGS) ++ return false; ++ if (len <= fraglen) ++ return true; ++ len -= fraglen; ++ fragi++; ++ } ++ /* We may not copy all @len but what we have will fit. */ ++ return true; ++} ++ ++/** ++ * skb_add_frags() - add a range of fragment references into an skb ++ * @skb: skb to add references into ++ * @walk: the walk to add referenced fragments from. ++ * @offset: offset from beginning of original skb to start from. ++ * @len: amount of data to add frag references to in @skb. ++ * ++ * skb_can_add_frags() should be called before this function to verify that the ++ * destination @skb is compatible with the walk and has space in the array for ++ * the to be added frag references. ++ * ++ * Return: The number of bytes not added to @skb b/c we reached the end of the ++ * walk before adding all of @len. ++ */ ++static int skb_add_frags(struct sk_buff *skb, struct skb_frag_walk *walk, ++ u32 offset, u32 len) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ u32 fraglen; ++ ++ BUG_ON(skb->pp_recycle != walk->pp_recycle); ++ if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) ++ return len; ++ ++ /* make offset relative to current frag after setting that */ ++ offset = __skb_reset_frag_walk(walk, offset); ++ BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); ++ ++ while (len && walk->fragi < walk->nr_frags) { ++ skb_frag_t *frag = &walk->frags[walk->fragi]; ++ skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; ++ ++ *tofrag = *frag; ++ if (offset) { ++ tofrag->offset += offset; ++ tofrag->len -= offset; ++ offset = 0; ++ } ++ __skb_frag_ref(tofrag); ++ shinfo->nr_frags++; ++ BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); ++ ++ /* see if we are done */ ++ fraglen = tofrag->len; ++ if (len < fraglen) { ++ tofrag->len = len; ++ skb->len += len; ++ skb->data_len += len; ++ return 0; ++ } ++ /* advance to next source fragment */ ++ len -= fraglen; /* careful, use dst bv_len */ ++ skb->len += fraglen; /* careful, " " " */ ++ skb->data_len += fraglen; /* careful, " " " */ ++ walk->past += frag->len; /* careful, use src bv_len */ ++ walk->fragi++; ++ } ++ return len; ++} ++ + /** + * skb_copy_bits_seq - copy bits from a skb_seq_state to kernel buffer + * @st: source skb_seq_state +@@ -267,6 +469,53 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + /* IPTFS Receiving (egress) Functions */ + /* ================================== */ + ++/** ++ * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. ++ * @tpl: template to create new skb from. ++ * @walk: The source for fragments to add. ++ * @off: The offset into @walk to add frags from, also used with @st and ++ * @copy_len. ++ * @len: The length of data to add covering frags from @walk into @skb. ++ * This must be <= @skblen. ++ * @st: The sequence state to copy from into the new head skb. ++ * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb ++ * linear space. ++ * ++ * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from ++ * @st into the new skb linear space, and then add shared fragments from the ++ * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). ++ * ++ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. ++ */ ++static struct sk_buff *iptfs_pskb_add_frags(struct sk_buff *tpl, ++ struct skb_frag_walk *walk, u32 off, ++ u32 len, struct skb_seq_state *st, ++ u32 copy_len) ++{ ++ struct sk_buff *skb; ++ ++ skb = iptfs_alloc_skb(tpl, copy_len, false); ++ if (!skb) ++ return NULL; ++ ++ /* this should not normally be happening */ ++ if (!skb_can_add_frags(skb, walk, off + copy_len, len - copy_len)) { ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ if (copy_len && ++ skb_copy_bits_seq(st, off, skb_put(skb, copy_len), copy_len)) { ++ XFRM_INC_STATS(dev_net(st->root_skb->dev), ++ LINUX_MIB_XFRMINERROR); ++ kfree_skb(skb); ++ return NULL; ++ } ++ ++ skb_add_frags(skb, walk, off + copy_len, len - copy_len); ++ return skb; ++} ++ + /** + * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. + * @skblen: the total data size for `skb`. +@@ -457,6 +706,8 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) + { ++ struct skb_frag_walk _fragwalk; ++ struct skb_frag_walk *fragwalk = NULL; + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; +@@ -609,13 +860,31 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + +- /* We verified this was true in the main receive routine */ +- BUG_ON(skb_tailroom(newskb) < copylen); ++ /* If we may have the opportunity to share prepare a fragwalk. */ ++ if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && ++ (skb->head_frag || skb->len == skb->data_len) && ++ skb->pp_recycle == newskb->pp_recycle) { ++ fragwalk = &_fragwalk; ++ skb_prepare_frag_walk(skb, data, fragwalk); ++ } + +- /* copy fragment data into newskb */ +- if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), copylen)) { +- XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); +- goto abandon; ++ /* Try share then copy. */ ++ if (fragwalk && skb_can_add_frags(newskb, fragwalk, data, copylen)) { ++ u32 leftover; ++ ++ leftover = skb_add_frags(newskb, fragwalk, data, copylen); ++ BUG_ON(leftover != 0); ++ } else { ++ /* We verified this was true in the main receive routine */ ++ BUG_ON(skb_tailroom(newskb) < copylen); ++ ++ /* copy fragment data into newskb */ ++ if (skb_copy_bits_seq(st, data, skb_put(newskb, copylen), ++ copylen)) { ++ XFRM_INC_STATS(xs_net(xtfs->x), ++ LINUX_MIB_XFRMINBUFFERERROR); ++ goto abandon; ++ } + } + + if (copylen < ipremain) { +@@ -656,6 +925,8 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + u8 hbytes[sizeof(struct ipv6hdr)]; + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; ++ struct skb_frag_walk _fragwalk; ++ struct skb_frag_walk *fragwalk = NULL; + struct list_head sublist; /* rename this it's just a list */ + struct sk_buff *first_skb, *defer, *next; + const unsigned char *old_mac; +@@ -802,6 +1073,7 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + } else { + first_skb = skb; + first_iplen = iplen; ++ fragwalk = NULL; + + /* We are going to skip over `data` bytes to reach the + * start of the IP header of `iphlen` len for `iplen` +@@ -853,6 +1125,13 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + /* all pointers could be changed now reset walk */ + skb_abort_seq_read(&skbseq); + skb_prepare_seq_read(skb, data, tail, &skbseq); ++ } else if (skb->head_frag && ++ /* We have the IP header right now */ ++ remaining >= iphlen) { ++ fragwalk = &_fragwalk; ++ skb_prepare_frag_walk(skb, data, fragwalk); ++ defer = skb; ++ skb = NULL; + } else { + /* We couldn't reuse the input skb so allocate a + * new one. +@@ -868,8 +1147,18 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + + capturelen = min(iplen, remaining); + if (!skb) { +- skb = iptfs_pskb_extract_seq(iplen, &skbseq, data, +- capturelen); ++ if (!fragwalk || ++ /* Large enough to be worth sharing */ ++ iplen < IPTFS_PKT_SHARE_MIN || ++ /* Have IP header + some data to share. */ ++ capturelen <= iphlen || ++ /* Try creating skb and adding frags */ ++ !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, ++ data, capturelen, ++ &skbseq, iphlen))) { ++ skb = iptfs_pskb_extract_seq(iplen, &skbseq, ++ data, capturelen); ++ } + if (!skb) { + /* skip to next packet or done */ + data += capturelen; +-- +2.46.0 + diff --git a/patches/v8/v8-0015-xfrm-iptfs-handle-reordering-of-received-packets.patch b/patches/v8/v8-0015-xfrm-iptfs-handle-reordering-of-received-packets.patch new file mode 100644 index 0000000..075763c --- /dev/null +++ b/patches/v8/v8-0015-xfrm-iptfs-handle-reordering-of-received-packets.patch @@ -0,0 +1,665 @@ +From 1715114e4f050acba9eb7b6af3c38e0838314bcf Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 31 Jul 2024 12:26:10 -0400 +Subject: [PATCH ipsec-next v8 15/16] xfrm: iptfs: handle reordering of + received packets + +Handle the receipt of the outer tunnel packets out-of-order. Pointers to +the out-of-order packets are saved in a window (array) awaiting needed +prior packets. When the required prior packets are received the now +in-order packets are then passed on to the regular packet receive code. +A timer is used to consider missing earlier packet as lost so the +algorithm will advance. + +Signed-off-by: Christian Hopps +--- + net/xfrm/xfrm_iptfs.c | 507 +++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 495 insertions(+), 12 deletions(-) + +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 598dc88b1408..6d022935c9dc 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -39,6 +39,17 @@ + */ + #define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 + ++/** ++ * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size ++ * ++ * The default IPTFS reorder window size. The reorder window size dictates the ++ * maximum number of IPTFS tunnel packets in a sequence that may arrive out of ++ * order. ++ * ++ * Default 3. (tcp folks suggested) ++ */ ++#define IPTFS_DEFAULT_REORDER_WINDOW 3 ++ + /* ------------------------------------------------ */ + /* IPTFS default SA values (tunnel ingress/dir-out) */ + /* ------------------------------------------------ */ +@@ -83,6 +94,8 @@ + /** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. ++ * @reorder_win_size: the number slots in the reorder window, thus the number of ++ * packets that may arrive out of order. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. + * @max_queue_size: The maximum number of octets allowed to be queued to be sent +@@ -91,10 +104,16 @@ + */ + struct xfrm_iptfs_config { + bool dont_frag : 1; ++ u16 reorder_win_size; + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + }; + ++struct skb_wseq { ++ struct sk_buff *skb; ++ u64 drop_time; ++}; ++ + /** + * struct xfrm_iptfs_data - mode specific xfrm state. + * @cfg: IPTFS tunnel config. +@@ -105,6 +124,10 @@ struct xfrm_iptfs_config { + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. + * @payload_mtu: max payload size. ++ * @w_seq_set: true after first seq received. ++ * @w_wantseq: waiting for this seq number as next to process (in order). ++ * @w_saved: the saved buf array (reorder window). ++ * @w_savedlen: the saved len (not size). + * @drop_lock: lock to protect reorder queue. + * @drop_timer: timer for considering next packet lost. + * @drop_time_ns: timer intervan in nanoseconds. +@@ -126,12 +149,16 @@ struct xfrm_iptfs_data { + struct hrtimer iptfs_timer; /* output timer */ + u32 payload_mtu; /* max payload size */ + +- /* Tunnel egress */ ++ /* Tunnel input reordering */ ++ bool w_seq_set; /* true after first seq received */ ++ u64 w_wantseq; /* expected next sequence */ ++ struct skb_wseq *w_saved; /* the saved buf array */ ++ u32 w_savedlen; /* the saved len (not size) */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + +- /* Tunnel egress reassembly */ ++ /* Tunnel input reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ +@@ -912,15 +939,13 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + } + + /** +- * iptfs_input() - handle receipt of iptfs payload ++ * iptfs_input_ordered() - handle next in order IPTFS payload. + * @x: xfrm state +- * @skb: the packet ++ * @skb: current packet + * + * Process the IPTFS payload in `skb` and consume it afterwards. +- * +- * Returns 0. + */ +-static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) + { + u8 hbytes[sizeof(struct ipv6hdr)]; + struct ip_iptfs_cc_hdr iptcch; +@@ -1241,12 +1266,368 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + BUG_ON(!skb); + kfree_skb(skb); + } ++} + +- /* We always have dealt with the input SKB, either we are re-using it, +- * or we have freed it. Return EINPROGRESS so that xfrm_input stops +- * processing it. ++/* ------------------------------- */ ++/* Input (Egress) Re-ordering Code */ ++/* ------------------------------- */ ++ ++static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) ++{ ++ u32 savedlen = xtfs->w_savedlen; ++ ++ if (shift > savedlen) ++ shift = savedlen; ++ if (shift != savedlen) ++ memcpy(xtfs->w_saved, xtfs->w_saved + shift, ++ (savedlen - shift) * sizeof(*xtfs->w_saved)); ++ memset(xtfs->w_saved + savedlen - shift, 0, ++ shift * sizeof(*xtfs->w_saved)); ++ xtfs->w_savedlen -= shift; ++} ++ ++static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, ++ struct list_head *freelist) ++{ ++ list_add_tail(&inskb->list, freelist); ++} ++ ++static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) ++ ++{ ++ struct skb_wseq *s, *se; ++ const u32 savedlen = xtfs->w_savedlen; ++ time64_t now = ktime_get_raw_fast_ns(); ++ u32 count = 0; ++ u32 scount = 0; ++ ++ BUG_ON(!savedlen); ++ if (xtfs->w_saved[0].drop_time > now) ++ goto set_timer; ++ ++ ++xtfs->w_wantseq; ++ ++ /* Keep flushing packets until we reach a drop time greater than now. */ ++ s = xtfs->w_saved; ++ se = s + savedlen; ++ do { ++ /* Walking past empty slots until we reach a packet */ ++ for (; s < se && !s->skb; s++) ++ if (s->drop_time > now) ++ goto outerdone; ++ /* Sending packets until we hit another empty slot. */ ++ for (; s < se && s->skb; scount++, s++) ++ list_add_tail(&s->skb->list, list); ++ } while (s < se); ++outerdone: ++ ++ count = s - xtfs->w_saved; ++ if (count) { ++ xtfs->w_wantseq += count; ++ ++ /* Shift handled slots plus final empty slot into slot 0. */ ++ __vec_shift(xtfs, count); ++ } ++ ++ if (xtfs->w_savedlen) { ++set_timer: ++ /* Drifting is OK */ ++ hrtimer_start(&xtfs->drop_timer, ++ xtfs->w_saved[0].drop_time - now, ++ IPTFS_HRTIMER_MODE); ++ } ++ return scount; ++} ++ ++static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, ++ struct list_head *list) ++{ ++ struct skb_wseq *s, *se; ++ const u32 savedlen = xtfs->w_savedlen; ++ u32 count = 0; ++ ++ /* Got what we wanted. */ ++ list_add_tail(&inskb->list, list); ++ ++xtfs->w_wantseq; ++ if (!savedlen) ++ return; ++ ++ /* Flush remaining consecutive packets. */ ++ ++ /* Keep sending until we hit another missed pkt. */ ++ for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) ++ list_add_tail(&s->skb->list, list); ++ count = s - xtfs->w_saved; ++ if (count) ++ xtfs->w_wantseq += count; ++ ++ /* Shift handled slots plus final empty slot into slot 0. */ ++ __vec_shift(xtfs, count + 1); ++} ++ ++/* Set the slot's drop time and all the empty slots below it until reaching a ++ * filled slot which will already be set. ++ */ ++static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) ++{ ++ const u32 savedlen = xtfs->w_savedlen; ++ struct skb_wseq *s = xtfs->w_saved; ++ time64_t drop_time; ++ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ if (savedlen > index + 1) { ++ /* we are below another, our drop time and the timer are already set */ ++ BUG_ON(xtfs->w_saved[index + 1].drop_time != ++ xtfs->w_saved[index].drop_time); ++ return; ++ } ++ /* we are the most future so get a new drop time. */ ++ drop_time = ktime_get_raw_fast_ns(); ++ drop_time += xtfs->drop_time_ns; ++ ++ /* Walk back through the array setting drop times as we go */ ++ s[index].drop_time = drop_time; ++ while (index-- > 0 && !s[index].skb) ++ s[index].drop_time = drop_time; ++ ++ /* If we walked all the way back, schedule the drop timer if needed */ ++ if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) ++ hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, ++ IPTFS_HRTIMER_MODE); ++} ++ ++static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ const u64 inseq = __esp_seq(inskb); ++ const u64 wantseq = xtfs->w_wantseq; ++ const u64 distance = inseq - wantseq; ++ const u32 savedlen = xtfs->w_savedlen; ++ const u32 index = distance - 1; ++ ++ BUG_ON(distance >= nslots); ++ ++ /* Handle future sequence number received which fits in the window. ++ * ++ * We know we don't have the seq we want so we won't be able to flush ++ * anything. + */ +- return -EINPROGRESS; ++ ++ /* slot count is 4, saved size is 3 savedlen is 2 ++ * ++ * "window boundary" is based on the fixed window size ++ * distance is also slot number ++ * index is an array index (i.e., - 1 of slot) ++ * : : - implicit NULL after array len ++ * ++ * +--------- used length (savedlen == 2) ++ * | +----- array size (nslots - 1 == 3) ++ * | | + window boundary (nslots == 4) ++ * V V | V ++ * | ++ * 0 1 2 3 | slot number ++ * --- 0 1 2 | array index ++ * [-] [b] : :| array ++ * ++ * "2" "3" "4" *5*| seq numbers ++ * ++ * We receive seq number 5 ++ * distance == 3 [inseq(5) - w_wantseq(2)] ++ * index == 2 [distance(6) - 1] ++ */ ++ ++ if (xtfs->w_saved[index].skb) { ++ /* a dup of a future */ ++ list_add_tail(&inskb->list, freelist); ++ return; ++ } ++ ++ xtfs->w_saved[index].skb = inskb; ++ xtfs->w_savedlen = max(savedlen, index + 1); ++ iptfs_set_window_drop_times(xtfs, index); ++} ++ ++static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, ++ struct list_head *list, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ const u64 inseq = __esp_seq(inskb); ++ u32 savedlen = xtfs->w_savedlen; ++ u64 wantseq = xtfs->w_wantseq; ++ struct sk_buff *slot0 = NULL; ++ struct skb_wseq *wnext; ++ u32 beyond, shifting, slot; ++ u64 distance; ++ ++ BUG_ON(inseq <= wantseq); ++ distance = inseq - wantseq; ++ BUG_ON(distance <= nslots - 1); ++ beyond = distance - (nslots - 1); ++ ++ /* Handle future sequence number received. ++ * ++ * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 ++ * b/c we are beyond the window boundary. ++ * ++ * We know we don't have the wantseq so that counts as a drop. ++ */ ++ ++ /* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the ++ * missing sequence number. ++ * ++ * the final slot at savedlen (index savedlen - 1) is always occupied. ++ * ++ * beyond is "beyond array size" not savedlen. ++ * ++ * +--------- array length (savedlen == 2) ++ * | +----- array size (nslots - 1 == 3) ++ * | | +- window boundary (nslots == 4) ++ * V V | V ++ * | ++ * 0 1 2 3 | slot number ++ * --- 0 1 2 | array index ++ * [b] [c] : :| array ++ * | ++ * "2" "3" "4" "5"|*6* seq numbers ++ * ++ * We receive seq number 6 ++ * distance == 4 [inseq(6) - w_wantseq(2)] ++ * newslot == distance ++ * index == 3 [distance(4) - 1] ++ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] ++ * shifting == 1 [min(savedlen(2), beyond(1)] ++ * slot0_skb == [b], and should match w_wantseq ++ * ++ * +--- window boundary (nslots == 4) ++ * 0 1 2 3 | 4 slot number ++ * --- 0 1 2 | 3 array index ++ * [b] : : : :| array ++ * "2" "3" "4" "5" *6* seq numbers ++ * ++ * We receive seq number 6 ++ * distance == 4 [inseq(6) - w_wantseq(2)] ++ * newslot == distance ++ * index == 3 [distance(4) - 1] ++ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] ++ * shifting == 1 [min(savedlen(1), beyond(1)] ++ * slot0_skb == [b] and should match w_wantseq ++ * ++ * +-- window boundary (nslots == 4) ++ * 0 1 2 3 | 4 5 6 slot number ++ * --- 0 1 2 | 3 4 5 array index ++ * [-] [c] : :| array ++ * "2" "3" "4" "5" "6" "7" *8* seq numbers ++ * ++ * savedlen = 2, beyond = 3 ++ * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] ++ * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] ++ * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) ++ * ++ * We receive seq number 8 ++ * distance == 6 [inseq(8) - w_wantseq(2)] ++ * newslot == distance ++ * index == 5 [distance(6) - 1] ++ * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] ++ * shifting == 2 [min(savedlen(2), beyond(3)] ++ * ++ * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. ++ */ ++ ++ /* Now send any packets that are being shifted out of saved, and account ++ * for missing packets that are exiting the window as we shift it. ++ */ ++ ++ /* If savedlen > beyond we are shifting some, else all. */ ++ shifting = min(savedlen, beyond); ++ ++ /* slot0 is the buf that just shifted out and into slot0 */ ++ slot0 = NULL; ++ wnext = xtfs->w_saved; ++ for (slot = 1; slot <= shifting; slot++, wnext++) { ++ /* handle what was in slot0 before we occupy it */ ++ if (slot0) ++ list_add_tail(&slot0->list, list); ++ slot0 = wnext->skb; ++ wnext->skb = NULL; ++ } ++ ++ /* slot0 is now either NULL (in which case it's what we now are waiting ++ * for, or a buf in which case we need to handle it like we received it; ++ * however, we may be advancing past that buffer as well.. ++ */ ++ ++ /* Handle case where we need to shift more than we had saved, slot0 will ++ * be NULL iff savedlen is 0, otherwise slot0 will always be ++ * non-NULL b/c we shifted the final element, which is always set if ++ * there is any saved, into slot0. ++ */ ++ if (savedlen < beyond) { ++ if (savedlen == 0) { ++ BUG_ON(slot0); ++ } else { ++ BUG_ON(!slot0); ++ list_add_tail(&slot0->list, list); ++ } ++ slot0 = NULL; ++ /* slot0 has had an empty slot pushed into it */ ++ } ++ ++ /* Remove the entries */ ++ __vec_shift(xtfs, beyond); ++ ++ /* Advance want seq */ ++ xtfs->w_wantseq += beyond; ++ ++ /* Process drops here when implementing congestion control */ ++ ++ /* We've shifted. plug the packet in at the end. */ ++ xtfs->w_savedlen = nslots - 1; ++ xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; ++ iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); ++ ++ /* if we don't have a slot0 then we must wait for it */ ++ if (!slot0) ++ return; ++ ++ /* If slot0, seq must match new want seq */ ++ BUG_ON(xtfs->w_wantseq != __esp_seq(slot0)); ++ ++ /* slot0 is valid, treat like we received expected. */ ++ __reorder_this(xtfs, slot0, list); ++} ++ ++/* Receive a new packet into the reorder window. Return a list of ordered ++ * packets from the window. ++ */ ++static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, ++ struct sk_buff *inskb, struct list_head *list, ++ struct list_head *freelist) ++{ ++ const u32 nslots = xtfs->cfg.reorder_win_size + 1; ++ u64 inseq = __esp_seq(inskb); ++ u64 wantseq; ++ ++ assert_spin_locked(&xtfs->drop_lock); ++ ++ if (unlikely(!xtfs->w_seq_set)) { ++ xtfs->w_seq_set = true; ++ xtfs->w_wantseq = inseq; ++ } ++ wantseq = xtfs->w_wantseq; ++ ++ if (likely(inseq == wantseq)) ++ __reorder_this(xtfs, inskb, list); ++ else if (inseq < wantseq) ++ __reorder_past(xtfs, inskb, freelist); ++ else if ((inseq - wantseq) < nslots) ++ __reorder_future_fits(xtfs, inskb, freelist); ++ else ++ __reorder_future_shifts(xtfs, inskb, list, freelist); + } + + /** +@@ -1273,23 +1654,92 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) + */ + static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) + { ++ struct sk_buff *skb, *next; ++ struct list_head freelist, list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; ++ u32 count; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + +- /* Drop any in progress packet */ + spin_lock(&xtfs->drop_lock); ++ ++ INIT_LIST_HEAD(&list); ++ INIT_LIST_HEAD(&freelist); ++ ++ /* Drop any in progress packet */ ++ + if (xtfs->ra_newskb) { + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; + } ++ ++ /* Now drop as many packets as we should from the reordering window ++ * saved array ++ */ ++ count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; ++ + spin_unlock(&xtfs->drop_lock); + ++ if (count) { ++ list_for_each_entry_safe(skb, next, &list, list) { ++ skb_list_del_init(skb); ++ iptfs_input_ordered(x, skb); ++ } ++ } + return HRTIMER_NORESTART; + } + ++/** ++ * iptfs_input() - handle receipt of iptfs payload ++ * @x: xfrm state ++ * @skb: the packet ++ * ++ * We have an IPTFS payload order it if needed, then process newly in order ++ * packets. ++ * ++ * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb. ++ */ ++static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) ++{ ++ struct list_head freelist, list; ++ struct xfrm_iptfs_data *xtfs = x->mode_data; ++ struct sk_buff *next; ++ ++ /* Fast path for no reorder window. */ ++ if (xtfs->cfg.reorder_win_size == 0) { ++ iptfs_input_ordered(x, skb); ++ goto done; ++ } ++ ++ /* Fetch list of in-order packets from the reordering window as well as ++ * a list of buffers we need to now free. ++ */ ++ INIT_LIST_HEAD(&list); ++ INIT_LIST_HEAD(&freelist); ++ ++ spin_lock(&xtfs->drop_lock); ++ iptfs_input_reorder(xtfs, skb, &list, &freelist); ++ spin_unlock(&xtfs->drop_lock); ++ ++ list_for_each_entry_safe(skb, next, &list, list) { ++ skb_list_del_init(skb); ++ iptfs_input_ordered(x, skb); ++ } ++ ++ list_for_each_entry_safe(skb, next, &freelist, list) { ++ skb_list_del_init(skb); ++ kfree_skb(skb); ++ } ++done: ++ /* We always have dealt with the input SKB, either we are re-using it, ++ * or we have freed it. Return EINPROGRESS so that xfrm_input stops ++ * processing it. ++ */ ++ return -EINPROGRESS; ++} ++ + /* ================================= */ + /* IPTFS Sending (ingress) Functions */ + /* ================================= */ +@@ -2096,11 +2546,24 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, + + xc = &xtfs->cfg; + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; ++ xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW; + xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; ++ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) ++ xc->reorder_win_size = ++ nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); ++ /* saved array is for saving 1..N seq nums from wantseq */ ++ if (xc->reorder_win_size) { ++ xtfs->w_saved = kcalloc(xc->reorder_win_size, ++ sizeof(*xtfs->w_saved), GFP_KERNEL); ++ if (!xtfs->w_saved) { ++ NL_SET_ERR_MSG(extack, "Cannot alloc reorder window"); ++ return -ENOMEM; ++ } ++ } + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { +@@ -2139,6 +2602,7 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) + + if (x->dir == XFRM_SA_DIR_IN) { + l += nla_total_size(sizeof(u32)); /* drop time usec */ ++ l += nla_total_size(sizeof(xc->reorder_win_size)); + } else { + if (xc->dont_frag) + l += nla_total_size(0); /* dont-frag flag */ +@@ -2161,6 +2625,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) + q = xtfs->drop_time_ns; + (void)do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); ++ if (ret) ++ return ret; ++ ++ ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, ++ xc->reorder_win_size); + } else { + if (xc->dont_frag) { + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); +@@ -2220,6 +2689,14 @@ static int iptfs_clone(struct xfrm_state *x, struct xfrm_state *orig) + return -ENOMEM; + + xtfs->ra_newskb = NULL; ++ if (xtfs->cfg.reorder_win_size) { ++ xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, ++ sizeof(*xtfs->w_saved), GFP_KERNEL); ++ if (!xtfs->w_saved) { ++ kfree_sensitive(xtfs); ++ return -ENOMEM; ++ } ++ } + + __iptfs_init_state(x, xtfs); + +@@ -2243,6 +2720,7 @@ static void iptfs_delete_state(struct xfrm_state *x) + { + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff_head list; ++ struct skb_wseq *s, *se; + struct sk_buff *skb; + + if (!xtfs) +@@ -2264,6 +2742,11 @@ static void iptfs_delete_state(struct xfrm_state *x) + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + ++ for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) ++ if (s->skb) ++ kfree_skb(s->skb); ++ ++ kfree_sensitive(xtfs->w_saved); + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +-- +2.46.0 + diff --git a/patches/v8/v8-0016-xfrm-iptfs-add-tracepoint-functionality.patch b/patches/v8/v8-0016-xfrm-iptfs-add-tracepoint-functionality.patch new file mode 100644 index 0000000..f4dee7c --- /dev/null +++ b/patches/v8/v8-0016-xfrm-iptfs-add-tracepoint-functionality.patch @@ -0,0 +1,458 @@ +From 5536eb4659be81268940fcadb67e054aec40a38b Mon Sep 17 00:00:00 2001 +From: Christian Hopps +Date: Wed, 3 Apr 2024 01:11:30 -0400 +Subject: [PATCH ipsec-next v8 16/16] xfrm: iptfs: add tracepoint functionality + +Add tracepoints to the IP-TFS code. + +Signed-off-by: Christian Hopps +--- + net/xfrm/trace_iptfs.h | 218 +++++++++++++++++++++++++++++++++++++++++ + net/xfrm/xfrm_iptfs.c | 70 ++++++++++++- + 2 files changed, 287 insertions(+), 1 deletion(-) + create mode 100644 net/xfrm/trace_iptfs.h + +diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h +new file mode 100644 +index 000000000000..74391ba24445 +--- /dev/null ++++ b/net/xfrm/trace_iptfs.h +@@ -0,0 +1,218 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* xfrm_trace_iptfs.h ++ * ++ * August 12 2023, Christian Hopps ++ * ++ * Copyright (c) 2023, LabN Consulting, L.L.C. ++ */ ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM iptfs ++ ++#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_IPTFS_H ++ ++#include ++#include ++#include ++#include ++ ++struct xfrm_iptfs_data; ++ ++TRACE_EVENT(iptfs_egress_recv, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), ++ TP_ARGS(skb, xtfs, blkoff), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(void *, head) ++ __field(void *, head_pg_addr) ++ __field(void *, pg0addr) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, headroom) ++ __field(u32, tailroom) ++ __field(u32, tail) ++ __field(u32, end) ++ __field(u32, pg0off) ++ __field(u8, head_frag) ++ __field(u8, frag_list) ++ __field(u8, nr_frags) ++ __field(u16, blkoff)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->head = skb->head; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->headroom = skb_headroom(skb); ++ __entry->tailroom = skb_tailroom(skb); ++ __entry->tail = (u32)skb->tail; ++ __entry->end = (u32)skb->end; ++ __entry->head_frag = skb->head_frag; ++ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; ++ __entry->nr_frags = skb_shinfo(skb)->nr_frags; ++ __entry->blkoff = blkoff; ++ __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); ++ __entry->pg0addr = (__entry->nr_frags ++ ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem)) ++ : NULL); ++ __entry->pg0off = (__entry->nr_frags ++ ? skb_shinfo(skb)->frags[0].offset ++ : 0); ++ ), ++ TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, ++ __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, ++ __entry->tailroom, __entry->tail, __entry->end, __entry->head, ++ __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, ++ __entry->pg0off) ++ ) ++ ++DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, ++ u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, pmtu) ++ __field(u32, queue_size) ++ __field(u32, proto_seq) ++ __field(u8, proto) ++ __field(u8, was_gso) ++ ), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->queue_size = ++ xtfs->cfg.max_queue_size - xtfs->queue_size; ++ __entry->proto = __trace_ip_proto(ip_hdr(skb)); ++ __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); ++ __entry->pmtu = pmtu; ++ __entry->was_gso = was_gso; ++ ), ++ TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, ++ __entry->queue_size, __entry->proto, __entry->proto_seq, ++ __entry->pmtu, __entry->was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, ++ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), ++ TP_ARGS(skb, xtfs, pmtu, was_gso)); ++ ++DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u32, mtu) ++ __field(u32, proto_seq) ++ __field(u16, blkoff) ++ __field(u8, proto)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->mtu = mtu; ++ __entry->blkoff = blkoff; ++ __entry->proto = iph ? __trace_ip_proto(iph) : 0; ++ __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; ++ ), ++ TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, ++ __entry->blkoff, __entry->proto, __entry->proto_seq)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, ++ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, ++ struct iphdr *iph), ++ TP_ARGS(skb, mtu, blkoff, iph)); ++ ++TRACE_EVENT(iptfs_ingress_nth_peek, ++ TP_PROTO(struct sk_buff *skb, u32 remaining), ++ TP_ARGS(skb, remaining), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, remaining)), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->remaining = remaining; ++ ), ++ TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", ++ __entry->skb, __entry->skb_len, __entry->remaining)); ++ ++TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), ++ TP_ARGS(skb, share_ok), ++ TP_STRUCT__entry(__field(struct sk_buff *, skb) ++ __field(u32, skb_len) ++ __field(u32, data_len) ++ __field(u8, share_ok) ++ __field(u8, head_frag) ++ __field(u8, pp_recycle) ++ __field(u8, cloned) ++ __field(u8, shared) ++ __field(u8, nr_frags) ++ __field(u8, frag_list) ++ ), ++ TP_fast_assign(__entry->skb = skb; ++ __entry->skb_len = skb->len; ++ __entry->data_len = skb->data_len; ++ __entry->share_ok = share_ok; ++ __entry->head_frag = skb->head_frag; ++ __entry->pp_recycle = skb->pp_recycle; ++ __entry->cloned = skb_cloned(skb); ++ __entry->shared = skb_shared(skb); ++ __entry->nr_frags = skb_shinfo(skb)->nr_frags; ++ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; ++ ), ++ TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", ++ __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, ++ __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, ++ __entry->nr_frags, __entry->frag_list)); ++ ++DECLARE_EVENT_CLASS(iptfs_timer_event, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val), ++ TP_STRUCT__entry(__field(u64, time_val) ++ __field(u64, set_time)), ++ TP_fast_assign(__entry->time_val = time_val; ++ __entry->set_time = xtfs->iptfs_settime; ++ ), ++ TP_printk("TIMER: set_time=%llu time_val=%llu", ++ __entry->set_time, __entry->time_val)); ++ ++DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val)); ++ ++DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, ++ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), ++ TP_ARGS(xtfs, time_val)); ++ ++#endif /* _TRACE_IPTFS_H */ ++ ++/* This part must be outside protection */ ++#undef TRACE_INCLUDE_PATH ++#define TRACE_INCLUDE_PATH ../../net/xfrm ++#undef TRACE_INCLUDE_FILE ++#define TRACE_INCLUDE_FILE trace_iptfs ++#include +diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c +index 6d022935c9dc..c2fd8e1fc79d 100644 +--- a/net/xfrm/xfrm_iptfs.c ++++ b/net/xfrm/xfrm_iptfs.c +@@ -19,6 +19,7 @@ + #include + + #include "xfrm_inout.h" ++#include "trace_iptfs.h" + + /* IPTFS encap (header) values. */ + #define IPTFS_SUBTYPE_BASIC 0 +@@ -123,6 +124,7 @@ struct skb_wseq { + * @ecn_queue_size: octets above with ECN mark. + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. ++ * @iptfs_settime: time the output timer was set. + * @payload_mtu: max payload size. + * @w_seq_set: true after first seq received. + * @w_wantseq: waiting for this seq number as next to process (in order). +@@ -147,6 +149,7 @@ struct xfrm_iptfs_data { + u32 ecn_queue_size; /* octets above which ECN mark */ + u64 init_delay_ns; /* nanoseconds */ + struct hrtimer iptfs_timer; /* output timer */ ++ time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ + + /* Tunnel input reordering */ +@@ -173,6 +176,39 @@ static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + /* Utility Functions */ + /* ================= */ + ++static u32 __trace_ip_proto(struct iphdr *iph) ++{ ++ if (iph->version == 4) ++ return iph->protocol; ++ return ((struct ipv6hdr *)iph)->nexthdr; ++} ++ ++static u32 __trace_ip_proto_seq(struct iphdr *iph) ++{ ++ void *nexthdr; ++ u32 protocol = 0; ++ ++ if (iph->version == 4) { ++ nexthdr = (void *)(iph + 1); ++ protocol = iph->protocol; ++ } else if (iph->version == 6) { ++ nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); ++ protocol = ((struct ipv6hdr *)(iph))->nexthdr; ++ } ++ switch (protocol) { ++ case IPPROTO_ICMP: ++ return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); ++ case IPPROTO_ICMPV6: ++ return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); ++ case IPPROTO_TCP: ++ return ntohl(((struct tcphdr *)nexthdr)->seq); ++ case IPPROTO_UDP: ++ return ntohs(((struct udphdr *)nexthdr)->source); ++ default: ++ return 0; ++ } ++} ++ + static u64 __esp_seq(struct sk_buff *skb) + { + u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); +@@ -492,6 +528,13 @@ static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, + } + } + ++/* ================================== */ ++/* IPTFS Trace Event Definitions */ ++/* ================================== */ ++ ++#define CREATE_TRACE_POINTS ++#include "trace_iptfs.h" ++ + /* ================================== */ + /* IPTFS Receiving (egress) Functions */ + /* ================================== */ +@@ -986,6 +1029,8 @@ static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) + } + data = sizeof(*ipth); + ++ trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); ++ + /* Set data past the basic header */ + if (ipth->subtype == IPTFS_SUBTYPE_CC) { + /* Copy the rest of the CC header */ +@@ -1883,6 +1928,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + */ + if (!ok) { + nospace: ++ trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); + continue; +@@ -1892,6 +1938,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + * enqueue. + */ + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { ++ trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -1900,12 +1947,17 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, + ok = iptfs_enqueue(xtfs, skb); + if (!ok) + goto nospace; ++ ++ trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); + } + + /* Start a delay timer if we don't have one yet */ +- if (!hrtimer_is_queued(&xtfs->iptfs_timer)) ++ if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, + IPTFS_HRTIMER_MODE); ++ xtfs->iptfs_settime = ktime_get_raw_fast_ns(); ++ trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); ++ } + + spin_unlock_bh(&x->lock); + return 0; +@@ -1990,6 +2042,7 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, + struct sk_buff *nskb = *skbp; + u32 copy_len, offset; + u32 to_copy = skb->len - mtu; ++ u32 blkoff = 0; + int err = 0; + + INIT_LIST_HEAD(&sublist); +@@ -2002,6 +2055,7 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, + to_copy = skb->len - offset; + while (to_copy) { + /* Send all but last fragment to allow agg. append */ ++ trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); + list_add_tail(&nskb->list, &sublist); + + /* FUTURE: if the packet has an odd/non-aligning length we could +@@ -2021,11 +2075,14 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, + iptfs_output_prepare_skb(nskb, to_copy); + offset += copy_len; + to_copy -= copy_len; ++ blkoff = to_copy; + } + skb_abort_seq_read(&skbseq); + + /* return last fragment that will be unsent (or NULL) */ + *skbp = nskb; ++ if (nskb) ++ trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); + + /* trim the original skb to MTU */ + if (!err) +@@ -2133,6 +2190,8 @@ static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, + /* We've split these up before queuing */ + BUG_ON(skb_is_gso(skb)); + ++ trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); ++ + /* Simple case -- it fits. `mtu` accounted for all the overhead + * including the basic IPTFS header. + */ +@@ -2233,6 +2292,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + */ + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + ++ trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } +@@ -2280,6 +2340,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + * case. + */ + while ((skb2 = skb_peek(list))) { ++ trace_iptfs_ingress_nth_peek(skb2, remaining); + if (skb2->len > remaining) + break; + +@@ -2315,6 +2376,8 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) + skb->len += skb2->len; + remaining -= skb2->len; + ++ trace_iptfs_ingress_nth_add(skb2, share_ok); ++ + if (share_ok) { + iptfs_consume_frags(skb, skb2); + } else { +@@ -2338,6 +2401,7 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) + struct sk_buff_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; ++ time64_t settime; + + xtfs = container_of(me, typeof(*xtfs), iptfs_timer); + x = xtfs->x; +@@ -2354,6 +2418,7 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + xtfs->queue_size = 0; ++ settime = xtfs->iptfs_settime; + spin_unlock(&x->lock); + + /* After the above unlock, packets can begin queuing again, and the +@@ -2362,6 +2427,9 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) + * already). + */ + ++ trace_iptfs_timer_expire( ++ xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime)); ++ + iptfs_output_queued(x, &list); + + return HRTIMER_NORESTART; +-- +2.46.0 +