Revis manual pages. [SA-18:08.tcp]

Fix L1 Terminal Fault (L1TF) kernel information disclosure. [SA-18:09.l1tf] Fix resource exhaustion in IP fragment reassembly. [SA-18:10.ip] Fix unauthenticated EAPOL-Key decryption vulnerability. [SA-18:11.hostapd] Approved by: so
2018-08-15 02:30:11 +00:00 · 2018-08-15 02:30:11 +00:00 · aad223b461
parent d039d53b5a
commit aad223b461
16 changed files with 573 additions and 163 deletions
--- a/15
+++ b/15
@ -16,6 +16,21 @@ from older versions of FreeBSD, try WITHOUT_CLANG and WITH_GCC to bootstrap to
 the tip of head, and then rebuild without this option. The bootstrap process
 from older version of current across the gcc/clang cutover is a bit fragile.

+20180814	p2	FreeBSD-SA-18:08.tcp [revised]
+			FreeBSD-SA-18:09.l1tf
+			FreeBSD-SA-18:10.ip
+			FreeBSD-SA-18:11.hostapd
+
+	Revise manual pages. [SA-18:08.tcp]
+
+	Fix L1 Terminal Fault (L1TF) kernel information disclosure.
+	[SA-18:09.l1tf]
+
+	Fix resource exhaustion in IP fragment reassembly. [SA-18:10.ip]
+
+	Fix unauthenticated EAPOL-Key decryption vulnerability.
+	[SA-18:11.hostapd]
+
 20180806	p1	FreeBSD-SA-18:08.tcp

 	Fix resource exhaustion in TCP reassembly.
--- a/contrib/wpa/src/rsn_supp/wpa.c
+++ b/contrib/wpa/src/rsn_supp/wpa.c
@ -2027,6 +2027,17 @@ int wpa_sm_rx_eapol(struct wpa_sm *sm, const u8 *src_addr,

 	if ((sm->proto == WPA_PROTO_RSN || sm->proto == WPA_PROTO_OSEN) &&
 	    (key_info & WPA_KEY_INFO_ENCR_KEY_DATA)) {
+		/*
+		 * Only decrypt the Key Data field if the frame's authenticity
+		 * was verified. When using AES-SIV (FILS), the MIC flag is not
+		 * set, so this check should only be performed if mic_len != 0
+		 * which is the case in this code branch.
+		 */
+		if (!(key_info & WPA_KEY_INFO_MIC)) {
+			wpa_msg(sm->ctx->msg_ctx, MSG_WARNING,
+				"WPA: Ignore EAPOL-Key with encrypted but unauthenticated data");
+			goto out;
+		}
 		if (wpa_supplicant_decrypt_key_data(sm, key, ver, key_data,
 						    &key_data_len))
 			goto out;
--- a/share/man/man4/inet.4
+++ b/share/man/man4/inet.4
@ -28,7 +28,7 @@
 .\"     From: @(#)inet.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd Feb 4, 2016
+.Dd August 14, 2018
 .Dt INET 4
 .Os
 .Sh NAME
@ -229,15 +229,38 @@ At the same time, on high-speed links, it can decrease the ID reuse
 cycle greatly.
 Default is 0 (sequential IP IDs).
 IPv6 flow IDs and fragment IDs are always random.
+.It Va ip.maxfrags
+Integer: maximum number of fragments the host will accept and simultaneously
+hold across all reassembly queues in all VNETs.
+If set to 0, reassembly is disabled.
+If set to -1, this limit is not applied.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a global limit.
 .It Va ip.maxfragpackets
-Integer: maximum number of fragmented packets the host will accept and hold
-in the reassembling queue simultaneously.
-0 means that the host will not accept any fragmented packets.
-\-1 means that the host will accept as many fragmented packets as it receives.
+Integer: maximum number of fragmented packets the host will accept and
+simultaneously hold in the reassembly queue for a particular VNET.
+0 means that the host will not accept any fragmented packets for that VNET.
+\-1 means that the host will not apply this limit for that VNET.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a per-VNET limit.
+.It Va ip.maxfragbucketsize
+Integer: maximum number of reassembly queues per bucket.
+Fragmented packets are hashed to buckets.
+Each bucket has a list of reassembly queues.
+The system must compare the incoming packets to the existing reassembly queues
+in the bucket to find a matching reassembly queue.
+To preserve system resources, the system limits the number of reassembly
+queues allowed in each bucket.
+This limit is recalculated when the number of mbuf clusters is changed or
+when the value of
+.Va ip.maxfragpackets
+changes.
+This is a per-VNET limit.
 .It Va ip.maxfragsperpacket
 Integer: maximum number of fragments the host will accept and hold
-in the reassembling queue for a packet.
-0 means that the host will not accept any fragmented packets.
+in the reassembly queue for a packet.
+0 means that the host will not accept any fragmented packets for the VNET.
+This is a per-VNET limit.
 .El
 .Sh SEE ALSO
 .Xr ioctl 2 ,
--- a/share/man/man4/inet6.4
+++ b/share/man/man4/inet6.4
@ -29,7 +29,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 2, 2009
+.Dd August 14, 2018
 .Dt INET6 4
 .Os
 .Sh NAME
@ -219,12 +219,41 @@ packets.
 This value applies to all the transport protocols on top of
 .Tn IPv6 .
 There are APIs to override the value.
+.It Dv IPV6CTL_MAXFRAGS
+.Pq ip6.maxfrags
+Integer: maximum number of fragments the host will accept and simultaneously
+hold across all reassembly queues in all VNETs.
+If set to 0, fragment reassembly is disabled.
+If set to -1, this limit is not applied.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a global limit.
 .It Dv IPV6CTL_MAXFRAGPACKETS
 .Pq ip6.maxfragpackets
-Integer: default maximum number of fragmented packets the node will accept.
-0 means that the node will not accept any fragmented packets.
-1 means that the node will accept as many fragmented packets as it receives.
-The flag is provided basically for avoiding possible DoS attacks.
+Integer: maximum number of fragmented packets the node will accept and
+simultaneously hold in the reassembly queue for a particular VNET.
+0 means that the node will not accept any fragmented packets for that VNET.
+-1 means that the node will not apply this limit for that VNET.
+This limit is recalculated when the number of mbuf clusters is changed.
+This is a per-VNET limit.
+.It Dv IPV6CTL_MAXFRAGBUCKETSIZE
+.Pq ip6.maxfragbucketsize
+Integer: maximum number of reassembly queues per bucket.
+Fragmented packets are hashed to buckets.
+Each bucket has a list of reassembly queues.
+The system must compare the incoming packets to the existing reassembly queues
+in the bucket to find a matching reassembly queue.
+To preserve system resources, the system limits the number of reassembly
+queues allowed in each bucket.
+This limit is recalculated when the number of mbuf clusters is changed or
+when the value of
+.Va ip6.maxfragpackets
+changes.
+This is a per-VNET limit.
+.It Dv IPV6CTL_MAXFRAGSPERPACKET
+.Pq ip6.maxfragsperpacket
+Integer: maximum number of fragments the host will accept and hold in the
+ressembly queue for a packet.
+This is a per-VNET limit.
 .It Dv IPV6CTL_ACCEPT_RTADV
 .Pq ip6.accept_rtadv
 Boolean: the default value of a per-interface flag to
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd February 6, 2017
+.Dd August 6, 2018
 .Dt TCP 4
 .Os
 .Sh NAME
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@ -1305,6 +1305,9 @@ pmap_init(void)
 	vm_size_t s;
 	int error, i, pv_npg, ret, skz63;

+	/* L1TF, reserve page @0 unconditionally */
+	vm_page_blacklist_add(0, bootverbose);
+
 	/* Detect bare-metal Skylake Server and Skylake-X. */
 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@ -185,6 +185,12 @@ static u_int vpid_alloc_failed;
 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
 	    &vpid_alloc_failed, 0, NULL);

+static int guest_l1d_flush;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
+    &guest_l1d_flush, 0, NULL);
+
+uint64_t vmx_msr_flush_cmd;
+
 /*
 * Use the last page below 4GB as the APIC access address. This address is
 * occupied by the boot firmware so it is guaranteed that it will not conflict
@ -720,6 +726,12 @@ vmx_init(int ipinum)
 		return (error);
 	}

+	guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0;
+	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
+	if (guest_l1d_flush &&
+	    (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0)
+		vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D;
+
 	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");

 #include <vm/vm.h>
 #include <vm/pmap.h>
+#include <vm/vm_param.h>

 #include <machine/vmm.h>
 #include "vmx_cpufunc.h"
@ -86,3 +87,6 @@ ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));

 ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL));
+
+ASSYM(PAGE_SIZE, PAGE_SIZE);
+ASSYM(KERNBASE, KERNBASE);
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@ -28,6 +28,7 @@
 */

 #include <machine/asmacros.h>
+#include <machine/specialreg.h>

 #include "vmx_assym.h"

@ -173,9 +174,47 @@ ENTRY(vmx_enter_guest)
 	jbe	invept_error		/* Check invept instruction error */

 guest_restore:
-	cmpl	$0, %edx
-	je	do_launch

+	/*
+	 * Flush L1D cache if requested.  Use IA32_FLUSH_CMD MSR if available,
+	 * otherwise load enough of the data from the zero_region to flush
+	 * existing L1D content.
+	 */
+#define	L1D_FLUSH_SIZE	(64 * 1024)
+	movl	%edx, %r8d
+	cmpb	$0, guest_l1d_flush(%rip)
+	je	after_l1d
+	movq	vmx_msr_flush_cmd(%rip), %rax
+	testq	%rax, %rax
+	jz	1f
+	movq	%rax, %rdx
+	shrq	$32, %rdx
+	movl	$MSR_IA32_FLUSH_CMD, %ecx
+	wrmsr
+	jmp	after_l1d
+1:	movq	$KERNBASE, %r9
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/*
+	 * pass 1: Preload TLB.
+	 * Kernel text is mapped using superpages,  TLB preload is
+	 * done for the benefit of older CPUs which split 2M page
+	 * into 4k TLB entries.
+	 */
+2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$PAGE_SIZE, %rcx
+	jne	2b
+	xorl	%eax, %eax
+	cpuid
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/* pass 2: Read each cache line */
+3:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$64, %rcx
+	jne	3b
+	lfence
+#undef	L1D_FLUSH_SIZE
+after_l1d:
+	cmpl	$0, %r8d
+	je	do_launch
 	VMX_GUEST_RESTORE
 	vmresume
 	/*
--- a/sys/conf/newvers.sh
+++ b/sys/conf/newvers.sh
@ -44,7 +44,7 @@

 TYPE="FreeBSD"
 REVISION="11.2"
-BRANCH="RELEASE-p1"
+BRANCH="RELEASE-p2"
 if [ -n "${BRANCH_OVERRIDE}" ]; then
 	BRANCH=${BRANCH_OVERRIDE}
 fi
--- a/sys/netinet/ip_reass.c
+++ b/sys/netinet/ip_reass.c
@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/hash.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
@ -63,13 +64,14 @@ SYSCTL_DECL(_net_inet_ip);
 /*
 * Reassembly headers are stored in hash buckets.
 */
-#define	IPREASS_NHASH_LOG2	6
+#define	IPREASS_NHASH_LOG2	10
 #define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
 #define	IPREASS_HMASK		(IPREASS_NHASH - 1)

 struct ipqbucket {
 	TAILQ_HEAD(ipqhead, ipq) head;
 	struct mtx		 lock;
+	int			 count;
 };

 static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]);
@ -82,6 +84,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed);
 #define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
 #define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)

+static VNET_DEFINE(int, ipreass_maxbucketsize);
+#define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
+
 void		ipreass_init(void);
 void		ipreass_drain(void);
 void		ipreass_slowtimo(void);
@ -89,27 +94,53 @@ void		ipreass_slowtimo(void);
 void		ipreass_destroy(void);
 #endif
 static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
+static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
 static void	ipreass_zone_change(void *);
 static void	ipreass_drain_tomax(void);
-static void	ipq_free(struct ipqhead *, struct ipq *);
+static void	ipq_free(struct ipqbucket *, struct ipq *);
 static struct ipq * ipq_reuse(int);

 static inline void
-ipq_timeout(struct ipqhead *head, struct ipq *fp)
+ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
 {

 	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
-	ipq_free(head, fp);
+	ipq_free(bucket, fp);
 }

 static inline void
-ipq_drop(struct ipqhead *head, struct ipq *fp)
+ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
 {

 	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
-	ipq_free(head, fp);
+	ipq_free(bucket, fp);
 }

+/*
+ * By default, limit the number of IP fragments across all reassembly
+ * queues to  1/32 of the total number of mbuf clusters.
+ *
+ * Limit the total number of reassembly queues per VNET to the
+ * IP fragment limit, but ensure the limit will not allow any bucket
+ * to grow above 100 items. (The bucket limit is
+ * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
+ * multiplier to reach a 100-item limit.)
+ * The 100-item limit was chosen as brief testing seems to show that
+ * this produces "reasonable" performance on some subset of systems
+ * under DoS attack.
+ */
+#define	IP_MAXFRAGS		(nmbclusters / 32)
+#define	IP_MAXFRAGPACKETS	(imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
+
+static int		maxfrags;
+static volatile u_int	nfrags;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
+    &maxfrags, 0,
+    "Maximum number of IPv4 fragments allowed across all reassembly queues");
+SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
+    __DEVOLATILE(u_int *, &nfrags), 0,
+    "Current number of IPv4 fragments across all reassembly queues");
+
 static VNET_DEFINE(uma_zone_t, ipq_zone);
 #define	V_ipq_zone	VNET(ipq_zone)
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
@ -127,6 +158,10 @@ static VNET_DEFINE(int, maxfragsperpacket);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
    &VNET_NAME(maxfragsperpacket), 0,
    "Maximum number of IPv4 fragments allowed per packet");
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
+    sysctl_maxfragbucketsize, "I",
+    "Maximum number of IPv4 fragment reassembly queue entries per bucket");

 /*
 * Take incoming datagram fragment and try to reassemble it into
@ -146,9 +181,9 @@ ip_reass(struct mbuf *m)
 	struct mbuf *p, *q, *nq, *t;
 	struct ipq *fp;
 	struct ipqhead *head;
-	int i, hlen, next;
+	int i, hlen, next, tmpmax;
 	u_int8_t ecn, ecn0;
-	uint32_t hash;
+	uint32_t hash, hashkey[3];
 #ifdef	RSS
 	uint32_t rss_hash, rss_type;
 #endif
@ -156,8 +191,12 @@ ip_reass(struct mbuf *m)
 	/*
 	 * If no reassembling or maxfragsperpacket are 0,
 	 * never accept fragments.
+	 * Also, drop packet if it would exceed the maximum
+	 * number of fragments.
 	 */
-	if (V_noreass == 1 || V_maxfragsperpacket == 0) {
+	tmpmax = maxfrags;
+	if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
+	    (tmpmax >= 0 && nfrags >= (u_int)tmpmax)) {
 		IPSTAT_INC(ips_fragments);
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
@ -202,8 +241,12 @@ ip_reass(struct mbuf *m)
 	m->m_data += hlen;
 	m->m_len -= hlen;

-	hash = ip->ip_src.s_addr ^ ip->ip_id;
-	hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK;
+	hashkey[0] = ip->ip_src.s_addr;
+	hashkey[1] = ip->ip_dst.s_addr;
+	hashkey[2] = (uint32_t)ip->ip_p << 16;
+	hashkey[2] += ip->ip_id;
+	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
+	hash &= IPREASS_HMASK;
 	head = &V_ipq[hash].head;
 	IPQ_LOCK(hash);

@ -224,9 +267,12 @@ ip_reass(struct mbuf *m)
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == NULL) {
-		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
+		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
+			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 		if (fp == NULL)
 			fp = ipq_reuse(hash);
+		if (fp == NULL)
+			goto dropfrag;
 #ifdef MAC
 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 			uma_zfree(V_ipq_zone, fp);
@ -236,7 +282,9 @@ ip_reass(struct mbuf *m)
 		mac_ipq_create(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
+		V_ipq[hash].count++;
 		fp->ipq_nfrags = 1;
+		atomic_add_int(&nfrags, 1);
 		fp->ipq_ttl = IPFRAGTTL;
 		fp->ipq_p = ip->ip_p;
 		fp->ipq_id = ip->ip_id;
@ -247,6 +295,7 @@ ip_reass(struct mbuf *m)
 		goto done;
 	} else {
 		fp->ipq_nfrags++;
+		atomic_add_int(&nfrags, 1);
 #ifdef MAC
 		mac_ipq_update(m, fp);
 #endif
@ -323,6 +372,7 @@ ip_reass(struct mbuf *m)
 		m->m_nextpkt = nq;
 		IPSTAT_INC(ips_fragdropped);
 		fp->ipq_nfrags--;
+		atomic_subtract_int(&nfrags, 1);
 		m_freem(q);
 	}

@ -340,7 +390,7 @@ ip_reass(struct mbuf *m)
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (ntohs(GETIP(q)->ip_off) != next) {
 			if (fp->ipq_nfrags > V_maxfragsperpacket)
-				ipq_drop(head, fp);
+				ipq_drop(&V_ipq[hash], fp);
 			goto done;
 		}
 		next += ntohs(GETIP(q)->ip_len);
@ -348,7 +398,7 @@ ip_reass(struct mbuf *m)
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_IP_FRAG) {
 		if (fp->ipq_nfrags > V_maxfragsperpacket)
-			ipq_drop(head, fp);
+			ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}

@ -359,7 +409,7 @@ ip_reass(struct mbuf *m)
 	ip = GETIP(q);
 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
-		ipq_drop(head, fp);
+		ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}

@ -388,6 +438,7 @@ ip_reass(struct mbuf *m)
 	while (m->m_pkthdr.csum_data & 0xffff0000)
 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 		    (m->m_pkthdr.csum_data >> 16);
+	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 #ifdef MAC
 	mac_ipq_reassemble(fp, m);
 	mac_ipq_destroy(fp);
@ -402,6 +453,7 @@ ip_reass(struct mbuf *m)
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
+	V_ipq[hash].count--;
 	uma_zfree(V_ipq_zone, fp);
 	m->m_len += (ip->ip_hl << 2);
 	m->m_data -= (ip->ip_hl << 2);
@ -447,8 +499,10 @@ ip_reass(struct mbuf *m)

 dropfrag:
 	IPSTAT_INC(ips_fragdropped);
-	if (fp != NULL)
+	if (fp != NULL) {
 		fp->ipq_nfrags--;
+		atomic_subtract_int(&nfrags, 1);
+	}
 	m_freem(m);
 done:
 	IPQ_UNLOCK(hash);
@ -463,21 +517,27 @@ done:
 void
 ipreass_init(void)
 {
+	int max;

 	for (int i = 0; i < IPREASS_NHASH; i++) {
 		TAILQ_INIT(&V_ipq[i].head);
 		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
 		    MTX_DEF | MTX_DUPOK);
+		V_ipq[i].count = 0;
 	}
 	V_ipq_hashseed = arc4random();
 	V_maxfragsperpacket = 16;
 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 	    NULL, UMA_ALIGN_PTR, 0);
-	uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
+	max = IP_MAXFRAGPACKETS;
+	max = uma_zone_set_max(V_ipq_zone, max);
+	V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);

-	if (IS_DEFAULT_VNET(curvnet))
+	if (IS_DEFAULT_VNET(curvnet)) {
+		maxfrags = IP_MAXFRAGS;
 		EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
 		    NULL, EVENTHANDLER_PRI_ANY);
+	}
 }

 /*
@ -492,7 +552,7 @@ ipreass_slowtimo(void)
 		IPQ_LOCK(i);
 		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
 		if (--fp->ipq_ttl == 0)
-				ipq_timeout(&V_ipq[i].head, fp);
+				ipq_timeout(&V_ipq[i], fp);
 		IPQ_UNLOCK(i);
 	}
 }
@ -507,7 +567,10 @@ ipreass_drain(void)
 	for (int i = 0; i < IPREASS_NHASH; i++) {
 		IPQ_LOCK(i);
 		while(!TAILQ_EMPTY(&V_ipq[i].head))
-			ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head));
+			ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
+		KASSERT(V_ipq[i].count == 0,
+		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
+		    V_ipq[i].count, V_ipq));
 		IPQ_UNLOCK(i);
 	}
 }
@ -535,8 +598,22 @@ ipreass_destroy(void)
 static void
 ipreass_drain_tomax(void)
 {
+	struct ipq *fp;
 	int target;

+	/*
+	 * Make sure each bucket is under the new limit. If
+	 * necessary, drop enough of the oldest elements from
+	 * each bucket to get under the new limit.
+	 */
+	for (int i = 0; i < IPREASS_NHASH; i++) {
+		IPQ_LOCK(i);
+		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
+		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
+			ipq_timeout(&V_ipq[i], fp);
+		IPQ_UNLOCK(i);
+	}
+
 	/*
 	 * If we are over the maximum number of fragments,
 	 * drain off enough to get down to the new limit,
@ -545,13 +622,11 @@ ipreass_drain_tomax(void)
 	 */
 	target = uma_zone_get_max(V_ipq_zone);
 	while (uma_zone_get_cur(V_ipq_zone) > target) {
-		struct ipq *fp;
-
 		for (int i = 0; i < IPREASS_NHASH; i++) {
 			IPQ_LOCK(i);
 			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 			if (fp != NULL)
-				ipq_timeout(&V_ipq[i].head, fp);
+				ipq_timeout(&V_ipq[i], fp);
 			IPQ_UNLOCK(i);
 		}
 	}
@ -560,9 +635,20 @@ ipreass_drain_tomax(void)
 static void
 ipreass_zone_change(void *tag)
 {
+	VNET_ITERATOR_DECL(vnet_iter);
+	int max;

-	uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
-	ipreass_drain_tomax();
+	maxfrags = IP_MAXFRAGS;
+	max = IP_MAXFRAGPACKETS;
+	VNET_LIST_RLOCK_NOSLEEP();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET(vnet_iter);
+		max = uma_zone_set_max(V_ipq_zone, max);
+		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
+		ipreass_drain_tomax();
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK_NOSLEEP();
 }

 /*
@ -590,6 +676,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 		 * and place an extreme upper bound.
 		 */
 		max = uma_zone_set_max(V_ipq_zone, max);
+		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 		ipreass_drain_tomax();
 		V_noreass = 0;
 	} else if (max == 0) {
@ -598,6 +685,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 	} else if (max == -1) {
 		V_noreass = 0;
 		uma_zone_set_max(V_ipq_zone, 0);
+		V_ipreass_maxbucketsize = INT_MAX;
 	} else
 		return (EINVAL);
 	return (0);
@ -611,49 +699,72 @@ static struct ipq *
 ipq_reuse(int start)
 {
 	struct ipq *fp;
-	int i;
+	int bucket, i;

 	IPQ_LOCK_ASSERT(start);

-	for (i = start;; i++) {
-		if (i == IPREASS_NHASH)
-			i = 0;
-		if (i != start && IPQ_TRYLOCK(i) == 0)
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		bucket = (start + i) % IPREASS_NHASH;
+		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
 			continue;
-		fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
+		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
 		if (fp) {
 			struct mbuf *m;

 			IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
+			atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 			while (fp->ipq_frags) {
 				m = fp->ipq_frags;
 				fp->ipq_frags = m->m_nextpkt;
 				m_freem(m);
 			}
-			TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list);
-			if (i != start)
-				IPQ_UNLOCK(i);
-			IPQ_LOCK_ASSERT(start);
-			return (fp);
+			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
+			V_ipq[bucket].count--;
+			if (bucket != start)
+				IPQ_UNLOCK(bucket);
+			break;
 		}
-		if (i != start)
-			IPQ_UNLOCK(i);
+		if (bucket != start)
+			IPQ_UNLOCK(bucket);
 	}
+	IPQ_LOCK_ASSERT(start);
+	return (fp);
 }

 /*
 * Free a fragment reassembly header and all associated datagrams.
 */
 static void
-ipq_free(struct ipqhead *fhp, struct ipq *fp)
+ipq_free(struct ipqbucket *bucket, struct ipq *fp)
 {
 	struct mbuf *q;

+	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 	while (fp->ipq_frags) {
 		q = fp->ipq_frags;
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
-	TAILQ_REMOVE(fhp, fp, ipq_list);
+	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
+	bucket->count--;
 	uma_zfree(V_ipq_zone, fp);
 }
+
+/*
+ * Get or set the maximum number of reassembly queues per bucket.
+ */
+static int
+sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
+{
+	int error, max;
+
+	max = V_ipreass_maxbucketsize;
+	error = sysctl_handle_int(oidp, &max, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (max <= 0)
+		return (EINVAL);
+	V_ipreass_maxbucketsize = max;
+	ipreass_drain_tomax();
+	return (0);
+}
--- a/sys/netinet6/frag6.c
+++ b/sys/netinet6/frag6.c
@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");

 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/hash.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/syslog.h>

+#include <machine/atomic.h>
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
@ -63,58 +66,110 @@ __FBSDID("$FreeBSD$");

 #include <security/mac/mac_framework.h>

-static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *);
-static void frag6_deq(struct ip6asfrag *);
-static void frag6_insque(struct ip6q *, struct ip6q *);
-static void frag6_remque(struct ip6q *);
-static void frag6_freef(struct ip6q *);
-
-static struct mtx ip6qlock;
 /*
- * These fields all protected by ip6qlock.
+ * Reassembly headers are stored in hash buckets.
 */
-static VNET_DEFINE(u_int, frag6_nfragpackets);
-static VNET_DEFINE(u_int, frag6_nfrags);
-static VNET_DEFINE(struct ip6q, ip6q);	/* ip6 reassemble queue */
+#define	IP6REASS_NHASH_LOG2	10
+#define	IP6REASS_NHASH		(1 << IP6REASS_NHASH_LOG2)
+#define	IP6REASS_HMASK		(IP6REASS_NHASH - 1)
+
+static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
+    uint32_t bucket __unused);
+static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
+static void frag6_insque_head(struct ip6q *, struct ip6q *,
+    uint32_t bucket);
+static void frag6_remque(struct ip6q *, uint32_t bucket);
+static void frag6_freef(struct ip6q *, uint32_t bucket);
+
+struct ip6qbucket {
+	struct ip6q	ip6q;
+	struct mtx	lock;
+	int		count;
+};
+
+static VNET_DEFINE(volatile u_int, frag6_nfragpackets);
+volatile u_int frag6_nfrags = 0;
+static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]);
+static VNET_DEFINE(uint32_t, ip6q_hashseed);

 #define	V_frag6_nfragpackets		VNET(frag6_nfragpackets)
-#define	V_frag6_nfrags			VNET(frag6_nfrags)
 #define	V_ip6q				VNET(ip6q)
+#define	V_ip6q_hashseed			VNET(ip6q_hashseed)

-#define	IP6Q_LOCK_INIT()	mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF);
-#define	IP6Q_LOCK()		mtx_lock(&ip6qlock)
-#define	IP6Q_TRYLOCK()		mtx_trylock(&ip6qlock)
-#define	IP6Q_LOCK_ASSERT()	mtx_assert(&ip6qlock, MA_OWNED)
-#define	IP6Q_UNLOCK()		mtx_unlock(&ip6qlock)
+#define	IP6Q_LOCK(i)		mtx_lock(&V_ip6q[(i)].lock)
+#define	IP6Q_TRYLOCK(i)		mtx_trylock(&V_ip6q[(i)].lock)
+#define	IP6Q_LOCK_ASSERT(i)	mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
+#define	IP6Q_UNLOCK(i)		mtx_unlock(&V_ip6q[(i)].lock)
+#define	IP6Q_HEAD(i)		(&V_ip6q[(i)].ip6q)

 static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");

+/*
+ * By default, limit the number of IP6 fragments across all reassembly
+ * queues to  1/32 of the total number of mbuf clusters.
+ *
+ * Limit the total number of reassembly queues per VNET to the
+ * IP6 fragment limit, but ensure the limit will not allow any bucket
+ * to grow above 100 items. (The bucket limit is
+ * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
+ * multiplier to reach a 100-item limit.)
+ * The 100-item limit was chosen as brief testing seems to show that
+ * this produces "reasonable" performance on some subset of systems
+ * under DoS attack.
+ */
+#define	IP6_MAXFRAGS		(nmbclusters / 32)
+#define	IP6_MAXFRAGPACKETS	(imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
+
 /*
 * Initialise reassembly queue and fragment identifier.
 */
+void
+frag6_set_bucketsize()
+{
+	int i;
+
+	if ((i = V_ip6_maxfragpackets) > 0)
+		V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
+}
+
 static void
 frag6_change(void *tag)
 {
+	VNET_ITERATOR_DECL(vnet_iter);

-	V_ip6_maxfragpackets = nmbclusters / 4;
-	V_ip6_maxfrags = nmbclusters / 4;
+	ip6_maxfrags = IP6_MAXFRAGS;
+	VNET_LIST_RLOCK_NOSLEEP();
+	VNET_FOREACH(vnet_iter) {
+		CURVNET_SET(vnet_iter);
+		V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
+		frag6_set_bucketsize();
+		CURVNET_RESTORE();
+	}
+	VNET_LIST_RUNLOCK_NOSLEEP();
 }

 void
 frag6_init(void)
 {
+	struct ip6q *q6;
+	int i;

-	V_ip6_maxfragpackets = nmbclusters / 4;
-	V_ip6_maxfrags = nmbclusters / 4;
-	V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
-
+	V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
+	frag6_set_bucketsize();
+	for (i = 0; i < IP6REASS_NHASH; i++) {
+		q6 = IP6Q_HEAD(i);
+		q6->ip6q_next = q6->ip6q_prev = q6;
+		mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
+		V_ip6q[i].count = 0;
+	}
+	V_ip6q_hashseed = arc4random();
+	V_ip6_maxfragsperpacket = 64;
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;

+	ip6_maxfrags = IP6_MAXFRAGS;
 	EVENTHANDLER_REGISTER(nmbclusters_change,
 	    frag6_change, NULL, EVENTHANDLER_PRI_ANY);
-
-	IP6Q_LOCK_INIT();
 }

 /*
@ -155,12 +210,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 	struct mbuf *m = *mp, *t;
 	struct ip6_hdr *ip6;
 	struct ip6_frag *ip6f;
-	struct ip6q *q6;
+	struct ip6q *head, *q6;
 	struct ip6asfrag *af6, *ip6af, *af6dwn;
 	struct in6_ifaddr *ia;
 	int offset = *offp, nxt, i, next;
 	int first_frag = 0;
 	int fragoff, frgpartlen;	/* must be larger than u_int16_t */
+	uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp;
 	struct ifnet *dstifp;
 	u_int8_t ecn, ecn0;
 #ifdef RSS
@ -229,19 +285,38 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		return (ip6f->ip6f_nxt);
 	}

-	IP6Q_LOCK();
+	/* Get fragment length and discard 0-byte fragments. */
+	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
+	if (frgpartlen == 0) {
+		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
+		    offsetof(struct ip6_hdr, ip6_plen));
+		in6_ifstat_inc(dstifp, ifs6_reass_fail);
+		IP6STAT_INC(ip6s_fragdropped);
+		return IPPROTO_DONE;
+	}
+
+	hashkeyp = hashkey;
+	memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
+	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+	memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
+	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+	*hashkeyp = ip6f->ip6f_ident;
+	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
+	hash &= IP6REASS_HMASK;
+	head = IP6Q_HEAD(hash);
+	IP6Q_LOCK(hash);

 	/*
 	 * Enforce upper bound on number of fragments.
 	 * If maxfrag is 0, never accept fragments.
 	 * If maxfrag is -1, accept all fragments without limitation.
 	 */
-	if (V_ip6_maxfrags < 0)
+	if (ip6_maxfrags < 0)
 		;
-	else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags)
+	else if (frag6_nfrags >= (u_int)ip6_maxfrags)
 		goto dropfrag;

-	for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next)
+	for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
 		if (ip6f->ip6f_ident == q6->ip6q_ident &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
@ -251,7 +326,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		    )
 			break;

-	if (q6 == &V_ip6q) {
+	if (q6 == head) {
 		/*
 		 * the first fragment to arrive, create a reassembly queue.
 		 */
@ -266,9 +341,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		 */
 		if (V_ip6_maxfragpackets < 0)
 			;
-		else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
+		else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize ||
+		    V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
 			goto dropfrag;
-		V_frag6_nfragpackets++;
+		atomic_add_int(&V_frag6_nfragpackets, 1);
 		q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
 		    M_NOWAIT);
 		if (q6 == NULL)
@ -281,7 +357,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		}
 		mac_ip6q_create(m, q6);
 #endif
-		frag6_insque(q6, &V_ip6q);
+		frag6_insque_head(q6, head, hash);

 		/* ip6q_nxt will be filled afterwards, from 1st fragment */
 		q6->ip6q_down	= q6->ip6q_up = (struct ip6asfrag *)q6;
@ -315,21 +391,20 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 	 * in size.
 	 * If it would exceed, discard the fragment and return an ICMP error.
 	 */
-	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
 	if (q6->ip6q_unfrglen >= 0) {
 		/* The 1st fragment has already arrived. */
 		if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
 			icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    offset - sizeof(struct ip6_frag) +
 			    offsetof(struct ip6_frag, ip6f_offlg));
-			IP6Q_UNLOCK();
+			IP6Q_UNLOCK(hash);
 			return (IPPROTO_DONE);
 		}
 	} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 		    offset - sizeof(struct ip6_frag) +
 		    offsetof(struct ip6_frag, ip6f_offlg));
-		IP6Q_UNLOCK();
+		IP6Q_UNLOCK(hash);
 		return (IPPROTO_DONE);
 	}
 	/*
@ -348,7 +423,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 				int erroff = af6->ip6af_offset;

 				/* dequeue the fragment. */
-				frag6_deq(af6);
+				frag6_deq(af6, hash);
 				free(af6, M_FTABLE);

 				/* adjust pointer. */
@ -446,7 +521,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		}
 		af6 = af6->ip6af_down;
 		m_freem(IP6_REASS_MBUF(af6->ip6af_up));
-		frag6_deq(af6->ip6af_up);
+		frag6_deq(af6->ip6af_up, hash);
 	}
 #else
 	/*
@ -495,29 +570,38 @@ insert:
 	/*
 	 * Stick new segment in its place;
 	 * check for complete reassembly.
+	 * If not complete, check fragment limit.
 	 * Move to front of packet queue, as we are
 	 * the most recently active fragmented packet.
 	 */
-	frag6_enq(ip6af, af6->ip6af_up);
-	V_frag6_nfrags++;
+	frag6_enq(ip6af, af6->ip6af_up, hash);
+	atomic_add_int(&frag6_nfrags, 1);
 	q6->ip6q_nfrag++;
 #if 0 /* xxx */
-	if (q6 != V_ip6q.ip6q_next) {
-		frag6_remque(q6);
-		frag6_insque(q6, &V_ip6q);
+	if (q6 != head->ip6q_next) {
+		frag6_remque(q6, hash);
+		frag6_insque_head(q6, head, hash);
 	}
 #endif
 	next = 0;
 	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 	     af6 = af6->ip6af_down) {
 		if (af6->ip6af_off != next) {
-			IP6Q_UNLOCK();
+			if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
+				IP6STAT_INC(ip6s_fragdropped);
+				frag6_freef(q6, hash);
+			}
+			IP6Q_UNLOCK(hash);
 			return IPPROTO_DONE;
 		}
 		next += af6->ip6af_frglen;
 	}
 	if (af6->ip6af_up->ip6af_mff) {
-		IP6Q_UNLOCK();
+		if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
+			IP6STAT_INC(ip6s_fragdropped);
+			frag6_freef(q6, hash);
+		}
+		IP6Q_UNLOCK(hash);
 		return IPPROTO_DONE;
 	}

@ -527,7 +611,7 @@ insert:
 	ip6af = q6->ip6q_down;
 	t = m = IP6_REASS_MBUF(ip6af);
 	af6 = ip6af->ip6af_down;
-	frag6_deq(ip6af);
+	frag6_deq(ip6af, hash);
 	while (af6 != (struct ip6asfrag *)q6) {
 		m->m_pkthdr.csum_flags &=
 		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags;
@ -535,7 +619,7 @@ insert:
 		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_data;

 		af6dwn = af6->ip6af_down;
-		frag6_deq(af6);
+		frag6_deq(af6, hash);
 		while (t->m_next)
 			t = t->m_next;
 		m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
@ -562,13 +646,13 @@ insert:
 #endif

 	if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
-		frag6_remque(q6);
-		V_frag6_nfrags -= q6->ip6q_nfrag;
+		frag6_remque(q6, hash);
+		atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 		mac_ip6q_destroy(q6);
 #endif
 		free(q6, M_FTABLE);
-		V_frag6_nfragpackets--;
+		atomic_subtract_int(&V_frag6_nfragpackets, 1);

 		goto dropfrag;
 	}
@ -579,14 +663,14 @@ insert:
 	m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t),
 	    (caddr_t)&nxt);

-	frag6_remque(q6);
-	V_frag6_nfrags -= q6->ip6q_nfrag;
+	frag6_remque(q6, hash);
+	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 	mac_ip6q_reassemble(q6, m);
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FTABLE);
-	V_frag6_nfragpackets--;
+	atomic_subtract_int(&V_frag6_nfragpackets, 1);

 	if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
 		int plen = 0;
@ -608,7 +692,7 @@ insert:
 	m_tag_prepend(m, mtag);
 #endif

-	IP6Q_UNLOCK();
+	IP6Q_UNLOCK(hash);
 	IP6STAT_INC(ip6s_reassembled);
 	in6_ifstat_inc(dstifp, ifs6_reass_ok);

@ -630,7 +714,7 @@ insert:
 	return nxt;

 dropfrag:
-	IP6Q_UNLOCK();
+	IP6Q_UNLOCK(hash);
 	in6_ifstat_inc(dstifp, ifs6_reass_fail);
 	IP6STAT_INC(ip6s_fragdropped);
 	m_freem(m);
@ -641,19 +725,19 @@ insert:
 * Free a fragment reassembly header and all
 * associated datagrams.
 */
-void
-frag6_freef(struct ip6q *q6)
+static void
+frag6_freef(struct ip6q *q6, uint32_t bucket)
 {
 	struct ip6asfrag *af6, *down6;

-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);

 	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 	     af6 = down6) {
 		struct mbuf *m = IP6_REASS_MBUF(af6);

 		down6 = af6->ip6af_down;
-		frag6_deq(af6);
+		frag6_deq(af6, bucket);

 		/*
 		 * Return ICMP time exceeded error for the 1st fragment.
@ -675,24 +759,25 @@ frag6_freef(struct ip6q *q6)
 			m_freem(m);
 		free(af6, M_FTABLE);
 	}
-	frag6_remque(q6);
-	V_frag6_nfrags -= q6->ip6q_nfrag;
+	frag6_remque(q6, bucket);
+	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FTABLE);
-	V_frag6_nfragpackets--;
+	atomic_subtract_int(&V_frag6_nfragpackets, 1);
 }

 /*
 * Put an ip fragment on a reassembly chain.
 * Like insque, but pointers in middle of structure.
 */
-void
-frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
+static void
+frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
+    uint32_t bucket __unused)
 {

-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);

 	af6->ip6af_up = up6;
 	af6->ip6af_down = up6->ip6af_down;
@ -703,36 +788,41 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
 /*
 * To frag6_enq as remque is to insque.
 */
-void
-frag6_deq(struct ip6asfrag *af6)
+static void
+frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
 {

-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);

 	af6->ip6af_up->ip6af_down = af6->ip6af_down;
 	af6->ip6af_down->ip6af_up = af6->ip6af_up;
 }

-void
-frag6_insque(struct ip6q *new, struct ip6q *old)
+static void
+frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket)
 {

-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
+	KASSERT(IP6Q_HEAD(bucket) == old,
+	    ("%s: attempt to insert at head of wrong bucket"
+	    " (bucket=%u, old=%p)", __func__, bucket, old));

 	new->ip6q_prev = old;
 	new->ip6q_next = old->ip6q_next;
 	old->ip6q_next->ip6q_prev= new;
 	old->ip6q_next = new;
+	V_ip6q[bucket].count++;
 }

-void
-frag6_remque(struct ip6q *p6)
+static void
+frag6_remque(struct ip6q *p6, uint32_t bucket)
 {

-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);

 	p6->ip6q_prev->ip6q_next = p6->ip6q_next;
 	p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
+	V_ip6q[bucket].count--;
 }

 /*
@ -744,37 +834,71 @@ void
 frag6_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
-	struct ip6q *q6;
+	struct ip6q *head, *q6;
+	int i;

 	VNET_LIST_RLOCK_NOSLEEP();
-	IP6Q_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		q6 = V_ip6q.ip6q_next;
-		if (q6)
-			while (q6 != &V_ip6q) {
+		for (i = 0; i < IP6REASS_NHASH; i++) {
+			IP6Q_LOCK(i);
+			head = IP6Q_HEAD(i);
+			q6 = head->ip6q_next;
+			if (q6 == NULL) {
+				/*
+				 * XXXJTL: This should never happen. This
+				 * should turn into an assertion.
+				 */
+				IP6Q_UNLOCK(i);
+				continue;
+			}
+			while (q6 != head) {
 				--q6->ip6q_ttl;
 				q6 = q6->ip6q_next;
 				if (q6->ip6q_prev->ip6q_ttl == 0) {
 					IP6STAT_INC(ip6s_fragtimeout);
 					/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-					frag6_freef(q6->ip6q_prev);
+					frag6_freef(q6->ip6q_prev, i);
 				}
 			}
+			/*
+			 * If we are over the maximum number of fragments
+			 * (due to the limit being lowered), drain off
+			 * enough to get down to the new limit.
+			 * Note that we drain all reassembly queues if
+			 * maxfragpackets is 0 (fragmentation is disabled),
+			 * and don't enforce a limit when maxfragpackets
+			 * is negative.
+			 */
+			while ((V_ip6_maxfragpackets == 0 ||
+			    (V_ip6_maxfragpackets > 0 &&
+			    V_ip6q[i].count > V_ip6_maxfragbucketsize)) &&
+			    head->ip6q_prev != head) {
+				IP6STAT_INC(ip6s_fragoverflow);
+				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+				frag6_freef(head->ip6q_prev, i);
+			}
+			IP6Q_UNLOCK(i);
+		}
 		/*
-		 * If we are over the maximum number of fragments
-		 * (due to the limit being lowered), drain off
-		 * enough to get down to the new limit.
+		 * If we are still over the maximum number of fragmented
+		 * packets, drain off enough to get down to the new limit.
 		 */
-		while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
-		    V_ip6q.ip6q_prev) {
-			IP6STAT_INC(ip6s_fragoverflow);
-			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-			frag6_freef(V_ip6q.ip6q_prev);
+		i = 0;
+		while (V_ip6_maxfragpackets >= 0 &&
+		    V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets) {
+			IP6Q_LOCK(i);
+			head = IP6Q_HEAD(i);
+			if (head->ip6q_prev != head) {
+				IP6STAT_INC(ip6s_fragoverflow);
+				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+				frag6_freef(head->ip6q_prev, i);
+			}
+			IP6Q_UNLOCK(i);
+			i = (i + 1) % IP6REASS_NHASH;
 		}
 		CURVNET_RESTORE();
 	}
-	IP6Q_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }

@ -785,22 +909,25 @@ void
 frag6_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
+	struct ip6q *head;
+	int i;

 	VNET_LIST_RLOCK_NOSLEEP();
-	if (IP6Q_TRYLOCK() == 0) {
-		VNET_LIST_RUNLOCK_NOSLEEP();
-		return;
-	}
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		while (V_ip6q.ip6q_next != &V_ip6q) {
-			IP6STAT_INC(ip6s_fragdropped);
-			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-			frag6_freef(V_ip6q.ip6q_next);
+		for (i = 0; i < IP6REASS_NHASH; i++) {
+			if (IP6Q_TRYLOCK(i) == 0)
+				continue;
+			head = IP6Q_HEAD(i);
+			while (head->ip6q_next != head) {
+				IP6STAT_INC(ip6s_fragdropped);
+				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+				frag6_freef(head->ip6q_next, i);
+			}
+			IP6Q_UNLOCK(i);
 		}
 		CURVNET_RESTORE();
 	}
-	IP6Q_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }

--- a/sys/netinet6/in6.h
+++ b/sys/netinet6/in6.h
@ -637,7 +637,9 @@ struct ip6_mtuinfo {
 #define	IPV6CTL_INTRQMAXLEN	51	/* max length of IPv6 netisr queue */
 #define	IPV6CTL_INTRDQMAXLEN	52	/* max length of direct IPv6 netisr
 					 * queue */
-#define	IPV6CTL_MAXID		53
+#define	IPV6CTL_MAXFRAGSPERPACKET	53 /* Max fragments per packet */
+#define	IPV6CTL_MAXFRAGBUCKETSIZE	54 /* Max reassembly queues per bucket */
+#define	IPV6CTL_MAXID		55
 #endif /* __BSD_VISIBLE */

 /*
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@ -386,7 +386,9 @@ VNET_DEFINE(int, ip6_no_radr) = 0;
 VNET_DEFINE(int, ip6_norbit_raif) = 0;
 VNET_DEFINE(int, ip6_rfc6204w3) = 0;
 VNET_DEFINE(int, ip6_maxfragpackets);	/* initialized in frag6.c:frag6_init() */
-VNET_DEFINE(int, ip6_maxfrags);		/* initialized in frag6.c:frag6_init() */
+int ip6_maxfrags;		/* initialized in frag6.c:frag6_init() */
+VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */
+VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */
 VNET_DEFINE(int, ip6_log_interval) = 5;
 VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
 					 * process? */
@ -473,6 +475,20 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
 	return (0);
 }

+static int
+sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = V_ip6_maxfragpackets;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || !req->newptr)
+		return (error);
+	V_ip6_maxfragpackets = val;
+	frag6_set_bucketsize();
+	return (0);
+}
+
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0,
 	"Enable forwarding of IPv6 packets between interfaces");
@ -485,8 +501,9 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim,
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
 	ip6stat,
 	"IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
-	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0,
+SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+	sysctl_ip6_maxfragpackets, "I",
 	"Default maximum number of outstanding fragmented IPv6 packets. "
 	"A value of 0 means no fragmented packets will be accepted, while a "
 	"a value of -1 means no limit");
@ -560,8 +577,16 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,
 	"Use the default scope zone when none is specified");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
-	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0,
-	"Maximum allowed number of outstanding IPv6 packet fragments");
+	CTLFLAG_RW, &ip6_maxfrags, 0,
+	"Maximum allowed number of outstanding IPv6 packet fragments. "
+	"A value of 0 means no fragmented packets will be accepted, while a "
+	"a value of -1 means no limit");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
+	"Maximum number of reassembly queues per hash bucket");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
+	"Maximum allowed number of fragments per packet");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0,
 	"Enable path MTU discovery for multicast packets");
--- a/sys/netinet6/ip6_var.h
+++ b/sys/netinet6/ip6_var.h
@ -296,8 +296,10 @@ VNET_DECLARE(struct socket *, ip6_mrouter);	/* multicast routing daemon */
 VNET_DECLARE(int, ip6_sendredirects);	/* send IP redirects when forwarding? */
 VNET_DECLARE(int, ip6_maxfragpackets);	/* Maximum packets in reassembly
 					 * queue */
-VNET_DECLARE(int, ip6_maxfrags);	/* Maximum fragments in reassembly
+extern int ip6_maxfrags;		/* Maximum fragments in reassembly
 					 * queue */
+VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */
+VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */
 VNET_DECLARE(int, ip6_accept_rtadv);	/* Acts as a host not a router */
 VNET_DECLARE(int, ip6_no_radr);		/* No defroute from RA */
 VNET_DECLARE(int, ip6_norbit_raif);	/* Disable R-bit in NA on RA
@ -312,7 +314,8 @@ VNET_DECLARE(int, ip6_dad_count);	/* DupAddrDetectionTransmits */
 #define	V_ip6_mrouter			VNET(ip6_mrouter)
 #define	V_ip6_sendredirects		VNET(ip6_sendredirects)
 #define	V_ip6_maxfragpackets		VNET(ip6_maxfragpackets)
-#define	V_ip6_maxfrags			VNET(ip6_maxfrags)
+#define	V_ip6_maxfragbucketsize		VNET(ip6_maxfragbucketsize)
+#define	V_ip6_maxfragsperpacket		VNET(ip6_maxfragsperpacket)
 #define	V_ip6_accept_rtadv		VNET(ip6_accept_rtadv)
 #define	V_ip6_no_radr			VNET(ip6_no_radr)
 #define	V_ip6_norbit_raif		VNET(ip6_norbit_raif)
@ -404,6 +407,7 @@ void	ip6_flush_fwdtag(struct mbuf *);

 int	route6_input(struct mbuf **, int *, int);

+void	frag6_set_bucketsize(void);
 void	frag6_init(void);
 int	frag6_input(struct mbuf **, int *, int);
 void	frag6_slowtimo(void);
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@ -387,6 +387,7 @@
 */
 #define	CPUID_STDEXT3_IBPB	0x04000000
 #define	CPUID_STDEXT3_STIBP	0x08000000
+#define	CPUID_STDEXT3_L1D_FLUSH	0x10000000
 #define	CPUID_STDEXT3_ARCH_CAP	0x20000000
 #define	CPUID_STDEXT3_SSBD	0x80000000

@ -438,6 +439,7 @@
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
 #define	MSR_IA32_ARCH_CAP	0x10a
+#define	MSR_IA32_FLUSH_CMD	0x10b
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
@ -593,6 +595,9 @@
 /* MSR IA32_PRED_CMD */
 #define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL

+/* MSR IA32_FLUSH_CMD */
+#define	IA32_FLUSH_CMD_L1D	0x00000001
+
 /*
 * PAT modes.
 */