Thanks for the patch! Acked-by: Sairam Venugopal <vsai...@vmware.com>
On 7/4/16, 2:53 AM, "Paul Boca" <pb...@cloudbasesolutions.com> wrote: >"If the length of the payload to be encapsulated exceeds 64KB, or if >the offset to the L4 header exceeds 255 bytes, then it will not be >possible to offload the packet to the NIC for segmentation." (STT RFC) >In this case the packet needs to be segmented by us, before sending it. > >Signed-off-by: Paul-Daniel Boca <pb...@cloudbasesolutions.com> >--- > datapath-windows/ovsext/Stt.c | 363 >++++++++++++++++++++++-------------------- > 1 file changed, 194 insertions(+), 169 deletions(-) > >diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c >index e8f33a9..ad322d2 100644 >--- a/datapath-windows/ovsext/Stt.c >+++ b/datapath-windows/ovsext/Stt.c >@@ -36,6 +36,8 @@ > #endif > #define OVS_DBG_MOD OVS_DBG_STT > >+#define OVS_MAX_STT_PACKET_LENGTH 0x10000 >+#define OVS_MAX_STT_L4_OFFSET_LENGTH 0xFF > > KSTART_ROUTINE OvsSttDefragCleaner; > static PLIST_ENTRY OvsSttPktFragHash; >@@ -157,6 +159,10 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > ULONG mss = 0; > NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; > PVOID vlanTagValue; >+ ULONG tcpHeaderOffset = sizeof(EthHdr) + sizeof(IPHdr); >+ UINT32 encapMss = OvsGetExternalMtu(switchContext) >+ - sizeof(IPHdr) >+ - sizeof(TCPHdr); > > curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); > >@@ -166,6 +172,24 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > > if (layers->isTcp) { > mss = OVSGetTcpMSS(curNbl); >+ >+ curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >+ innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >+ >+ /* If the length of the packet exceeds 64K or if the L4 offset is >+ bigger than 255 bytes, then the packet cannot be offloaded to >the >+ network card */ >+ if ((innerFrameLen > OVS_MAX_STT_PACKET_LENGTH) || >+ (layers->l4Offset > OVS_MAX_STT_L4_OFFSET_LENGTH)) { >+ *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers, >+ mss - headRoom, headRoom); >+ if (*newNbl == NULL) { >+ OVS_LOG_ERROR("Unable to segment NBL"); >+ return NDIS_STATUS_FAILURE; >+ } >+ /* Clear out LSO flags after this point */ >+ NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) >= 0; >+ } > } > > vportStt = (POVS_STT_VPORT) GetOvsVportPriv(vport); >@@ -175,164 +199,195 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, > >TcpIpChecksumNetBufferListInfo); > vlanTagValue = NET_BUFFER_LIST_INFO(curNbl, >Ieee8021QNetBufferListInfo); >- *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom, >- FALSE /*copy NblInfo*/); > if (*newNbl == NULL) { >- OVS_LOG_ERROR("Unable to copy NBL"); >- return NDIS_STATUS_FAILURE; >+ *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom, >+ FALSE /*copy NblInfo*/); >+ if (*newNbl == NULL) { >+ OVS_LOG_ERROR("Unable to copy NBL"); >+ return NDIS_STATUS_FAILURE; >+ } > } >- > curNbl = *newNbl; >- curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >- curMdl = NET_BUFFER_CURRENT_MDL(curNb); >- /* NB Chain should be split before */ >- ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >- innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >- >- bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, >- LowPagePriority); >- if (bufferStart == NULL) { >- status = NDIS_STATUS_RESOURCES; >- goto ret_error; >- } >- bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); >- >- if (layers->isIPv4) { >- IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); >- if (!ip->tot_len) { >- ip->tot_len = htons(innerFrameLen - layers->l3Offset); >+ for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL; >+ curNb = curNb->Next) { >+ curMdl = NET_BUFFER_CURRENT_MDL(curNb); >+ innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >+ bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, >+ >LowPagePriority); >+ if (bufferStart == NULL) { >+ status = NDIS_STATUS_RESOURCES; >+ goto ret_error; > } >- if (!ip->check) { >- ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >+ bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); >+ >+ if (layers->isIPv4) { >+ IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); >+ if (!ip->tot_len) { >+ ip->tot_len = htons(innerFrameLen - layers->l3Offset); >+ } >+ if (!ip->check) { >+ ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >+ } > } >- } > >- if (layers->isTcp) { >- if (mss) { >- innerPartialChecksum = TRUE; >- } else { >- if (!csumInfo.Transmit.TcpChecksum) { >+ if (layers->isTcp) { >+ if (mss) { >+ innerPartialChecksum = TRUE; >+ } else { >+ if (!csumInfo.Transmit.TcpChecksum) { >+ innerChecksumVerified = TRUE; >+ } else { >+ innerPartialChecksum = TRUE; >+ } >+ } >+ } else if (layers->isUdp) { >+ if(!csumInfo.Transmit.UdpChecksum) { > innerChecksumVerified = TRUE; > } else { > innerPartialChecksum = TRUE; > } > } >- } else if (layers->isUdp) { >- if(!csumInfo.Transmit.UdpChecksum) { >- innerChecksumVerified = TRUE; >- } else { >- innerPartialChecksum = TRUE; >- } >- } > >- status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); >- if (status != NDIS_STATUS_SUCCESS) { >- ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)"); >- OVS_LOG_ERROR("Unable to >NdisRetreatNetBufferDataStart(headroom)"); >- goto ret_error; >- } >+ status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); >+ if (status != NDIS_STATUS_SUCCESS) { >+ ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)"); >+ OVS_LOG_ERROR("Unable to >NdisRetreatNetBufferDataStart(headroom)"); >+ goto ret_error; >+ } > >- /* >- * Make sure that the headroom for the tunnel header is continguous >in >- * memory. >- */ >- curMdl = NET_BUFFER_CURRENT_MDL(curNb); >- ASSERT((int) (MmGetMdlByteCount(curMdl) - >- NET_BUFFER_CURRENT_MDL_OFFSET(curNb)) >= (int) headRoom); >- >- buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority); >- if (!buf) { >- ASSERT(!"MmGetSystemAddressForMdlSafe failed"); >- OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed"); >- status = NDIS_STATUS_RESOURCES; >- goto ret_error; >- } >+ /* >+ * Make sure that the headroom for the tunnel header is >continguous in >+ * memory. >+ */ >+ curMdl = NET_BUFFER_CURRENT_MDL(curNb); >+ ASSERT((int) (MmGetMdlByteCount(curMdl) - >+ NET_BUFFER_CURRENT_MDL_OFFSET(curNb)) >= (int) >headRoom); >+ >+ buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, >LowPagePriority); >+ if (!buf) { >+ ASSERT(!"MmGetSystemAddressForMdlSafe failed"); >+ OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed"); >+ status = NDIS_STATUS_RESOURCES; >+ goto ret_error; >+ } > >- buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); >- outerEthHdr = (EthHdr *)buf; >- outerIpHdr = (IPHdr *) (outerEthHdr + 1); >- outerTcpHdr = (TCPHdr *) (outerIpHdr + 1); >- sttHdr = (SttHdr *) (outerTcpHdr + 1); >- >- /* L2 header */ >- ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) == >- (PCHAR)&fwdInfo->srcMacAddr); >- NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr, >- sizeof outerEthHdr->Destination + sizeof >outerEthHdr->Source); >- outerEthHdr->Type = htons(ETH_TYPE_IPV4); >- >- /* L3 header */ >- outerIpHdr->ihl = sizeof(IPHdr) >> 2; >- outerIpHdr->version = IPPROTO_IPV4; >- outerIpHdr->tos = tunKey->tos; >- >- ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + >innerFrameLen; >- outerIpHdr->tot_len = htons(ipTotalLen); >- ASSERT(ipTotalLen < 65536); >- >- outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, >innerFrameLen); >- outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ? >- IP_DF_NBO : 0; >- outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64; >- outerIpHdr->protocol = IPPROTO_TCP; >- outerIpHdr->check = 0; >- outerIpHdr->saddr = fwdInfo->srcIpAddr; >- outerIpHdr->daddr = tunKey->dst; >- >- /* L4 header */ >- RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr); >- outerTcpHdr->source = htons(tunKey->flow_hash | 32768); >- outerTcpHdr->dest = htons(vportStt->dstPort); >- outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) << >- STT_SEQ_LEN_SHIFT); >- outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo)); >- outerTcpHdr->doff = sizeof(TCPHdr) >> 2; >- outerTcpHdr->psh = 1; >- outerTcpHdr->ack = 1; >- outerTcpHdr->window = (uint16) ~0; >- >- /* Calculate pseudo header chksum */ >- tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen; >- ASSERT(tcpChksumLen < 65535); >- sttHdr->version = 0; >- >- /* Set STT Header */ >- sttHdr->flags = 0; >- sttHdr->mss = 0; >- sttHdr->l4Offset = 0; >- if (innerPartialChecksum) { >- sttHdr->flags |= STT_CSUM_PARTIAL; >- if (layers->isIPv4) { >- sttHdr->flags |= STT_PROTO_IPV4; >+ buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); >+ outerEthHdr = (EthHdr *)buf; >+ outerIpHdr = (IPHdr *) (outerEthHdr + 1); >+ outerTcpHdr = (TCPHdr *) (outerIpHdr + 1); >+ sttHdr = (SttHdr *) (outerTcpHdr + 1); >+ >+ /* L2 header */ >+ ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof >fwdInfo->dstMacAddr) == >+ (PCHAR)&fwdInfo->srcMacAddr); >+ NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr, >+ sizeof outerEthHdr->Destination + sizeof >outerEthHdr->Source); >+ outerEthHdr->Type = htons(ETH_TYPE_IPV4); >+ >+ /* L3 header */ >+ outerIpHdr->ihl = sizeof(IPHdr) >> 2; >+ outerIpHdr->version = IPPROTO_IPV4; >+ outerIpHdr->tos = tunKey->tos; >+ >+ ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + >innerFrameLen; >+ outerIpHdr->tot_len = htons(ipTotalLen); >+ ASSERT(ipTotalLen < 65536); >+ >+ outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, >innerFrameLen); >+ outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) >? >+ IP_DF_NBO : 0; >+ outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64; >+ outerIpHdr->protocol = IPPROTO_TCP; >+ outerIpHdr->check = 0; >+ outerIpHdr->saddr = fwdInfo->srcIpAddr; >+ outerIpHdr->daddr = tunKey->dst; >+ >+ /* L4 header */ >+ RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr); >+ outerTcpHdr->source = htons(tunKey->flow_hash | 32768); >+ outerTcpHdr->dest = htons(vportStt->dstPort); >+ outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) << >+ STT_SEQ_LEN_SHIFT); >+ outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo)); >+ outerTcpHdr->doff = sizeof(TCPHdr) >> 2; >+ outerTcpHdr->psh = 1; >+ outerTcpHdr->ack = 1; >+ outerTcpHdr->window = (uint16) ~0; >+ >+ /* Calculate pseudo header chksum */ >+ tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen; >+ ASSERT(tcpChksumLen < 65535); >+ sttHdr->version = 0; >+ >+ /* Set STT Header */ >+ sttHdr->flags = 0; >+ sttHdr->mss = 0; >+ sttHdr->l4Offset = 0; >+ if (innerPartialChecksum) { >+ sttHdr->flags |= STT_CSUM_PARTIAL; >+ if (layers->isIPv4) { >+ sttHdr->flags |= STT_PROTO_IPV4; >+ } >+ if (layers->isTcp) { >+ sttHdr->flags |= STT_PROTO_TCP; >+ } >+ sttHdr->l4Offset = (UINT8) layers->l4Offset; >+ sttHdr->mss = (UINT16) htons(mss); >+ } else if (innerChecksumVerified) { >+ sttHdr->flags = STT_CSUM_VERIFIED; >+ sttHdr->l4Offset = 0; >+ sttHdr->mss = 0; > } >- if (layers->isTcp) { >- sttHdr->flags |= STT_PROTO_TCP; >+ >+ /* Set VLAN tag */ >+ sttHdr->vlanTCI = 0; >+ if (vlanTagValue) { >+ PNDIS_NET_BUFFER_LIST_8021Q_INFO vlanTag = >+ (PNDIS_NET_BUFFER_LIST_8021Q_INFO)(PVOID *)&vlanTagValue; >+ sttHdr->vlanTCI = htons(vlanTag->TagHeader.VlanId | >OVSWIN_VLAN_CFI | >+ (vlanTag->TagHeader.UserPriority << >13)); > } >- sttHdr->l4Offset = (UINT8) layers->l4Offset; >- sttHdr->mss = (UINT16) htons(mss); >- } else if (innerChecksumVerified) { >- sttHdr->flags = STT_CSUM_VERIFIED; >- sttHdr->l4Offset = 0; >- sttHdr->mss = 0; >- } > >- /* Set VLAN tag */ >- sttHdr->vlanTCI = 0; >- if (vlanTagValue) { >- PNDIS_NET_BUFFER_LIST_8021Q_INFO vlanTag = >- (PNDIS_NET_BUFFER_LIST_8021Q_INFO)(PVOID *)&vlanTagValue; >- sttHdr->vlanTCI = htons(vlanTag->TagHeader.VlanId | >OVSWIN_VLAN_CFI | >- (vlanTag->TagHeader.UserPriority << 13)); >+ sttHdr->reserved = 0; >+ sttHdr->key = tunKey->tunnelId; >+ /* Zero out stt padding */ >+ *(uint16 *)(sttHdr + 1) = 0; >+ >+ /* The LSO offloading will be set only if the packet isn't >+ segmented due to the 64K limit for the offloading or 255 bytes >+ limit of L4 offset */ >+ if (ipTotalLen > encapMss) { >+ /* For Windows LSO, the TCP pseudo checksum must contain >Source IP >+ * Address, Destination IP Address, and Protocol; the length >of the >+ * payload is excluded because the underlying miniport >driver and NIC >+ * generate TCP segments from the large packet that is >passed down by >+ * the TCP/IP transport, the transport does not know the >size of the >+ * TCP payload for each TCP segment and therefore cannot >include the >+ * TCP Length in the pseudo-header. >+ */ >+ outerIpHdr->check = IPChecksum((UINT8 *)outerIpHdr, >+ sizeof *outerIpHdr, 0); >+ outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, >+ (uint32 *)&tunKey->dst, >+ IPPROTO_TCP, (uint16)0); >+ >+ lsoInfo.Value = 0; >+ lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; >+ lsoInfo.LsoV2Transmit.MSS = encapMss; >+ lsoInfo.LsoV2Transmit.Type = >NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >+ lsoInfo.LsoV2Transmit.IPVersion = >NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >+ NET_BUFFER_LIST_INFO(curNbl, >+ TcpLargeSendNetBufferListInfo) = lsoInfo.Value; >+ } else { >+ outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, >+ (uint32 *) &tunKey->dst, >+ IPPROTO_TCP, >+ (uint16) tcpChksumLen); >+ } > } > >- sttHdr->reserved = 0; >- sttHdr->key = tunKey->tunnelId; >- /* Zero out stt padding */ >- *(uint16 *)(sttHdr + 1) = 0; >- >- /* Offload IP and TCP checksum */ >- ULONG tcpHeaderOffset = sizeof *outerEthHdr + >- outerIpHdr->ihl * 4; >+ /* Offload IP and TCP checksum. >+ The offsets are the same for all segments if the packet was >segmented */ > csumInfo.Value = 0; > csumInfo.Transmit.IpHeaderChecksum = 1; > csumInfo.Transmit.TcpChecksum = 1; >@@ -341,36 +396,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > NET_BUFFER_LIST_INFO(curNbl, > TcpIpChecksumNetBufferListInfo) = >csumInfo.Value; > >- UINT32 encapMss = OvsGetExternalMtu(switchContext) >- - sizeof(IPHdr) >- - sizeof(TCPHdr); >- if (ipTotalLen > encapMss) { >- /* For Windows LSO, the TCP pseudo checksum must contain Source >IP >- * Address, Destination IP Address, and Protocol; the length of >the >- * payload is excluded because the underlying miniport driver >and NIC >- * generate TCP segments from the large packet that is passed >down by >- * the TCP/IP transport, the transport does not know the size of >the >- * TCP payload for each TCP segment and therefore cannot include >the >- * TCP Length in the pseudo-header. >- */ >- outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, >- (uint32 *) &tunKey->dst, >- IPPROTO_TCP, (uint16) 0); >- >- lsoInfo.Value = 0; >- lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; >- lsoInfo.LsoV2Transmit.MSS = encapMss; >- lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >- lsoInfo.LsoV2Transmit.IPVersion = >NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >- NET_BUFFER_LIST_INFO(curNbl, >- TcpLargeSendNetBufferListInfo) = >lsoInfo.Value; >- } else { >- outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr, >- (uint32 *) &tunKey->dst, >- IPPROTO_TCP, >- (uint16) tcpChksumLen); >- } >- > return STATUS_SUCCESS; > > ret_error: >-- >2.7.2.windows.1 >_______________________________________________ >dev mailing list >dev@openvswitch.org >https://urldefense.proofpoint.com/v2/url?u=http-3A__openvswitch.org_mailma >n_listinfo_dev&d=CwIGaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Dc >ruz40PROJ40ROzSpxyQSLw6fcrOWpJgEcEmNR3JEQ&m=GP_iTs4Uc8b40qSWLjQXq3W8pVIbST >C6AWjiHEHkW1I&s=t2t5TuiH24WV0N7JOn7nKkr0_T29yFc6ex275a4g4-g&e= _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev