Register and use AVX CPU intrinsic instructions when available to do 256-bit reads to speed up reading EDC and MC. Otherwise, fallback to 32-bit reads. Also align destination buffer on 32-byte boundary.
Signed-off-by: Rahul Lakkireddy <rahul.lakkire...@chelsio.com> Signed-off-by: Ganesh Goudar <ganes...@chelsio.com> --- drivers/net/ethernet/chelsio/cxgb4/Makefile | 1 + drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h | 2 + .../net/ethernet/chelsio/cxgb4/cudbg_intrinsic.c | 7 +- .../net/ethernet/chelsio/cxgb4/cudbg_intrinsic.h | 8 +++ .../ethernet/chelsio/cxgb4/cudbg_intrinsic_avx.c | 78 ++++++++++++++++++++++ drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 5 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 2 + 7 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic_avx.c diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile index 0dbaf1b18bac..a0f5239b19d4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/Makefile +++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile @@ -12,3 +12,4 @@ cxgb4-objs := cxgb4_main.o l2t.o smt.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o cxgb4-$(CONFIG_CHELSIO_T4_DCB) += cxgb4_dcb.o cxgb4-$(CONFIG_CHELSIO_T4_FCOE) += cxgb4_fcoe.o cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o +cxgb4-$(CONFIG_X86) += cudbg_intrinsic_avx.o diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h index b57acb8dc35b..4269d1621e9a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h @@ -25,6 +25,8 @@ #define MC1_FLAG 4 #define HMA_FLAG 5 +#define CUDBG_MEM_ALIGN 32 + #define CUDBG_ENTITY_SIGNATURE 0xCCEDB001 struct cudbg_mbox_log { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.c index 0b80512e5c0c..6ed418d90507 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.c @@ -34,5 +34,10 @@ unsigned int cudbg_mem_read_def(struct cudbg_init *pdbg_init, void cudbg_set_intrinsic_callback(struct cudbg_init *pdbg_init) { - pdbg_init->intrinsic_cb = cudbg_mem_read_def; +#ifdef CONFIG_X86 + if (cudbg_intrinsic_avx_supported()) + pdbg_init->intrinsic_cb = cudbg_mem_read_avx; + else +#endif + pdbg_init->intrinsic_cb = cudbg_mem_read_def; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.h index 3af0f07311ec..d878c71ef65d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic.h @@ -21,5 +21,13 @@ unsigned int cudbg_mem_read_def(struct cudbg_init *pdbg_init, u32 start, u32 offset, u32 size, u32 mem_aperture, u8 *outbuf); + +#ifdef CONFIG_X86 +int cudbg_intrinsic_avx_supported(void); +unsigned int cudbg_mem_read_avx(struct cudbg_init *pdbg_init, u32 start, + u32 offset, u32 size, u32 mem_aperture, + u8 *outbuf); +#endif + void cudbg_set_intrinsic_callback(struct cudbg_init *pdbg_init); #endif /* __CUDBG_INTRINSIC_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic_avx.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic_avx.c new file mode 100644 index 000000000000..d5bd4dfef428 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_intrinsic_avx.c @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2018 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#include <linux/cpufeature.h> +#include <asm/fpu/api.h> + +#include "cxgb4.h" +#include "cudbg_if.h" +#include "cudbg_lib_common.h" +#include "cudbg_intrinsic.h" + +int cudbg_intrinsic_avx_supported(void) +{ +#ifdef CONFIG_AS_AVX + return boot_cpu_has(X86_FEATURE_AVX); +#else + return 0; +#endif /* CONFIG_AS_AVX */ +} + +/* Alignment in bytes for AVX aligned instructions */ +#define CUDBG_MEM_ALIGN_AVX 32 + +unsigned int cudbg_mem_read_avx(struct cudbg_init *pdbg_init, u32 start, + u32 offset, u32 size, u32 mem_aperture, + u8 *outbuf) +{ +#ifdef CONFIG_AS_AVX + u32 max_read_len = CUDBG_MEM_ALIGN_AVX; + struct adapter *adap = pdbg_init->adap; + u8 *reg_addr, *src_addr, *dst_addr; + u32 bytes_read, read_len; + + reg_addr = (u8 *)adap->regs + start + offset; + src_addr = PTR_ALIGN(reg_addr, max_read_len); + dst_addr = PTR_ALIGN(outbuf, max_read_len); + read_len = min(size, max_read_len); + + /* Don't use intrinsic for following cases: + * 1. If reading current offset + 256-bits would + * exceed current window aperture. + * 2. Source or Destination address is not aligned + * to 256-bits. + * 3. There are less than 256-bits left to read. + */ + if (offset + max_read_len > mem_aperture || + src_addr != reg_addr || dst_addr != outbuf || + read_len < max_read_len) { + return cudbg_mem_read_def(pdbg_init, start, offset, size, + mem_aperture, outbuf); + } else { + kernel_fpu_begin(); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (*reg_addr)); + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*outbuf)); + kernel_fpu_end(); + bytes_read = read_len; + } + + return bytes_read; +#else + return cudbg_mem_read_def(pdbg_init, start, offset, size, mem_aperture, + outbuf); +#endif /* CONFIG_AS_AVX */ +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c index db1b57a09887..220ba2f60cf7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c @@ -428,12 +428,15 @@ int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size, buf, &total_size); - if (flag & CXGB4_ETH_DUMP_MEM) + if (flag & CXGB4_ETH_DUMP_MEM) { + dbg_buff.offset = roundup(dbg_buff.offset, CUDBG_MEM_ALIGN); + total_size = roundup(total_size, CUDBG_MEM_ALIGN); cxgb4_cudbg_collect_entity(&cudbg_init, &dbg_buff, cxgb4_collect_mem_dump, ARRAY_SIZE(cxgb4_collect_mem_dump), buf, &total_size); + } cudbg_hdr->data_len = total_size; *buf_size = total_size; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 7852d98bad75..d437e46f6af6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -1362,6 +1362,7 @@ static int set_dump(struct net_device *dev, struct ethtool_dump *eth_dump) len = sizeof(struct cudbg_hdr) + sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY; len += cxgb4_get_dump_length(adapter, eth_dump->flag); + len = roundup(len, CUDBG_MEM_ALIGN); adapter->eth_dump.flag = eth_dump->flag; adapter->eth_dump.len = len; @@ -1391,6 +1392,7 @@ static int get_dump_data(struct net_device *dev, struct ethtool_dump *eth_dump, len = sizeof(struct cudbg_hdr) + sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY; len += cxgb4_get_dump_length(adapter, adapter->eth_dump.flag); + len = roundup(len, CUDBG_MEM_ALIGN); if (eth_dump->len < len) return -ENOMEM; -- 2.14.1