[PATCH] D135938: [X86] Add AVX-VNNI-INT8 instructions.

Freddy, Ye via Phabricator via cfe-commits Mon, 17 Oct 2022 20:00:32 -0700

FreddyYe updated this revision to Diff 468398.
FreddyYe added a comment.

Address comments.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D135938/new/

https://reviews.llvm.org/D135938

Files:
  clang/docs/ReleaseNotes.rst
  clang/include/clang/Basic/BuiltinsX86.def
  clang/include/clang/Driver/Options.td
  clang/lib/Basic/Targets/X86.cpp
  clang/lib/Basic/Targets/X86.h
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/avxvnniint8intrin.h
  clang/lib/Headers/cpuid.h
  clang/lib/Headers/immintrin.h
  clang/test/CodeGen/attr-target-x86.c
  clang/test/CodeGen/avxvnniint8-builtins.c
  clang/test/Driver/x86-target-features.c
  clang/test/Preprocessor/x86_target_features.c
  llvm/docs/ReleaseNotes.rst
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/include/llvm/Support/X86TargetParser.def
  llvm/lib/Support/Host.cpp
  llvm/lib/Support/X86TargetParser.cpp
  llvm/lib/Target/X86/X86.td
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/lib/Target/X86/X86ISelLowering.h
  llvm/lib/Target/X86/X86InstrFoldTables.cpp
  llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
  llvm/lib/Target/X86/X86InstrInfo.cpp
  llvm/lib/Target/X86/X86InstrInfo.td
  llvm/lib/Target/X86/X86InstrSSE.td
  llvm/lib/Target/X86/X86IntrinsicsInfo.h
  llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
  llvm/test/MC/Disassembler/X86/avx-vnni_int8-intel.txt
  llvm/test/MC/Disassembler/X86/x86-64-avx-vnni_int8-att.txt
  llvm/test/MC/X86/avx-vnni-int8-intel.s
  llvm/test/MC/X86/x86-64-avx-vnni-int8-att.s

Index: llvm/test/MC/X86/x86-64-avx-vnni-int8-att.s
===================================================================
--- /dev/null
+++ llvm/test/MC/X86/x86-64-avx-vnni-int8-att.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-encoding < %s  | FileCheck %s
+
+// CHECK: vpdpbssd %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6]
+     vpdpbssd %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssd %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6]
+     vpdpbssd %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbssds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6]
+     vpdpbssds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6]
+     vpdpbssds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6]
+     vpdpbsud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6]
+     vpdpbsud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6]
+     vpdpbsuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6]
+     vpdpbsuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6]
+     vpdpbuud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6]
+     vpdpbuud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6]
+     vpdpbuuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6]
+     vpdpbuuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+
Index: llvm/test/MC/X86/avx-vnni-int8-intel.s
===================================================================
--- /dev/null
+++ llvm/test/MC/X86/avx-vnni-int8-intel.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+     vpdpbssd ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+     vpdpbssd xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10]
+     vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10]
+     vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+     vpdpbssds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+     vpdpbssds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10]
+     vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10]
+     vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+     vpdpbsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+     vpdpbsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10]
+     vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10]
+     vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+     vpdpbsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+     vpdpbsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+     vpdpbuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+     vpdpbuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10]
+     vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10]
+     vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+     vpdpbuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+     vpdpbuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
Index: llvm/test/MC/Disassembler/X86/x86-64-avx-vnni_int8-att.txt
===================================================================
--- /dev/null
+++ llvm/test/MC/Disassembler/X86/x86-64-avx-vnni_int8-att.txt
@@ -0,0 +1,182 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s
+
+# CHECK: vpdpbssd %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x17,0x50,0xe6
+
+# CHECK: vpdpbssd %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x13,0x50,0xe6
+
+# CHECK: vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssd  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssd  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbssds %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x17,0x51,0xe6
+
+# CHECK: vpdpbssds %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x13,0x51,0xe6
+
+# CHECK: vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssds  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssds  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbsud %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x16,0x50,0xe6
+
+# CHECK: vpdpbsud %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x12,0x50,0xe6
+
+# CHECK: vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsud  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsud  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbsuds %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x16,0x51,0xe6
+
+# CHECK: vpdpbsuds %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x12,0x51,0xe6
+
+# CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsuds  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsuds  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbuud %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x14,0x50,0xe6
+
+# CHECK: vpdpbuud %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x10,0x50,0xe6
+
+# CHECK: vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuud  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuud  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbuuds %ymm14, %ymm13, %ymm12
+0xc4,0x42,0x14,0x51,0xe6
+
+# CHECK: vpdpbuuds %xmm14, %xmm13, %xmm12
+0xc4,0x42,0x10,0x51,0xe6
+
+# CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuuds  (%rip), %ymm13, %ymm12
+0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuuds  (%rip), %xmm13, %xmm12
+0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00
+
+# CHECK: vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
Index: llvm/test/MC/Disassembler/X86/avx-vnni_int8-intel.txt
===================================================================
--- /dev/null
+++ llvm/test/MC/Disassembler/X86/avx-vnni_int8-intel.txt
@@ -0,0 +1,182 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: vpdpbssd ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x50,0xd4
+
+# CHECK: vpdpbssd xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x50,0xd4
+
+# CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x50,0x10
+
+# CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x50,0x10
+
+# CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbssds ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x51,0xd4
+
+# CHECK: vpdpbssds xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x51,0xd4
+
+# CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x51,0x10
+
+# CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x51,0x10
+
+# CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x50,0xd4
+
+# CHECK: vpdpbsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x50,0xd4
+
+# CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x50,0x10
+
+# CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x50,0x10
+
+# CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x51,0xd4
+
+# CHECK: vpdpbsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x51,0xd4
+
+# CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x51,0x10
+
+# CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x51,0x10
+
+# CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x50,0xd4
+
+# CHECK: vpdpbuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x50,0xd4
+
+# CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x50,0x10
+
+# CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x50,0x10
+
+# CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: vpdpbuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x51,0xd4
+
+# CHECK: vpdpbuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x51,0xd4
+
+# CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x51,0x10
+
+# CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x51,0x10
+
+# CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
Index: llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X64
+
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18]
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x1f]
+; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18]
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x1f]
+; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18]
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x1f]
+; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18]
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x1f]
+; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18]
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x1f]
+; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18]
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x1f]
+; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -415,6 +415,18 @@
   X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssd_128,  INTR_TYPE_3OP, X86ISD::VPDPBSSD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssd_256,  INTR_TYPE_3OP, X86ISD::VPDPBSSD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssds_128, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssds_256, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsud_128,  INTR_TYPE_3OP, X86ISD::VPDPBSUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsud_256,  INTR_TYPE_3OP, X86ISD::VPDPBSUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsuds_128, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsuds_256, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuud_128,  INTR_TYPE_3OP, X86ISD::VPDPBUUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuud_256,  INTR_TYPE_3OP, X86ISD::VPDPBUUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
   X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
Index: llvm/lib/Target/X86/X86InstrSSE.td
===================================================================
--- llvm/lib/Target/X86/X86InstrSSE.td
+++ llvm/lib/Target/X86/X86InstrSSE.td
@@ -8115,3 +8115,61 @@
                                              X86GF2P8affineqb>, TAPD;
 }
 
+let Constraints = "$src1 = $dst" in
+multiclass avx_dotprod_rm<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                          RegisterClass RC, PatFrag MemOpFrag,
+                          X86MemOperand x86memop, SDNode OpNode,
+                          X86FoldableSchedWrite sched,
+                          bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  :  I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2, RC:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
+             VEX_4V, Sched<[sched]>;
+  def rm  :  I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2, x86memop:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                   (MemOpFrag addr:$src3))))]>,
+             VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVXVNNIINT8] in {
+  defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
+                                   1>,  T8XD;
+  defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
+                                   1>,  VEX_L, T8XD;
+  defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
+                                   1>,  T8PS;
+  defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
+                                   1>,  VEX_L, T8PS;
+  defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
+                                   1>, T8XD;
+  defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8XD;
+  defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
+                                   1>, T8PS;
+  defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8PS;
+  defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
+                                   0>,  T8XS;
+  defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
+                                   0>,  VEX_L, T8XS;
+  defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
+                                   0>, T8XS;
+  defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8XS;
+}
Index: llvm/lib/Target/X86/X86InstrInfo.td
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.td
+++ llvm/lib/Target/X86/X86InstrInfo.td
@@ -913,6 +913,7 @@
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
 def HasFP16      : Predicate<"Subtarget->hasFP16()">;
+def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">;
 def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
 def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
Index: llvm/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2550,6 +2550,14 @@
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDSYrr:
   case X86::VPDPWSSDSrr:
+  case X86::VPDPBSSDSrr:
+  case X86::VPDPBSSDSYrr:
+  case X86::VPDPBSSDrr:
+  case X86::VPDPBSSDYrr:
+  case X86::VPDPBUUDSrr:
+  case X86::VPDPBUUDSYrr:
+  case X86::VPDPBUUDrr:
+  case X86::VPDPBUUDYrr:
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128rk:
   case X86::VPDPWSSDZ128rkz:
Index: llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -813,6 +813,13 @@
   SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
 ]>;
 
+def X86vpdpbssd  : SDNode<"X86ISD::VPDPBSSD",  SDTVnni>;
+def X86vpdpbssds : SDNode<"X86ISD::VPDPBSSDS", SDTVnni>;
+def X86vpdpbsud  : SDNode<"X86ISD::VPDPBSUD",  SDTVnni>;
+def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>;
+def X86vpdpbuud  : SDNode<"X86ISD::VPDPBUUD",  SDTVnni>;
+def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>;
+
 //===----------------------------------------------------------------------===//
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/X86/X86InstrFoldTables.cpp
===================================================================
--- llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -3995,6 +3995,14 @@
   { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
   { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
   { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBSSDSYrr,               X86::VPDPBSSDSYrm,               0 },
+  { X86::VPDPBSSDSrr,                X86::VPDPBSSDSrm,                0 },
+  { X86::VPDPBSSDYrr,                X86::VPDPBSSDYrm,                0 },
+  { X86::VPDPBSSDrr,                 X86::VPDPBSSDrm,                 0 },
+  { X86::VPDPBSUDSYrr,               X86::VPDPBSUDSYrm,               0 },
+  { X86::VPDPBSUDSrr,                X86::VPDPBSUDSrm,                0 },
+  { X86::VPDPBSUDYrr,                X86::VPDPBSUDYrm,                0 },
+  { X86::VPDPBSUDrr,                 X86::VPDPBSUDrm,                 0 },
   { X86::VPDPBUSDSYrr,               X86::VPDPBUSDSYrm,               0 },
   { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
   { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
@@ -4005,6 +4013,10 @@
   { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
   { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
   { X86::VPDPBUSDrr,                 X86::VPDPBUSDrm,                 0 },
+  { X86::VPDPBUUDSYrr,               X86::VPDPBUUDSYrm,               0 },
+  { X86::VPDPBUUDSrr,                X86::VPDPBUUDSrm,                0 },
+  { X86::VPDPBUUDYrr,                X86::VPDPBUUDYrm,                0 },
+  { X86::VPDPBUUDrr,                 X86::VPDPBUUDrm,                 0 },
   { X86::VPDPWSSDSYrr,               X86::VPDPWSSDSYrm,               0 },
   { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
   { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -584,6 +584,13 @@
     VFCMULCSH,
     VFCMULCSH_RND,
 
+    VPDPBSUD,
+    VPDPBSUDS,
+    VPDPBUUD,
+    VPDPBUUDS,
+    VPDPBSSD,
+    VPDPBSSDS,
+
     // Compress and expand.
     COMPRESS,
     EXPAND,
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33997,6 +33997,12 @@
   NODE_NAME_CASE(ENQCMD)
   NODE_NAME_CASE(ENQCMDS)
   NODE_NAME_CASE(VP2INTERSECT)
+  NODE_NAME_CASE(VPDPBSUD)
+  NODE_NAME_CASE(VPDPBSUDS)
+  NODE_NAME_CASE(VPDPBUUD)
+  NODE_NAME_CASE(VPDPBUUDS)
+  NODE_NAME_CASE(VPDPBSSD)
+  NODE_NAME_CASE(VPDPBSSDS)
   NODE_NAME_CASE(AESENC128KL)
   NODE_NAME_CASE(AESDEC128KL)
   NODE_NAME_CASE(AESENC256KL)
Index: llvm/lib/Target/X86/X86.td
===================================================================
--- llvm/lib/Target/X86/X86.td
+++ llvm/lib/Target/X86/X86.td
@@ -181,6 +181,10 @@
 def FeatureFP16    : SubtargetFeature<"avx512fp16", "HasFP16", "true",
                            "Support 16-bit floating point",
                            [FeatureBWI, FeatureVLX, FeatureDQI]>;
+def FeatureAVXVNNIINT8  : SubtargetFeature<"avxvnniint8",
+                             "HasAVXVNNIINT8", "true",
+                             "Enable AVX-VNNI-INT8",
+                             [FeatureAVX2]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
Index: llvm/lib/Support/X86TargetParser.cpp
===================================================================
--- llvm/lib/Support/X86TargetParser.cpp
+++ llvm/lib/Support/X86TargetParser.cpp
@@ -581,6 +581,7 @@
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
+static constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
 static constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
     FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
 // Key Locker Features
Index: llvm/lib/Support/Host.cpp
===================================================================
--- llvm/lib/Support/Host.cpp
+++ llvm/lib/Support/Host.cpp
@@ -1808,6 +1808,7 @@
   Features["avxvnni"]    = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
   Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
   Features["hreset"]     = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
+  Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
Index: llvm/include/llvm/Support/X86TargetParser.def
===================================================================
--- llvm/include/llvm/Support/X86TargetParser.def
+++ llvm/include/llvm/Support/X86TargetParser.def
@@ -201,6 +201,7 @@
 X86_FEATURE       (XSAVES,          "xsaves")
 X86_FEATURE       (HRESET,          "hreset")
 X86_FEATURE       (AVX512FP16,      "avx512fp16")
+X86_FEATURE       (AVXVNNIINT8,     "avxvnniint8")
 X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
Index: llvm/include/llvm/IR/IntrinsicsX86.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsX86.td
+++ llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1929,6 +1929,66 @@
               ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssd_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssd_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
Index: llvm/docs/ReleaseNotes.rst
===================================================================
--- llvm/docs/ReleaseNotes.rst
+++ llvm/docs/ReleaseNotes.rst
@@ -131,6 +131,8 @@
 Changes to the X86 Backend
 --------------------------
 
+* Support ISA of ``AVX-VNNI-INT8``.
+
 Changes to the OCaml bindings
 -----------------------------
 
Index: clang/test/Preprocessor/x86_target_features.c
===================================================================
--- clang/test/Preprocessor/x86_target_features.c
+++ clang/test/Preprocessor/x86_target_features.c
@@ -581,6 +581,20 @@
 // AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1
 // AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1
 
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8 %s
+
+// AVXVNNIINT8: #define __AVX2__ 1
+// AVXVNNIINT8: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNIINT8 %s
+
+// NOAVXVNNIINT8-NOT: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8NOAVX512BF16 %s
+
+// AVXVNNIINT8NOAVX512BF16-NOT: #define __AVX2__ 1
+// AVXVNNIINT8NOAVX512BF16-NOT: #define __AVXVNNIINT8__ 1
+
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
 
 // CRC32: #define __CRC32__ 1
Index: clang/test/Driver/x86-target-features.c
===================================================================
--- clang/test/Driver/x86-target-features.c
+++ clang/test/Driver/x86-target-features.c
@@ -305,6 +305,15 @@
 // AVX512FP16: "-target-feature" "+avx512fp16"
 // NO-AVX512FP16: "-target-feature" "-avx512fp16"
 
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnniint8 %s \
+// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX-VNNIINT8 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnniint8 \
+// RUN: %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX-VNNIINT8 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnniint8 \
+// RUN: %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX-VNNIINT8 %s
+// AVX-VNNIINT8: "-target-feature" "+avxvnniint8"
+// NO-AVX-VNNIINT8: "-target-feature" "-avxvnniint8"
+
 // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
 // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
 // CRC32: "-target-feature" "+crc32"
Index: clang/test/CodeGen/avxvnniint8-builtins.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/avxvnniint8-builtins.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-   -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+// CHECK-LABEL: @test_mm_dpbssd_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128
+__m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbssds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128
+__m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsud_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128
+__m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsuds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128
+__m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuud_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128
+__m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuuds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128
+__m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbuuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssd_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256
+__m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256
+__m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsud_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256
+__m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsuds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256
+__m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuud_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256
+__m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuuds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256
+__m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbuuds_epi32(__W, __A, __B);
+}
Index: clang/test/CodeGen/attr-target-x86.c
===================================================================
--- clang/test/CodeGen/attr-target-x86.c
+++ clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
Index: clang/lib/Headers/immintrin.h
===================================================================
--- clang/lib/Headers/immintrin.h
+++ clang/lib/Headers/immintrin.h
@@ -254,6 +254,11 @@
 #include <gfniintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNIINT8__)
+#include <avxvnniint8intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
Index: clang/lib/Headers/cpuid.h
===================================================================
--- clang/lib/Headers/cpuid.h
+++ clang/lib/Headers/cpuid.h
@@ -204,6 +204,9 @@
 #define bit_AVX512BF16    0x00000020
 #define bit_HRESET        0x00400000
 
+/* Features in %edx for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNIINT8   0x00000010
+
 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
 #define bit_XSAVEC      0x00000002
Index: clang/lib/Headers/avxvnniint8intrin.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/avxvnniint8intrin.h
@@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -143,6 +143,7 @@
   avx512vpopcntdqvlintrin.h
   avxintrin.h
   avxvnniintrin.h
+  avxvnniint8intrin.h
   bmi2intrin.h
   bmiintrin.h
   cetintrin.h
Index: clang/lib/Basic/Targets/X86.h
===================================================================
--- clang/lib/Basic/Targets/X86.h
+++ clang/lib/Basic/Targets/X86.h
@@ -135,6 +135,7 @@
   bool HasPTWRITE = false;
   bool HasINVPCID = false;
   bool HasENQCMD = false;
+  bool HasAVXVNNIINT8 = false;
   bool HasKL = false;      // For key locker
   bool HasWIDEKL = false; // For wide key locker
   bool HasHRESET = false;
Index: clang/lib/Basic/Targets/X86.cpp
===================================================================
--- clang/lib/Basic/Targets/X86.cpp
+++ clang/lib/Basic/Targets/X86.cpp
@@ -330,6 +330,8 @@
       HasAMXINT8 = true;
     } else if (Feature == "+amx-tile") {
       HasAMXTILE = true;
+    } else if (Feature == "+avxvnniint8") {
+      HasAVXVNNIINT8 = true;
     } else if (Feature == "+avxvnni") {
       HasAVXVNNI = true;
     } else if (Feature == "+serialize") {
@@ -774,6 +776,8 @@
     Builder.defineMacro("__AMXINT8__");
   if (HasAMXBF16)
     Builder.defineMacro("__AMXBF16__");
+  if (HasAVXVNNIINT8)
+    Builder.defineMacro("__AVXVNNIINT8__");
   if (HasAVXVNNI)
     Builder.defineMacro("__AVXVNNI__");
   if (HasSERIALIZE)
@@ -898,6 +902,7 @@
       .Case("avx512ifma", true)
       .Case("avx512vp2intersect", true)
       .Case("avxvnni", true)
+      .Case("avxvnniint8", true)
       .Case("bmi", true)
       .Case("bmi2", true)
       .Case("cldemote", true)
@@ -992,6 +997,7 @@
       .Case("avx512vbmi2", HasAVX512VBMI2)
       .Case("avx512ifma", HasAVX512IFMA)
       .Case("avx512vp2intersect", HasAVX512VP2INTERSECT)
+      .Case("avxvnniint8", HasAVXVNNIINT8)
       .Case("bmi", HasBMI)
       .Case("bmi2", HasBMI2)
       .Case("cldemote", HasCLDEMOTE)
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -4582,6 +4582,8 @@
 def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
 def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
 def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
+def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group<m_x86_Features_Group>;
+def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group<m_x86_Features_Group>;
 def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
 def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
 def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
Index: clang/include/clang/Basic/BuiltinsX86.def
===================================================================
--- clang/include/clang/Basic/BuiltinsX86.def
+++ clang/include/clang/Basic/BuiltinsX86.def
@@ -2091,6 +2091,20 @@
 TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
 TARGET_HEADER_BUILTIN(_InterlockedAnd64,         "WiWiD*Wi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedDecrement64,   "WiWiD*",   "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_InterlockedExchange64,    "WiWiD*Wi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -550,6 +550,10 @@
 --------------------
 - Support ``-mindirect-branch-cs-prefix`` for call and jmp to indirect thunk.
 - Fix 32-bit ``__fastcall`` and ``__vectorcall`` ABI mismatch with MSVC.
+- Support ISA of ``AVX-VNNI-INT8``.
+  * Support intrinsic of ``_mm(256)_dpbssd(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpbsud(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpbuud(s)_epi32``.
 
 DWARF Support in Clang
 ----------------------

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D135938: [X86] Add AVX-VNNI-INT8 instructions.

Reply via email to