From 618e28224775186af3e0d219eebeb8fae1c8acc8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 Jan 2024 11:50:44 +0000 Subject: [PATCH] [X86] Add i8 CTPOP lowering using i32 MUL Fixes #79823 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 28 +- llvm/test/CodeGen/X86/ctpop-combine.ll | 21 +- llvm/test/CodeGen/X86/popcnt.ll | 701 ++++++++++++------------ 3 files changed, 367 insertions(+), 383 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 531e00862558c..de2df5c036f55 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -427,7 +427,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // on the dest that popcntl hasn't had since Cannon Lake. setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); } else { - setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i8 , Custom); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) @@ -30989,12 +30989,12 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. -static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, +static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); - SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. @@ -31035,9 +31035,27 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Op.getSimpleValueType().isVector() && + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + + // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. + if (VT == MVT::i8) { + SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); + Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32); + Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, + DAG.getConstant(0x08040201U, DL, MVT::i32)); + Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(3, MVT::i32, DL)); + Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11); + Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11); + Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(28, MVT::i32, DL)); + return DAG.getZExtOrTrunc(Op, DL, VT); + } + + assert(VT.isVector() && "We only do custom lowering for vector population count."); - return LowerVectorCTPOP(Op, Subtarget, DAG); + return LowerVectorCTPOP(Op, DL, Subtarget, DAG); } static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll index fba44218e0572..73152e9f909cf 100644 --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone { ; ; NO-POPCOUNT-LABEL: test4: ; NO-POPCOUNT: # %bb.0: -; NO-POPCOUNT-NEXT: movl %edi, %ecx -; NO-POPCOUNT-NEXT: andb $127, %cl -; NO-POPCOUNT-NEXT: shrb %dil -; NO-POPCOUNT-NEXT: andb $21, %dil -; NO-POPCOUNT-NEXT: subb %dil, %cl -; NO-POPCOUNT-NEXT: movl %ecx, %eax -; NO-POPCOUNT-NEXT: andb $51, %al -; NO-POPCOUNT-NEXT: shrb $2, %cl -; NO-POPCOUNT-NEXT: andb $51, %cl -; NO-POPCOUNT-NEXT: addb %al, %cl -; NO-POPCOUNT-NEXT: movl %ecx, %eax -; NO-POPCOUNT-NEXT: shrb $4, %al -; NO-POPCOUNT-NEXT: addb %cl, %al -; NO-POPCOUNT-NEXT: andb $15, %al +; NO-POPCOUNT-NEXT: andl $127, %edi +; NO-POPCOUNT-NEXT: imull $134480385, %edi, %eax # imm = 0x8040201 +; NO-POPCOUNT-NEXT: shrl $3, %eax +; NO-POPCOUNT-NEXT: andl $286331153, %eax # imm = 0x11111111 +; NO-POPCOUNT-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; NO-POPCOUNT-NEXT: shrl $28, %eax +; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax ; NO-POPCOUNT-NEXT: retq %x2 = and i8 %x, 127 %count = tail call i8 @llvm.ctpop.i8(i8 %x2) diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index a9d77fd2c0a61..37c7b051de7b1 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -1,46 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86,X86-NOSSE -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X86-POPCNT -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd | FileCheck %s --check-prefix=X64-NDD +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=X64,X64-BASE +; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefixes=X86-POPCNT +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefixes=X64-POPCNT +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd | FileCheck %s --check-prefixes=X64,X64-NDD ; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X86,X86-SSSE3 define i8 @cnt8(i8 %x) nounwind readnone { ; X86-LABEL: cnt8: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb %al -; X86-NEXT: andb $85, %al -; X86-NEXT: subb %al, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andb $51, %al -; X86-NEXT: shrb $2, %cl -; X86-NEXT: andb $51, %cl -; X86-NEXT: addb %al, %cl -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $4, %al -; X86-NEXT: addb %cl, %al -; X86-NEXT: andb $15, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X86-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: cnt8: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrb %al -; X64-NEXT: andb $85, %al -; X64-NEXT: subb %al, %dil -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: andb $51, %cl -; X64-NEXT: shrb $2, %dil -; X64-NEXT: andb $51, %dil -; X64-NEXT: addb %dil, %cl -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: shrb $4, %al -; X64-NEXT: addb %cl, %al -; X64-NEXT: andb $15, %al +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $134480385, %eax, %eax # imm = 0x8040201 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $286331153, %eax # imm = 0x11111111 +; X64-NEXT: imull $286331153, %eax, %eax # imm = 0x11111111 +; X64-NEXT: shrl $28, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-POPCNT-LABEL: cnt8: @@ -56,20 +43,6 @@ define i8 @cnt8(i8 %x) nounwind readnone { ; X64-POPCNT-NEXT: popcntl %eax, %eax ; X64-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-POPCNT-NEXT: retq -; -; X64-NDD-LABEL: cnt8: -; X64-NDD: # %bb.0: -; X64-NDD-NEXT: shrb %dil, %al -; X64-NDD-NEXT: andb $85, %al -; X64-NDD-NEXT: subb %al, %dil, %al -; X64-NDD-NEXT: andb $51, %al, %cl -; X64-NDD-NEXT: shrb $2, %al -; X64-NDD-NEXT: andb $51, %al -; X64-NDD-NEXT: addb %cl, %al -; X64-NDD-NEXT: shrb $4, %al, %cl -; X64-NDD-NEXT: addb %cl, %al -; X64-NDD-NEXT: andb $15, %al -; X64-NDD-NEXT: retq %cnt = tail call i8 @llvm.ctpop.i8(i8 %x) ret i8 %cnt } @@ -98,27 +71,27 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; -; X64-LABEL: cnt16: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $21845, %eax # imm = 0x5555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $13107, %edi # imm = 0x3333 -; X64-NEXT: addl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $3855, %eax # imm = 0xF0F -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $8, %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt16: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $21845, %eax # imm = 0x5555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: andl $13107, %eax # imm = 0x3333 +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl $13107, %edi # imm = 0x3333 +; X64-BASE-NEXT: addl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $3855, %eax # imm = 0xF0F +; X64-BASE-NEXT: movl %eax, %ecx +; X64-BASE-NEXT: shrl $8, %ecx +; X64-BASE-NEXT: addl %eax, %ecx +; X64-BASE-NEXT: movzbl %cl, %eax +; X64-BASE-NEXT: # kill: def $ax killed $ax killed $eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt16: ; X86-POPCNT: # %bb.0: @@ -176,24 +149,24 @@ define i32 @cnt32(i32 %x) nounwind readnone { ; X86-NEXT: shrl $24, %eax ; X86-NEXT: retl ; -; X64-LABEL: cnt32: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X64-NEXT: addl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X64-NEXT: shrl $24, %eax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt32: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-BASE-NEXT: addl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-BASE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X64-BASE-NEXT: shrl $24, %eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt32: ; X86-POPCNT: # %bb.0: @@ -263,28 +236,28 @@ define i64 @cnt64(i64 %x) nounwind readnone { ; X86-NOSSE-NEXT: xorl %edx, %edx ; X86-NOSSE-NEXT: retl ; -; X64-LABEL: cnt64: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: subq %rcx, %rdi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt64: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: subq %rcx, %rdi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rdi, %rcx +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: movq %rax, %rcx +; X64-BASE-NEXT: shrq $4, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rcx, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %rdx, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt64: ; X86-POPCNT: # %bb.0: @@ -447,45 +420,45 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-NOSSE-NEXT: popl %ebx ; X86-NOSSE-NEXT: retl $4 ; -; X64-LABEL: cnt128: -; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx -; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax -; X64-NEXT: imulq %r9, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: retq +; X64-BASE-LABEL: cnt128: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rsi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rsi +; X64-BASE-NEXT: andq %rcx, %rsi +; X64-BASE-NEXT: addq %rsi, %rax +; X64-BASE-NEXT: movq %rax, %rdx +; X64-BASE-NEXT: shrq $4, %rdx +; X64-BASE-NEXT: addq %rax, %rdx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rsi, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %r9, %rdx +; X64-BASE-NEXT: shrq $56, %rdx +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rdi +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movq %rcx, %rax +; X64-BASE-NEXT: shrq $4, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: andq %rsi, %rax +; X64-BASE-NEXT: imulq %r9, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: addq %rdx, %rax +; X64-BASE-NEXT: xorl %edx, %edx +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt128: ; X86-POPCNT: # %bb.0: @@ -671,28 +644,28 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat { ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; -; X64-LABEL: cnt64_noimplicitfloat: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: subq %rcx, %rdi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt64_noimplicitfloat: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: subq %rcx, %rdi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rdi, %rcx +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: movq %rax, %rcx +; X64-BASE-NEXT: shrq $4, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rcx, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %rdx, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt64_noimplicitfloat: ; X86-POPCNT: # %bb.0: @@ -752,25 +725,25 @@ define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize { ; X86-NEXT: shrl $24, %eax ; X86-NEXT: retl ; -; X64-LABEL: cnt32_optsize: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: andl %eax, %ecx -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl %eax, %edi -; X64-NEXT: addl %ecx, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X64-NEXT: shrl $24, %eax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt32_optsize: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl $858993459, %eax # imm = 0x33333333 +; X64-BASE-NEXT: movl %edi, %ecx +; X64-BASE-NEXT: andl %eax, %ecx +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl %eax, %edi +; X64-BASE-NEXT: addl %ecx, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-BASE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X64-BASE-NEXT: shrl $24, %eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt32_optsize: ; X86-POPCNT: # %bb.0: @@ -850,28 +823,28 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize { ; X86-NOSSE-NEXT: popl %ebx ; X86-NOSSE-NEXT: retl ; -; X64-LABEL: cnt64_optsize: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: subq %rcx, %rdi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt64_optsize: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: subq %rcx, %rdi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rdi, %rcx +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: movq %rax, %rcx +; X64-BASE-NEXT: shrq $4, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rcx, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %rdx, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt64_optsize: ; X86-POPCNT: # %bb.0: @@ -1042,45 +1015,45 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl $4 ; -; X64-LABEL: cnt128_optsize: -; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx -; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax -; X64-NEXT: imulq %r9, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: retq +; X64-BASE-LABEL: cnt128_optsize: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rsi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rsi +; X64-BASE-NEXT: andq %rcx, %rsi +; X64-BASE-NEXT: addq %rsi, %rax +; X64-BASE-NEXT: movq %rax, %rdx +; X64-BASE-NEXT: shrq $4, %rdx +; X64-BASE-NEXT: addq %rax, %rdx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rsi, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %r9, %rdx +; X64-BASE-NEXT: shrq $56, %rdx +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rdi +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movq %rcx, %rax +; X64-BASE-NEXT: shrq $4, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: andq %rsi, %rax +; X64-BASE-NEXT: imulq %r9, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: addq %rdx, %rax +; X64-BASE-NEXT: xorl %edx, %edx +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt128_optsize: ; X86-POPCNT: # %bb.0: @@ -1251,24 +1224,24 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { ; X86-NEXT: shrl $24, %eax ; X86-NEXT: retl ; -; X64-LABEL: cnt32_pgso: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X64-NEXT: addl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X64-NEXT: shrl $24, %eax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt32_pgso: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-BASE-NEXT: addl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-BASE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X64-BASE-NEXT: shrl $24, %eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt32_pgso: ; X86-POPCNT: # %bb.0: @@ -1338,28 +1311,28 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { ; X86-NOSSE-NEXT: xorl %edx, %edx ; X86-NOSSE-NEXT: retl ; -; X64-LABEL: cnt64_pgso: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: subq %rcx, %rdi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: retq +; X64-BASE-LABEL: cnt64_pgso: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: subq %rcx, %rdi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rdi, %rcx +; X64-BASE-NEXT: andq %rax, %rcx +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: movq %rax, %rcx +; X64-BASE-NEXT: shrq $4, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rcx, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %rdx, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt64_pgso: ; X86-POPCNT: # %bb.0: @@ -1523,45 +1496,45 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-NOSSE-NEXT: popl %ebx ; X86-NOSSE-NEXT: retl $4 ; -; X64-LABEL: cnt128_pgso: -; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rcx, %rsi -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shrq $4, %rdx -; X64-NEXT: addq %rax, %rdx -; X64-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rsi, %rdx -; X64-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 -; X64-NEXT: imulq %r9, %rdx -; X64-NEXT: shrq $56, %rdx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq %rax -; X64-NEXT: andq %r8, %rax -; X64-NEXT: subq %rax, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rdi, %rcx -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: andq %rsi, %rax -; X64-NEXT: imulq %r9, %rax -; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: retq +; X64-BASE-LABEL: cnt128_pgso: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rsi +; X64-BASE-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-BASE-NEXT: movq %rsi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rsi +; X64-BASE-NEXT: andq %rcx, %rsi +; X64-BASE-NEXT: addq %rsi, %rax +; X64-BASE-NEXT: movq %rax, %rdx +; X64-BASE-NEXT: shrq $4, %rdx +; X64-BASE-NEXT: addq %rax, %rdx +; X64-BASE-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; X64-BASE-NEXT: andq %rsi, %rdx +; X64-BASE-NEXT: movabsq $72340172838076673, %r9 # imm = 0x101010101010101 +; X64-BASE-NEXT: imulq %r9, %rdx +; X64-BASE-NEXT: shrq $56, %rdx +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: shrq %rax +; X64-BASE-NEXT: andq %r8, %rax +; X64-BASE-NEXT: subq %rax, %rdi +; X64-BASE-NEXT: movq %rdi, %rax +; X64-BASE-NEXT: andq %rcx, %rax +; X64-BASE-NEXT: shrq $2, %rdi +; X64-BASE-NEXT: andq %rdi, %rcx +; X64-BASE-NEXT: addq %rax, %rcx +; X64-BASE-NEXT: movq %rcx, %rax +; X64-BASE-NEXT: shrq $4, %rax +; X64-BASE-NEXT: addq %rcx, %rax +; X64-BASE-NEXT: andq %rsi, %rax +; X64-BASE-NEXT: imulq %r9, %rax +; X64-BASE-NEXT: shrq $56, %rax +; X64-BASE-NEXT: addq %rdx, %rax +; X64-BASE-NEXT: xorl %edx, %edx +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: cnt128_pgso: ; X86-POPCNT: # %bb.0: @@ -1732,24 +1705,24 @@ define i32 @popcount_zext_i32(i16 zeroext %x) { ; X86-NEXT: shrl $24, %eax ; X86-NEXT: retl ; -; X64-LABEL: popcount_zext_i32: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $21845, %eax # imm = 0x5555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X64-NEXT: addl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X64-NEXT: shrl $24, %eax -; X64-NEXT: retq +; X64-BASE-LABEL: popcount_zext_i32: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $21845, %eax # imm = 0x5555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-BASE-NEXT: addl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-BASE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X64-BASE-NEXT: shrl $24, %eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: popcount_zext_i32: ; X86-POPCNT: # %bb.0: @@ -1805,26 +1778,26 @@ define i32 @popcount_i16_zext(i16 zeroext %x) { ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; -; X64-LABEL: popcount_i16_zext: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %eax -; X64-NEXT: andl $21845, %eax # imm = 0x5555 -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $13107, %edi # imm = 0x3333 -; X64-NEXT: addl %eax, %edi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $4, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: andl $3855, %eax # imm = 0xF0F -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $8, %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: retq +; X64-BASE-LABEL: popcount_i16_zext: +; X64-BASE: # %bb.0: +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl %eax +; X64-BASE-NEXT: andl $21845, %eax # imm = 0x5555 +; X64-BASE-NEXT: subl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: andl $13107, %eax # imm = 0x3333 +; X64-BASE-NEXT: shrl $2, %edi +; X64-BASE-NEXT: andl $13107, %edi # imm = 0x3333 +; X64-BASE-NEXT: addl %eax, %edi +; X64-BASE-NEXT: movl %edi, %eax +; X64-BASE-NEXT: shrl $4, %eax +; X64-BASE-NEXT: addl %edi, %eax +; X64-BASE-NEXT: andl $3855, %eax # imm = 0xF0F +; X64-BASE-NEXT: movl %eax, %ecx +; X64-BASE-NEXT: shrl $8, %ecx +; X64-BASE-NEXT: addl %eax, %ecx +; X64-BASE-NEXT: movzbl %cl, %eax +; X64-BASE-NEXT: retq ; ; X86-POPCNT-LABEL: popcount_i16_zext: ; X86-POPCNT: # %bb.0: