Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit ff9e7aa

Browse filesBrowse files
committed
[SelectionDAG] Deal with POISON for INSERT_VECTOR_ELT/INSERT_SUBVECTOR (part 2)
Add support in isGuaranteedNotToBeUndefOrPoison and SimplifyDemandedVectorElts to avoid regressions seen after a previous commit fixing #141034.
1 parent bf22ac6 commit ff9e7aa
Copy full SHA for ff9e7aa
Expand file treeCollapse file tree

22 files changed

+1504
-1585
lines changed

‎llvm/include/llvm/CodeGen/SelectionDAGNodes.h

Copy file name to clipboardExpand all lines: llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+5Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,6 +1881,11 @@ LLVM_ABI SDValue peekThroughExtractSubvectors(SDValue V);
18811881
/// If \p V is not a truncation, it is returned as-is.
18821882
LLVM_ABI SDValue peekThroughTruncates(SDValue V);
18831883

1884+
/// Recursively peek through INSERT_VECTOR_ELT nodes, returning the source
1885+
/// vector operand of \p V, as long as \p V is an does INSERT_VECTOR_ELT
1886+
/// operation that do not insert into any of the demanded vector elts.
1887+
LLVM_ABI SDValue peekThroughInsertVectorElt(SDValue V, APInt DemandedElts);
1888+
18841889
/// Returns true if \p V is a bitwise not operation. Assumes that an all ones
18851890
/// constant is canonicalized to be operand 1.
18861891
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs = false);

‎llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Copy file name to clipboardExpand all lines: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+70Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5454,6 +5454,59 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
54545454
}
54555455
return true;
54565456

5457+
case ISD::INSERT_SUBVECTOR: {
5458+
if (Op.getValueType().isScalableVector())
5459+
break;
5460+
SDValue Src = Op.getOperand(0);
5461+
SDValue Sub = Op.getOperand(1);
5462+
uint64_t Idx = Op.getConstantOperandVal(2);
5463+
unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
5464+
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
5465+
APInt DemandedSrcElts = DemandedElts;
5466+
DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
5467+
5468+
if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison(
5469+
Sub, DemandedSubElts, PoisonOnly, Depth + 1))
5470+
return false;
5471+
if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison(
5472+
Src, DemandedSrcElts, PoisonOnly, Depth + 1))
5473+
return false;
5474+
return true;
5475+
}
5476+
5477+
case ISD::INSERT_VECTOR_ELT: {
5478+
SDValue InVec = Op.getOperand(0);
5479+
SDValue InVal = Op.getOperand(1);
5480+
SDValue EltNo = Op.getOperand(2);
5481+
EVT VT = InVec.getValueType();
5482+
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
5483+
if (IndexC && VT.isFixedLengthVector() &&
5484+
IndexC->getZExtValue() < VT.getVectorNumElements()) {
5485+
if (DemandedElts[IndexC->getZExtValue()] &&
5486+
!isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1))
5487+
return false;
5488+
APInt InVecDemandedElts = DemandedElts;
5489+
InVecDemandedElts.clearBit(IndexC->getZExtValue());
5490+
if (!!InVecDemandedElts &&
5491+
!isGuaranteedNotToBeUndefOrPoison(
5492+
peekThroughInsertVectorElt(InVec, InVecDemandedElts),
5493+
InVecDemandedElts, PoisonOnly, Depth + 1))
5494+
return false;
5495+
return true;
5496+
}
5497+
break;
5498+
}
5499+
5500+
case ISD::SCALAR_TO_VECTOR:
5501+
// If only demanding upper (undef) elements.
5502+
if (DemandedElts.ugt(1))
5503+
return PoisonOnly;
5504+
// If only demanding element 0, or only considering poison.
5505+
if (PoisonOnly || DemandedElts == 0)
5506+
return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
5507+
Depth + 1);
5508+
return false;
5509+
54575510
case ISD::SPLAT_VECTOR:
54585511
return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
54595512
Depth + 1);
@@ -12463,6 +12516,23 @@ SDValue llvm::peekThroughTruncates(SDValue V) {
1246312516
return V;
1246412517
}
1246512518

12519+
SDValue llvm::peekThroughInsertVectorElt(SDValue V, APInt DemandedElts) {
12520+
while (V.getOpcode() == ISD::INSERT_VECTOR_ELT) {
12521+
SDValue InVec = V.getOperand(0);
12522+
SDValue EltNo = V.getOperand(2);
12523+
EVT VT = InVec.getValueType();
12524+
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
12525+
if (IndexC && VT.isFixedLengthVector() &&
12526+
IndexC->getZExtValue() < VT.getVectorNumElements() &&
12527+
!DemandedElts[IndexC->getZExtValue()]) {
12528+
V = InVec;
12529+
continue;
12530+
}
12531+
break;
12532+
}
12533+
return V;
12534+
}
12535+
1246612536
bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) {
1246712537
if (V.getOpcode() != ISD::XOR)
1246812538
return false;

‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Copy file name to clipboardExpand all lines: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3367,6 +3367,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
33673367
APInt DemandedSrcElts = DemandedElts;
33683368
DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
33693369

3370+
// If none of the sub operand elements are demanded, bypass the insert.
3371+
if (!DemandedSubElts)
3372+
return TLO.CombineTo(Op, Src);
3373+
33703374
APInt SubUndef, SubZero;
33713375
if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
33723376
Depth + 1))

‎llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+22-23Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -66,39 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8)
6666
; CHECK-NEXT: s_mov_b32 s6, s4
6767
; CHECK-NEXT: s_mov_b32 s5, s3
6868
; CHECK-NEXT: s_mov_b32 s4, s2
69-
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1
70-
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
71-
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1
72-
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
73-
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 4, v1
69+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1
70+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1
71+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
72+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1
73+
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v1
7474
; CHECK-NEXT: v_mov_b32_e32 v10, s0
75-
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 20, v2
76-
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 16, v2
75+
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
76+
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
7777
; CHECK-NEXT: s_mov_b32 m0, -1
78-
; CHECK-NEXT: ds_read_b32 v8, v0
79-
; CHECK-NEXT: ds_read_b32 v7, v3
80-
; CHECK-NEXT: ds_read_b32 v6, v4
81-
; CHECK-NEXT: ds_read_b32 v5, v5
82-
; CHECK-NEXT: ds_read_b32 v4, v9
78+
; CHECK-NEXT: ds_read_b32 v6, v0
79+
; CHECK-NEXT: ds_read_b32 v5, v3
80+
; CHECK-NEXT: ds_read_b32 v4, v4
81+
; CHECK-NEXT: ds_read_b32 v8, v7
82+
; CHECK-NEXT: ds_read_b32 v7, v9
8383
; CHECK-NEXT: ds_read_b32 v3, v1
84-
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2
85-
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 8, v2
86-
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
84+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2
85+
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2
86+
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 16, v2
8787
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
8888
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
8989
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
90-
; CHECK-NEXT: ds_read_b32 v0, v12
9190
; CHECK-NEXT: s_waitcnt expcnt(1)
92-
; CHECK-NEXT: ds_read_b32 v5, v1
93-
; CHECK-NEXT: ds_read_b32 v4, v9
94-
; CHECK-NEXT: ds_read_b32 v3, v13
91+
; CHECK-NEXT: ds_read_b32 v5, v11
92+
; CHECK-NEXT: ds_read_b32 v4, v12
93+
; CHECK-NEXT: ds_read_b32 v3, v0
94+
; CHECK-NEXT: ds_read_b32 v1, v1
95+
; CHECK-NEXT: ds_read_b32 v0, v9
9596
; CHECK-NEXT: ds_read_b32 v2, v2
96-
; CHECK-NEXT: ds_read_b32 v1, v11
97-
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
98-
; CHECK-NEXT: exp mrt0 off, off, off, off
9997
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
100-
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
98+
; CHECK-NEXT: exp mrt0 off, off, off, off
10199
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
100+
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
102101
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v10, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
103102
; CHECK-NEXT: s_endpgm
104103
%load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4

‎llvm/test/CodeGen/Thumb2/mve-vld3.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/Thumb2/mve-vld3.ll
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,8 +663,8 @@ define void @vld3_v2i8(ptr %src, ptr %dst) {
663663
; CHECK: @ %bb.0: @ %entry
664664
; CHECK-NEXT: .pad #8
665665
; CHECK-NEXT: sub sp, #8
666-
; CHECK-NEXT: ldrd r2, r0, [r0]
667-
; CHECK-NEXT: strd r2, r0, [sp]
666+
; CHECK-NEXT: ldrd r0, r2, [r0]
667+
; CHECK-NEXT: strd r0, r2, [sp]
668668
; CHECK-NEXT: mov r0, sp
669669
; CHECK-NEXT: vldrb.u16 q0, [r0]
670670
; CHECK-NEXT: vmov.u16 r0, q0[4]

‎llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+1-3Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4451,8 +4451,8 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
44514451
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
44524452
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
44534453
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
4454-
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
44554454
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4455+
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
44564456
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
44574457
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
44584458
; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
@@ -4462,7 +4462,6 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
44624462
; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
44634463
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
44644464
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4465-
; AVX-NEXT: vzeroupper
44664465
; AVX-NEXT: retq
44674466
;
44684467
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4717,7 +4716,6 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
47174716
; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
47184717
; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
47194718
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4720-
; AVX-NEXT: vzeroupper
47214719
; AVX-NEXT: retq
47224720
;
47234721
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:

‎llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+22-56Lines changed: 22 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -262,54 +262,37 @@ define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp
262262
define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp {
263263
; SSE2-LABEL: merge_4f32_f32_012u:
264264
; SSE2: # %bb.0:
265-
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
266265
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
267-
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
268-
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
269-
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
270-
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
266+
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
267+
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
271268
; SSE2-NEXT: retq
272269
;
273270
; SSE41-LABEL: merge_4f32_f32_012u:
274271
; SSE41: # %bb.0:
275-
; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
276-
; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
277-
; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
278-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
279-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
280-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
272+
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
273+
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
281274
; SSE41-NEXT: retq
282275
;
283276
; AVX-LABEL: merge_4f32_f32_012u:
284277
; AVX: # %bb.0:
285-
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
286-
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
287-
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
288-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
289-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
290-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
278+
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
279+
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
291280
; AVX-NEXT: retq
292281
;
293282
; X86-SSE1-LABEL: merge_4f32_f32_012u:
294283
; X86-SSE1: # %bb.0:
295284
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
296-
; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
285+
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
286+
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
297287
; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
298-
; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
299-
; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
300-
; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
301-
; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
288+
; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
302289
; X86-SSE1-NEXT: retl
303290
;
304291
; X86-SSE41-LABEL: merge_4f32_f32_012u:
305292
; X86-SSE41: # %bb.0:
306293
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
307-
; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
308-
; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
309-
; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
310-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
311-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
312-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
294+
; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
295+
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
313296
; X86-SSE41-NEXT: retl
314297
%ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
315298
%ptr2 = getelementptr inbounds float, ptr %ptr, i64 2
@@ -326,54 +309,37 @@ define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp
326309
define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp {
327310
; SSE2-LABEL: merge_4f32_f32_019u:
328311
; SSE2: # %bb.0:
329-
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
330312
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
331-
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
332-
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
333-
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
334-
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
313+
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
314+
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
335315
; SSE2-NEXT: retq
336316
;
337317
; SSE41-LABEL: merge_4f32_f32_019u:
338318
; SSE41: # %bb.0:
339-
; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
340-
; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
341-
; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
342-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
343-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
344-
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
319+
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
320+
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
345321
; SSE41-NEXT: retq
346322
;
347323
; AVX-LABEL: merge_4f32_f32_019u:
348324
; AVX: # %bb.0:
349-
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
350-
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
351-
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
352-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
353-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
354-
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
325+
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
326+
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
355327
; AVX-NEXT: retq
356328
;
357329
; X86-SSE1-LABEL: merge_4f32_f32_019u:
358330
; X86-SSE1: # %bb.0:
359331
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
360-
; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
332+
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
333+
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
361334
; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
362-
; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
363-
; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
364-
; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
365-
; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
335+
; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
366336
; X86-SSE1-NEXT: retl
367337
;
368338
; X86-SSE41-LABEL: merge_4f32_f32_019u:
369339
; X86-SSE41: # %bb.0:
370340
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
371-
; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
372-
; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
373-
; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
374-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
375-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
376-
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0]
341+
; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
342+
; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
377343
; X86-SSE41-NEXT: retl
378344
%ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
379345
%ptr2 = getelementptr inbounds float, ptr %ptr, i64 9

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.