abs/extra/llvm/AMDGPU-Fix-an-interaction-between-WQM-and-polygon-stippling.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

From 25e2616626caafb896517e18cd8aa724fba2b200 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Tue, 29 Nov 2016 03:41:28 +0000
Subject: [PATCH] Merging r280589:

------------------------------------------------------------------------
r280589 | nhaehnle | 2016-09-03 05:26:32 -0700 (Sat, 03 Sep 2016) | 19 lines

AMDGPU: Fix an interaction between WQM and polygon stippling

Summary:
This fixes a rare bug in polygon stippling with non-monolithic pixel shaders.

The underlying problem is as follows: the prolog part contains the polygon
stippling sequence, i.e. a kill. The main part then enables WQM based on the
_reduced_ exec mask, effectively undoing most of the polygon stippling.

Since we cannot know whether polygon stippling will be used, the main part
of a non-monolithic shader must always return to exact mode to fix this
problem.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: arsenm, llvm-commits, kzhuravl

Differential Revision: https://reviews.llvm.org/D23131

------------------------------------------------------------------------

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@288105 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInstructions.td   |  1 +
 lib/Target/AMDGPU/SIWholeQuadMode.cpp |  7 -----
 test/CodeGen/AMDGPU/wqm.ll            | 49 ++++++++++++++++++++++++++++++++---
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 18b7d5d..dde5f2f 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2029,6 +2029,7 @@ def SI_RETURN : PseudoInstSI <
   let hasSideEffects = 1;
   let SALU = 1;
   let hasNoSchedulingInfo = 1;
+  let DisableWQM = 1;
 }
 
 let Uses = [EXEC], Defs = [EXEC, VCC, M0],
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index b200c15..1534d58 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -219,13 +219,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       markInstruction(MI, Flags, Worklist);
       GlobalFlags |= Flags;
     }
-
-    if (WQMOutputs && MBB.succ_empty()) {
-      // This is a prolog shader. Make sure we go back to exact mode at the end.
-      Blocks[&MBB].OutNeeds = StateExact;
-      Worklist.push_back(&MBB);
-      GlobalFlags |= StateExact;
-    }
   }
 
   return GlobalFlags;
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 809a7ba..41e4264 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -17,17 +17,18 @@ main_body:
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK-NOT: exec
-;CHECK: _load_dword v0,
-define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
 main_body:
   %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
   %c.3 = extractelement <4 x i32> %c.2, i32 0
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
   %data = load float, float addrspace(1)* %gep
-  ret float %data
+
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
+
+  ret void
 }
 
 ; ... but disabled for stores (and, in this simple case, not re-enabled).
@@ -414,6 +415,46 @@ entry:
   ret void
 }
 
+; Must return to exact at the end of a non-void returning shader,
+; otherwise the EXEC mask exported by the epilog will be wrong. This is true
+; even if the shader has no kills, because a kill could have happened in a
+; previous shader fragment.
+;
+; CHECK-LABEL: {{^}}test_nonvoid_return:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
+}
+
+; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
+entry:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %cc = icmp sgt i32 %c, 0
+  br i1 %cc, label %if, label %else
+
+if:
+  store volatile <4 x float> %dtex, <4 x float>* undef
+  unreachable
+
+else:
+  ret <4 x float> %dtex
+}
 
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1