-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathless_slow_amd64.S
353 lines (313 loc) · 12.6 KB
/
less_slow_amd64.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
# ----------------------------------------------------------------------------
# less_slow_amd64.asm
# Micro-kernels for building a performance-first mindset for 64-bit x86.
# ----------------------------------------------------------------------------
# Export symbols so linkers can find them:
.section .text
.global i32_add_asm_kernel
# Fused-Multiply-Add (FMA) kernels in AVX-512 and AVX2
.global tops_f64_avx512fma_asm_kernel
.global tops_f32_avx512fma_asm_kernel
.global tops_f16_avx512fma_asm_kernel
.global tops_bf16_avx512fma_asm_kernel
.global tops_i16_avx512fma_asm_kernel
.global tops_i7_avx512fma_asm_kernel
.global tops_f64_avx2fma_asm_kernel
.global tops_f32_avx2fma_asm_kernel
# Specialized 2D matrix multiplication kernels
.global tops_bf16_amx_asm_kernel
.global tops_u8_amx_asm_kernel
.global tops_i8_amx_asm_kernel
# Latency-hiding mixed kernels
.global tops_i7_amx_avx512fma_asm_kernel
# Additional exports for un-fused Multiply-Add variants
.global tops_f64_avx512ma_asm_kernel
.global tops_f32_avx512ma_asm_kernel
.global tops_f16_avx512ma_asm_kernel
.global tops_f64_avx2ma_asm_kernel
.global tops_f32_avx2ma_asm_kernel
# ----------------------------------------------------------------------------
# Simple function that adds two 32-bit signed integers using System V AMD64.
# Arguments in 32-bit registers EDI (a) and ESI (b). Return value in EAX.
# ----------------------------------------------------------------------------
i32_add_asm_kernel:
addl %esi, %edi # EDI = EDI + ESI
movl %edi, %eax # EAX = EDI
ret
# ----------------------------------------------------------------------------
# AVX-512 micro-kernels measuring Tensor Operations Per Second (TOPS)
# without accounting for memory bandwidth or latency. The following kernels
# assume presence of 32x 512-bit long registers (ZMM0-ZMM31).
# Each reports return the aggregate number of floating-point operations -
# multiplications and additions forming the FMA instruction - performed.
# ----------------------------------------------------------------------------
tops_f64_avx512fma_asm_kernel:
# Each vfmadd231pd does: DEST = DEST + (SRC1 * SRC2)
# That is 8 multiplies + 8 adds = 16 FLOPs per instruction.
# We'll do 10 instructions below => 10 × 16 = 160 FLOPs total.
vfmadd231pd %zmm1, %zmm2, %zmm0 # 1
vfmadd231pd %zmm4, %zmm5, %zmm3 # 2
vfmadd231pd %zmm7, %zmm8, %zmm6 # 3
vfmadd231pd %zmm10, %zmm11, %zmm9 # 4
vfmadd231pd %zmm13, %zmm14, %zmm12 # 5
vfmadd231pd %zmm16, %zmm17, %zmm15 # 6
vfmadd231pd %zmm19, %zmm20, %zmm18 # 7
vfmadd231pd %zmm22, %zmm23, %zmm21 # 8
vfmadd231pd %zmm25, %zmm26, %zmm24 # 9
vfmadd231pd %zmm28, %zmm29, %zmm27 # 10
# Return value in 64-bit register %rax => 160 TOPs.
movabsq $160, %rax
ret
tops_f64_avx512_x5fma_asm_kernel:
# Each vfmadd231pd does: DEST = DEST + (SRC1 * SRC2)
# That is 8 multiplies + 8 adds = 16 FLOPs per instruction.
# We'll do 5 instructions below => 5 × 16 = 80 FLOPs total.
vfmadd231pd %zmm1, %zmm2, %zmm0 # 1
vfmadd231pd %zmm4, %zmm5, %zmm3 # 2
vfmadd231pd %zmm7, %zmm8, %zmm6 # 3
vfmadd231pd %zmm10, %zmm11, %zmm9 # 4
vfmadd231pd %zmm13, %zmm14, %zmm12 # 5
# Return value in 64-bit register %rax => 80 TOPs.
movabsq $80, %rax
ret
tops_f32_avx512fma_asm_kernel:
vfmadd231ps %zmm1, %zmm2, %zmm0 # 1
vfmadd231ps %zmm4, %zmm5, %zmm3 # 2
vfmadd231ps %zmm7, %zmm8, %zmm6 # 3
vfmadd231ps %zmm10, %zmm11, %zmm9 # 4
vfmadd231ps %zmm13, %zmm14, %zmm12 # 5
vfmadd231ps %zmm16, %zmm17, %zmm15 # 6
vfmadd231ps %zmm19, %zmm20, %zmm18 # 7
vfmadd231ps %zmm22, %zmm23, %zmm21 # 8
vfmadd231ps %zmm25, %zmm26, %zmm24 # 9
vfmadd231ps %zmm28, %zmm29, %zmm27 # 10
movabsq $320, %rax
ret
tops_f16_avx512fma_asm_kernel:
# This kernel requires AVX512_FP16 support.
vfmadd231ph %zmm1, %zmm2, %zmm0
vfmadd231ph %zmm4, %zmm5, %zmm3
vfmadd231ph %zmm7, %zmm8, %zmm6
vfmadd231ph %zmm10, %zmm11, %zmm9
vfmadd231ph %zmm13, %zmm14, %zmm12
vfmadd231ph %zmm16, %zmm17, %zmm15
vfmadd231ph %zmm19, %zmm20, %zmm18
vfmadd231ph %zmm22, %zmm23, %zmm21
vfmadd231ph %zmm25, %zmm26, %zmm24
vfmadd231ph %zmm28, %zmm29, %zmm27
movabsq $640, %rax
ret
tops_bf16_avx512fma_asm_kernel:
# This kernel requires AVX512_BF16 support.
vdpbf16ps %zmm0, %zmm1, %zmm2
vdpbf16ps %zmm3, %zmm4, %zmm5
vdpbf16ps %zmm6, %zmm7, %zmm8
vdpbf16ps %zmm9, %zmm10, %zmm11
vdpbf16ps %zmm12, %zmm13, %zmm14
vdpbf16ps %zmm15, %zmm16, %zmm17
vdpbf16ps %zmm18, %zmm19, %zmm20
vdpbf16ps %zmm21, %zmm22, %zmm23
vdpbf16ps %zmm24, %zmm25, %zmm26
vdpbf16ps %zmm27, %zmm28, %zmm29
movabsq $640, %rax
ret
tops_i16_avx512fma_asm_kernel:
# This kernel requires AVX512_VNNI support.
vpdpwssds %zmm0, %zmm1, %zmm2
vpdpwssds %zmm3, %zmm4, %zmm5
vpdpwssds %zmm6, %zmm7, %zmm8
vpdpwssds %zmm9, %zmm10, %zmm11
vpdpwssds %zmm12, %zmm13, %zmm14
vpdpwssds %zmm15, %zmm16, %zmm17
vpdpwssds %zmm18, %zmm19, %zmm20
vpdpwssds %zmm21, %zmm22, %zmm23
vpdpwssds %zmm24, %zmm25, %zmm26
vpdpwssds %zmm27, %zmm28, %zmm29
movabsq $640, %rax
ret
tops_i7_avx512fma_asm_kernel:
# This kernel requires AVX512_VNNI support.
# This instruction is tricky! It's not strictly `u8` by `u8` to `u32`
# multiplication, but rather `u8` by `i8` to `i32`! There is a similar
# `vpdpbusds` instruction, that only differs by its saturation behavior.
vpdpbusd %zmm0, %zmm1, %zmm2
vpdpbusd %zmm3, %zmm4, %zmm5
vpdpbusd %zmm6, %zmm7, %zmm8
vpdpbusd %zmm9, %zmm10, %zmm11
vpdpbusd %zmm12, %zmm13, %zmm14
vpdpbusd %zmm15, %zmm16, %zmm17
vpdpbusd %zmm18, %zmm19, %zmm20
vpdpbusd %zmm21, %zmm22, %zmm23
vpdpbusd %zmm24, %zmm25, %zmm26
vpdpbusd %zmm27, %zmm28, %zmm29
movabsq $1280, %rax
ret
# ----------------------------------------------------------------------------
# AVX2 micro-kernels differ from AVX-512 in that they have fewer registers,
# and those registers are narrower. We have 16x 256-bit YMM registers.
# ----------------------------------------------------------------------------
tops_f64_avx2fma_asm_kernel:
vfmadd231pd %ymm1, %ymm2, %ymm0
vfmadd231pd %ymm4, %ymm5, %ymm3
vfmadd231pd %ymm7, %ymm8, %ymm6
vfmadd231pd %ymm9, %ymm10, %ymm11
vfmadd231pd %ymm12, %ymm13, %ymm14
movabsq $40, %rax
ret
tops_f32_avx2fma_asm_kernel:
vfmadd231ps %ymm1, %ymm2, %ymm0
vfmadd231ps %ymm4, %ymm5, %ymm3
vfmadd231ps %ymm7, %ymm8, %ymm6
vfmadd231ps %ymm9, %ymm10, %ymm11
vfmadd231ps %ymm12, %ymm13, %ymm14
movabsq $80, %rax
ret
# ----------------------------------------------------------------------------
# AMX is an Intel-specific extension that introduces a new set of registers
# and instructions for matrix multiplication. Proper usage would require
# loading the tile configuration file, setting up the tile registers, loading
# a rectangular tile of the bigger matrix, performing the multiplications and
# putting back. We will avoid most of those steps!
#
# https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#amxtechs=AMX_BF16,AMX_INT8,AMX_FP16
# ----------------------------------------------------------------------------
tops_bf16_amx_asm_kernel:
# Each line performs a 16x32x16 matrix multiplication.
# That's 16x32x16 scalar multiplications and 16x31x16 scalar additions.
# That's 16,128 scalar operations per line or 64,512 scalar operations total.
tdpbf16ps %tmm7, %tmm0, %tmm1
tdpbf16ps %tmm6, %tmm2, %tmm3
tdpbf16ps %tmm5, %tmm0, %tmm2
tdpbf16ps %tmm4, %tmm1, %tmm3
movabsq $64512, %rax
ret
tops_u8_amx_asm_kernel:
# Each line performs a 16x64x16 matrix multiplication.
# That's 16x64x16 scalar multiplications and 16x63x16 scalar additions.
# That's 32,512 scalar operations per line or 130,048 scalar operations total.
tdpbuud %tmm7, %tmm0, %tmm1
tdpbuud %tmm6, %tmm2, %tmm3
tdpbuud %tmm5, %tmm0, %tmm2
tdpbuud %tmm4, %tmm1, %tmm3
movabsq $130048, %rax
ret
tops_i8_amx_asm_kernel:
tdpbssd %tmm7, %tmm0, %tmm1
tdpbssd %tmm6, %tmm2, %tmm3
tdpbssd %tmm5, %tmm0, %tmm2
tdpbssd %tmm4, %tmm1, %tmm3
movabsq $130048, %rax
ret
# ----------------------------------------------------------------------------
# Mixed kernels that combine AMX and AVX-512 instructions, hiding the latency
# of some instructions by interleaving them with others.
# ----------------------------------------------------------------------------
tops_i7_amx_avx512fma_asm_kernel:
tdpbssd %tmm7, %tmm0, %tmm1
vpdpbusd %zmm0, %zmm1, %zmm2
vpdpbusd %zmm3, %zmm4, %zmm5
vpdpbusd %zmm6, %zmm7, %zmm8
vpdpbusd %zmm9, %zmm10, %zmm11
vpdpbusd %zmm12, %zmm13, %zmm14
vpdpbusd %zmm15, %zmm16, %zmm17
vpdpbusd %zmm18, %zmm19, %zmm20
vpdpbusd %zmm21, %zmm22, %zmm23
vpdpbusd %zmm24, %zmm25, %zmm26
vpdpbusd %zmm27, %zmm28, %zmm29
tdpbssd %tmm6, %tmm2, %tmm3
vpdpbusd %zmm0, %zmm1, %zmm2
vpdpbusd %zmm3, %zmm4, %zmm5
vpdpbusd %zmm6, %zmm7, %zmm8
vpdpbusd %zmm9, %zmm10, %zmm11
vpdpbusd %zmm12, %zmm13, %zmm14
vpdpbusd %zmm15, %zmm16, %zmm17
vpdpbusd %zmm18, %zmm19, %zmm20
vpdpbusd %zmm21, %zmm22, %zmm23
vpdpbusd %zmm24, %zmm25, %zmm26
vpdpbusd %zmm27, %zmm28, %zmm29
tdpbssd %tmm5, %tmm0, %tmm2
vpdpbusd %zmm0, %zmm1, %zmm2
vpdpbusd %zmm3, %zmm4, %zmm5
vpdpbusd %zmm6, %zmm7, %zmm8
vpdpbusd %zmm9, %zmm10, %zmm11
vpdpbusd %zmm12, %zmm13, %zmm14
vpdpbusd %zmm15, %zmm16, %zmm17
vpdpbusd %zmm18, %zmm19, %zmm20
vpdpbusd %zmm21, %zmm22, %zmm23
vpdpbusd %zmm24, %zmm25, %zmm26
vpdpbusd %zmm27, %zmm28, %zmm29
tdpbssd %tmm4, %tmm1, %tmm3
vpdpbusd %zmm0, %zmm1, %zmm2
vpdpbusd %zmm3, %zmm4, %zmm5
vpdpbusd %zmm6, %zmm7, %zmm8
vpdpbusd %zmm9, %zmm10, %zmm11
vpdpbusd %zmm12, %zmm13, %zmm14
vpdpbusd %zmm15, %zmm16, %zmm17
vpdpbusd %zmm18, %zmm19, %zmm20
vpdpbusd %zmm21, %zmm22, %zmm23
vpdpbusd %zmm24, %zmm25, %zmm26
vpdpbusd %zmm27, %zmm28, %zmm29
movabsq $135168, %rax
ret
# ----------------------------------------------------------------------------
tops_f64_avx512ma_asm_kernel:
# Each multiply-add pair does 8 multiplies + 8 adds = 16 FLOPs.
# We'll do 5 such pairs => 80 FLOPs total.
vmulpd %zmm1, %zmm2, %zmm0 # Pair #1 multiply => 8 ops
vaddpd %zmm3, %zmm4, %zmm5 # Pair #1 add => 8 ops
vmulpd %zmm6, %zmm7, %zmm8 # Pair #2 multiply
vaddpd %zmm9, %zmm10, %zmm11 # Pair #2 add
vmulpd %zmm12, %zmm13, %zmm14 # Pair #3 multiply
vaddpd %zmm15, %zmm16, %zmm17 # Pair #3 add
vmulpd %zmm18, %zmm19, %zmm20 # Pair #4 multiply
vaddpd %zmm21, %zmm22, %zmm23 # Pair #4 add
vmulpd %zmm24, %zmm25, %zmm26 # Pair #5 multiply
vaddpd %zmm27, %zmm28, %zmm29 # Pair #5 add
movabsq $80, %rax
ret
tops_f32_avx512ma_asm_kernel:
vmulps %zmm1, %zmm2, %zmm0
vaddps %zmm3, %zmm4, %zmm5
vmulps %zmm6, %zmm7, %zmm8
vaddps %zmm9, %zmm10, %zmm11
vmulps %zmm12, %zmm13, %zmm14
vaddps %zmm15, %zmm16, %zmm17
vmulps %zmm18, %zmm19, %zmm20
vaddps %zmm21, %zmm22, %zmm23
vmulps %zmm24, %zmm25, %zmm26
vaddps %zmm27, %zmm28, %zmm29
movabsq $160, %rax
ret
tops_f16_avx512ma_asm_kernel:
vmulph %zmm1, %zmm2, %zmm0
vaddph %zmm3, %zmm4, %zmm5
vmulph %zmm6, %zmm7, %zmm8
vaddph %zmm9, %zmm10, %zmm11
vmulph %zmm12, %zmm13, %zmm14
vaddph %zmm15, %zmm16, %zmm17
vmulph %zmm18, %zmm19, %zmm20
vaddph %zmm21, %zmm22, %zmm23
vmulph %zmm24, %zmm25, %zmm26
vaddph %zmm27, %zmm28, %zmm29
movabsq $320, %rax
ret
tops_f64_avx2ma_asm_kernel:
vmulpd %ymm1, %ymm2, %ymm0
vaddpd %ymm3, %ymm4, %ymm5
vmulpd %ymm6, %ymm7, %ymm8
vaddpd %ymm9, %ymm10, %ymm11
vmulpd %ymm12, %ymm13, %ymm14
movabsq $20, %rax
ret
tops_f32_avx2ma_asm_kernel:
vmulps %ymm1, %ymm2, %ymm0
vaddps %ymm3, %ymm4, %ymm5
vmulps %ymm6, %ymm7, %ymm8
vaddps %ymm9, %ymm10, %ymm11
vmulps %ymm12, %ymm13, %ymm14
movabsq $40, %rax
ret
# ----------------------------------------------------------------------------
# Tell the linker/assembler that we do NOT need an executable stack:
.section .note.GNU-stack, "", @progbits
# ----------------------------------------------------------------------------