Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[~shefty/rdma-dev.git] / arch / x86 / include / asm / xor.h
1 #ifdef CONFIG_KMEMCHECK
2 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3 # include <asm-generic/xor.h>
4 #elif !defined(_ASM_X86_XOR_H)
5 #define _ASM_X86_XOR_H
6
7 /*
8  * Optimized RAID-5 checksumming functions for SSE.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2, or (at your option)
13  * any later version.
14  *
15  * You should have received a copy of the GNU General Public License
16  * (for example /usr/src/linux/COPYING); if not, write to the Free
17  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19
20 /*
21  * Cache avoiding checksumming functions utilizing KNI instructions
22  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23  */
24
25 /*
26  * Based on
27  * High-speed RAID5 checksumming functions utilizing SSE instructions.
28  * Copyright (C) 1998 Ingo Molnar.
29  */
30
31 /*
32  * x86-64 changes / gcc fixes from Andi Kleen.
33  * Copyright 2002 Andi Kleen, SuSE Labs.
34  *
35  * This hasn't been optimized for the hammer yet, but there are likely
36  * no advantages to be gotten from x86-64 here anyways.
37  */
38
39 #include <asm/i387.h>
40
41 #ifdef CONFIG_X86_32
42 /* reduce register pressure */
43 # define XOR_CONSTANT_CONSTRAINT "i"
44 #else
45 # define XOR_CONSTANT_CONSTRAINT "re"
46 #endif
47
48 #define OFFS(x)         "16*("#x")"
49 #define PF_OFFS(x)      "256+16*("#x")"
50 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
51 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
52 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
53 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
54 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
55 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
56 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
57 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
58 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
59 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
60 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
61 #define NOP(x)
62
63 #define BLK64(pf, op, i)                                \
64                 pf(i)                                   \
65                 op(i, 0)                                \
66                         op(i + 1, 1)                    \
67                                 op(i + 2, 2)            \
68                                         op(i + 3, 3)
69
70 static void
71 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72 {
73         unsigned long lines = bytes >> 8;
74
75         kernel_fpu_begin();
76
77         asm volatile(
78 #undef BLOCK
79 #define BLOCK(i)                                        \
80                 LD(i, 0)                                \
81                         LD(i + 1, 1)                    \
82                 PF1(i)                                  \
83                                 PF1(i + 2)              \
84                                 LD(i + 2, 2)            \
85                                         LD(i + 3, 3)    \
86                 PF0(i + 4)                              \
87                                 PF0(i + 6)              \
88                 XO1(i, 0)                               \
89                         XO1(i + 1, 1)                   \
90                                 XO1(i + 2, 2)           \
91                                         XO1(i + 3, 3)   \
92                 ST(i, 0)                                \
93                         ST(i + 1, 1)                    \
94                                 ST(i + 2, 2)            \
95                                         ST(i + 3, 3)    \
96
97
98                 PF0(0)
99                                 PF0(2)
100
101         " .align 32                     ;\n"
102         " 1:                            ;\n"
103
104                 BLOCK(0)
105                 BLOCK(4)
106                 BLOCK(8)
107                 BLOCK(12)
108
109         "       add %[inc], %[p1]       ;\n"
110         "       add %[inc], %[p2]       ;\n"
111         "       dec %[cnt]              ;\n"
112         "       jnz 1b                  ;\n"
113         : [cnt] "+r" (lines),
114           [p1] "+r" (p1), [p2] "+r" (p2)
115         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116         : "memory");
117
118         kernel_fpu_end();
119 }
120
121 static void
122 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123 {
124         unsigned long lines = bytes >> 8;
125
126         kernel_fpu_begin();
127
128         asm volatile(
129 #undef BLOCK
130 #define BLOCK(i)                        \
131                 BLK64(PF0, LD, i)       \
132                 BLK64(PF1, XO1, i)      \
133                 BLK64(NOP, ST, i)       \
134
135         " .align 32                     ;\n"
136         " 1:                            ;\n"
137
138                 BLOCK(0)
139                 BLOCK(4)
140                 BLOCK(8)
141                 BLOCK(12)
142
143         "       add %[inc], %[p1]       ;\n"
144         "       add %[inc], %[p2]       ;\n"
145         "       dec %[cnt]              ;\n"
146         "       jnz 1b                  ;\n"
147         : [cnt] "+r" (lines),
148           [p1] "+r" (p1), [p2] "+r" (p2)
149         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150         : "memory");
151
152         kernel_fpu_end();
153 }
154
155 static void
156 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157           unsigned long *p3)
158 {
159         unsigned long lines = bytes >> 8;
160
161         kernel_fpu_begin();
162
163         asm volatile(
164 #undef BLOCK
165 #define BLOCK(i) \
166                 PF1(i)                                  \
167                                 PF1(i + 2)              \
168                 LD(i, 0)                                \
169                         LD(i + 1, 1)                    \
170                                 LD(i + 2, 2)            \
171                                         LD(i + 3, 3)    \
172                 PF2(i)                                  \
173                                 PF2(i + 2)              \
174                 PF0(i + 4)                              \
175                                 PF0(i + 6)              \
176                 XO1(i, 0)                               \
177                         XO1(i + 1, 1)                   \
178                                 XO1(i + 2, 2)           \
179                                         XO1(i + 3, 3)   \
180                 XO2(i, 0)                               \
181                         XO2(i + 1, 1)                   \
182                                 XO2(i + 2, 2)           \
183                                         XO2(i + 3, 3)   \
184                 ST(i, 0)                                \
185                         ST(i + 1, 1)                    \
186                                 ST(i + 2, 2)            \
187                                         ST(i + 3, 3)    \
188
189
190                 PF0(0)
191                                 PF0(2)
192
193         " .align 32                     ;\n"
194         " 1:                            ;\n"
195
196                 BLOCK(0)
197                 BLOCK(4)
198                 BLOCK(8)
199                 BLOCK(12)
200
201         "       add %[inc], %[p1]       ;\n"
202         "       add %[inc], %[p2]       ;\n"
203         "       add %[inc], %[p3]       ;\n"
204         "       dec %[cnt]              ;\n"
205         "       jnz 1b                  ;\n"
206         : [cnt] "+r" (lines),
207           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209         : "memory");
210
211         kernel_fpu_end();
212 }
213
214 static void
215 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216                unsigned long *p3)
217 {
218         unsigned long lines = bytes >> 8;
219
220         kernel_fpu_begin();
221
222         asm volatile(
223 #undef BLOCK
224 #define BLOCK(i)                        \
225                 BLK64(PF0, LD, i)       \
226                 BLK64(PF1, XO1, i)      \
227                 BLK64(PF2, XO2, i)      \
228                 BLK64(NOP, ST, i)       \
229
230         " .align 32                     ;\n"
231         " 1:                            ;\n"
232
233                 BLOCK(0)
234                 BLOCK(4)
235                 BLOCK(8)
236                 BLOCK(12)
237
238         "       add %[inc], %[p1]       ;\n"
239         "       add %[inc], %[p2]       ;\n"
240         "       add %[inc], %[p3]       ;\n"
241         "       dec %[cnt]              ;\n"
242         "       jnz 1b                  ;\n"
243         : [cnt] "+r" (lines),
244           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246         : "memory");
247
248         kernel_fpu_end();
249 }
250
251 static void
252 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253           unsigned long *p3, unsigned long *p4)
254 {
255         unsigned long lines = bytes >> 8;
256
257         kernel_fpu_begin();
258
259         asm volatile(
260 #undef BLOCK
261 #define BLOCK(i) \
262                 PF1(i)                                  \
263                                 PF1(i + 2)              \
264                 LD(i, 0)                                \
265                         LD(i + 1, 1)                    \
266                                 LD(i + 2, 2)            \
267                                         LD(i + 3, 3)    \
268                 PF2(i)                                  \
269                                 PF2(i + 2)              \
270                 XO1(i, 0)                               \
271                         XO1(i + 1, 1)                   \
272                                 XO1(i + 2, 2)           \
273                                         XO1(i + 3, 3)   \
274                 PF3(i)                                  \
275                                 PF3(i + 2)              \
276                 PF0(i + 4)                              \
277                                 PF0(i + 6)              \
278                 XO2(i, 0)                               \
279                         XO2(i + 1, 1)                   \
280                                 XO2(i + 2, 2)           \
281                                         XO2(i + 3, 3)   \
282                 XO3(i, 0)                               \
283                         XO3(i + 1, 1)                   \
284                                 XO3(i + 2, 2)           \
285                                         XO3(i + 3, 3)   \
286                 ST(i, 0)                                \
287                         ST(i + 1, 1)                    \
288                                 ST(i + 2, 2)            \
289                                         ST(i + 3, 3)    \
290
291
292                 PF0(0)
293                                 PF0(2)
294
295         " .align 32                     ;\n"
296         " 1:                            ;\n"
297
298                 BLOCK(0)
299                 BLOCK(4)
300                 BLOCK(8)
301                 BLOCK(12)
302
303         "       add %[inc], %[p1]       ;\n"
304         "       add %[inc], %[p2]       ;\n"
305         "       add %[inc], %[p3]       ;\n"
306         "       add %[inc], %[p4]       ;\n"
307         "       dec %[cnt]              ;\n"
308         "       jnz 1b                  ;\n"
309         : [cnt] "+r" (lines), [p1] "+r" (p1),
310           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312         : "memory");
313
314         kernel_fpu_end();
315 }
316
317 static void
318 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319                unsigned long *p3, unsigned long *p4)
320 {
321         unsigned long lines = bytes >> 8;
322
323         kernel_fpu_begin();
324
325         asm volatile(
326 #undef BLOCK
327 #define BLOCK(i)                        \
328                 BLK64(PF0, LD, i)       \
329                 BLK64(PF1, XO1, i)      \
330                 BLK64(PF2, XO2, i)      \
331                 BLK64(PF3, XO3, i)      \
332                 BLK64(NOP, ST, i)       \
333
334         " .align 32                     ;\n"
335         " 1:                            ;\n"
336
337                 BLOCK(0)
338                 BLOCK(4)
339                 BLOCK(8)
340                 BLOCK(12)
341
342         "       add %[inc], %[p1]       ;\n"
343         "       add %[inc], %[p2]       ;\n"
344         "       add %[inc], %[p3]       ;\n"
345         "       add %[inc], %[p4]       ;\n"
346         "       dec %[cnt]              ;\n"
347         "       jnz 1b                  ;\n"
348         : [cnt] "+r" (lines), [p1] "+r" (p1),
349           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351         : "memory");
352
353         kernel_fpu_end();
354 }
355
356 static void
357 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358           unsigned long *p3, unsigned long *p4, unsigned long *p5)
359 {
360         unsigned long lines = bytes >> 8;
361
362         kernel_fpu_begin();
363
364         asm volatile(
365 #undef BLOCK
366 #define BLOCK(i) \
367                 PF1(i)                                  \
368                                 PF1(i + 2)              \
369                 LD(i, 0)                                \
370                         LD(i + 1, 1)                    \
371                                 LD(i + 2, 2)            \
372                                         LD(i + 3, 3)    \
373                 PF2(i)                                  \
374                                 PF2(i + 2)              \
375                 XO1(i, 0)                               \
376                         XO1(i + 1, 1)                   \
377                                 XO1(i + 2, 2)           \
378                                         XO1(i + 3, 3)   \
379                 PF3(i)                                  \
380                                 PF3(i + 2)              \
381                 XO2(i, 0)                               \
382                         XO2(i + 1, 1)                   \
383                                 XO2(i + 2, 2)           \
384                                         XO2(i + 3, 3)   \
385                 PF4(i)                                  \
386                                 PF4(i + 2)              \
387                 PF0(i + 4)                              \
388                                 PF0(i + 6)              \
389                 XO3(i, 0)                               \
390                         XO3(i + 1, 1)                   \
391                                 XO3(i + 2, 2)           \
392                                         XO3(i + 3, 3)   \
393                 XO4(i, 0)                               \
394                         XO4(i + 1, 1)                   \
395                                 XO4(i + 2, 2)           \
396                                         XO4(i + 3, 3)   \
397                 ST(i, 0)                                \
398                         ST(i + 1, 1)                    \
399                                 ST(i + 2, 2)            \
400                                         ST(i + 3, 3)    \
401
402
403                 PF0(0)
404                                 PF0(2)
405
406         " .align 32                     ;\n"
407         " 1:                            ;\n"
408
409                 BLOCK(0)
410                 BLOCK(4)
411                 BLOCK(8)
412                 BLOCK(12)
413
414         "       add %[inc], %[p1]       ;\n"
415         "       add %[inc], %[p2]       ;\n"
416         "       add %[inc], %[p3]       ;\n"
417         "       add %[inc], %[p4]       ;\n"
418         "       add %[inc], %[p5]       ;\n"
419         "       dec %[cnt]              ;\n"
420         "       jnz 1b                  ;\n"
421         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424         : "memory");
425
426         kernel_fpu_end();
427 }
428
429 static void
430 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431                unsigned long *p3, unsigned long *p4, unsigned long *p5)
432 {
433         unsigned long lines = bytes >> 8;
434
435         kernel_fpu_begin();
436
437         asm volatile(
438 #undef BLOCK
439 #define BLOCK(i)                        \
440                 BLK64(PF0, LD, i)       \
441                 BLK64(PF1, XO1, i)      \
442                 BLK64(PF2, XO2, i)      \
443                 BLK64(PF3, XO3, i)      \
444                 BLK64(PF4, XO4, i)      \
445                 BLK64(NOP, ST, i)       \
446
447         " .align 32                     ;\n"
448         " 1:                            ;\n"
449
450                 BLOCK(0)
451                 BLOCK(4)
452                 BLOCK(8)
453                 BLOCK(12)
454
455         "       add %[inc], %[p1]       ;\n"
456         "       add %[inc], %[p2]       ;\n"
457         "       add %[inc], %[p3]       ;\n"
458         "       add %[inc], %[p4]       ;\n"
459         "       add %[inc], %[p5]       ;\n"
460         "       dec %[cnt]              ;\n"
461         "       jnz 1b                  ;\n"
462         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465         : "memory");
466
467         kernel_fpu_end();
468 }
469
470 static struct xor_block_template xor_block_sse_pf64 = {
471         .name = "prefetch64-sse",
472         .do_2 = xor_sse_2_pf64,
473         .do_3 = xor_sse_3_pf64,
474         .do_4 = xor_sse_4_pf64,
475         .do_5 = xor_sse_5_pf64,
476 };
477
478 #undef LD
479 #undef XO1
480 #undef XO2
481 #undef XO3
482 #undef XO4
483 #undef ST
484 #undef NOP
485 #undef BLK64
486 #undef BLOCK
487
488 #undef XOR_CONSTANT_CONSTRAINT
489
490 #ifdef CONFIG_X86_32
491 # include <asm/xor_32.h>
492 #else
493 # include <asm/xor_64.h>
494 #endif
495
496 #define XOR_SELECT_TEMPLATE(FASTEST) \
497         AVX_SELECT(FASTEST)
498
499 #endif /* _ASM_X86_XOR_H */