]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - arch/x86/include/asm/xor.h
c661571ca0b70a5eafa3ab5bc0eb8ee29ced483e
[~shefty/rdma-dev.git] / arch / x86 / include / asm / xor.h
1 #ifdef CONFIG_KMEMCHECK
2 /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3 # include <asm-generic/xor.h>
4 #elif !defined(_ASM_X86_XOR_H)
5 #define _ASM_X86_XOR_H
6
7 /*
8  * Optimized RAID-5 checksumming functions for SSE.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2, or (at your option)
13  * any later version.
14  *
15  * You should have received a copy of the GNU General Public License
16  * (for example /usr/src/linux/COPYING); if not, write to the Free
17  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19
20 /*
21  * Cache avoiding checksumming functions utilizing KNI instructions
22  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23  */
24
25 /*
26  * Based on
27  * High-speed RAID5 checksumming functions utilizing SSE instructions.
28  * Copyright (C) 1998 Ingo Molnar.
29  */
30
31 /*
32  * x86-64 changes / gcc fixes from Andi Kleen.
33  * Copyright 2002 Andi Kleen, SuSE Labs.
34  *
35  * This hasn't been optimized for the hammer yet, but there are likely
36  * no advantages to be gotten from x86-64 here anyways.
37  */
38
39 #include <asm/i387.h>
40
41 #ifdef CONFIG_X86_32
42 /* reduce register pressure */
43 # define XOR_CONSTANT_CONSTRAINT "i"
44 #else
45 # define XOR_CONSTANT_CONSTRAINT "re"
46 #endif
47
48 #define OFFS(x)         "16*("#x")"
49 #define PF_OFFS(x)      "256+16*("#x")"
50 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
51 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
52 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
53 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
54 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
55 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
56 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
57 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
58 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
59 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
60 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
61
62 static void
63 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
64 {
65         unsigned long lines = bytes >> 8;
66
67         kernel_fpu_begin();
68
69         asm volatile(
70 #undef BLOCK
71 #define BLOCK(i)                                        \
72                 LD(i, 0)                                \
73                         LD(i + 1, 1)                    \
74                 PF1(i)                                  \
75                                 PF1(i + 2)              \
76                                 LD(i + 2, 2)            \
77                                         LD(i + 3, 3)    \
78                 PF0(i + 4)                              \
79                                 PF0(i + 6)              \
80                 XO1(i, 0)                               \
81                         XO1(i + 1, 1)                   \
82                                 XO1(i + 2, 2)           \
83                                         XO1(i + 3, 3)   \
84                 ST(i, 0)                                \
85                         ST(i + 1, 1)                    \
86                                 ST(i + 2, 2)            \
87                                         ST(i + 3, 3)    \
88
89
90                 PF0(0)
91                                 PF0(2)
92
93         " .align 32                     ;\n"
94         " 1:                            ;\n"
95
96                 BLOCK(0)
97                 BLOCK(4)
98                 BLOCK(8)
99                 BLOCK(12)
100
101         "       add %[inc], %[p1]       ;\n"
102         "       add %[inc], %[p2]       ;\n"
103         "       dec %[cnt]              ;\n"
104         "       jnz 1b                  ;\n"
105         : [cnt] "+r" (lines),
106           [p1] "+r" (p1), [p2] "+r" (p2)
107         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
108         : "memory");
109
110         kernel_fpu_end();
111 }
112
113 static void
114 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
115           unsigned long *p3)
116 {
117         unsigned long lines = bytes >> 8;
118
119         kernel_fpu_begin();
120
121         asm volatile(
122 #undef BLOCK
123 #define BLOCK(i) \
124                 PF1(i)                                  \
125                                 PF1(i + 2)              \
126                 LD(i, 0)                                \
127                         LD(i + 1, 1)                    \
128                                 LD(i + 2, 2)            \
129                                         LD(i + 3, 3)    \
130                 PF2(i)                                  \
131                                 PF2(i + 2)              \
132                 PF0(i + 4)                              \
133                                 PF0(i + 6)              \
134                 XO1(i, 0)                               \
135                         XO1(i + 1, 1)                   \
136                                 XO1(i + 2, 2)           \
137                                         XO1(i + 3, 3)   \
138                 XO2(i, 0)                               \
139                         XO2(i + 1, 1)                   \
140                                 XO2(i + 2, 2)           \
141                                         XO2(i + 3, 3)   \
142                 ST(i, 0)                                \
143                         ST(i + 1, 1)                    \
144                                 ST(i + 2, 2)            \
145                                         ST(i + 3, 3)    \
146
147
148                 PF0(0)
149                                 PF0(2)
150
151         " .align 32                     ;\n"
152         " 1:                            ;\n"
153
154                 BLOCK(0)
155                 BLOCK(4)
156                 BLOCK(8)
157                 BLOCK(12)
158
159         "       add %[inc], %[p1]       ;\n"
160         "       add %[inc], %[p2]       ;\n"
161         "       add %[inc], %[p3]       ;\n"
162         "       dec %[cnt]              ;\n"
163         "       jnz 1b                  ;\n"
164         : [cnt] "+r" (lines),
165           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
166         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
167         : "memory");
168
169         kernel_fpu_end();
170 }
171
172 static void
173 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
174           unsigned long *p3, unsigned long *p4)
175 {
176         unsigned long lines = bytes >> 8;
177
178         kernel_fpu_begin();
179
180         asm volatile(
181 #undef BLOCK
182 #define BLOCK(i) \
183                 PF1(i)                                  \
184                                 PF1(i + 2)              \
185                 LD(i, 0)                                \
186                         LD(i + 1, 1)                    \
187                                 LD(i + 2, 2)            \
188                                         LD(i + 3, 3)    \
189                 PF2(i)                                  \
190                                 PF2(i + 2)              \
191                 XO1(i, 0)                               \
192                         XO1(i + 1, 1)                   \
193                                 XO1(i + 2, 2)           \
194                                         XO1(i + 3, 3)   \
195                 PF3(i)                                  \
196                                 PF3(i + 2)              \
197                 PF0(i + 4)                              \
198                                 PF0(i + 6)              \
199                 XO2(i, 0)                               \
200                         XO2(i + 1, 1)                   \
201                                 XO2(i + 2, 2)           \
202                                         XO2(i + 3, 3)   \
203                 XO3(i, 0)                               \
204                         XO3(i + 1, 1)                   \
205                                 XO3(i + 2, 2)           \
206                                         XO3(i + 3, 3)   \
207                 ST(i, 0)                                \
208                         ST(i + 1, 1)                    \
209                                 ST(i + 2, 2)            \
210                                         ST(i + 3, 3)    \
211
212
213                 PF0(0)
214                                 PF0(2)
215
216         " .align 32                     ;\n"
217         " 1:                            ;\n"
218
219                 BLOCK(0)
220                 BLOCK(4)
221                 BLOCK(8)
222                 BLOCK(12)
223
224         "       add %[inc], %[p1]       ;\n"
225         "       add %[inc], %[p2]       ;\n"
226         "       add %[inc], %[p3]       ;\n"
227         "       add %[inc], %[p4]       ;\n"
228         "       dec %[cnt]              ;\n"
229         "       jnz 1b                  ;\n"
230         : [cnt] "+r" (lines), [p1] "+r" (p1),
231           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
232         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
233         : "memory");
234
235         kernel_fpu_end();
236 }
237
238 static void
239 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
240           unsigned long *p3, unsigned long *p4, unsigned long *p5)
241 {
242         unsigned long lines = bytes >> 8;
243
244         kernel_fpu_begin();
245
246         asm volatile(
247 #undef BLOCK
248 #define BLOCK(i) \
249                 PF1(i)                                  \
250                                 PF1(i + 2)              \
251                 LD(i, 0)                                \
252                         LD(i + 1, 1)                    \
253                                 LD(i + 2, 2)            \
254                                         LD(i + 3, 3)    \
255                 PF2(i)                                  \
256                                 PF2(i + 2)              \
257                 XO1(i, 0)                               \
258                         XO1(i + 1, 1)                   \
259                                 XO1(i + 2, 2)           \
260                                         XO1(i + 3, 3)   \
261                 PF3(i)                                  \
262                                 PF3(i + 2)              \
263                 XO2(i, 0)                               \
264                         XO2(i + 1, 1)                   \
265                                 XO2(i + 2, 2)           \
266                                         XO2(i + 3, 3)   \
267                 PF4(i)                                  \
268                                 PF4(i + 2)              \
269                 PF0(i + 4)                              \
270                                 PF0(i + 6)              \
271                 XO3(i, 0)                               \
272                         XO3(i + 1, 1)                   \
273                                 XO3(i + 2, 2)           \
274                                         XO3(i + 3, 3)   \
275                 XO4(i, 0)                               \
276                         XO4(i + 1, 1)                   \
277                                 XO4(i + 2, 2)           \
278                                         XO4(i + 3, 3)   \
279                 ST(i, 0)                                \
280                         ST(i + 1, 1)                    \
281                                 ST(i + 2, 2)            \
282                                         ST(i + 3, 3)    \
283
284
285                 PF0(0)
286                                 PF0(2)
287
288         " .align 32                     ;\n"
289         " 1:                            ;\n"
290
291                 BLOCK(0)
292                 BLOCK(4)
293                 BLOCK(8)
294                 BLOCK(12)
295
296         "       add %[inc], %[p1]       ;\n"
297         "       add %[inc], %[p2]       ;\n"
298         "       add %[inc], %[p3]       ;\n"
299         "       add %[inc], %[p4]       ;\n"
300         "       add %[inc], %[p5]       ;\n"
301         "       dec %[cnt]              ;\n"
302         "       jnz 1b                  ;\n"
303         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
304           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
305         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
306         : "memory");
307
308         kernel_fpu_end();
309 }
310
311 #undef LD
312 #undef XO1
313 #undef XO2
314 #undef XO3
315 #undef XO4
316 #undef ST
317 #undef BLOCK
318
319 #undef XOR_CONSTANT_CONSTRAINT
320
321 #ifdef CONFIG_X86_32
322 # include <asm/xor_32.h>
323 #else
324 # include <asm/xor_64.h>
325 #endif
326
327 #endif /* _ASM_X86_XOR_H */