Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
impl.h
Go to the documentation of this file.
1#ifndef SSE2NEONTEST_H
2#define SSE2NEONTEST_H
3
4#include "common.h"
5
6#define INTRIN_LIST \
7 /* MMX */ \
8 _(mm_empty) \
9 /* SSE */ \
10 _(mm_add_ps) \
11 _(mm_add_ss) \
12 _(mm_and_ps) \
13 _(mm_andnot_ps) \
14 _(mm_avg_pu16) \
15 _(mm_avg_pu8) \
16 _(mm_cmpeq_ps) \
17 _(mm_cmpeq_ss) \
18 _(mm_cmpge_ps) \
19 _(mm_cmpge_ss) \
20 _(mm_cmpgt_ps) \
21 _(mm_cmpgt_ss) \
22 _(mm_cmple_ps) \
23 _(mm_cmple_ss) \
24 _(mm_cmplt_ps) \
25 _(mm_cmplt_ss) \
26 _(mm_cmpneq_ps) \
27 _(mm_cmpneq_ss) \
28 _(mm_cmpnge_ps) \
29 _(mm_cmpnge_ss) \
30 _(mm_cmpngt_ps) \
31 _(mm_cmpngt_ss) \
32 _(mm_cmpnle_ps) \
33 _(mm_cmpnle_ss) \
34 _(mm_cmpnlt_ps) \
35 _(mm_cmpnlt_ss) \
36 _(mm_cmpord_ps) \
37 _(mm_cmpord_ss) \
38 _(mm_cmpunord_ps) \
39 _(mm_cmpunord_ss) \
40 _(mm_comieq_ss) \
41 _(mm_comige_ss) \
42 _(mm_comigt_ss) \
43 _(mm_comile_ss) \
44 _(mm_comilt_ss) \
45 _(mm_comineq_ss) \
46 _(mm_cvt_pi2ps) \
47 _(mm_cvt_ps2pi) \
48 _(mm_cvt_si2ss) \
49 _(mm_cvt_ss2si) \
50 _(mm_cvtpi16_ps) \
51 _(mm_cvtpi32_ps) \
52 _(mm_cvtpi32x2_ps) \
53 _(mm_cvtpi8_ps) \
54 _(mm_cvtps_pi16) \
55 _(mm_cvtps_pi32) \
56 _(mm_cvtps_pi8) \
57 _(mm_cvtpu16_ps) \
58 _(mm_cvtpu8_ps) \
59 _(mm_cvtsi32_ss) \
60 _(mm_cvtsi64_ss) \
61 _(mm_cvtss_f32) \
62 _(mm_cvtss_si32) \
63 _(mm_cvtss_si64) \
64 _(mm_cvtt_ps2pi) \
65 _(mm_cvtt_ss2si) \
66 _(mm_cvttps_pi32) \
67 _(mm_cvttss_si32) \
68 _(mm_cvttss_si64) \
69 _(mm_div_ps) \
70 _(mm_div_ss) \
71 _(mm_extract_pi16) \
72 _(mm_free) \
73 _(mm_get_flush_zero_mode) \
74 _(mm_get_rounding_mode) \
75 _(mm_getcsr) \
76 _(mm_insert_pi16) \
77 _(mm_load_ps) \
78 _(mm_load_ps1) \
79 _(mm_load_ss) \
80 _(mm_load1_ps) \
81 _(mm_loadh_pi) \
82 _(mm_loadl_pi) \
83 _(mm_loadr_ps) \
84 _(mm_loadu_ps) \
85 _(mm_loadu_si16) \
86 _(mm_loadu_si64) \
87 _(mm_malloc) \
88 _(mm_maskmove_si64) \
89 _(m_maskmovq) \
90 _(mm_max_pi16) \
91 _(mm_max_ps) \
92 _(mm_max_pu8) \
93 _(mm_max_ss) \
94 _(mm_min_pi16) \
95 _(mm_min_ps) \
96 _(mm_min_pu8) \
97 _(mm_min_ss) \
98 _(mm_move_ss) \
99 _(mm_movehl_ps) \
100 _(mm_movelh_ps) \
101 _(mm_movemask_pi8) \
102 _(mm_movemask_ps) \
103 _(mm_mul_ps) \
104 _(mm_mul_ss) \
105 _(mm_mulhi_pu16) \
106 _(mm_or_ps) \
107 _(m_pavgb) \
108 _(m_pavgw) \
109 _(m_pextrw) \
110 _(m_pinsrw) \
111 _(m_pmaxsw) \
112 _(m_pmaxub) \
113 _(m_pminsw) \
114 _(m_pminub) \
115 _(m_pmovmskb) \
116 _(m_pmulhuw) \
117 _(mm_prefetch) \
118 _(m_psadbw) \
119 _(m_pshufw) \
120 _(mm_rcp_ps) \
121 _(mm_rcp_ss) \
122 _(mm_rsqrt_ps) \
123 _(mm_rsqrt_ss) \
124 _(mm_sad_pu8) \
125 _(mm_set_flush_zero_mode) \
126 _(mm_set_ps) \
127 _(mm_set_ps1) \
128 _(mm_set_rounding_mode) \
129 _(mm_set_ss) \
130 _(mm_set1_ps) \
131 _(mm_setcsr) \
132 _(mm_setr_ps) \
133 _(mm_setzero_ps) \
134 _(mm_sfence) \
135 _(mm_shuffle_pi16) \
136 _(mm_shuffle_ps) \
137 _(mm_sqrt_ps) \
138 _(mm_sqrt_ss) \
139 _(mm_store_ps) \
140 _(mm_store_ps1) \
141 _(mm_store_ss) \
142 _(mm_store1_ps) \
143 _(mm_storeh_pi) \
144 _(mm_storel_pi) \
145 _(mm_storer_ps) \
146 _(mm_storeu_ps) \
147 _(mm_storeu_si16) \
148 _(mm_storeu_si64) \
149 _(mm_stream_pi) \
150 _(mm_stream_ps) \
151 _(mm_sub_ps) \
152 _(mm_sub_ss) \
153 _(mm_ucomieq_ss) \
154 _(mm_ucomige_ss) \
155 _(mm_ucomigt_ss) \
156 _(mm_ucomile_ss) \
157 _(mm_ucomilt_ss) \
158 _(mm_ucomineq_ss) \
159 _(mm_undefined_ps) \
160 _(mm_unpackhi_ps) \
161 _(mm_unpacklo_ps) \
162 _(mm_xor_ps) \
163 /* SSE2 */ \
164 _(mm_add_epi16) \
165 _(mm_add_epi32) \
166 _(mm_add_epi64) \
167 _(mm_add_epi8) \
168 _(mm_add_pd) \
169 _(mm_add_sd) \
170 _(mm_add_si64) \
171 _(mm_adds_epi16) \
172 _(mm_adds_epi8) \
173 _(mm_adds_epu16) \
174 _(mm_adds_epu8) \
175 _(mm_and_pd) \
176 _(mm_and_si128) \
177 _(mm_andnot_pd) \
178 _(mm_andnot_si128) \
179 _(mm_avg_epu16) \
180 _(mm_avg_epu8) \
181 _(mm_bslli_si128) \
182 _(mm_bsrli_si128) \
183 _(mm_castpd_ps) \
184 _(mm_castpd_si128) \
185 _(mm_castps_pd) \
186 _(mm_castps_si128) \
187 _(mm_castsi128_pd) \
188 _(mm_castsi128_ps) \
189 _(mm_clflush) \
190 _(mm_cmpeq_epi16) \
191 _(mm_cmpeq_epi32) \
192 _(mm_cmpeq_epi8) \
193 _(mm_cmpeq_pd) \
194 _(mm_cmpeq_sd) \
195 _(mm_cmpge_pd) \
196 _(mm_cmpge_sd) \
197 _(mm_cmpgt_epi16) \
198 _(mm_cmpgt_epi32) \
199 _(mm_cmpgt_epi8) \
200 _(mm_cmpgt_pd) \
201 _(mm_cmpgt_sd) \
202 _(mm_cmple_pd) \
203 _(mm_cmple_sd) \
204 _(mm_cmplt_epi16) \
205 _(mm_cmplt_epi32) \
206 _(mm_cmplt_epi8) \
207 _(mm_cmplt_pd) \
208 _(mm_cmplt_sd) \
209 _(mm_cmpneq_pd) \
210 _(mm_cmpneq_sd) \
211 _(mm_cmpnge_pd) \
212 _(mm_cmpnge_sd) \
213 _(mm_cmpngt_pd) \
214 _(mm_cmpngt_sd) \
215 _(mm_cmpnle_pd) \
216 _(mm_cmpnle_sd) \
217 _(mm_cmpnlt_pd) \
218 _(mm_cmpnlt_sd) \
219 _(mm_cmpord_pd) \
220 _(mm_cmpord_sd) \
221 _(mm_cmpunord_pd) \
222 _(mm_cmpunord_sd) \
223 _(mm_comieq_sd) \
224 _(mm_comige_sd) \
225 _(mm_comigt_sd) \
226 _(mm_comile_sd) \
227 _(mm_comilt_sd) \
228 _(mm_comineq_sd) \
229 _(mm_cvtepi32_pd) \
230 _(mm_cvtepi32_ps) \
231 _(mm_cvtpd_epi32) \
232 _(mm_cvtpd_pi32) \
233 _(mm_cvtpd_ps) \
234 _(mm_cvtpi32_pd) \
235 _(mm_cvtps_epi32) \
236 _(mm_cvtps_pd) \
237 _(mm_cvtsd_f64) \
238 _(mm_cvtsd_si32) \
239 _(mm_cvtsd_si64) \
240 _(mm_cvtsd_si64x) \
241 _(mm_cvtsd_ss) \
242 _(mm_cvtsi128_si32) \
243 _(mm_cvtsi128_si64) \
244 _(mm_cvtsi128_si64x) \
245 _(mm_cvtsi32_sd) \
246 _(mm_cvtsi32_si128) \
247 _(mm_cvtsi64_sd) \
248 _(mm_cvtsi64_si128) \
249 _(mm_cvtsi64x_sd) \
250 _(mm_cvtsi64x_si128) \
251 _(mm_cvtss_sd) \
252 _(mm_cvttpd_epi32) \
253 _(mm_cvttpd_pi32) \
254 _(mm_cvttps_epi32) \
255 _(mm_cvttsd_si32) \
256 _(mm_cvttsd_si64) \
257 _(mm_cvttsd_si64x) \
258 _(mm_div_pd) \
259 _(mm_div_sd) \
260 _(mm_extract_epi16) \
261 _(mm_insert_epi16) \
262 _(mm_lfence) \
263 _(mm_load_pd) \
264 _(mm_load_pd1) \
265 _(mm_load_sd) \
266 _(mm_load_si128) \
267 _(mm_load1_pd) \
268 _(mm_loadh_pd) \
269 _(mm_loadl_epi64) \
270 _(mm_loadl_pd) \
271 _(mm_loadr_pd) \
272 _(mm_loadu_pd) \
273 _(mm_loadu_si128) \
274 _(mm_loadu_si32) \
275 _(mm_madd_epi16) \
276 _(mm_maskmoveu_si128) \
277 _(mm_max_epi16) \
278 _(mm_max_epu8) \
279 _(mm_max_pd) \
280 _(mm_max_sd) \
281 _(mm_mfence) \
282 _(mm_min_epi16) \
283 _(mm_min_epu8) \
284 _(mm_min_pd) \
285 _(mm_min_sd) \
286 _(mm_move_epi64) \
287 _(mm_move_sd) \
288 _(mm_movemask_epi8) \
289 _(mm_movemask_pd) \
290 _(mm_movepi64_pi64) \
291 _(mm_movpi64_epi64) \
292 _(mm_mul_epu32) \
293 _(mm_mul_pd) \
294 _(mm_mul_sd) \
295 _(mm_mul_su32) \
296 _(mm_mulhi_epi16) \
297 _(mm_mulhi_epu16) \
298 _(mm_mullo_epi16) \
299 _(mm_or_pd) \
300 _(mm_or_si128) \
301 _(mm_packs_epi16) \
302 _(mm_packs_epi32) \
303 _(mm_packus_epi16) \
304 _(mm_pause) \
305 _(mm_sad_epu8) \
306 _(mm_set_epi16) \
307 _(mm_set_epi32) \
308 _(mm_set_epi64) \
309 _(mm_set_epi64x) \
310 _(mm_set_epi8) \
311 _(mm_set_pd) \
312 _(mm_set_pd1) \
313 _(mm_set_sd) \
314 _(mm_set1_epi16) \
315 _(mm_set1_epi32) \
316 _(mm_set1_epi64) \
317 _(mm_set1_epi64x) \
318 _(mm_set1_epi8) \
319 _(mm_set1_pd) \
320 _(mm_setr_epi16) \
321 _(mm_setr_epi32) \
322 _(mm_setr_epi64) \
323 _(mm_setr_epi8) \
324 _(mm_setr_pd) \
325 _(mm_setzero_pd) \
326 _(mm_setzero_si128) \
327 _(mm_shuffle_epi32) \
328 _(mm_shuffle_pd) \
329 _(mm_shufflehi_epi16) \
330 _(mm_shufflelo_epi16) \
331 _(mm_sll_epi16) \
332 _(mm_sll_epi32) \
333 _(mm_sll_epi64) \
334 _(mm_slli_epi16) \
335 _(mm_slli_epi32) \
336 _(mm_slli_epi64) \
337 _(mm_slli_si128) \
338 _(mm_sqrt_pd) \
339 _(mm_sqrt_sd) \
340 _(mm_sra_epi16) \
341 _(mm_sra_epi32) \
342 _(mm_srai_epi16) \
343 _(mm_srai_epi32) \
344 _(mm_srl_epi16) \
345 _(mm_srl_epi32) \
346 _(mm_srl_epi64) \
347 _(mm_srli_epi16) \
348 _(mm_srli_epi32) \
349 _(mm_srli_epi64) \
350 _(mm_srli_si128) \
351 _(mm_store_pd) \
352 _(mm_store_pd1) \
353 _(mm_store_sd) \
354 _(mm_store_si128) \
355 _(mm_store1_pd) \
356 _(mm_storeh_pd) \
357 _(mm_storel_epi64) \
358 _(mm_storel_pd) \
359 _(mm_storer_pd) \
360 _(mm_storeu_pd) \
361 _(mm_storeu_si128) \
362 _(mm_storeu_si32) \
363 _(mm_stream_pd) \
364 _(mm_stream_si128) \
365 _(mm_stream_si32) \
366 _(mm_stream_si64) \
367 _(mm_sub_epi16) \
368 _(mm_sub_epi32) \
369 _(mm_sub_epi64) \
370 _(mm_sub_epi8) \
371 _(mm_sub_pd) \
372 _(mm_sub_sd) \
373 _(mm_sub_si64) \
374 _(mm_subs_epi16) \
375 _(mm_subs_epi8) \
376 _(mm_subs_epu16) \
377 _(mm_subs_epu8) \
378 _(mm_ucomieq_sd) \
379 _(mm_ucomige_sd) \
380 _(mm_ucomigt_sd) \
381 _(mm_ucomile_sd) \
382 _(mm_ucomilt_sd) \
383 _(mm_ucomineq_sd) \
384 _(mm_undefined_pd) \
385 _(mm_undefined_si128) \
386 _(mm_unpackhi_epi16) \
387 _(mm_unpackhi_epi32) \
388 _(mm_unpackhi_epi64) \
389 _(mm_unpackhi_epi8) \
390 _(mm_unpackhi_pd) \
391 _(mm_unpacklo_epi16) \
392 _(mm_unpacklo_epi32) \
393 _(mm_unpacklo_epi64) \
394 _(mm_unpacklo_epi8) \
395 _(mm_unpacklo_pd) \
396 _(mm_xor_pd) \
397 _(mm_xor_si128) \
398 /* SSE3 */ \
399 _(mm_addsub_pd) \
400 _(mm_addsub_ps) \
401 _(mm_hadd_pd) \
402 _(mm_hadd_ps) \
403 _(mm_hsub_pd) \
404 _(mm_hsub_ps) \
405 _(mm_lddqu_si128) \
406 _(mm_loaddup_pd) \
407 _(mm_movedup_pd) \
408 _(mm_movehdup_ps) \
409 _(mm_moveldup_ps) \
410 /* SSSE3 */ \
411 _(mm_abs_epi16) \
412 _(mm_abs_epi32) \
413 _(mm_abs_epi8) \
414 _(mm_abs_pi16) \
415 _(mm_abs_pi32) \
416 _(mm_abs_pi8) \
417 _(mm_alignr_epi8) \
418 _(mm_alignr_pi8) \
419 _(mm_hadd_epi16) \
420 _(mm_hadd_epi32) \
421 _(mm_hadd_pi16) \
422 _(mm_hadd_pi32) \
423 _(mm_hadds_epi16) \
424 _(mm_hadds_pi16) \
425 _(mm_hsub_epi16) \
426 _(mm_hsub_epi32) \
427 _(mm_hsub_pi16) \
428 _(mm_hsub_pi32) \
429 _(mm_hsubs_epi16) \
430 _(mm_hsubs_pi16) \
431 _(mm_maddubs_epi16) \
432 _(mm_maddubs_pi16) \
433 _(mm_mulhrs_epi16) \
434 _(mm_mulhrs_pi16) \
435 _(mm_shuffle_epi8) \
436 _(mm_shuffle_pi8) \
437 _(mm_sign_epi16) \
438 _(mm_sign_epi32) \
439 _(mm_sign_epi8) \
440 _(mm_sign_pi16) \
441 _(mm_sign_pi32) \
442 _(mm_sign_pi8) \
443 /* SSE4.1 */ \
444 _(mm_blend_epi16) \
445 _(mm_blend_pd) \
446 _(mm_blend_ps) \
447 _(mm_blendv_epi8) \
448 _(mm_blendv_pd) \
449 _(mm_blendv_ps) \
450 _(mm_ceil_pd) \
451 _(mm_ceil_ps) \
452 _(mm_ceil_sd) \
453 _(mm_ceil_ss) \
454 _(mm_cmpeq_epi64) \
455 _(mm_cvtepi16_epi32) \
456 _(mm_cvtepi16_epi64) \
457 _(mm_cvtepi32_epi64) \
458 _(mm_cvtepi8_epi16) \
459 _(mm_cvtepi8_epi32) \
460 _(mm_cvtepi8_epi64) \
461 _(mm_cvtepu16_epi32) \
462 _(mm_cvtepu16_epi64) \
463 _(mm_cvtepu32_epi64) \
464 _(mm_cvtepu8_epi16) \
465 _(mm_cvtepu8_epi32) \
466 _(mm_cvtepu8_epi64) \
467 _(mm_dp_pd) \
468 _(mm_dp_ps) \
469 _(mm_extract_epi32) \
470 _(mm_extract_epi64) \
471 _(mm_extract_epi8) \
472 _(mm_extract_ps) \
473 _(mm_floor_pd) \
474 _(mm_floor_ps) \
475 _(mm_floor_sd) \
476 _(mm_floor_ss) \
477 _(mm_insert_epi32) \
478 _(mm_insert_epi64) \
479 _(mm_insert_epi8) \
480 _(mm_insert_ps) \
481 _(mm_max_epi32) \
482 _(mm_max_epi8) \
483 _(mm_max_epu16) \
484 _(mm_max_epu32) \
485 _(mm_min_epi32) \
486 _(mm_min_epi8) \
487 _(mm_min_epu16) \
488 _(mm_min_epu32) \
489 _(mm_minpos_epu16) \
490 _(mm_mpsadbw_epu8) \
491 _(mm_mul_epi32) \
492 _(mm_mullo_epi32) \
493 _(mm_packus_epi32) \
494 _(mm_round_pd) \
495 _(mm_round_ps) \
496 _(mm_round_sd) \
497 _(mm_round_ss) \
498 _(mm_stream_load_si128) \
499 _(mm_test_all_ones) \
500 _(mm_test_all_zeros) \
501 _(mm_test_mix_ones_zeros) \
502 _(mm_testc_si128) \
503 _(mm_testnzc_si128) \
504 _(mm_testz_si128) \
505 /* SSE4.2 */ \
506 _(mm_cmpestra) \
507 _(mm_cmpestrc) \
508 _(mm_cmpestri) \
509 _(mm_cmpestrm) \
510 _(mm_cmpestro) \
511 _(mm_cmpestrs) \
512 _(mm_cmpestrz) \
513 _(mm_cmpgt_epi64) \
514 _(mm_cmpistra) \
515 _(mm_cmpistrc) \
516 _(mm_cmpistri) \
517 _(mm_cmpistrm) \
518 _(mm_cmpistro) \
519 _(mm_cmpistrs) \
520 _(mm_cmpistrz) \
521 _(mm_crc32_u16) \
522 _(mm_crc32_u32) \
523 _(mm_crc32_u64) \
524 _(mm_crc32_u8) \
525 /* AES */ \
526 _(mm_aesenc_si128) \
527 _(mm_aesdec_si128) \
528 _(mm_aesenclast_si128) \
529 _(mm_aesdeclast_si128) \
530 _(mm_aesimc_si128) \
531 _(mm_aeskeygenassist_si128) \
532 /* Others */ \
533 _(mm_clmulepi64_si128) \
534 _(mm_get_denormals_zero_mode) \
535 _(mm_popcnt_u32) \
536 _(mm_popcnt_u64) \
537 _(mm_set_denormals_zero_mode) \
538 _(rdtsc) \
539 _(last) /* This indicates the end of macros */
540
541namespace SSE2NEON
542{
543// The way unit tests are implemented is that 10,000 random floating point and
544// integer vec4 numbers are generated as sample data.
545//
546// A short C implementation of every intrinsic is implemented and compared to
547// the actual expected results from the corresponding SSE intrinsic against all
548// of the 10,000 randomized input vectors. When running on ARM, then the results
549// are compared to the NEON approximate version.
550extern const char *instructionString[];
552#define _(x) it_##x,
554#undef _
556
558{
559public:
560 static SSE2NEONTest *create(void); // create the test.
561
562 // Run test of this instruction;
563 // Passed: TEST_SUCCESS (1)
564 // Failed: TEST_FAIL (0)
565 // Unimplemented: TEST_UNIMPL (-1)
567 virtual void release(void) = 0;
568};
569
570} // namespace SSE2NEON
571
572#endif
Definition impl.h:558
virtual void release(void)=0
static SSE2NEONTest * create(void)
Definition impl.cpp:11884
virtual result_t runTest(InstructionTest test)=0
Definition binding.cpp:7
InstructionTest
Definition impl.h:551
@ INTRIN_LIST
Definition impl.h:553
const char * instructionString[]
Definition impl.cpp:281
result_t
Definition common.h:60