From b30ce8b72f5949e7f4ff484071ffb8e12ee6becd Mon Sep 17 00:00:00 2001 From: Pere Mato Date: Tue, 21 Jun 2016 16:27:42 +0200 Subject: [PATCH] Removed Vc from the ROOT sources. --- README/ReleaseNotes/v608/index.md | 2 + math/CMakeLists.txt | 8 - math/vc/CMakeLists.txt | 68 - math/vc/Module.mk | 85 - math/vc/cmake/AddCompilerFlag.cmake | 118 -- math/vc/cmake/AddTargetProperty.cmake | 39 - math/vc/cmake/CheckCCompilerFlag.cmake | 69 - math/vc/cmake/CheckCXXCompilerFlag.cmake | 69 - math/vc/cmake/OptimizeForArchitecture.cmake | 466 ----- math/vc/cmake/UserWarning.cmake | 9 - math/vc/cmake/VcMacros.cmake | 543 ------ math/vc/examples/CMakeLists.txt | 71 - math/vc/examples/buddhabrot/CMakeLists.txt | 16 - math/vc/examples/buddhabrot/main.cpp | 643 ------- math/vc/examples/buddhabrot/main.h | 72 - math/vc/examples/finitediff/CMakeLists.txt | 1 - math/vc/examples/finitediff/main.cpp | 249 --- math/vc/examples/fit/GaussFunction.h | 83 - math/vc/examples/fit/Makefile | 121 -- math/vc/examples/fit/MinimizerTypes.h | 51 - math/vc/examples/fit/testFitPerf.cxx | 1105 ------------ math/vc/examples/genvector/Makefile | 109 -- .../vc/examples/genvector/vectorOperation.cxx | 907 ---------- math/vc/examples/mandelbrot/CMakeLists.txt | 56 - math/vc/examples/mandelbrot/main.cpp | 160 -- math/vc/examples/mandelbrot/main.h | 73 - math/vc/examples/mandelbrot/mandel.cpp | 226 --- math/vc/examples/mandelbrot/mandel.h | 71 - math/vc/examples/matrix/CMakeLists.txt | 1 - math/vc/examples/matrix/main.cpp | 80 - math/vc/examples/polarcoord/CMakeLists.txt | 1 - math/vc/examples/polarcoord/main.cpp | 76 - math/vc/examples/smatrix/Makefile | 145 -- math/vc/examples/smatrix/TestTimer.h | 65 - math/vc/examples/smatrix/matrix_op_vec.h | 643 ------- math/vc/examples/smatrix/matrix_util.h | 83 - math/vc/examples/smatrix/testKalman.cxx | 655 ------- math/vc/examples/smatrix/testOperations.cxx | 896 ---------- math/vc/examples/tsc.h | 68 - math/vc/include/Vc/Allocator | 247 --- math/vc/include/Vc/IO | 195 --- math/vc/include/Vc/Memory | 37 - math/vc/include/Vc/Utils | 33 - math/vc/include/Vc/Vc | 29 - math/vc/include/Vc/avx/casts.h | 190 -- math/vc/include/Vc/avx/const.h | 112 -- math/vc/include/Vc/avx/const_data.h | 74 - math/vc/include/Vc/avx/debug.h | 102 -- math/vc/include/Vc/avx/deinterleave.tcc | 282 --- math/vc/include/Vc/avx/forceToRegisters.tcc | 141 -- math/vc/include/Vc/avx/helperimpl.h | 104 -- math/vc/include/Vc/avx/helperimpl.tcc | 64 - math/vc/include/Vc/avx/interleavedmemory.tcc | 890 ---------- math/vc/include/Vc/avx/intrinsics.h | 611 ------- math/vc/include/Vc/avx/limits.h | 55 - math/vc/include/Vc/avx/macros.h | 26 - math/vc/include/Vc/avx/mask.h | 246 --- math/vc/include/Vc/avx/mask.tcc | 75 - math/vc/include/Vc/avx/math.h | 119 -- math/vc/include/Vc/avx/prefetches.tcc | 58 - math/vc/include/Vc/avx/shuffle.h | 239 --- math/vc/include/Vc/avx/sorthelper.h | 45 - math/vc/include/Vc/avx/types.h | 111 -- math/vc/include/Vc/avx/undomacros.h | 26 - math/vc/include/Vc/avx/vector.h | 466 ----- math/vc/include/Vc/avx/vector.tcc | 1406 --------------- math/vc/include/Vc/avx/vectorhelper.h | 765 -------- math/vc/include/Vc/avx/vectorhelper.tcc | 270 --- math/vc/include/Vc/avx/writemaskedvector.h | 82 - math/vc/include/Vc/avx/writemaskedvector.tcc | 93 - .../include/Vc/common/aliasingentryhelper.h | 126 -- math/vc/include/Vc/common/bitscanintrinsics.h | 62 - math/vc/include/Vc/common/deinterleave.h | 87 - math/vc/include/Vc/common/exponential.h | 145 -- .../include/Vc/common/fix_clang_emmintrin.h | 79 - math/vc/include/Vc/common/iif.h | 62 - math/vc/include/Vc/common/interleavedmemory.h | 268 --- math/vc/include/Vc/common/logarithm.h | 277 --- math/vc/include/Vc/common/macros.h | 384 ---- math/vc/include/Vc/common/memory.h | 642 ------- math/vc/include/Vc/common/memorybase.h | 603 ------- math/vc/include/Vc/common/memoryfwd.h | 30 - math/vc/include/Vc/common/operand.h | 56 - math/vc/include/Vc/common/operators.h | 209 --- math/vc/include/Vc/common/storage.h | 130 -- math/vc/include/Vc/common/support.h | 7 - math/vc/include/Vc/common/trigonometric.h | 83 - math/vc/include/Vc/common/types.h | 225 --- math/vc/include/Vc/common/undomacros.h | 110 -- math/vc/include/Vc/common/vectortuple.h | 160 -- .../vc/include/Vc/common/windows_fix_intrin.h | 300 ---- math/vc/include/Vc/cpuid.h | 214 --- math/vc/include/Vc/double_v | 3 - math/vc/include/Vc/float_v | 3 - math/vc/include/Vc/global.h | 509 ------ math/vc/include/Vc/int_v | 3 - math/vc/include/Vc/internal/namespace.h | 28 - math/vc/include/Vc/limits | 57 - math/vc/include/Vc/scalar/helperimpl.h | 58 - math/vc/include/Vc/scalar/helperimpl.tcc | 86 - .../include/Vc/scalar/interleavedmemory.tcc | 160 -- math/vc/include/Vc/scalar/limits.h | 24 - math/vc/include/Vc/scalar/macros.h | 25 - math/vc/include/Vc/scalar/mask.h | 103 -- math/vc/include/Vc/scalar/math.h | 253 --- math/vc/include/Vc/scalar/types.h | 44 - math/vc/include/Vc/scalar/undomacros.h | 25 - math/vc/include/Vc/scalar/vector.h | 480 ----- math/vc/include/Vc/scalar/vector.tcc | 244 --- math/vc/include/Vc/scalar/writemaskedvector.h | 91 - math/vc/include/Vc/sfloat_v | 3 - math/vc/include/Vc/short_v | 3 - math/vc/include/Vc/sse/casts.h | 104 -- math/vc/include/Vc/sse/const.h | 108 -- math/vc/include/Vc/sse/const_data.h | 77 - math/vc/include/Vc/sse/debug.h | 90 - math/vc/include/Vc/sse/deinterleave.tcc | 237 --- math/vc/include/Vc/sse/forceToRegisters.tcc | 141 -- math/vc/include/Vc/sse/helperimpl.h | 87 - math/vc/include/Vc/sse/helperimpl.tcc | 66 - math/vc/include/Vc/sse/interleavedmemory.tcc | 1014 ----------- math/vc/include/Vc/sse/intrinsics.h | 602 ------- math/vc/include/Vc/sse/limits.h | 81 - math/vc/include/Vc/sse/macros.h | 47 - math/vc/include/Vc/sse/mask.h | 578 ------ math/vc/include/Vc/sse/math.h | 217 --- math/vc/include/Vc/sse/prefetches.tcc | 58 - math/vc/include/Vc/sse/shuffle.h | 172 -- math/vc/include/Vc/sse/types.h | 163 -- math/vc/include/Vc/sse/undomacros.h | 32 - math/vc/include/Vc/sse/vector.h | 550 ------ math/vc/include/Vc/sse/vector.tcc | 1545 ----------------- math/vc/include/Vc/sse/vectorhelper.h | 814 --------- math/vc/include/Vc/sse/vectorhelper.tcc | 493 ------ math/vc/include/Vc/support.h | 150 -- math/vc/include/Vc/uint_v | 3 - math/vc/include/Vc/ushort_v | 3 - math/vc/include/Vc/vector.h | 151 -- math/vc/include/Vc/version.h | 53 - math/vc/makeTest.py | 13 - math/vc/src/avx_sorthelper.cpp | 427 ----- math/vc/src/const.cpp | 529 ------ math/vc/src/cpuid.cpp | 623 ------- math/vc/src/support.cpp | 119 -- math/vc/src/trigonometric.cpp | 463 ----- math/vc/tests/CMakeLists.txt | 326 ---- math/vc/tests/Makefile | 160 -- math/vc/tests/arithmetics.cpp | 583 ------- math/vc/tests/casts.cpp | 144 -- math/vc/tests/const.h | 66 - math/vc/tests/convert-sincos-reference.cpp | 126 -- math/vc/tests/deinterleave.cpp | 421 ----- math/vc/tests/download.cmake | 1 - math/vc/tests/expandandmerge.cpp | 88 - math/vc/tests/gather.cpp | 212 --- math/vc/tests/implicit_type_conversion.cpp | 275 --- .../implicit_type_conversion_failures.cpp | 13 - math/vc/tests/linkTest0.cpp | 15 - math/vc/tests/linkTest1.cpp | 26 - math/vc/tests/linkTestLib0.cpp | 17 - math/vc/tests/linkTestLib1.cpp | 17 - math/vc/tests/linkTestLib2.cpp | 10 - math/vc/tests/linkTestLib3.cpp | 10 - math/vc/tests/load.cpp | 270 --- math/vc/tests/mask.cpp | 312 ---- math/vc/tests/math.cpp | 957 ---------- math/vc/tests/memory.cpp | 314 ---- math/vc/tests/scalaraccess.cpp | 146 -- math/vc/tests/scatter.cpp | 173 -- math/vc/tests/sse_blend.cpp | 125 -- math/vc/tests/stlcontainer.cpp | 75 - math/vc/tests/store.cpp | 164 -- math/vc/tests/supportfunctions.cpp | 68 - math/vc/tests/swizzles.cpp | 145 -- math/vc/tests/ulp.h | 96 - math/vc/tests/unittest.h | 681 -------- math/vc/tests/utils.cpp | 401 ----- math/vc/tests/vectormemoryhelper.h | 41 - 178 files changed, 2 insertions(+), 38143 deletions(-) delete mode 100644 math/vc/CMakeLists.txt delete mode 100644 math/vc/Module.mk delete mode 100644 math/vc/cmake/AddCompilerFlag.cmake delete mode 100644 math/vc/cmake/AddTargetProperty.cmake delete mode 100644 math/vc/cmake/CheckCCompilerFlag.cmake delete mode 100644 math/vc/cmake/CheckCXXCompilerFlag.cmake delete mode 100644 math/vc/cmake/OptimizeForArchitecture.cmake delete mode 100644 math/vc/cmake/UserWarning.cmake delete mode 100644 math/vc/cmake/VcMacros.cmake delete mode 100644 math/vc/examples/CMakeLists.txt delete mode 100644 math/vc/examples/buddhabrot/CMakeLists.txt delete mode 100644 math/vc/examples/buddhabrot/main.cpp delete mode 100644 math/vc/examples/buddhabrot/main.h delete mode 100644 math/vc/examples/finitediff/CMakeLists.txt delete mode 100644 math/vc/examples/finitediff/main.cpp delete mode 100644 math/vc/examples/fit/GaussFunction.h delete mode 100644 math/vc/examples/fit/Makefile delete mode 100644 math/vc/examples/fit/MinimizerTypes.h delete mode 100644 math/vc/examples/fit/testFitPerf.cxx delete mode 100644 math/vc/examples/genvector/Makefile delete mode 100644 math/vc/examples/genvector/vectorOperation.cxx delete mode 100644 math/vc/examples/mandelbrot/CMakeLists.txt delete mode 100644 math/vc/examples/mandelbrot/main.cpp delete mode 100644 math/vc/examples/mandelbrot/main.h delete mode 100644 math/vc/examples/mandelbrot/mandel.cpp delete mode 100644 math/vc/examples/mandelbrot/mandel.h delete mode 100644 math/vc/examples/matrix/CMakeLists.txt delete mode 100644 math/vc/examples/matrix/main.cpp delete mode 100644 math/vc/examples/polarcoord/CMakeLists.txt delete mode 100644 math/vc/examples/polarcoord/main.cpp delete mode 100644 math/vc/examples/smatrix/Makefile delete mode 100644 math/vc/examples/smatrix/TestTimer.h delete mode 100644 math/vc/examples/smatrix/matrix_op_vec.h delete mode 100644 math/vc/examples/smatrix/matrix_util.h delete mode 100644 math/vc/examples/smatrix/testKalman.cxx delete mode 100644 math/vc/examples/smatrix/testOperations.cxx delete mode 100644 math/vc/examples/tsc.h delete mode 100644 math/vc/include/Vc/Allocator delete mode 100644 math/vc/include/Vc/IO delete mode 100644 math/vc/include/Vc/Memory delete mode 100644 math/vc/include/Vc/Utils delete mode 100644 math/vc/include/Vc/Vc delete mode 100644 math/vc/include/Vc/avx/casts.h delete mode 100644 math/vc/include/Vc/avx/const.h delete mode 100644 math/vc/include/Vc/avx/const_data.h delete mode 100644 math/vc/include/Vc/avx/debug.h delete mode 100644 math/vc/include/Vc/avx/deinterleave.tcc delete mode 100644 math/vc/include/Vc/avx/forceToRegisters.tcc delete mode 100644 math/vc/include/Vc/avx/helperimpl.h delete mode 100644 math/vc/include/Vc/avx/helperimpl.tcc delete mode 100644 math/vc/include/Vc/avx/interleavedmemory.tcc delete mode 100644 math/vc/include/Vc/avx/intrinsics.h delete mode 100644 math/vc/include/Vc/avx/limits.h delete mode 100644 math/vc/include/Vc/avx/macros.h delete mode 100644 math/vc/include/Vc/avx/mask.h delete mode 100644 math/vc/include/Vc/avx/mask.tcc delete mode 100644 math/vc/include/Vc/avx/math.h delete mode 100644 math/vc/include/Vc/avx/prefetches.tcc delete mode 100644 math/vc/include/Vc/avx/shuffle.h delete mode 100644 math/vc/include/Vc/avx/sorthelper.h delete mode 100644 math/vc/include/Vc/avx/types.h delete mode 100644 math/vc/include/Vc/avx/undomacros.h delete mode 100644 math/vc/include/Vc/avx/vector.h delete mode 100644 math/vc/include/Vc/avx/vector.tcc delete mode 100644 math/vc/include/Vc/avx/vectorhelper.h delete mode 100644 math/vc/include/Vc/avx/vectorhelper.tcc delete mode 100644 math/vc/include/Vc/avx/writemaskedvector.h delete mode 100644 math/vc/include/Vc/avx/writemaskedvector.tcc delete mode 100644 math/vc/include/Vc/common/aliasingentryhelper.h delete mode 100644 math/vc/include/Vc/common/bitscanintrinsics.h delete mode 100644 math/vc/include/Vc/common/deinterleave.h delete mode 100644 math/vc/include/Vc/common/exponential.h delete mode 100644 math/vc/include/Vc/common/fix_clang_emmintrin.h delete mode 100644 math/vc/include/Vc/common/iif.h delete mode 100644 math/vc/include/Vc/common/interleavedmemory.h delete mode 100644 math/vc/include/Vc/common/logarithm.h delete mode 100644 math/vc/include/Vc/common/macros.h delete mode 100644 math/vc/include/Vc/common/memory.h delete mode 100644 math/vc/include/Vc/common/memorybase.h delete mode 100644 math/vc/include/Vc/common/memoryfwd.h delete mode 100644 math/vc/include/Vc/common/operand.h delete mode 100644 math/vc/include/Vc/common/operators.h delete mode 100644 math/vc/include/Vc/common/storage.h delete mode 100644 math/vc/include/Vc/common/support.h delete mode 100644 math/vc/include/Vc/common/trigonometric.h delete mode 100644 math/vc/include/Vc/common/types.h delete mode 100644 math/vc/include/Vc/common/undomacros.h delete mode 100644 math/vc/include/Vc/common/vectortuple.h delete mode 100644 math/vc/include/Vc/common/windows_fix_intrin.h delete mode 100644 math/vc/include/Vc/cpuid.h delete mode 100644 math/vc/include/Vc/double_v delete mode 100644 math/vc/include/Vc/float_v delete mode 100644 math/vc/include/Vc/global.h delete mode 100644 math/vc/include/Vc/int_v delete mode 100644 math/vc/include/Vc/internal/namespace.h delete mode 100644 math/vc/include/Vc/limits delete mode 100644 math/vc/include/Vc/scalar/helperimpl.h delete mode 100644 math/vc/include/Vc/scalar/helperimpl.tcc delete mode 100644 math/vc/include/Vc/scalar/interleavedmemory.tcc delete mode 100644 math/vc/include/Vc/scalar/limits.h delete mode 100644 math/vc/include/Vc/scalar/macros.h delete mode 100644 math/vc/include/Vc/scalar/mask.h delete mode 100644 math/vc/include/Vc/scalar/math.h delete mode 100644 math/vc/include/Vc/scalar/types.h delete mode 100644 math/vc/include/Vc/scalar/undomacros.h delete mode 100644 math/vc/include/Vc/scalar/vector.h delete mode 100644 math/vc/include/Vc/scalar/vector.tcc delete mode 100644 math/vc/include/Vc/scalar/writemaskedvector.h delete mode 100644 math/vc/include/Vc/sfloat_v delete mode 100644 math/vc/include/Vc/short_v delete mode 100644 math/vc/include/Vc/sse/casts.h delete mode 100644 math/vc/include/Vc/sse/const.h delete mode 100644 math/vc/include/Vc/sse/const_data.h delete mode 100644 math/vc/include/Vc/sse/debug.h delete mode 100644 math/vc/include/Vc/sse/deinterleave.tcc delete mode 100644 math/vc/include/Vc/sse/forceToRegisters.tcc delete mode 100644 math/vc/include/Vc/sse/helperimpl.h delete mode 100644 math/vc/include/Vc/sse/helperimpl.tcc delete mode 100644 math/vc/include/Vc/sse/interleavedmemory.tcc delete mode 100644 math/vc/include/Vc/sse/intrinsics.h delete mode 100644 math/vc/include/Vc/sse/limits.h delete mode 100644 math/vc/include/Vc/sse/macros.h delete mode 100644 math/vc/include/Vc/sse/mask.h delete mode 100644 math/vc/include/Vc/sse/math.h delete mode 100644 math/vc/include/Vc/sse/prefetches.tcc delete mode 100644 math/vc/include/Vc/sse/shuffle.h delete mode 100644 math/vc/include/Vc/sse/types.h delete mode 100644 math/vc/include/Vc/sse/undomacros.h delete mode 100644 math/vc/include/Vc/sse/vector.h delete mode 100644 math/vc/include/Vc/sse/vector.tcc delete mode 100644 math/vc/include/Vc/sse/vectorhelper.h delete mode 100644 math/vc/include/Vc/sse/vectorhelper.tcc delete mode 100644 math/vc/include/Vc/support.h delete mode 100644 math/vc/include/Vc/uint_v delete mode 100644 math/vc/include/Vc/ushort_v delete mode 100644 math/vc/include/Vc/vector.h delete mode 100644 math/vc/include/Vc/version.h delete mode 100644 math/vc/makeTest.py delete mode 100644 math/vc/src/avx_sorthelper.cpp delete mode 100644 math/vc/src/const.cpp delete mode 100644 math/vc/src/cpuid.cpp delete mode 100644 math/vc/src/support.cpp delete mode 100644 math/vc/src/trigonometric.cpp delete mode 100644 math/vc/tests/CMakeLists.txt delete mode 100644 math/vc/tests/Makefile delete mode 100644 math/vc/tests/arithmetics.cpp delete mode 100644 math/vc/tests/casts.cpp delete mode 100644 math/vc/tests/const.h delete mode 100644 math/vc/tests/convert-sincos-reference.cpp delete mode 100644 math/vc/tests/deinterleave.cpp delete mode 100644 math/vc/tests/download.cmake delete mode 100644 math/vc/tests/expandandmerge.cpp delete mode 100644 math/vc/tests/gather.cpp delete mode 100644 math/vc/tests/implicit_type_conversion.cpp delete mode 100644 math/vc/tests/implicit_type_conversion_failures.cpp delete mode 100644 math/vc/tests/linkTest0.cpp delete mode 100644 math/vc/tests/linkTest1.cpp delete mode 100644 math/vc/tests/linkTestLib0.cpp delete mode 100644 math/vc/tests/linkTestLib1.cpp delete mode 100644 math/vc/tests/linkTestLib2.cpp delete mode 100644 math/vc/tests/linkTestLib3.cpp delete mode 100644 math/vc/tests/load.cpp delete mode 100644 math/vc/tests/mask.cpp delete mode 100644 math/vc/tests/math.cpp delete mode 100644 math/vc/tests/memory.cpp delete mode 100644 math/vc/tests/scalaraccess.cpp delete mode 100644 math/vc/tests/scatter.cpp delete mode 100644 math/vc/tests/sse_blend.cpp delete mode 100644 math/vc/tests/stlcontainer.cpp delete mode 100644 math/vc/tests/store.cpp delete mode 100644 math/vc/tests/supportfunctions.cpp delete mode 100644 math/vc/tests/swizzles.cpp delete mode 100644 math/vc/tests/ulp.h delete mode 100644 math/vc/tests/unittest.h delete mode 100644 math/vc/tests/utils.cpp delete mode 100644 math/vc/tests/vectormemoryhelper.h diff --git a/README/ReleaseNotes/v608/index.md b/README/ReleaseNotes/v608/index.md index a4b479e79a8ef..fa48eaa3bb0c3 100644 --- a/README/ReleaseNotes/v608/index.md +++ b/README/ReleaseNotes/v608/index.md @@ -140,6 +140,8 @@ We added a cache specifically for the fast option of the TTreeCloner to signific ## Math Libraries * Improve thread safety of TMinuit constructor [ROOT-8217] +* Vc has ben removed from the ROOT sources. If the option 'vc' is enabled, the package will be searched (by default), + alternatively the source tarfile can be downloded and build with the option 'builtin_vc'. ## RooFit Libraries diff --git a/math/CMakeLists.txt b/math/CMakeLists.txt index 06781363de8a9..e480fc6be0d68 100644 --- a/math/CMakeLists.txt +++ b/math/CMakeLists.txt @@ -1,11 +1,3 @@ -#if(vc) -# if(testing) -# set(vc_tests ON) -# endif() -# set(TARGET_ARCHITECTURE auto CACHE STRING "") -# add_subdirectory(vc) -# mark_as_advanced(TARGET_ARCHITECTURE TEST_OPERATOR_FAILURES VC_IMPL) -#endif() add_subdirectory(mathcore) if(mathmore) add_subdirectory(mathmore) diff --git a/math/vc/CMakeLists.txt b/math/vc/CMakeLists.txt deleted file mode 100644 index daff6a942b5ad..0000000000000 --- a/math/vc/CMakeLists.txt +++ /dev/null @@ -1,68 +0,0 @@ -set(Vc_INSIDE_ROOT TRUE) - -if (CMAKE_BUILD_TYPE STREQUAL Optimized) - set(VC_EXTRA_CONSERVATIVE_FLAGS "-fno-fast-math") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VC_EXTRA_CONSERVATIVE_FLAGS}") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${VC_EXTRA_CONSERVATIVE_FLAGS}") -endif() - -include(cmake/VcMacros.cmake) -include(cmake/AddTargetProperty.cmake) -vc_determine_compiler() -vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS) -add_definitions("${Vc_DEFINITIONS}") -include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include) - -set(libvc_compile_flags "-DVC_COMPILE_LIB") -vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS ${libvc_compile_flags} - ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX SSE+XOP+FMA4 AVX+XOP+FMA4) -set(_srcs src/const.cpp src/cpuid.cpp src/support.cpp ${_objs}) -if(USE_AVX) - list(APPEND _srcs src/avx_sorthelper.cpp) -else() - if(NOT Vc_AVX_INTRINSICS_BROKEN) - # we'd still like to have avx_sorthelper.cpp built in, but that requires compilation with -mavx (or a comparable flag) - foreach(_flag "-xAVX" "-mavx" "/arch:AVX") - AddCompilerFlag("${_flag}" CXX_RESULT _flag_works) - if(_flag_works) - if(_flag STREQUAL "-xAVX") - set(_flag "${_flag} -diag-disable 10121") # disable the warning "overriding -xSSE4.2 with -xAVX" - endif() - list(APPEND _srcs src/avx_sorthelper.cpp) - set_source_files_properties(src/avx_sorthelper.cpp PROPERTIES COMPILE_FLAGS "${_flag}") - break() - endif() - endforeach() - endif() -endif() -add_library(Vc STATIC ${_srcs}) -add_target_property(Vc COMPILE_FLAGS ${libvc_compile_flags}) - -set_property(GLOBAL APPEND PROPERTY ROOT_EXPORTED_TARGETS Vc) - -if(Vc_COMPILER_IS_INTEL) - # per default icc is not IEEE compliant, but we need that for verification - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fp-model source") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fp-model source") -endif() - -install(TARGETS Vc RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib - COMPONENT libraries) -ROOT_INSTALL_HEADERS(include/) - -ROOT_BUILD_OPTION(vc_examples OFF "Build Vc examples") -ROOT_BUILD_OPTION(vc_tests OFF "Build Vc unit tests") -if(vc_examples OR vc_tests) - add_custom_target(other VERBATIM) - add_custom_target(Scalar COMMENT "build Vc Scalar code" VERBATIM) - add_custom_target(SSE COMMENT "build Vc SSE code" VERBATIM) - add_custom_target(AVX COMMENT "build Vc AVX code" VERBATIM) -endif() -if(vc_examples) - add_subdirectory(examples) -endif() -if(vc_tests) - add_subdirectory(tests) -endif() diff --git a/math/vc/Module.mk b/math/vc/Module.mk deleted file mode 100644 index 8f84dda96f04e..0000000000000 --- a/math/vc/Module.mk +++ /dev/null @@ -1,85 +0,0 @@ -# Module.mk for Vc module -# Generated on Tue Apr 3 17:31:31 CEST 2012 by Vc/makeRootRelease.sh - -MODNAME := vc -#VCVERS := vc-0.6.70-root - -MODDIR := $(ROOT_SRCDIR)/math/$(MODNAME) -MODDIRS := $(MODDIR)/src -MODDIRI := $(MODDIR)/include -VCBUILDDIR := $(call stripsrc,$(MODDIRS)) - -ifeq ($(PLATFORM),win32) -VCLIBVC := $(LPATH)/libVc.lib -else -VCLIBVC := $(LPATH)/libVc.a -endif - -VCH := $(wildcard $(MODDIRI)/Vc/* $(MODDIRI)/Vc/*/*) -# Above includes Vc/scalar which is a directory; filter those out. -# Problem: $(dir $(VCH)) gives Vc/scalar/ thus patsubst %/, % -VCH := $(filter-out $(sort $(patsubst %/,%,$(dir $(VCH)))),$(VCH)) - -ALLHDRS += $(patsubst $(MODDIRI)/%,include/%,$(VCH)) -ALLLIBS += $(VCLIBVC) - -##### local rules ##### -.PHONY: all-$(MODNAME) clean-$(MODNAME) distclean-$(MODNAME) - -include/Vc/%: $(MODDIRI)/Vc/% - +@[ -d $(dir $@) ] || mkdir -p $(dir $@) - cp $< $@ - -escapeflag = $(subst ~,_,$(subst /,_,$(subst :,_,$(subst =,_,$(subst .,_,$(subst -,_,$(1))))))) - -VCFLAGS0 := -DVC_COMPILE_LIB $(filter-out -Wall,$(filter-out -x%,$(filter-out -m%,$(filter-out /arch:%,$(OPT) $(CXXFLAGS))))) -VCFLAGS := $(VCFLAGS0) $(VCFLAGS) -VCLIBVCOBJ := const.cpp cpuid.cpp support.cpp trigonometric.cpp \ - $(foreach flag,$(call escapeflag,$(SIMDCXXFLAGS)),trigonometric_$(flag).cpp) -ifdef AVXCXXFLAG -VCLIBVCOBJ += avx_sorthelper.cpp -endif -VCLIBVCOBJ := $(addprefix $(VCBUILDDIR)/,$(addsuffix .o,$(VCLIBVCOBJ))) - -$(VCLIBVC): $(VCLIBVCOBJ) - $(MAKEDIR) - @echo "Create static library $@" - @rm -f $@ - @ar r $@ $? - @ranlib $@ - -$(VCBUILDDIR)/avx_%.cpp.o: $(MODDIRS)/avx_%.cpp - $(MAKEDIR) - $(CXX) $(VCFLAGS) $(AVXCXXFLAG) -c $(CXXOUT)$@ $< - -$(VCBUILDDIR)/trigonometric_%.cpp.o: $(MODDIRS)/trigonometric.cpp - $(MAKEDIR) - @for flag in $(SIMDCXXFLAGS); do \ - flag=`echo $$flag|tr '~' ' '`; \ - if test "$*" = "`echo "$$flag"|tr ' /:=.-' '______'`"; then \ - echo "$(CXX) $(VCFLAGS) $$flag -c $(CXXOUT)$@ $<"; \ - $(CXX) $(VCFLAGS) $$flag -c $(CXXOUT)$@ $<; \ - break; \ - fi; \ - done - -$(VCBUILDDIR)/%.cpp.o: $(MODDIRS)/%.cpp - $(MAKEDIR) - $(CXX) $(VCFLAGS) -c $(CXXOUT)$@ $< - - -all-$(MODNAME): $(VCLIBVC) - -clean-$(MODNAME): - @rm -f $(VCLIBVC) $(VCLIBVCOBJ) - -clean:: clean-$(MODNAME) - -distclean-$(MODNAME): clean-$(MODNAME) - @rm -rf include/Vc - -distclean:: distclean-$(MODNAME) - -# FIXME: Temporarily until we understand where the errors come from. -$(VCLIBVCOBJ): CXXFLAGS := $(filter-out -Xclang -fmodules -Xclang -fmodules-cache-path=$(ROOTSYS)/pcm/, $(CXXFLAGS)) -$(VCLIBVCOBJ): VCFLAGS := $(filter-out -Xclang -fmodules -Xclang -fmodules-cache-path=$(ROOTSYS)/pcm/, $(VCFLAGS)) diff --git a/math/vc/cmake/AddCompilerFlag.cmake b/math/vc/cmake/AddCompilerFlag.cmake deleted file mode 100644 index f3e5fd0ee7aea..0000000000000 --- a/math/vc/cmake/AddCompilerFlag.cmake +++ /dev/null @@ -1,118 +0,0 @@ -# - Add a given compiler flag to flags variables. -# AddCompilerFlag( []) -# or -# AddCompilerFlag( [C_FLAGS ] [CXX_FLAGS ] [C_RESULT ] -# [CXX_RESULT ]) - -#============================================================================= -# Copyright 2010-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/CheckCCompilerFlag.cmake") -include("${_currentDir}/CheckCXXCompilerFlag.cmake") - -macro(AddCompilerFlag _flag) - string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") - - set(_c_flags "CMAKE_C_FLAGS") - set(_cxx_flags "CMAKE_CXX_FLAGS") - set(_c_result tmp) - set(_cxx_result tmp) - if(${ARGC} EQUAL 2) - message(WARNING "Deprecated use of the AddCompilerFlag macro.") - unset(_c_result) - set(_cxx_result ${ARGV1}) - elseif(${ARGC} GREATER 2) - set(state 0) - unset(_c_flags) - unset(_cxx_flags) - unset(_c_result) - unset(_cxx_result) - foreach(_arg ${ARGN}) - if(_arg STREQUAL "C_FLAGS") - set(state 1) - if(NOT DEFINED _c_result) - set(_c_result tmp) - endif() - elseif(_arg STREQUAL "CXX_FLAGS") - set(state 2) - if(NOT DEFINED _cxx_result) - set(_cxx_result tmp) - endif() - elseif(_arg STREQUAL "C_RESULT") - set(state 3) - elseif(_arg STREQUAL "CXX_RESULT") - set(state 4) - elseif(state EQUAL 1) - set(_c_flags "${_arg}") - elseif(state EQUAL 2) - set(_cxx_flags "${_arg}") - elseif(state EQUAL 3) - set(_c_result "${_arg}") - elseif(state EQUAL 4) - set(_cxx_result "${_arg}") - else() - message(FATAL_ERROR "Syntax error for AddCompilerFlag") - endif() - endforeach() - endif() - - if("${_flag}" STREQUAL "-mfma") - # Compiling with FMA3 support may fail only at the assembler level. - # In that case we need to have such an instruction in the test code - set(_code "#include - __m128 foo(__m128 x) { return _mm_fmadd_ps(x, x, x); } - int main() { return 0; }") - elseif("${_flag}" STREQUAL "-stdlib=libc++") - # Compiling with libc++ not only requires a compiler that understands it, but also - # the libc++ headers itself - set(_code "#include - int main() { return 0; }") - else() - set(_code "int main() { return 0; }") - endif() - - if(DEFINED _c_result) - check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_code}") - set(${_c_result} ${check_c_compiler_flag_${_flag_esc}}) - endif() - if(DEFINED _cxx_result) - check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_code}") - set(${_cxx_result} ${check_cxx_compiler_flag_${_flag_esc}}) - endif() - - if(check_c_compiler_flag_${_flag_esc} AND DEFINED _c_flags) - set(${_c_flags} "${${_c_flags}} ${_flag}") - endif() - if(check_cxx_compiler_flag_${_flag_esc} AND DEFINED _cxx_flags) - set(${_cxx_flags} "${${_cxx_flags}} ${_flag}") - endif() -endmacro(AddCompilerFlag) diff --git a/math/vc/cmake/AddTargetProperty.cmake b/math/vc/cmake/AddTargetProperty.cmake deleted file mode 100644 index c410135ea897a..0000000000000 --- a/math/vc/cmake/AddTargetProperty.cmake +++ /dev/null @@ -1,39 +0,0 @@ -#============================================================================= -# Copyright 2010-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -macro(add_target_property _target _prop _value) - get_target_property(_oldprop "${_target}" ${_prop}) - if(NOT _oldprop) - set_target_properties("${_target}" PROPERTIES ${_prop} "${_value}") - else(NOT _oldprop) - set_target_properties("${_target}" PROPERTIES ${_prop} "${_oldprop} ${_value}") - endif(NOT _oldprop) -endmacro(add_target_property) diff --git a/math/vc/cmake/CheckCCompilerFlag.cmake b/math/vc/cmake/CheckCCompilerFlag.cmake deleted file mode 100644 index a4c7b5510db30..0000000000000 --- a/math/vc/cmake/CheckCCompilerFlag.cmake +++ /dev/null @@ -1,69 +0,0 @@ -# - Check whether the C compiler supports a given flag. -# CHECK_C_COMPILER_FLAG( ) -# - the compiler flag -# - variable to store the result -# This internally calls the check_c_source_compiles macro. -# See help for CheckCSourceCompiles for a listing of variables -# that can modify the build. - -#============================================================================= -# Copyright 2006-2009 Kitware, Inc. -# Copyright 2006 Alexander Neundorf -# Copyright 2011-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -INCLUDE(CheckCSourceCompiles) - -MACRO (CHECK_C_COMPILER_FLAG _FLAG _RESULT) - SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") - SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") - if(${ARGC} GREATER 2) - SET(TEST_SOURCE "${ARGV2}") - else() - SET(TEST_SOURCE "int main() { return 0;}") - endif() - CHECK_C_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} - # Some compilers do not fail with a bad flag - FAIL_REGEX "error: bad value (.*) for .* switch" # GNU - FAIL_REGEX "argument unused during compilation" # clang - FAIL_REGEX "is valid for .* but not for C" # GNU - FAIL_REGEX "unrecognized .*option" # GNU - FAIL_REGEX "ignored for target" # GNU - FAIL_REGEX "ignoring unknown option" # MSVC - FAIL_REGEX "[Uu]nknown option" # HP - FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro - FAIL_REGEX "command option .* is not recognized" # XL - FAIL_REGEX "WARNING: unknown flag:" # Open64 - FAIL_REGEX " #10159: " # ICC - FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' - ) - SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") -ENDMACRO (CHECK_C_COMPILER_FLAG) - diff --git a/math/vc/cmake/CheckCXXCompilerFlag.cmake b/math/vc/cmake/CheckCXXCompilerFlag.cmake deleted file mode 100644 index c753a3a236078..0000000000000 --- a/math/vc/cmake/CheckCXXCompilerFlag.cmake +++ /dev/null @@ -1,69 +0,0 @@ -# - Check whether the CXX compiler supports a given flag. -# CHECK_CXX_COMPILER_FLAG( ) -# - the compiler flag -# - variable to store the result -# This internally calls the check_cxx_source_compiles macro. See help -# for CheckCXXSourceCompiles for a listing of variables that can -# modify the build. - -#============================================================================= -# Copyright 2006-2009 Kitware, Inc. -# Copyright 2006 Alexander Neundorf -# Copyright 2011-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -INCLUDE(CheckCXXSourceCompiles) - -MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) - SET(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}") - SET(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}") - if(${ARGC} GREATER 2) - SET(TEST_SOURCE "${ARGV2}") - else() - SET(TEST_SOURCE "int main() { return 0;}") - endif() - CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} - # Some compilers do not fail with a bad flag - FAIL_REGEX "error: bad value (.*) for .* switch" # GNU - FAIL_REGEX "argument unused during compilation" # clang - FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU - FAIL_REGEX "unrecognized .*option" # GNU - FAIL_REGEX "ignored for target" # GNU - FAIL_REGEX "ignoring unknown option" # MSVC - FAIL_REGEX "[Uu]nknown option" # HP - FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro - FAIL_REGEX "command option .* is not recognized" # XL - FAIL_REGEX "WARNING: unknown flag:" # Open64 - FAIL_REGEX " #10159: " # ICC - FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' - ) - SET (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}") -ENDMACRO (CHECK_CXX_COMPILER_FLAG) - diff --git a/math/vc/cmake/OptimizeForArchitecture.cmake b/math/vc/cmake/OptimizeForArchitecture.cmake deleted file mode 100644 index e7e6d3af02f64..0000000000000 --- a/math/vc/cmake/OptimizeForArchitecture.cmake +++ /dev/null @@ -1,466 +0,0 @@ -#============================================================================= -# Copyright 2010-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/AddCompilerFlag.cmake") -include(CheckIncludeFile) - -macro(_my_find _list _value _ret) - list(FIND ${_list} "${_value}" _found) - if(_found EQUAL -1) - set(${_ret} FALSE) - else(_found EQUAL -1) - set(${_ret} TRUE) - endif(_found EQUAL -1) -endmacro(_my_find) - -macro(AutodetectHostArchitecture) - set(TARGET_ARCHITECTURE "generic") - set(Vc_ARCHITECTURE_FLAGS) - set(_vendor_id) - set(_cpu_family) - set(_cpu_model) - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") - string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") - string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") - string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor" OUTPUT_VARIABLE _vendor_id) - exec_program("/usr/sbin/sysctl -n machdep.cpu.model" OUTPUT_VARIABLE _cpu_model) - exec_program("/usr/sbin/sysctl -n machdep.cpu.family" OUTPUT_VARIABLE _cpu_family) - exec_program("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE _cpu_flags) - string(TOLOWER "${_cpu_flags}" _cpu_flags) - string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") - get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) - get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) - mark_as_advanced(_vendor_id _cpu_id) - string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") - string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") - endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") - if(_vendor_id STREQUAL "GenuineIntel") - if(_cpu_family EQUAL 6) - # Any recent Intel CPU except NetBurst - if(_cpu_model EQUAL 62) - set(TARGET_ARCHITECTURE "ivy-bridge") - elseif(_cpu_model EQUAL 58) - set(TARGET_ARCHITECTURE "ivy-bridge") - elseif(_cpu_model EQUAL 47) # Xeon E7 4860 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 46) # Xeon 7500 series - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 45) # Xeon TNG - set(TARGET_ARCHITECTURE "sandy-bridge") - elseif(_cpu_model EQUAL 44) # Xeon 5600 series - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 42) # Core TNG - set(TARGET_ARCHITECTURE "sandy-bridge") - elseif(_cpu_model EQUAL 37) # Core i7/i5/i3 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 31) # Core i7/i5 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 30) # Core i7/i5 - set(TARGET_ARCHITECTURE "westmere") - elseif(_cpu_model EQUAL 29) - set(TARGET_ARCHITECTURE "penryn") - elseif(_cpu_model EQUAL 28) - set(TARGET_ARCHITECTURE "atom") - elseif(_cpu_model EQUAL 26) - set(TARGET_ARCHITECTURE "nehalem") - elseif(_cpu_model EQUAL 23) - set(TARGET_ARCHITECTURE "penryn") - elseif(_cpu_model EQUAL 15) - set(TARGET_ARCHITECTURE "merom") - elseif(_cpu_model EQUAL 14) - set(TARGET_ARCHITECTURE "core") - elseif(_cpu_model LESS 14) - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") - set(TARGET_ARCHITECTURE "generic") - else() - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") - set(TARGET_ARCHITECTURE "merom") - endif() - elseif(_cpu_family EQUAL 7) # Itanium (not supported) - message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") - elseif(_cpu_family EQUAL 15) # NetBurst - list(APPEND _available_vector_units_list "sse" "sse2") - if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - endif(_cpu_model GREATER 2) - endif(_cpu_family EQUAL 6) - elseif(_vendor_id STREQUAL "AuthenticAMD") - if(_cpu_family EQUAL 22) # 16h - set(TARGET_ARCHITECTURE "AMD 16h") - elseif(_cpu_family EQUAL 21) # 15h - if(_cpu_model LESS 2) - set(TARGET_ARCHITECTURE "bulldozer") - else() - set(TARGET_ARCHITECTURE "piledriver") - endif() - elseif(_cpu_family EQUAL 20) # 14h - set(TARGET_ARCHITECTURE "AMD 14h") - elseif(_cpu_family EQUAL 18) # 12h - elseif(_cpu_family EQUAL 16) # 10h - set(TARGET_ARCHITECTURE "barcelona") - elseif(_cpu_family EQUAL 15) - set(TARGET_ARCHITECTURE "k8") - if(_cpu_model GREATER 64) # I don't know the right number to put here. This is just a guess from the hardware I have access to - set(TARGET_ARCHITECTURE "k8-sse3") - endif(_cpu_model GREATER 64) - endif() - endif(_vendor_id STREQUAL "GenuineIntel") -endmacro() - -macro(OptimizeForArchitecture) - set(TARGET_ARCHITECTURE "none" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"ivy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"AMD 14h\", \"AMD 16h\".") - set(_force) - if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") - message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") - set(_force FORCE) - endif() - set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) - mark_as_advanced(_last_target_arch) - string(TOLOWER "${TARGET_ARCHITECTURE}" TARGET_ARCHITECTURE) - - set(_march_flag_list) - set(_available_vector_units_list) - - if(TARGET_ARCHITECTURE STREQUAL "auto") - AutodetectHostArchitecture() - message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}") - endif(TARGET_ARCHITECTURE STREQUAL "auto") - - if(TARGET_ARCHITECTURE STREQUAL "core") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "merom") - list(APPEND _march_flag_list "merom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "penryn") - list(APPEND _march_flag_list "penryn") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") - if(_cpu_flags MATCHES "sse4_1") - message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)") - list(APPEND _available_vector_units_list "sse4.1") - else() - message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") - endif() - elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") - list(APPEND _march_flag_list "nehalem") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") - elseif(TARGET_ARCHITECTURE STREQUAL "westmere") - list(APPEND _march_flag_list "westmere") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") - elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge") - list(APPEND _march_flag_list "core-avx-i") - list(APPEND _march_flag_list "corei7-avx") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx" "rdrnd" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge") - list(APPEND _march_flag_list "sandybridge") - list(APPEND _march_flag_list "corei7-avx") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") - elseif(TARGET_ARCHITECTURE STREQUAL "atom") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "k8") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2") - elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") - list(APPEND _march_flag_list "k8-sse3") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "AMD 16h") - list(APPEND _march_flag_list "btver2") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "AMD 14h") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - else(TARGET_ARCHITECTURE STREQUAL "core") - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif(TARGET_ARCHITECTURE STREQUAL "core") - - if(NOT TARGET_ARCHITECTURE STREQUAL "none") - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - _my_find(_available_vector_units_list "sse2" SSE2_FOUND) - _my_find(_available_vector_units_list "sse3" SSE3_FOUND) - _my_find(_available_vector_units_list "ssse3" SSSE3_FOUND) - _my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND) - _my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND) - _my_find(_available_vector_units_list "sse4a" SSE4a_FOUND) - if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN) - UserWarning("AVX disabled per default because of old/broken compiler") - set(AVX_FOUND false) - set(XOP_FOUND false) - set(FMA4_FOUND false) - else() - _my_find(_available_vector_units_list "avx" AVX_FOUND) - if(DEFINED Vc_FMA4_INTRINSICS_BROKEN AND Vc_FMA4_INTRINSICS_BROKEN) - UserWarning("FMA4 disabled per default because of old/broken compiler") - set(FMA4_FOUND false) - else() - _my_find(_available_vector_units_list "fma4" FMA4_FOUND) - endif() - if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN) - UserWarning("XOP disabled per default because of old/broken compiler") - set(XOP_FOUND false) - else() - _my_find(_available_vector_units_list "xop" XOP_FOUND) - endif() - endif() - set(USE_SSE2 ${SSE2_FOUND} CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force}) - set(USE_SSE3 ${SSE3_FOUND} CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSSE3 ${SSSE3_FOUND} CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force}) - set(USE_SSE4a ${SSE4a_FOUND} CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force}) - set(USE_AVX ${AVX_FOUND} CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force}) - set(USE_XOP ${XOP_FOUND} CACHE BOOL "Use XOP." ${_force}) - set(USE_FMA4 ${FMA4_FOUND} CACHE BOOL "Use FMA4." ${_force}) - mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_XOP USE_FMA4) - if(USE_SSE2) - list(APPEND _enable_vector_unit_list "sse2") - else(USE_SSE2) - list(APPEND _disable_vector_unit_list "sse2") - endif(USE_SSE2) - if(USE_SSE3) - list(APPEND _enable_vector_unit_list "sse3") - else(USE_SSE3) - list(APPEND _disable_vector_unit_list "sse3") - endif(USE_SSE3) - if(USE_SSSE3) - list(APPEND _enable_vector_unit_list "ssse3") - else(USE_SSSE3) - list(APPEND _disable_vector_unit_list "ssse3") - endif(USE_SSSE3) - if(USE_SSE4_1) - list(APPEND _enable_vector_unit_list "sse4.1") - else(USE_SSE4_1) - list(APPEND _disable_vector_unit_list "sse4.1") - endif(USE_SSE4_1) - if(USE_SSE4_2) - list(APPEND _enable_vector_unit_list "sse4.2") - else(USE_SSE4_2) - list(APPEND _disable_vector_unit_list "sse4.2") - endif(USE_SSE4_2) - if(USE_SSE4a) - list(APPEND _enable_vector_unit_list "sse4a") - else(USE_SSE4a) - list(APPEND _disable_vector_unit_list "sse4a") - endif(USE_SSE4a) - if(USE_AVX) - list(APPEND _enable_vector_unit_list "avx") - # we want SSE intrinsics to result in instructions using the VEX prefix. - # Otherwise integer ops (which require the older SSE intrinsics) would - # always have a large penalty. - list(APPEND _enable_vector_unit_list "sse2avx") - else(USE_AVX) - list(APPEND _disable_vector_unit_list "avx") - endif(USE_AVX) - if(USE_XOP) - list(APPEND _enable_vector_unit_list "xop") - else() - list(APPEND _disable_vector_unit_list "xop") - endif() - if(USE_FMA4) - list(APPEND _enable_vector_unit_list "fma4") - else() - list(APPEND _disable_vector_unit_list "fma4") - endif() - if(MSVC) - # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) - # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) - _my_find(_enable_vector_unit_list "avx" _avx) - set(_avx_flag FALSE) - if(_avx) - AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _avx_flag) - endif() - if(NOT _avx_flag) - _my_find(_enable_vector_unit_list "sse2" _found) - if(_found) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - endif() - endif() - foreach(_flag ${_enable_vector_unit_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) - _my_find(_available_vector_units_list "avx2" _found) - if(_found) - AddCompilerFlag("-xCORE-AVX2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "f16c" _found) - if(_found) - AddCompilerFlag("-xCORE-AVX-I" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "avx" _found) - if(_found) - AddCompilerFlag("-xAVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "sse4.2" _found) - if(_found) - AddCompilerFlag("-xSSE4.2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "sse4.1" _found) - if(_found) - AddCompilerFlag("-xSSE4.1" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "ssse3" _found) - if(_found) - AddCompilerFlag("-xSSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - _my_find(_available_vector_units_list "sse3" _found) - if(_found) - # If the target host is an AMD machine then we still want to use -xSSE2 because the binary would refuse to run at all otherwise - _my_find(_march_flag_list "barcelona" _found) - if(NOT _found) - _my_find(_march_flag_list "k8-sse3" _found) - endif(NOT _found) - if(_found) - AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - else(_found) - AddCompilerFlag("-xSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - endif(_found) - else(_found) - _my_find(_available_vector_units_list "sse2" _found) - if(_found) - AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - endif(_found) - endif(_found) - endif(_found) - endif(_found) - endif(_found) - endif(_found) - endif(_found) - endif(_found) - else() # not MSVC and not ICC => GCC, Clang, Open64 - foreach(_flag ${_march_flag_list}) - AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-m${_flag}" CXX_RESULT _result) - if(_result) - set(_header FALSE) - if(_flag STREQUAL "sse3") - set(_header "pmmintrin.h") - elseif(_flag STREQUAL "ssse3") - set(_header "tmmintrin.h") - elseif(_flag STREQUAL "sse4.1") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4.2") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4a") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "avx") - set(_header "immintrin.h") - elseif(_flag STREQUAL "fma4") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "xop") - set(_header "x86intrin.h") - endif() - set(_resultVar "HAVE_${_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - if(_header) - CHECK_INCLUDE_FILE("${_header}" ${_resultVar} "-m${_flag}") - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "disabling ${_useVar} because ${_header} is missing") - set(${_useVar} FALSE) - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endif() - if(NOT _header OR ${_resultVar}) - set(Vc_ARCHITECTURE_FLAGS "${Vc_ARCHITECTURE_FLAGS} -m${_flag}") - endif() - endif() - endforeach(_flag) - foreach(_flag ${_disable_vector_unit_list}) - AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS) - endforeach(_flag) - endif() - endif() -endmacro(OptimizeForArchitecture) diff --git a/math/vc/cmake/UserWarning.cmake b/math/vc/cmake/UserWarning.cmake deleted file mode 100644 index 0be6ad240d7fb..0000000000000 --- a/math/vc/cmake/UserWarning.cmake +++ /dev/null @@ -1,9 +0,0 @@ -macro(UserWarning _msg) - if("$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") - # developer (non-dashboard) build - message(WARNING "${_msg}") - else() - # dashboard build - message(STATUS "${_msg}") - endif() -endmacro() diff --git a/math/vc/cmake/VcMacros.cmake b/math/vc/cmake/VcMacros.cmake deleted file mode 100644 index 16000db82d410..0000000000000 --- a/math/vc/cmake/VcMacros.cmake +++ /dev/null @@ -1,543 +0,0 @@ -# Macros for use with the Vc library. Vc can be found at http://code.compeng.uni-frankfurt.de/projects/vc -# -# The following macros are provided: -# vc_determine_compiler -# vc_set_preferred_compiler_flags -# -#============================================================================= -# Copyright 2009-2013 Matthias Kretz -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -cmake_minimum_required(VERSION 2.8.3) - -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include ("${_currentDir}/UserWarning.cmake") -include ("${_currentDir}/AddCompilerFlag.cmake") -include ("${_currentDir}/OptimizeForArchitecture.cmake") - -macro(vc_determine_compiler) - if(NOT DEFINED Vc_COMPILER_IS_INTEL) - execute_process(COMMAND "${CMAKE_CXX_COMPILER}" "--version" OUTPUT_VARIABLE _cxx_compiler_version ERROR_VARIABLE _cxx_compiler_version) - set(Vc_COMPILER_IS_INTEL false) - set(Vc_COMPILER_IS_OPEN64 false) - set(Vc_COMPILER_IS_CLANG false) - set(Vc_COMPILER_IS_MSVC false) - set(Vc_COMPILER_IS_GCC false) - if(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - set(Vc_COMPILER_IS_INTEL true) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_ICC_VERSION) - message(STATUS "Detected Compiler: Intel ${Vc_ICC_VERSION}") - elseif(CMAKE_CXX_COMPILER MATCHES "(opencc|openCC)$") - set(Vc_COMPILER_IS_OPEN64 true) - message(STATUS "Detected Compiler: Open64") - elseif(CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$" OR "${_cxx_compiler_version}" MATCHES "clang") - set(Vc_COMPILER_IS_CLANG true) - exec_program(${CMAKE_CXX_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_CLANG_VERSION) - string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" Vc_CLANG_VERSION "${Vc_CLANG_VERSION}") - message(STATUS "Detected Compiler: Clang ${Vc_CLANG_VERSION}") - elseif(MSVC) - set(Vc_COMPILER_IS_MSVC true) - message(STATUS "Detected Compiler: MSVC ${MSVC_VERSION}") - elseif(CMAKE_COMPILER_IS_GNUCXX) - set(Vc_COMPILER_IS_GCC true) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE Vc_GCC_VERSION) - message(STATUS "Detected Compiler: GCC ${Vc_GCC_VERSION}") - - # some distributions patch their GCC to return nothing or only major and minor version on -dumpversion. - # In that case we must extract the version number from --version. - if(NOT Vc_GCC_VERSION OR Vc_GCC_VERSION MATCHES "^[0-9]\\.[0-9]+$") - exec_program(${CMAKE_C_COMPILER} ARGS --version OUTPUT_VARIABLE Vc_GCC_VERSION) - string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" Vc_GCC_VERSION "${Vc_GCC_VERSION}") - message(STATUS "GCC Version from --version: ${Vc_GCC_VERSION}") - endif() - - # some distributions patch their GCC to be API incompatible to what the FSF released. In - # those cases we require a macro to identify the distribution version - find_program(_lsb_release lsb_release) - mark_as_advanced(_lsb_release) - if(_lsb_release) - execute_process(COMMAND ${_lsb_release} -is OUTPUT_VARIABLE _distributor_id OUTPUT_STRIP_TRAILING_WHITESPACE) - execute_process(COMMAND ${_lsb_release} -rs OUTPUT_VARIABLE _distributor_release OUTPUT_STRIP_TRAILING_WHITESPACE) - string(TOUPPER "${_distributor_id}" _distributor_id) - if(_distributor_id STREQUAL "UBUNTU") - execute_process(COMMAND ${CMAKE_C_COMPILER} --version OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _gcc_version) - string(REGEX MATCH "\\(.* ${Vc_GCC_VERSION}-([0-9]+).*\\)" _tmp "${_gcc_version}") - if(_tmp) - set(_patch ${CMAKE_MATCH_1}) - string(REGEX MATCH "^([0-9]+)\\.([0-9]+)$" _tmp "${_distributor_release}") - execute_process(COMMAND printf 0x%x%02x%02x ${CMAKE_MATCH_1} ${CMAKE_MATCH_2} ${_patch} OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE _tmp) - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -D__GNUC_UBUNTU_VERSION__=${_tmp}") - endif() - endif() - endif() - else() - message(WARNING "Untested/-supported Compiler for use with Vc.\nPlease fill out the missing parts in the CMake scripts and submit a patch to http://code.compeng.uni-frankfurt.de/projects/vc") - endif() - endif() -endmacro() - -macro(vc_set_gnu_buildtype_flags) - set(CMAKE_CXX_FLAGS_DEBUG "-g3" CACHE STRING "Flags used by the compiler during debug builds." FORCE) - set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) - set(CMAKE_CXX_FLAGS_RELWITHDEBUG "-O3" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBUG} -g" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the compiler during debug builds." FORCE) - set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the compiler during release minsize builds." FORCE) - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the compiler during release builds (/MD /Ob1 /Oi /Ot /Oy /Gs will produce slightly less optimized but smaller files)." FORCE) - set(CMAKE_C_FLAGS_RELWITHDEBUG "${CMAKE_CXX_FLAGS_RELWITHDEBUG}" CACHE STRING "Flags used by the compiler during release builds containing runtime checks." FORCE) - set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the compiler during Release with Debug Info builds." FORCE) - if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebug") - set(ENABLE_STRICT_ALIASING true CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") - if(NOT ENABLE_STRICT_ALIASING) - AddCompilerFlag(-fno-strict-aliasing) - endif(NOT ENABLE_STRICT_ALIASING) - endif() - mark_as_advanced(CMAKE_CXX_FLAGS_RELWITHDEBUG CMAKE_C_FLAGS_RELWITHDEBUG) -endmacro() - -macro(vc_add_compiler_flag VAR _flag) - AddCompilerFlag("${_flag}" CXX_FLAGS ${VAR}) -endmacro() - -macro(vc_check_assembler) - if(APPLE) - if(NOT Vc_COMPILER_IS_CLANG) - message(WARNING "Apple does not provide an assembler with AVX support. AVX will not be available. Please use Clang if you want to use AVX.") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") - set(Vc_AVX_INTRINSICS_BROKEN true) - endif() - else(APPLE) - if(${ARGC} EQUAL 1) - set(_as "${ARGV1}") - else() - exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as) - mark_as_advanced(_as) - endif() - if(NOT _as) - message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...") - else() - exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version) - string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}") - string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}") - if(_as_version VERSION_LESS "2.18.93") - UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.") - add_definitions(-DVC_NO_XGETBV) # old assembler doesn't know the xgetbv instruction - set(Vc_AVX_INTRINSICS_BROKEN true) - set(Vc_XOP_INTRINSICS_BROKEN true) - set(Vc_FMA4_INTRINSICS_BROKEN true) - elseif(_as_version VERSION_LESS "2.21.0") - UserWarning("Your binutils is too old (${_as_version}) for XOP instructions. They will therefore not be provided in libVc.") - set(Vc_XOP_INTRINSICS_BROKEN true) - endif() - endif() - endif(APPLE) -endmacro() - -macro(vc_check_fpmath) - # if compiling for 32 bit x86 we need to use the -mfpmath=sse since the x87 is broken by design - include (CheckCXXSourceRuns) - check_cxx_source_runs("int main() { return sizeof(void*) != 8; }" Vc_VOID_PTR_IS_64BIT) - if(NOT Vc_VOID_PTR_IS_64BIT) - exec_program(${CMAKE_C_COMPILER} ARGS -dumpmachine OUTPUT_VARIABLE _gcc_machine) - if(_gcc_machine MATCHES "[x34567]86" OR _gcc_machine STREQUAL "mingw32") - vc_add_compiler_flag(Vc_DEFINITIONS "-mfpmath=sse") - endif() - endif() -endmacro() - -macro(vc_set_preferred_compiler_flags) - vc_determine_compiler() - - set(_add_warning_flags false) - set(_add_buildtype_flags false) - foreach(_arg ${ARGN}) - if(_arg STREQUAL "WARNING_FLAGS") - set(_add_warning_flags true) - elseif(_arg STREQUAL "BUILDTYPE_FLAGS") - set(_add_buildtype_flags true) - endif() - endforeach() - - set(Vc_SSE_INTRINSICS_BROKEN false) - set(Vc_AVX_INTRINSICS_BROKEN false) - set(Vc_XOP_INTRINSICS_BROKEN false) - set(Vc_FMA4_INTRINSICS_BROKEN false) - - if(Vc_COMPILER_IS_OPEN64) - ################################################################################################## - # Open64 # - ################################################################################################## - if(_add_warning_flags) - AddCompilerFlag("-W") - AddCompilerFlag("-Wall") - AddCompilerFlag("-Wimplicit") - AddCompilerFlag("-Wswitch") - AddCompilerFlag("-Wformat") - AddCompilerFlag("-Wchar-subscripts") - AddCompilerFlag("-Wparentheses") - AddCompilerFlag("-Wmultichar") - AddCompilerFlag("-Wtrigraphs") - AddCompilerFlag("-Wpointer-arith") - AddCompilerFlag("-Wcast-align") - AddCompilerFlag("-Wreturn-type") - AddCompilerFlag("-ansi") - AddCompilerFlag("-pedantic") - AddCompilerFlag("-Wno-long-long") - AddCompilerFlag("-Wshadow") - AddCompilerFlag("-Wold-style-cast") - AddCompilerFlag("-Wno-variadic-macros") - endif() - if(_add_buildtype_flags) - vc_set_gnu_buildtype_flags() - endif() - - vc_check_assembler() - - # Open64 4.5.1 still doesn't ship immintrin.h - set(Vc_AVX_INTRINSICS_BROKEN true) - elseif(Vc_COMPILER_IS_GCC) - ################################################################################################## - # GCC # - ################################################################################################## - if(_add_warning_flags) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wall -Wswitch -Wformat -Wchar-subscripts -Wparentheses -Wmultichar -Wtrigraphs -Wpointer-arith -Wcast-align -Wreturn-type -pedantic -Wno-long-long -Wshadow") - if(NOT WIN32) - # the -ansi flag makes MinGW unusable, so maybe it's better to omit it - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ansi") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ansi") - endif() - AddCompilerFlag("-Wundef") - AddCompilerFlag("-Wold-style-cast") - AddCompilerFlag("-Wno-variadic-macros") - if(Vc_GCC_VERSION VERSION_GREATER "4.5.2" AND Vc_GCC_VERSION VERSION_LESS "4.6.4") - # GCC gives bogus "array subscript is above array bounds" warnings in math.cpp - AddCompilerFlag("-Wno-array-bounds") - endif() - endif() - vc_add_compiler_flag(Vc_DEFINITIONS "-Wabi") - vc_add_compiler_flag(Vc_DEFINITIONS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version. - - if(_add_buildtype_flags) - vc_set_gnu_buildtype_flags() - endif() - - # GCC 4.5.[01] fail at inlining some functions, creating functions with a single instructions, - # thus creating a large overhead. - if(Vc_GCC_VERSION VERSION_LESS "4.5.2" AND NOT Vc_GCC_VERSION VERSION_LESS "4.5.0") - UserWarning("GCC 4.5.0 and 4.5.1 have problems with inlining correctly. Setting early-inlining-insns=12 as workaround.") - AddCompilerFlag("--param early-inlining-insns=12") - endif() - - if(Vc_GCC_VERSION VERSION_LESS "4.1.99") - UserWarning("Your GCC is ancient and crashes on some important optimizations. The full set of SSE2 intrinsics is not supported. Vc will fall back to the scalar implementation. Use of the may_alias and always_inline attributes will be disabled. In turn all code using Vc must be compiled with -fno-strict-aliasing") - vc_add_compiler_flag(Vc_DEFINITIONS "-fno-strict-aliasing") - set(Vc_AVX_INTRINSICS_BROKEN true) - set(Vc_SSE_INTRINSICS_BROKEN true) - elseif(Vc_GCC_VERSION VERSION_LESS "4.4.6") - UserWarning("Your GCC is older than 4.4.6. This is known to cause problems/bugs. Please update to the latest GCC if you can.") - set(Vc_AVX_INTRINSICS_BROKEN true) - if(Vc_GCC_VERSION VERSION_LESS "4.3.0") - UserWarning("Your GCC is older than 4.3.0. It is unable to handle the full set of SSE2 intrinsics. All SSE code will be disabled. Please update to the latest GCC if you can.") - set(Vc_SSE_INTRINSICS_BROKEN true) - endif() - endif() - - if(Vc_GCC_VERSION VERSION_LESS 4.5.0) - UserWarning("GCC 4.4.x shows false positives for -Wparentheses, thus we rather disable the warning.") - string(REPLACE " -Wparentheses " " " CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - string(REPLACE " -Wparentheses " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -Wno-parentheses") - - UserWarning("GCC 4.4.x shows false positives for -Wstrict-aliasing, thus we rather disable the warning. Use a newer GCC for better warnings.") - AddCompilerFlag("-Wno-strict-aliasing") - - UserWarning("GCC 4.4.x shows false positives for -Wuninitialized, thus we rather disable the warning. Use a newer GCC for better warnings.") - AddCompilerFlag("-Wno-uninitialized") - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.6.0) - UserWarning("GCC 4.6.0 miscompiles AVX loads/stores, leading to spurious segfaults. Disabling AVX per default.") - set(Vc_AVX_INTRINSICS_BROKEN true) - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.7.0) - UserWarning("GCC 4.7.0 miscompiles at -O3, adding -fno-predictive-commoning to the compiler flags as workaround") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-predictive-commoning") - elseif(Vc_GCC_VERSION VERSION_EQUAL 4.8.0) - UserWarning("GCC 4.8.0 miscompiles at -O3, adding -fno-tree-vectorize to the compiler flags as workaround") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -fno-tree-vectorize") - endif() - - vc_check_fpmath() - vc_check_assembler() - elseif(Vc_COMPILER_IS_INTEL) - ################################################################################################## - # Intel Compiler # - ################################################################################################## - - if(_add_buildtype_flags) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3") - - set(ALIAS_FLAGS "-no-ansi-alias") - if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - # default ICC to -no-ansi-alias because otherwise tests/utils_sse fails. So far I suspect a miscompilation... - set(ENABLE_STRICT_ALIASING false CACHE BOOL "Enables strict aliasing rules for more aggressive optimizations") - if(ENABLE_STRICT_ALIASING) - set(ALIAS_FLAGS "-ansi-alias") - endif(ENABLE_STRICT_ALIASING) - endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALIAS_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALIAS_FLAGS}") - endif() - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 913") - # Disable warning #13211 "Immediate parameter to intrinsic call too large". (sse/vector.tcc rotated(int)) - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 13211") - - if(NOT "$ENV{DASHBOARD_TEST_FROM_CTEST}" STREQUAL "") - # disable warning #2928: the __GXX_EXPERIMENTAL_CXX0X__ macro is disabled when using GNU version 4.6 with the c++0x option - # this warning just adds noise about problems in the compiler - but I'm only interested in seeing problems in Vc - vc_add_compiler_flag(Vc_DEFINITIONS "-diag-disable 2928") - endif() - - # Intel doesn't implement the XOP or FMA4 intrinsics - set(Vc_XOP_INTRINSICS_BROKEN true) - set(Vc_FMA4_INTRINSICS_BROKEN true) - elseif(Vc_COMPILER_IS_MSVC) - if(_add_warning_flags) - AddCompilerFlag("/wd4800") # Disable warning "forcing value to bool" - AddCompilerFlag("/wd4996") # Disable warning about strdup vs. _strdup - AddCompilerFlag("/wd4244") # Disable warning "conversion from 'unsigned int' to 'float', possible loss of data" - AddCompilerFlag("/wd4146") # Disable warning "unary minus operator applied to unsigned type, result still unsigned" - AddCompilerFlag("/wd4227") # Disable warning "anachronism used : qualifiers on reference are ignored" (this is about 'restrict' usage on references, stupid MSVC) - AddCompilerFlag("/wd4722") # Disable warning "destructor never returns, potential memory leak" (warns about ~_UnitTest_Global_Object which we don't care about) - AddCompilerFlag("/wd4748") # Disable warning "/GS can not protect parameters and local variables from local buffer overrun because optimizations are disabled in function" (I don't get it) - add_definitions(-D_CRT_SECURE_NO_WARNINGS) - endif() - - # MSVC does not support inline assembly on 64 bit! :( - # searching the help for xgetbv doesn't turn up anything. So just fall back to not supporting AVX on Windows :( - # TODO: apparently MSVC 2010 SP1 added _xgetbv - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_NO_XGETBV") - - # get rid of the min/max macros - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DNOMINMAX") - - # MSVC doesn't implement the XOP or FMA4 intrinsics - set(Vc_XOP_INTRINSICS_BROKEN true) - set(Vc_FMA4_INTRINSICS_BROKEN true) - - if(MSVC_VERSION LESS 1700) - UserWarning("MSVC before 2012 has a broken std::vector::resize implementation. STL + Vc code will probably not compile.") - endif() - elseif(Vc_COMPILER_IS_CLANG) - # for now I don't know of any arguments I want to pass. -march and stuff is tried by OptimizeForArchitecture... - if(Vc_CLANG_VERSION VERSION_EQUAL "3.0") - UserWarning("Clang 3.0 has serious issues to compile Vc code and will most likely crash when trying to do so.\nPlease update to a recent clang version.") - elseif(Vc_CLANG_VERSION VERSION_EQUAL "3.2" AND NOT APPLE) - # the LLVM assembler gets FMAs wrong (bug 15040) - vc_add_compiler_flag(Vc_DEFINITIONS "-no-integrated-as") - endif() - - # disable these warnings because clang shows them for function overloads that were discarded via SFINAE - vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-local-type-template-args") - vc_add_compiler_flag(Vc_DEFINITIONS "-Wno-unnamed-type-template-args") - - if(NOT DEFINED Vc_INSIDE_ROOT) # ROOT has to set this up - AddCompilerFlag(-stdlib=libc++) - endif() - endif() - - if(NOT Vc_COMPILER_IS_MSVC) - vc_add_compiler_flag(Vc_DEFINITIONS "-ffp-contract=fast") - endif() - - OptimizeForArchitecture() - set(Vc_DEFINITIONS "${Vc_ARCHITECTURE_FLAGS} ${Vc_DEFINITIONS}") - - set(VC_IMPL "auto" CACHE STRING "Force the Vc implementation globally to the selected instruction set. \"auto\" lets Vc use the best available instructions.") - if(NOT VC_IMPL STREQUAL "auto") - set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DVC_IMPL=${VC_IMPL}") - if(NOT VC_IMPL STREQUAL "Scalar") - set(_use_var "USE_${VC_IMPL}") - if(VC_IMPL STREQUAL "SSE") - set(_use_var "USE_SSE2") - endif() - if(NOT ${_use_var}) - message(WARNING "The selected value for VC_IMPL (${VC_IMPL}) will not work because the relevant instructions are not enabled via compiler flags.") - endif() - endif() - endif() -endmacro() - -# helper macro for vc_compile_for_all_implementations -macro(_vc_compile_one_implementation _objs _impl) - list(FIND _disabled_targets "${_impl}" _disabled_index) - list(FIND _only_targets "${_impl}" _only_index) - if(${_disabled_index} EQUAL -1 AND (NOT _only_targets OR ${_only_index} GREATER -1)) - set(_extra_flags) - set(_ok FALSE) - foreach(_flag ${ARGN}) - if(_flag STREQUAL "NO_FLAG") - set(_ok TRUE) - break() - endif() - string(REPLACE " " ";" _flag_list "${_flag}") - foreach(_flag ${_flag_list}) - AddCompilerFlag(${_flag} CXX_RESULT _ok) - if(NOT _ok) - break() - endif() - endforeach() - if(_ok) - set(_extra_flags ${_flag_list}) - break() - endif() - endforeach() - - set(_outfile_flag -c -o) - if(Vc_COMPILER_IS_MSVC) - # MSVC for 64bit does not recognize /arch:SSE2 anymore. Therefore we set override _ok if _impl - # says SSE - if("${_impl}" MATCHES "SSE") - set(_ok TRUE) - endif() - set(_outfile_flag /c /Fo) - endif() - - if(_ok) - get_filename_component(_out "${_vc_compile_src}" NAME_WE) - get_filename_component(_ext "${_vc_compile_src}" EXT) - if(Vc_COMPILER_IS_MSVC) - set(_out "${_out}_${_impl}${_ext}.obj") - else() - set(_out "${_out}_${_impl}${_ext}.o") - endif() - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_out} - COMMAND ${CMAKE_CXX_COMPILER} ${_flags} ${_extra_flags} - -DVC_IMPL=${_impl} - ${_outfile_flag}${_out} ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - IMPLICIT_DEPENDS CXX ${CMAKE_CURRENT_SOURCE_DIR}/${_vc_compile_src} - COMMENT "Building CXX object ${_out}" - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - VERBATIM - ) - list(APPEND ${_objs} "${CMAKE_CURRENT_BINARY_DIR}/${_out}") - endif() - endif() -endmacro() - -# Generate compile rules for the given C++ source file for all available implementations and return -# the resulting list of object files in _obj -# all remaining arguments are additional flags -# Example: -# vc_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS -DCOMPILE_BLAH EXCLUDE Scalar) -# add_executable(executable main.cpp ${_objs}) -macro(vc_compile_for_all_implementations _objs _src) - set(${_objs}) - - # remove all -march, -msse, etc. flags from the flags we want to pass - string(REPLACE "${Vc_ARCHITECTURE_FLAGS}" "" _flags "${Vc_DEFINITIONS}") - string(REPLACE "-DVC_IMPL=[^ ]*" "" _flags "${_flags}") - - # capture the -march= switch as -mtune; if there is none skip it - if(Vc_ARCHITECTURE_FLAGS MATCHES "-march=") - string(REGEX REPLACE "^.*-march=([^ ]*).*$" "-mtune=\\1" _tmp "${Vc_ARCHITECTURE_FLAGS}") - set(_flags "${_flags} ${_tmp}") - endif() - - unset(_disabled_targets) - unset(_only_targets) - set(_state 0) - foreach(_arg ${ARGN}) - if(_arg STREQUAL "FLAGS") - set(_state 1) - elseif(_arg STREQUAL "EXCLUDE") - set(_state 2) - elseif(_arg STREQUAL "ONLY") - set(_state 3) - elseif(_state EQUAL 1) - set(_flags "${_flags} ${_arg}") - elseif(_state EQUAL 2) - list(APPEND _disabled_targets "${_arg}") - elseif(_state EQUAL 3) - list(APPEND _only_targets "${_arg}") - else() - message(FATAL_ERROR "incorrect argument to vc_compile_for_all_implementations") - endif() - endforeach() - - # make a semicolon separated list of all flags - string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp) - set(_tmp "CMAKE_CXX_FLAGS_${_tmp}") - string(REPLACE " " ";" _tmp "${CMAKE_CXX_FLAGS} ${${_tmp}} ${_flags}") - set(_flags) - foreach(item ${_tmp}) - if(item MATCHES "^[^']*'[^']*$") - if(_str) - list(APPEND _flags "${_str} ${item}") - unset(_str) - else() - set(_str "${item}") - endif() - else() - list(APPEND _flags "${item}") - endif() - endforeach() - get_directory_property(_inc INCLUDE_DIRECTORIES) - foreach(_i ${_inc}) - list(APPEND _flags "-I${_i}") - endforeach() - - set(_vc_compile_src "${_src}") - - _vc_compile_one_implementation(${_objs} Scalar NO_FLAG) - if(NOT Vc_SSE_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} SSE2 "-msse2" "-xSSE2" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE3 "-msse3" "-xSSE3" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSSE3 "-mssse3" "-xSSSE3" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE4_1 "-msse4.1" "-xSSE4.1" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE4_2 "-msse4.2" "-xSSE4.2" "/arch:SSE2") - _vc_compile_one_implementation(${_objs} SSE3+SSE4a "-msse4a") - endif() - if(NOT Vc_AVX_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} AVX "-mavx" "-xAVX" "/arch:AVX") - if(NOT Vc_XOP_INTRINSICS_BROKEN) - if(NOT Vc_FMA4_INTRINSICS_BROKEN) - _vc_compile_one_implementation(${_objs} SSE+XOP+FMA4 "-mxop -mfma4" "" "") - _vc_compile_one_implementation(${_objs} AVX+XOP+FMA4 "-mavx -mxop -mfma4" "" "") - endif() - _vc_compile_one_implementation(${_objs} SSE+XOP+FMA "-mxop -mfma" "" "") - _vc_compile_one_implementation(${_objs} AVX+XOP+FMA "-mavx -mxop -mfma" "" "") - endif() - _vc_compile_one_implementation(${_objs} AVX+FMA "-mavx -mfma" "" "") - endif() -endmacro() diff --git a/math/vc/examples/CMakeLists.txt b/math/vc/examples/CMakeLists.txt deleted file mode 100644 index 19ba0bd58c759..0000000000000 --- a/math/vc/examples/CMakeLists.txt +++ /dev/null @@ -1,71 +0,0 @@ -find_package(Qt4) -set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") -set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") -set(CMAKE_REQUIRED_INCLUDES "${QT_INCLUDES}") -set(CMAKE_REQUIRED_LIBRARIES "${QT_QTCORE_LIBRARY}") -CHECK_CXX_SOURCE_COMPILES("#include -int main() { QObject o; return 0;}" QT4_USABLE) -mark_as_advanced(QT4_USABLE) -set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") -set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") - -macro(build_example name) - set(_SRCS) - set(_LIBS) - set(_state 1) - foreach(ARG ${ARGN}) - if(ARG STREQUAL "LIBS") - set(_state 2) - elseif(_state EQUAL 1) - set(_SRCS ${_SRCS} ${ARG}) - elseif(_state EQUAL 2) - set(_LIBS ${_LIBS} ${ARG}) - endif() - endforeach() - add_executable("example_${name}_default" ${_SRCS}) - target_link_libraries("example_${name}_default" Vc ${_LIBS}) - - add_executable("example_${name}_scalar" ${_SRCS}) - add_target_property("example_${name}_scalar" COMPILE_FLAGS "-DVC_IMPL=Scalar") - add_target_property("example_${name}_scalar" LABELS "Scalar") - add_dependencies(Scalar "example_${name}_scalar") - target_link_libraries("example_${name}_scalar" Vc ${_LIBS}) - - if(USE_SSE2) - add_executable("example_${name}_sse" ${_SRCS}) - add_target_property("example_${name}_sse" COMPILE_FLAGS "-DVC_IMPL=SSE") - add_target_property("example_${name}_sse" LABELS "SSE") - add_dependencies(SSE "example_${name}_sse") - target_link_libraries("example_${name}_sse" Vc ${_LIBS}) - endif() - - if(USE_AVX) - add_executable("example_${name}_avx" ${_SRCS}) - add_target_property("example_${name}_avx" COMPILE_FLAGS "-DVC_IMPL=AVX") - add_target_property("example_${name}_avx" LABELS "AVX") - add_dependencies(AVX "example_${name}_avx") - target_link_libraries("example_${name}_avx" Vc ${_LIBS}) - - add_target_property("example_${name}_default" LABELS "AVX") - add_dependencies(AVX "example_${name}_default") - elseif(USE_SSE2) - add_target_property("example_${name}_default" LABELS "SSE") - add_dependencies(SSE "example_${name}_default") - else() - add_target_property("example_${name}_default" LABELS "Scalar") - add_dependencies(Scalar "example_${name}_default") - endif() -endmacro(build_example) - -macro(my_add_subdirectory _name) - list(FIND disabled_targets "example_${_name}" _disabled) - if(_disabled EQUAL -1) - add_subdirectory(${_name}) - endif() -endmacro() - -my_add_subdirectory(polarcoord) -my_add_subdirectory(matrix) -my_add_subdirectory(mandelbrot) -my_add_subdirectory(buddhabrot) -my_add_subdirectory(finitediff) diff --git a/math/vc/examples/buddhabrot/CMakeLists.txt b/math/vc/examples/buddhabrot/CMakeLists.txt deleted file mode 100644 index e03ee3ea24b84..0000000000000 --- a/math/vc/examples/buddhabrot/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -if(QT4_FOUND AND QT4_USABLE) - include(${QT_USE_FILE}) - include_directories(${CMAKE_CURRENT_BINARY_DIR}) - - add_executable(buddhabrot_sse main.cpp) - add_target_property(buddhabrot_sse COMPILE_FLAGS "-DVC_IMPL=SSE") - target_link_libraries(buddhabrot_sse ${QT_LIBRARIES} Vc) - - add_executable(buddhabrot_scalar main.cpp) - add_target_property(buddhabrot_scalar COMPILE_FLAGS "-DVC_IMPL=Scalar") - target_link_libraries(buddhabrot_scalar ${QT_LIBRARIES} Vc) - - add_executable(buddhabrot_scalar2 main.cpp) - add_target_property(buddhabrot_scalar2 COMPILE_FLAGS "-DScalar") - target_link_libraries(buddhabrot_scalar2 ${QT_LIBRARIES}) -endif() diff --git a/math/vc/examples/buddhabrot/main.cpp b/math/vc/examples/buddhabrot/main.cpp deleted file mode 100644 index ff82c0820572e..0000000000000 --- a/math/vc/examples/buddhabrot/main.cpp +++ /dev/null @@ -1,643 +0,0 @@ -/* - Copyright (C) 2010-2011 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#include "main.h" -#include "../tsc.h" -#include -#include - -#include -#include -#include -#include -#include -#include - -#ifdef Scalar -typedef float float_v; -typedef int int_v; -typedef bool int_m; -#else -#include - -using Vc::float_v; -using Vc::float_m; -using Vc::int_v; -using Vc::int_m; -#endif - -ProgressWriter::ProgressWriter() - : m_out(stdout) -{ -} - -void ProgressWriter::setValue(float vf) -{ - static int lastPercent = -1; - static int lastHash = 0; - int p = static_cast(vf + 0.5f); - int h = static_cast(vf * 0.78f + 0.5f); - bool flush = false; - if (p != lastPercent) { - flush = true; - if (lastPercent == -1) { - m_out << "\033[80D\033[K" - << "[ "; - m_out.setFieldWidth(3); - m_out << p; - m_out.setFieldWidth(0); - m_out << "% ]" - << "\033[79D"; - } else { - m_out << "\033[s\033[80D\033[37C"; - m_out.setFieldWidth(3); - m_out << p; - m_out.setFieldWidth(0); - m_out << "\033[u"; - } - lastPercent = p; - } - for (; lastHash < h; ++lastHash) { - flush = true; - if (lastHash < 36 || lastHash > 39) { - m_out << '#'; - } else { - m_out << "\033[1C"; - } - } - if (flush) { - m_out.flush(); - } -} - -void ProgressWriter::done() -{ - setValue(100.f); - m_out << "\033[2C"; - m_out.flush(); -} - -Baker::Baker() -{ -} - -void Baker::setSize(int w, int h) -{ - m_y = -1.f; - m_height = 2.f; - - m_width = w * m_height / h; - m_x = m_width * -0.667f; - - m_image = QImage(w, h, QImage::Format_RGB32); -} - -void Baker::setFilename(const QString &filename) -{ - m_filename = filename; -} - -typedef std::complex Z; - -static inline Z P(Z z, Z c) -{ - return z * z + c; -} - -static inline Z::value_type fastNorm(const Z &z) -{ - return z.real() * z.real() + z.imag() * z.imag(); -} - -template static inline T square(T a) { return a * a; } -template static inline T minOf(T a, T b) { return a < b ? a : b; } -template static inline T maxOf(T a, T b) { return a < b ? b : a; } -template static inline T clamp(T min, T value, T max) -{ - if (value > max) { - return max; - } - return value < min ? min : value; -} - -struct Pixel -{ - float blue; - float green; - float red; -}; - -static const Pixel NULL_PIXEL = { 0, 0, 0 }; - -class Canvas -{ - public: - Canvas(int h, int w); - void addDot(float x, float y, int red, int green, int blue); - void toQImage(QImage *); - - private: - void addDot(int x, int y, float red, float green, float blue) { - Pixel &p = m_pixels[x + y * m_width]; - p.blue += blue; - p.green += green; - p.red += red; - } - const int m_width; - std::vector m_pixels; -}; - -Canvas::Canvas(int h, int w) - : m_width(w), m_pixels(h * w, NULL_PIXEL) -{ -} - -void Canvas::addDot(float x, float y, int red, int green, int blue) -{ - const int x1 = static_cast(std::floor(x)); - const int x2 = static_cast(std::ceil (x)); - const int y1 = static_cast(std::floor(y)); - const int y2 = static_cast(std::ceil (y)); - const float xfrac = x - std::floor(x); - const float yfrac = y - std::floor(y); - const float r = red; - const float g = green; - const float b = blue; - const float frac11 = (1.f - xfrac) * (1.f - yfrac); - const float frac12 = (1.f - xfrac) * yfrac; - const float frac21 = xfrac * (1.f - yfrac); - const float frac22 = xfrac * yfrac; - addDot(x1, y1, r * frac11, g * frac11, b * frac11); - addDot(x2, y1, r * frac21, g * frac21, b * frac21); - addDot(x1, y2, r * frac12, g * frac12, b * frac12); - addDot(x2, y2, r * frac22, g * frac22, b * frac22); -} - -#define BUDDHABROT_USE_FUNCTION1 - -#ifdef BUDDHABROT_USE_FUNCTION2 -static inline uchar reduceRange(float x, float m, float h) -{ - /* m: max, h: median - * +- -+ - * | 3 3 2 | - * | 510 h + 127 m - 765 h m | - * | -------------------------- | - * | 3 3 2 2 | - * | h m + h m - 2 h m | - * | | - * | 3 3 2 | - * | - 255 h - 254 m + 765 h m | - * | ---------------------------- | - * | 4 2 3 3 2 | - * | h m - 2 h m + h m | - * | | - * | 2 2 | - * | - 510 h m + 255 h + 127 m | - * | --------------------------- | - * | 4 2 3 3 2 | - * | h m - 2 h m + h m | - * +- -+ - */ - const float h2 = h * h; - const float h3 = h2 * h; - const float m2 = m * m; - const float m3 = m2 * m; - const float denom = h * m * square(m - h); - return minOf(255.f, 0.5f //rounding - + x / denom * ( - 510.f * h3 + 127.f * m3 - 765.f * h2 * m - + x / m * ( - 765.f * h * m2 - 255.f * h3 - 254.f * m3 - + x * ( - 255.f * h2 + 127.f * m2 - 510.f * h * m) - ))); -} -#elif defined(BUDDHABROT_USE_FUNCTION1) -static inline unsigned int reduceRange(float x, float m, float h) -{ - if (x <= m) { - return 0.5f // rounding - + 4.f / 255.f * h * h / m * x - + square(x) * (h / square(m)) * (4.f - 8.f / 255.f * h); - } else { - return 0.5f // rounding - + 255.f - 4.f * h + 4.f / 255.f * square(h) - + x / m * (16.f * h - 1020.f - 12.f / 255.f * square(h)) - + square(x / m) * (1020.f - 12.f * h + 8.f / 255.f * square(h)); - } -} -#endif - -void Canvas::toQImage(QImage *img) -{ - uchar *line = img->scanLine(0); - const Pixel *p = &m_pixels[0]; -#ifdef BUDDHABROT_USE_FUNCTION2 - float max [3] = { 0.f, 0.f, 0.f }; - std::vector sorted[3]; - for (int i = 0; i < 3; ++i) { - sorted[i].reserve(m_pixels.size()); - } - for (unsigned int i = 0; i < m_pixels.size(); ++i) { - max[0] = maxOf(max[0], m_pixels[i].red); - max[1] = maxOf(max[1], m_pixels[i].green); - max[2] = maxOf(max[2], m_pixels[i].blue); - if (m_pixels[i].red > 1.f) { - sorted[0].push_back(m_pixels[i].red); - } - if (m_pixels[i].green > 1.f) { - sorted[1].push_back(m_pixels[i].green); - } - if (m_pixels[i].blue > 1.f) { - sorted[2].push_back(m_pixels[i].blue); - } - } - for (int i = 0; i < 3; ++i) { - std::sort(sorted[i].begin(), sorted[i].end()); - } - const float median[3] = { - sorted[0][sorted[0].size() / 2], - sorted[1][sorted[1].size() / 2], - sorted[2][sorted[2].size() / 2] - }; - - /* - int hist[3][2]; - for (int i = 0; i < 3; ++i) { - hist[i][0] = hist[i][1] = 0; - } - for (unsigned int i = 0; i < m_pixels.size(); ++i) { - ++hist[0][reduceRange(m_pixels[i].red , max[0], median[0]) / 128]; - ++hist[1][reduceRange(m_pixels[i].green, max[1], median[1]) / 128]; - ++hist[2][reduceRange(m_pixels[i].blue , max[2], median[2]) / 128]; - } - qDebug() << "Histogram:\n red:" - << median[0] << hist[0][0] << hist[0][1] << "\ngreen:" - << median[1] << hist[1][0] << hist[1][1] << "\n blue:" - << median[2] << hist[2][0] << hist[2][1]; - */ - - for (int yy = 0; yy < img->height(); ++yy) { - for (int xx = 0; xx < img->width(); ++xx) { - line[0] = reduceRange(p->blue , max[2], median[2]); - line[1] = reduceRange(p->green, max[1], median[1]); - line[2] = reduceRange(p->red , max[0], median[0]); - line += 4; - ++p; - } - } -#elif defined(BUDDHABROT_USE_FUNCTION1) - float max[3] = { 0.f, 0.f, 0.f }; - for (unsigned int i = 0; i < m_pixels.size(); ++i) { - max[0] = maxOf(max[0], m_pixels[i].red); - max[1] = maxOf(max[1], m_pixels[i].green); - max[2] = maxOf(max[2], m_pixels[i].blue); - } - float h[3] = { 220.f, 220.f, 220.f }; - - /* - int hist[3][2]; - for (int i = 0; i < 3; ++i) { - hist[i][0] = hist[i][1] = 0; - } - for (unsigned int i = 0; i < m_pixels.size(); ++i) { - ++hist[0][reduceRange(m_pixels[i].red , max[0], h[0]) / 128]; - ++hist[1][reduceRange(m_pixels[i].green, max[1], h[1]) / 128]; - ++hist[2][reduceRange(m_pixels[i].blue , max[2], h[2]) / 128]; - } - qDebug() << "Histogram:\n red:" - << hist[0][0] << hist[0][1] << "\ngreen:" - << hist[1][0] << hist[1][1] << "\n blue:" - << hist[2][0] << hist[2][1]; - */ - - for (int yy = 0; yy < img->height(); ++yy) { - for (int xx = 0; xx < img->width(); ++xx) { - line[0] = reduceRange(p->blue , max[2], h[2]); - line[1] = reduceRange(p->green, max[1], h[1]); - line[2] = reduceRange(p->red , max[0], h[0]); - line += 4; - ++p; - } - } -#else - float max [3] = { 0.f, 0.f, 0.f }; - float mean [3] = { 0.f, 0.f, 0.f }; - float stddev[3] = { 0.f, 0.f, 0.f }; - for (unsigned int i = 0; i < m_pixels.size(); ++i) { - max[0] = maxOf(max[0], m_pixels[i].red); - max[1] = maxOf(max[1], m_pixels[i].green); - max[2] = maxOf(max[2], m_pixels[i].blue); - mean[0] += m_pixels[i].red; - mean[1] += m_pixels[i].green; - mean[2] += m_pixels[i].blue; - stddev[0] += square(m_pixels[i].red); - stddev[1] += square(m_pixels[i].green); - stddev[2] += square(m_pixels[i].blue); - } - const float normalization = 1.f / m_pixels.size(); - mean[0] *= normalization; - mean[1] *= normalization; - mean[2] *= normalization; - stddev[0] = std::sqrt(stddev[0] * normalization - square(mean[0])); - stddev[1] = std::sqrt(stddev[1] * normalization - square(mean[1])); - stddev[2] = std::sqrt(stddev[2] * normalization - square(mean[2])); - qDebug() << " max:" << max[0] << max[1] << max[2]; - qDebug() << " mean:" << mean[0] << mean[1] << mean[2]; - qDebug() << "stddev:" << stddev[0] << stddev[1] << stddev[2]; - - // colors have the range 0..max at this point - // they should be transformed such that for the resulting mean and stddev: - // mean - stddev = 0 - // mean + stddev = min(min(2 * mean, max), 255) - // - // newColor = (c - mean) * min(min(2 * mean, max), 255) * 0.5 / stddev + 127.5 - - const float center[3] = { - minOf(minOf(2.f * mean[0], max[0]), 255.f) * 0.5f, - minOf(minOf(2.f * mean[1], max[1]), 255.f) * 0.5f, - minOf(minOf(2.f * mean[2], max[2]), 255.f) * 0.5f - }; - - const float sdFactor[3] = { 2.f, 2.f, 2.f }; - const float redFactor = center[0] / (sdFactor[0] * stddev[0]); - const float greenFactor = center[1] / (sdFactor[1] * stddev[1]); - const float blueFactor = center[2] / (sdFactor[2] * stddev[2]); - - for (int yy = 0; yy < img->height(); ++yy) { - for (int xx = 0; xx < img->width(); ++xx) { - line[0] = clamp(0, static_cast(center[2] + (p->blue - mean[2]) * blueFactor ), 255); - line[1] = clamp(0, static_cast(center[1] + (p->green - mean[1]) * greenFactor), 255); - line[2] = clamp(0, static_cast(center[0] + (p->red - mean[0]) * redFactor ), 255); - line += 4; - ++p; - } - } -#endif -} - -Baker::Options::Options() -{ - red[0] = 2; - red[1] = 10; - green[0] = 0; - green[1] = 1; - blue[0] = 11; - blue[1] = 20; - it[0] = 10000; - it[1] = 50000; - steps[0] = steps[1] = -1; -} - -void Baker::createImage() -{ - const int iHeight = m_image.height(); - const int iWidth = m_image.width(); - - // Parameters Begin - const float S = 4.f; - const float nSteps[2] = { - static_cast(m_opt.steps[0] == -1 ? std::sqrt(iWidth) * iWidth : m_opt.steps[0]), - static_cast(m_opt.steps[1] == -1 ? std::sqrt(iHeight) * iHeight : m_opt.steps[1]) - }; - const int upperBound[3] = { m_opt.red[1], m_opt.green[1], m_opt.blue[1] }; - const int lowerBound[3] = { m_opt.red[0], m_opt.green[0], m_opt.blue[0] }; - int overallLowerBound = m_opt.it[0]; - int maxIterations = m_opt.it[1];// maxOf(maxOf(overallLowerBound, upperBound[0]), maxOf(upperBound[1], upperBound[2])); - float realMin = -2.102613f; - float realMax = 1.200613f; - float imagMin = 0.f; - float imagMax = 1.23971f; - // Parameters End - - TimeStampCounter timer; - timer.Start(); - - // helper constants - const int overallUpperBound = maxOf(upperBound[0], maxOf(upperBound[1], upperBound[2])); - const float maxX = static_cast(iWidth ) - 1.f; - const float maxY = static_cast(iHeight) - 1.f; - const float xFact = iWidth / m_width; - const float yFact = iHeight / m_height; - const float realStep = (realMax - realMin) / nSteps[0]; - const float imagStep = (imagMax - imagMin) / nSteps[1]; - - Canvas canvas(iHeight, iWidth); -#ifdef Scalar - for (float real = realMin; real <= realMax; real += realStep) { - m_progress.setValue(99.f * (real - realMin) / (realMax - realMin)); - for (float imag = imagMin; imag <= imagMax; imag += imagStep) { - Z c(real, imag); - Z c2 = Z(1.08f * real + 0.15f, imag); - if (fastNorm(Z(real + 1.f, imag)) < 0.06f || (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f)) { - continue; - } - Z z = c; - int n; - for (n = 0; n <= maxIterations && fastNorm(z) < S; ++n) { - z = P(z, c); - } - if (n <= maxIterations && n >= overallLowerBound) { - // point is outside of the Mandelbrot set and required enough (overallLowerBound) - // iterations to reach the cut-off value S - Z cn(real, -imag); - Z zn = cn; - z = c; - for (int i = 0; i <= overallUpperBound; ++i) { - const float y2 = (std::imag(z) - m_y) * yFact; - const float yn2 = (std::imag(zn) - m_y) * yFact; - if (y2 >= 0.f && y2 < maxY && yn2 >= 0.f && yn2 < maxY) { - const float x2 = (std::real(z) - m_x) * xFact; - if (x2 >= 0.f && x2 < maxX) { - const int red = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0; - const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0; - const int blue = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0; - canvas.addDot(x2, y2 , red, green, blue); - canvas.addDot(x2, yn2, red, green, blue); - } - } - z = P(z, c); - zn = P(zn, cn); - if (fastNorm(z) >= S) { // optimization: skip some useless looping - break; - } - } - } - } - } -#else - const float imagStep2 = imagStep * float_v::Size; - const float_v imagMin2 = imagMin + imagStep * static_cast(int_v::IndexesFromZero()); - for (float real = realMin; real <= realMax; real += realStep) { - m_progress.setValue(99.f * (real - realMin) / (realMax - realMin)); - for (float_v imag = imagMin2; imag <= imagMax; imag += imagStep2) { - // FIXME: extra "tracks" if nSteps[1] is not a multiple of float_v::Size - Z c(float_v(real), imag); - Z c2 = Z(float_v(1.08f * real + 0.15f), imag); - if (fastNorm(Z(float_v(real + 1.f), imag)) < 0.06f || (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f)) { - continue; - } - Z z = c; - int_v n(Vc::Zero); - int_m inside = fastNorm(z) < S; - while (!(inside && n <= maxIterations).isEmpty()) { - z = P(z, c); - ++n(inside); - inside &= fastNorm(z) < S; - } - inside |= n < overallLowerBound; - if (inside.isFull()) { - continue; - } - Z cn(float_v(real), -imag); - Z zn = cn; - z = c; - for (int i = 0; i <= overallUpperBound; ++i) { - const float_v y2 = (std::imag(z) - m_y) * yFact; - const float_v yn2 = (std::imag(zn) - m_y) * yFact; - const float_v x2 = (std::real(z) - m_x) * xFact; - z = P(z, c); - zn = P(zn, cn); - const float_m drawMask = !inside && y2 >= 0.f && x2 >= 0.f && y2 < maxY && x2 < maxX && yn2 >= 0.f && yn2 < maxY; - - const int red = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0; - const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0; - const int blue = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0; - - foreach_bit(int j, drawMask) { - canvas.addDot(x2[j], y2 [j], red, green, blue); - canvas.addDot(x2[j], yn2[j], red, green, blue); - } - if (fastNorm(z) >= S) { // optimization: skip some useless looping - break; - } - } - } - } -#endif - canvas.toQImage(&m_image); - - timer.Stop(); - m_progress.done(); - qDebug() << timer.Cycles() << "cycles"; - - if (m_filename.isEmpty()) { - m_filename = QString("r%1-%2_g%3-%4_b%5-%6_s%7-%8_i%9-%10_%11x%12.png") - .arg(lowerBound[0]).arg(upperBound[0]) - .arg(lowerBound[1]).arg(upperBound[1]) - .arg(lowerBound[2]).arg(upperBound[2]) - .arg(nSteps[0]).arg(nSteps[1]) - .arg(overallLowerBound).arg(maxIterations) - .arg(m_image.width()).arg(m_image.height()); - } - - m_image.save(m_filename); -} - -static void usage(const char *argv0) -{ - Baker::Options o; - - QTextStream out(stdout); - out << "Usage: " << argv0 << " [options] []\n\n" - << "Options:\n" - << " -h|--help This message.\n" - << " -s|--size Specify the width and height of the resulting image file. [1024 768]\n" - << " -r|--red Specify lower and upper iteration bounds for a red trace. [" - << o.red[0] << ' ' << o.red[1] << "]\n" - << " -g|--green Specify lower and upper iteration bounds for a green trace. [" - << o.green[0] << ' ' << o.green[1] << "]\n" - << " -b|--blue Specify lower and upper iteration bounds for a blue trace. [" - << o.blue[0] << ' ' << o.blue[1] << "]\n" - << " --steps Specify the steps in real and imaginary direction. [width^1.5 height^1.5]\n" - << " --minIt Overall lower iteration bound. [" << o.it[0] << "]\n" - << " --maxIt Overall upper iteration bound. [" << o.it[1] << "]\n" - ; -} - -int main(int argc, char **argv) -{ - QCoreApplication app(argc, argv); - const QStringList &args = QCoreApplication::arguments(); - if (args.contains("--help") || args.contains("-h")) { - usage(argv[0]); - return 0; - } - - Baker b; - - Baker::Options opt; - int width = 1024; - int height = 768; - - // parse args - for (int i = 1; i < args.size(); ++i) { - const QString &arg = args[i]; - bool ok = true; - if (arg == QLatin1String("--red") || arg == QLatin1String("-r")) { - opt.red[0] = args[++i].toInt(&ok); - if (ok) { - opt.red[1] = args[++i].toInt(&ok); - } - } else if (arg == QLatin1String("--green") || arg == QLatin1String("-g")) { - opt.green[0] = args[++i].toInt(&ok); - if (ok) { - opt.green[1] = args[++i].toInt(&ok); - } - } else if (arg == QLatin1String("--blue") || arg == QLatin1String("-b")) { - opt.blue[0] = args[++i].toInt(&ok); - if (ok) { - opt.blue[1] = args[++i].toInt(&ok); - } - } else if (arg == QLatin1String("--steps")) { - opt.steps[0] = args[++i].toInt(&ok); - if (ok) { - opt.steps[1] = args[++i].toInt(&ok); - } - } else if (arg == QLatin1String("--minIt")) { - opt.it[0] = args[++i].toInt(&ok); - } else if (arg == QLatin1String("--maxIt")) { - opt.it[1] = args[++i].toInt(&ok); - } else if (arg == QLatin1String("--size") || arg == QLatin1String("-s")) { - width = args[++i].toInt(&ok); - if (ok) { - height = args[++i].toInt(&ok); - } - } else { - static bool filenameSet = false; - ok = !filenameSet; - filenameSet = true; - b.setFilename(arg); - } - if (!ok) { - usage(argv[0]); - return 1; - } - } - - b.setOptions(opt); - b.setSize(width, height); - b.createImage(); - return 0; -} diff --git a/math/vc/examples/buddhabrot/main.h b/math/vc/examples/buddhabrot/main.h deleted file mode 100644 index df29f5573c52e..0000000000000 --- a/math/vc/examples/buddhabrot/main.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - Copyright (C) 2010-2011 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#ifndef MAIN_H -#define MAIN_H - -#include -#include -#include -#include - -class ProgressWriter -{ - public: - ProgressWriter(); - void setValue(float v); - void done(); - - private: - QTextStream m_out; -}; - -class Baker -{ - public: - struct Options - { - int red[2]; - int green[2]; - int blue[2]; - int steps[2]; - int it[2]; - Options(); - }; - - Baker(); - void setOptions(Options o) { m_opt = o; } - void setSize(int w, int h); - void setFilename(const QString &); - void createImage(); - - private: - Options m_opt; - float m_x; // left - float m_y; // top - float m_width; - float m_height; - QImage m_image; - QString m_filename; - ProgressWriter m_progress; -}; -#endif // MAIN_H diff --git a/math/vc/examples/finitediff/CMakeLists.txt b/math/vc/examples/finitediff/CMakeLists.txt deleted file mode 100644 index a8f9e07e459ae..0000000000000 --- a/math/vc/examples/finitediff/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -build_example(finitediff main.cpp) diff --git a/math/vc/examples/finitediff/main.cpp b/math/vc/examples/finitediff/main.cpp deleted file mode 100644 index 042b071f9e4f9..0000000000000 --- a/math/vc/examples/finitediff/main.cpp +++ /dev/null @@ -1,249 +0,0 @@ -/* - Copyright (C) 2010 Jochen Gerhard - Copyright (C) 2010-2012 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -/*! - Finite difference method example - - We calculate central differences for a given function and - compare it to the analytical solution. - -*/ - -#include -#include -#include -#include -#include "../tsc.h" -#include - -#define USE_SCALAR_SINCOS - -enum { - N = 10240000, - PrintStep = 1000000 -}; - -static const float epsilon = 1e-7f; -static const float lower = 0.f; -static const float upper = 40000.f; -static const float h = (upper - lower) / N; - -// dfu is the derivative of fu. This is really easy for sine and cosine: -static inline float fu(float x) { return ( std::sin(x) ); } -static inline float dfu(float x) { return ( std::cos(x) ); } - -static inline Vc::float_v fu(Vc::float_v::AsArg x) { -#ifdef USE_SCALAR_SINCOS - Vc::float_v r; - for (int i = 0; i < Vc::float_v::Size; ++i) { - r[i] = std::sin(x[i]); - } - return r; -#else - return Vc::sin(x); -#endif -} - -static inline Vc::float_v dfu(Vc::float_v::AsArg x) { -#ifdef USE_SCALAR_SINCOS - Vc::float_v r; - for (int i = 0; i < Vc::float_v::Size; ++i) { - r[i] = std::cos(x[i]); - } - return r; -#else - return Vc::cos(x); -#endif -} - -using Vc::float_v; - -// It is important for this example that the following variables (especially dy_points) are global -// variables. Else the compiler can optimze all calculations of dy away except for the few places -// where the value is used in printResults. -Vc::Memory x_points; -Vc::Memory y_points; -float *VC_RESTRICT dy_points; - -void printResults() -{ - std::cout - << "------------------------------------------------------------\n" - << std::setw(15) << "fu(x_i)" - << std::setw(15) << "FD fu'(x_i)" - << std::setw(15) << "SYM fu'(x)" - << std::setw(15) << "error %\n"; - for (int i = 0; i < N; i += PrintStep) { - std::cout - << std::setw(15) << y_points[i] - << std::setw(15) << dy_points[i] - << std::setw(15) << dfu(x_points[i]) - << std::setw(15) << std::abs((dy_points[i] - dfu(x_points[i])) / (dfu(x_points[i] + epsilon)) * 100) - << "\n"; - } - std::cout - << std::setw(15) << y_points[N - 1] - << std::setw(15) << dy_points[N - 1] - << std::setw(15) << dfu(x_points[N - 1]) - << std::setw(15) << std::abs((dy_points[N - 1] - dfu(x_points[N - 1])) / (dfu(x_points[N - 1] + epsilon)) * 100) - << std::endl; -} - -int main() -{ - { - float_v x_i(float_v::IndexType::IndexesFromZero()); - for ( unsigned int i = 0; i < x_points.vectorsCount(); ++i, x_i += float_v::Size ) { - const float_v x = x_i * h; - x_points.vector(i) = x; - y_points.vector(i) = fu(x); - } - } - - dy_points = Vc::malloc(N + float_v::Size - 1) + (float_v::Size - 1); - - double speedup; - TimeStampCounter timer; - - { ///////// ignore this part - it only wakes up the CPU //////////////////////////// - const float oneOver2h = 0.5f / h; - - // set borders explicit as up- or downdifferential - dy_points[0] = (y_points[1] - y_points[0]) / h; - // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and - // Vc::SSE are faster, though. - for ( int i = 1; i < N - 1; ++i) { - dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h; - } - dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; - } ////////////////////////////////////////////////////////////////////////////////// - - { - std::cout << "\n" << std::setw(60) << "Classical finite difference method" << std::endl; - timer.Start(); - - const float oneOver2h = 0.5f / h; - - // set borders explicit as up- or downdifferential - dy_points[0] = (y_points[1] - y_points[0]) / h; - // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and - // Vc::SSE are faster, though. - for ( int i = 1; i < N - 1; ++i) { - dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h; - } - dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; - - timer.Stop(); - printResults(); - std::cout << "cycle count: " << timer.Cycles() - << " | " << static_cast(N * 2) / timer.Cycles() << " FLOP/cycle" - << " | " << static_cast(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle" - << "\n"; - } - - speedup = timer.Cycles(); - { - std::cout << std::setw(60) << "Vectorized finite difference method" << std::endl; - timer.Start(); - - // All the differentials require to calculate (r - l) / 2h, where we calculate 1/2h as a - // constant before the loop to avoid unnecessary calculations. Note that a good compiler can - // already do this for you. - const float_v oneOver2h = 0.5f / h; - - // Calculate the left border - dy_points[0] = (y_points[1] - y_points[0]) / h; - - // Calculate the differentials streaming through the y and dy memory. The picture below - // should give an idea of what values in y get read and what values are written to dy in - // each iteration: - // - // y [...................................] - // 00001111222233334444555566667777 - // 00001111222233334444555566667777 - // dy [...................................] - // 00001111222233334444555566667777 - // - // The loop is manually unrolled four times to improve instruction level parallelism and - // prefetching on architectures where four vectors fill one cache line. (Note that this - // unrolling breaks auto-vectorization of the Vc::Scalar implementation when compiling with - // GCC.) - for (unsigned int i = 0; i < (y_points.entriesCount() - 2) / float_v::Size; i += 4) { - // Prefetches make sure the data which is going to be used in 24/4 iterations is already - // in the L1 cache. The prefetchForOneRead additionally instructs the CPU to not evict - // these cache lines to L2/L3. - Vc::prefetchForOneRead(&y_points[(i + 24) * float_v::Size]); - - // calculate float_v::Size differentials per (left - right) / 2h - const float_v dy0 = (y_points.vector(i + 0, 2) - y_points.vector(i + 0)) * oneOver2h; - const float_v dy1 = (y_points.vector(i + 1, 2) - y_points.vector(i + 1)) * oneOver2h; - const float_v dy2 = (y_points.vector(i + 2, 2) - y_points.vector(i + 2)) * oneOver2h; - const float_v dy3 = (y_points.vector(i + 3, 2) - y_points.vector(i + 3)) * oneOver2h; - - // Use streaming stores to reduce the required memory bandwidth. Without streaming - // stores the CPU would first have to load the cache line, where the store occurs, from - // memory into L1, then overwrite the data, and finally write it back to memory. But - // since we never actually need the data that the CPU fetched from memory we'd like to - // keep that bandwidth free for real work. Streaming stores allow us to issue stores - // which the CPU gathers in store buffers to form full cache lines, which then get - // written back to memory directly without the costly read. Thus we make better use of - // the available memory bandwidth. - dy0.store(&dy_points[(i + 0) * float_v::Size + 1], Vc::Streaming); - dy1.store(&dy_points[(i + 1) * float_v::Size + 1], Vc::Streaming); - dy2.store(&dy_points[(i + 2) * float_v::Size + 1], Vc::Streaming); - dy3.store(&dy_points[(i + 3) * float_v::Size + 1], Vc::Streaming); - } - - // Process the last vector. Note that this works for any N because Vc::Memory adds padding - // to y_points and dy_points such that the last scalar value is somewhere inside lastVector. - // The correct right border value for dy_points is overwritten in the last step unless N is - // a multiple of float_v::Size + 2. - // y [...................................] - // 8888 - // 8888 - // dy [...................................] - // 8888 - { - const size_t i = y_points.vectorsCount() - 1; - const float_v left = y_points.vector(i, -2); - const float_v right = y_points.lastVector(); - ((right - left) * oneOver2h).store(&dy_points[i * float_v::Size - 1], Vc::Unaligned); - } - - // ... and finally the right border - dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h; - - timer.Stop(); - printResults(); - std::cout << "cycle count: " << timer.Cycles() - << " | " << static_cast(N * 2) / timer.Cycles() << " FLOP/cycle" - << " | " << static_cast(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle" - << "\n"; - } - speedup /= timer.Cycles(); - std::cout << "Speedup: " << speedup << "\n"; - - Vc::free(dy_points - float_v::Size + 1); - return 0; -} diff --git a/math/vc/examples/fit/GaussFunction.h b/math/vc/examples/fit/GaussFunction.h deleted file mode 100644 index 535efe971b358..0000000000000 --- a/math/vc/examples/fit/GaussFunction.h +++ /dev/null @@ -1,83 +0,0 @@ - -#include "Math/IParamFunction.h" -#include - - -class GaussFunction : public ROOT::Math::IParamMultiGradFunction { - -public: - - enum { - kNPar = 3 - }; - - GaussFunction(double amp = 1, double mean = 0, double sigma = 1) { - fParams[0] = amp; - fParams[1] = mean; - fParams[2] = sigma; - fLogAmp = std::log(amp); - } - - unsigned int NDim() const { return 1; } - - unsigned int NPar() const { return kNPar; } - - inline double amp() const { return fParams[0]; } - inline double logamp() const { return fLogAmp; } - inline double mean() const { return fParams[1]; } - inline double sigma() const { return fParams[2]; } - - const double * Parameters() const { return fParams; } - - void SetParameters(const double * p) { std::copy(p,p+kNPar,fParams); /* fLogAmp = std::log( p[0] ); */ } - - - ROOT::Math::IMultiGenFunction * Clone() const { return new GaussFunction(amp(), mean(), sigma() ); } - - - // implementing this is much faster - double operator()(const double *x, const double * p) { - double y = (x[0]-p[1])/p[2]; - return p[0]*std::exp(-0.5*y*y); - } - - using ROOT::Math::IParamMultiGradFunction::operator(); - - void ParameterGradient(const double *x, const double * p, double * g) const { - double a = p[0]; - double m = p[1]; - double s = p[2]; - double y = (x[0]- m )/s; - g[0] = std::exp(-0.5*y*y); - g[1] = a *g[0]*y/s; - g[2] = g[1]*y; - } - - -private: - - - double DoEvalPar(const double * x, const double * p) const { - double a = p[0]; - double m = p[1]; - double s = p[2]; - double y = (x[0]-m)/s; - return a*std::exp(-0.5*y*y); - } - - double DoDerivative(const double *x, unsigned int icoord) const { - assert (icoord == 0); - double dGdx = -(*this)(x) * (x[0]-mean())/(sigma()*sigma()); - return dGdx; - } - - double DoParameterDerivative(const double *x, const double * p, unsigned int ipar) const { - double grad[3]; - ParameterGradient(x, p, &grad[0] ); - return grad[ipar]; - } - - - double fParams[kNPar]; - double fLogAmp; -}; diff --git a/math/vc/examples/fit/Makefile b/math/vc/examples/fit/Makefile deleted file mode 100644 index a0f6f2c0dd691..0000000000000 --- a/math/vc/examples/fit/Makefile +++ /dev/null @@ -1,121 +0,0 @@ -# Makefile for the ROOT test programs. -# This Makefile shows nicely how to compile and link applications -# using the ROOT libraries on all supported platforms. -# -# Copyright (c) 2000 Rene Brun and Fons Rademakers -# -# Author: Fons Rademakers, 29/2/2000 - -#ROOTSYS = ../../../.. -include $(ROOTSYS)/etc/Makefile.arch -include $(ROOTSYS)/config/Makefile.config - -#------------------------------------------------------------------------------ - -# ifeq ($(PLATFORM),macosx) -# #unroll loop better on gcc > 4 -#CXXFLAGS+= -O3 -g -# endif -AVXCXXFLAG := -mavx -SIMDCXXFLAGS := -mavx -msse4.2 -msse4.1 -msse4a -mssse3 -msse3 -msse2 -VCFLAGS := -fabi-version=0 -Wno-unused-function - -CXXFLAGS+= $(VCFLAGS) - - -ifeq ($(NOAVX),) -CXXFLAGS+= $(AVXCXXFLAG) -LDFLAGS += $(AVXCXXFLAG) -endif -#CXXFLAGS+= -O3 -#CXXFLAGS+= --fast-math -#CXXFLAGS += -ftree-vectorize -ifneq ($(AUTOVEC),) -CXXFLAGS+= -ftree-vectorize -endif -ifneq ($(OPT3),) -CXXFLAGS+= -O3 -LDFLAGS += -O3 -endif -ifneq ($(OPT2EXT),) -CXXFLAGS+= -finline-functions -ftree-vectorize -funswitch-loops -fgcse-after-reload -fipa-cp-clone -fpredictive-commoning -ftree-loop-distribute-patterns -LDFLAGS+= -finline-functions -ftree-vectorize -funswitch-loops -fgcse-after-reload -fipa-cp-clone -fpredictive-commoning -ftree-loop-distribute-patterns -endif -ifneq ($(FASTM),) -CXXFLAGS+= -O3 --fast-math -ftree-vectorize -Ofast -LDFLAGS+= -O3 --fast-math -ftree-vectorize -Ofast -endif -# -ifneq ($(USEVC),) -CXXFLAGS+= -DUSE_VC -EXTRALIBS += $(ROOTSYS)/lib/libVc.a -ifneq ($(VCSCALAR),) -CXXFLAGS+= -DVC_IMPL=Scalar -endif -endif - -ifneq ($(USEVDT),) -CXXFLAGS+= -DUSE_VDT -I/home/data/moneta/vdt_trunk/include -ifeq ($(NOAVX),) -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-avx -lvdt -else -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-sse -lvdt -endif -endif - -ifneq ($(DEBUG),) -CXXFLAGS += -g -DDEBUG -endif - -#for debugging vectorization -#CXXFLAGS+= -ftree-vectorizer-verbose=2 - - -ifneq ($(NDIM1),) -CXXFLAGS += -DNDIM1=$(NDIM1) -endif -ifneq ($(NDIM2),) -CXXFLAGS += -DNDIM2=$(NDIM2) -endif - - -LIBS = -L$(ROOTSYS)/lib -lCore -lMathCore -lMatrix -lHist -lTree - - -TESTFITOBJ = testFitPerf.$(ObjSuf) -TESTFITSRC = testFitPerf.$(SrcSuf) -TESTFIT = testFitPerf$(ExeSuf) - - -OBJS = $(TESTFITOBJ) - -PROGRAMS = $(TESTFIT) - - -.SUFFIXES: .$(SrcSuf) .$(ObjSuf) $(ExeSuf) - - -$(TESTFIT): $(TESTFITOBJ) - $(LD) $(LDFLAGS) $^ $(LIBS) $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - - - - -check: all - for prog in $(PROGRAMS); do \ - ./$$prog > $$prog.out; \ - done; - -clean: - @rm -f $(OBJS) $(PROGRAMS) - -distclean: clean - @rm -f $(PROGRAMS) - - -.SUFFIXES: .$(SrcSuf) - - -.$(SrcSuf).$(ObjSuf): - $(CXX) $(CXXFLAGS) -c $< diff --git a/math/vc/examples/fit/MinimizerTypes.h b/math/vc/examples/fit/MinimizerTypes.h deleted file mode 100644 index 7b66a1cbbae45..0000000000000 --- a/math/vc/examples/fit/MinimizerTypes.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef ROOT_Fit_MinimizerTypes -#define ROOT_Fit_MinimizerTypes - -// define a different type so easy to see in Shark -struct TMINUIT { - static std::string name() { return "Minuit"; } - static std::string name2() { return ""; } -}; -struct TFUMILI { - static std::string name() { return "Fumili"; } - static std::string name2() { return ""; } -}; -struct MINUIT2 { - static std::string name() { return "Minuit2"; } - static std::string name2() { return ""; } -}; -struct FUMILI2 { - static std::string name() { return "Fumili2"; } - static std::string name2() { return ""; } -}; -struct DUMMY { - static std::string name() { return "Dummy"; } - static std::string name2() { return ""; } -}; -struct GSL_FR { - static std::string name() { return "GSLMultiMin"; } - static std::string name2() { return "ConjugateFR"; } -}; -struct GSL_PR { - static std::string name() { return "GSLMultiMin"; } - static std::string name2() { return "ConjugatePR"; } -}; -struct GSL_BFGS { - static std::string name() { return "GSLMultiMin"; } - static std::string name2() { return "BFGS"; } -}; -struct GSL_BFGS2 { - static std::string name() { return "GSLMultiMin"; } - static std::string name2() { return "BFGS2"; } -}; -struct GSL_NLS { - static std::string name() { return "GSLMultiFit"; } - static std::string name2() { return ""; } -}; - -struct LINEAR { - static std::string name() { return "Linear"; } - static std::string name2() { return ""; } -}; - -#endif diff --git a/math/vc/examples/fit/testFitPerf.cxx b/math/vc/examples/fit/testFitPerf.cxx deleted file mode 100644 index 9739693f313f4..0000000000000 --- a/math/vc/examples/fit/testFitPerf.cxx +++ /dev/null @@ -1,1105 +0,0 @@ -#include "TH1.h" -#include "TF1.h" -#include "TF2.h" -#include "TMath.h" -#include "TSystem.h" -#include "TRandom3.h" -#include "TTree.h" -#include "TROOT.h" - -#include "Fit/BinData.h" -#include "Fit/UnBinData.h" -//#include "Fit/BinPoint.h" -#include "Fit/Fitter.h" -#include "HFitInterface.h" - -#include "Math/IParamFunction.h" -#include "Math/WrappedTF1.h" -#include "Math/WrappedMultiTF1.h" -#include "Math/WrappedParamFunction.h" -#include "Math/MultiDimParamFunctionAdapter.h" - -#include "TGraphErrors.h" - -#include "TStyle.h" - -#include "TSeqCollection.h" - -#include "Math/Polynomial.h" -#include "Math/DistFunc.h" - -#include -#include - -#include "TStopwatch.h" - -#include "TVirtualFitter.h" -// #include "TFitterFumili.h" -// #include "TFumili.h" - -#include "GaussFunction.h" - -// #include "RooDataHist.h" -// #include "RooDataSet.h" -// #include "RooRealVar.h" -// #include "RooGaussian.h" -// #include "RooMinuit.h" -// #include "RooChi2Var.h" -// #include "RooGlobalFunc.h" -// #include "RooFitResult.h" -// #include "RooProdPdf.h" - -#include - -#include "MinimizerTypes.h" - -#ifdef USE_VC -#include "Vc/Vc" -//#include "Vc/Allocator" -//Vc_DECLARE_ALLOCATOR(Vc::double_v) -#endif - -//#define USE_AOS - -#ifdef USE_VDT -#include "vdtMath.h" -#endif - -//#define DEBUG - -int nfit; -const int N = 20; -double iniPar[2*N]; -int ndimObs; -int ndimPars; - -void printData(const ROOT::Fit::UnBinData & data) { - for (unsigned int i = 0; i < data.Size(); ++i) { - std::cout << data.Coords(i)[0] << "\t"; - } - std::cout << "\ndata size is " << data.Size() << std::endl; -} - -void printResult(int iret) { - std::cout << "\n************************************************************\n"; - std::cout << "Test\t\t\t\t"; - if (iret == 0) std::cout << "OK"; - else std::cout << "FAILED"; - std::cout << "\n************************************************************\n"; -} - -bool USE_BRANCH = false; -ROOT::Fit::UnBinData * FillUnBinData(TTree * tree, bool copyData = true, unsigned int dim = 1 ) { - - // fill the unbin data set from a TTree - ROOT::Fit::UnBinData * d = 0; - // for the large tree - if (std::string(tree->GetName()) == "t2") { - d = new ROOT::Fit::UnBinData(); - // large tree - unsigned int n = tree->GetEntries(); -#ifdef DEBUG - std::cout << "number of unbin data is " << n << " of dim " << N << std::endl; -#endif - d->Initialize(n,N); - TBranch * bx = tree->GetBranch("x"); - double vx[N]; - bx->SetAddress(vx); - std::vector m(N); - for (int unsigned i = 0; i < n; ++i) { - bx->GetEntry(i); - d->Add(vx); - for (int j = 0; j < N; ++j) - m[j] += vx[j]; - } - -#ifdef DEBUG - std::cout << "average values of means :\n"; - for (int j = 0; j < N; ++j) - std::cout << m[j]/n << " "; - std::cout << "\n"; -#endif - - return d; - } - if (USE_BRANCH) - { - d = new ROOT::Fit::UnBinData(); - unsigned int n = tree->GetEntries(); - //std::cout << "number of unbin data is " << n << std::endl; - - if (dim == 2) { - d->Initialize(n,2); - TBranch * bx = tree->GetBranch("x"); - TBranch * by = tree->GetBranch("y"); - double v[2]; - bx->SetAddress(&v[0]); - by->SetAddress(&v[1]); - for (int unsigned i = 0; i < n; ++i) { - bx->GetEntry(i); - by->GetEntry(i); - d->Add(v); - } - } - else if (dim == 1) { - d->Initialize(n,1); - TBranch * bx = tree->GetBranch("x"); - double v[1]; - bx->SetAddress(&v[0]); - for (int unsigned i = 0; i < n; ++i) { - bx->GetEntry(i); - d->Add(v); - } - } - - return d; - - //printData(d); - } - else { - tree->SetEstimate(tree->GetEntries()); - - // use TTREE::Draw - if (dim == 2) { - tree->Draw("x:y",0,"goff"); // goff is used to turn off the graphics - double * x = tree->GetV1(); - double * y = tree->GetV2(); - - if (x == 0 || y == 0) { - USE_BRANCH= true; - return FillUnBinData(tree, true, dim); - } - - // use array pre-allocated in tree->Draw . This is faster - //assert(x != 0); - unsigned int n = tree->GetSelectedRows(); - - if (copyData) { - d = new ROOT::Fit::UnBinData(n,2); - double vx[2]; - for (int unsigned i = 0; i < n; ++i) { - vx[0] = x[i]; - vx[1] = y[i]; - d->Add(vx); - } - } - else // use data pointers directly - d = new ROOT::Fit::UnBinData(n,x,y); - - } - else if ( dim == 1) { - - tree->Draw("x",0,"goff"); // goff is used to turn off the graphics - double * x = tree->GetV1(); - - if (x == 0) { - USE_BRANCH= true; - return FillUnBinData(tree, true, dim); - } - unsigned int n = tree->GetSelectedRows(); - - if (copyData) { - d = new ROOT::Fit::UnBinData(n,1); - for (int unsigned i = 0; i < n; ++i) { - d->Add(x[i]); - } - } - else - d = new ROOT::Fit::UnBinData(n,x); - } - return d; - } - - //std::copy(x,x+n, d.begin() ); - return 0; -} - - - - -// print the data -template -void printData(const T & data) { - for (typename T::const_iterator itr = data.begin(); itr != data.end(); ++itr) { - std::cout << itr->Coords()[0] << " " << itr->Value() << " " << itr->Error() << std::endl; - } - std::cout << "\ndata size is " << data.Size() << std::endl; -} - - -// new likelihood function for unbinned data -using namespace ROOT::Fit; - -template -struct VecNLL { - - VecNLL( ROOT::Fit::UnBinData & data, F & f) : - fData(&data), fFunc(f) - { } //GetData(); } - - // void GetData() { - - // const UnBinData & data = *fData; - // //const ROOT::Math::IParamMultiFunction & func = *fFunc; //fNLL.ModelFunction(); - - // unsigned int n = data.Size(); - - // fX = std::vector(n); - // fY = std::vector(n); - // for (int i = 0; iGetNpar()); - pol.SetParameters(func->GetParameters() ); - ROOT::Math::WrappedParamFunction f(pol,1,func->GetParameters(),func->GetParameters()+func->GetNpar() ); -#endif - GaussFunction f; - f.SetParameters(func.Parameters()); - fitter.SetFunction(f); - } - - - bool ret = fitter.Fit(d); - if (!ret) { - std::cout << " Fit Failed " << std::endl; - return -1; - } - if (debug) - fitter.Result().Print(std::cout); - return 0; -} - -// unbin fit -template -int DoUnBinFit(T * tree, Func & func, bool debug = false, bool copyData = false ) { - - ROOT::Fit::UnBinData * d = FillUnBinData(tree, copyData, func.NDim() ); - // need to have done Tree->Draw() before fit - //FillUnBinData(d,tree); - - //std::cout << "data size type and size is " << typeid(*d).name() << " " << d->Size() << std::endl; - if (debug) { - if (copyData) - std::cout << "\tcopy data in FitData\n"; - else - std::cout << "\tre-use original data \n"; - } - - - //printData(d); - - // create the fitter - //std::cout << "Fit parameter 2 " << f.Parameters()[2] << std::endl; - - ROOT::Fit::Fitter fitter; - fitter.Config().SetMinimizer(MinType::name().c_str(),MinType::name2().c_str()); - - if (debug) - fitter.Config().MinimizerOptions().SetPrintLevel(3); - - // set tolerance 1 for tree to be same as in TTTreePlayer::UnBinFIt - fitter.Config().MinimizerOptions().SetTolerance(1); - - - // create the function - - // need to fix param 0 , normalization in the unbinned fits - //fitter.Config().ParSettings(0).Fix(); - - fitter.SetFunction(func); - bool ret = fitter.Fit(*d); - - if (!ret) { - std::cout << " Fit Failed " << std::endl; - return -1; - } - if (debug) - fitter.Result().Print(std::cout); - - delete d; - - return 0; - -} - - -template -int DoUnBinFitVec(T * tree, VFunc & func, int ndim, int npar, const double * p0, bool debug = false, bool copyData = false ) { - - ROOT::Fit::UnBinData * d = FillUnBinData(tree, copyData, ndim ); - // need to have done Tree->Draw() before fit - //FillUnBinData(d,tree); - - //std::cout << "data size type and size is " << typeid(*d).name() << " " << d->Size() << std::endl; - if (debug) { - if (copyData) - std::cout << "\tcopy data in FitData\n"; - else - std::cout << "\tre-use original data \n"; - } - - - //printData(d); - - // create the fitter - //std::cout << "Fit parameter 2 " << f.Parameters()[2] << std::endl; - - ROOT::Fit::Fitter fitter; - fitter.Config().SetMinimizer(MinType::name().c_str(),MinType::name2().c_str()); - - if (debug) - fitter.Config().MinimizerOptions().SetPrintLevel(3); - - // set tolerance 1 for tree to be same as in TTTreePlayer::UnBinFIt - fitter.Config().MinimizerOptions().SetTolerance(1); - - - // create the function - - // need to fix param 0 , normalization in the unbinned fits - //fitter.Config().ParSettings(0).Fix(); - - VecNLL fcn(*d, func); - bool ret = fitter.FitFCN(npar, fcn, p0); - - if (!ret) { - std::cout << " Fit Failed " << std::endl; - return -1; - } - if (debug) - fitter.Result().Print(std::cout); - - std::cout << "use vec nll : nll = " << fitter.Result().MinFcnValue() << std::endl; - - - delete d; - - return 0; - -} - - -template -int DoFit(TTree * tree, Func & func, bool debug = false, bool copyData = false ) { - return DoUnBinFit(tree, func, debug, copyData); -} - -template -int DoFit(TH1 * h1, Func & func, bool debug = false, bool copyData = false ) { - return DoBinFit(h1, func, debug, copyData); -} -template -int DoFit(TGraph * gr, Func & func, bool debug = false, bool copyData = false ) { - return DoBinFit(gr, func, debug, copyData); -} - -template -int DoFitVec(TTree * tree, F & func, int n1, int n2, const double * p, bool debug = false, bool copyData = false ) { - return DoUnBinFitVec(tree, func, n1,n2,p,debug, copyData); -} - -template -int DoFitVec(TH1 * h1, F & func, int, int , const double *, bool debug = false, bool copyData = false ) { - return DoBinFit(h1, func, debug, copyData); -} -template -int DoFitVec(TGraph * gr, F & func, int, int , const double *, bool debug = false, bool copyData = false ) { - return DoBinFit(gr, func, debug, copyData); -} - - -template -int FitUsingNewFitter(FitObj * fitobj, FuncObj func, bool useGrad=false) { - - std::cout << "\n************************************************************\n"; - std::cout << "\tFit using new Fit::Fitter " << typeid(*fitobj).name() << std::endl; - std::cout << "\tMinimizer is " << MinType::name() << " " << MinType::name2() << " func dim = " << ndimObs << std::endl; - - int iret = 0; - TStopwatch w; w.Start(); - -#define USE_VECNLL -#ifndef USE_VECNLL - -#ifdef DEBUG - // std::cout << "initial Parameters " << iniPar << " " << *iniPar << " " << *(iniPar+1) << std::endl; - func.SetParameters(iniPar); - iret |= DoFit(fitobj,func,true, useGrad); - if (iret != 0) { - std::cout << "Fit failed " << std::endl; - } - -#else - for (int i = 0; i < nfit; ++i) { - func.SetParameters(iniPar); - iret = DoFit(fitobj,func, false, useGrad); - if (iret != 0) { - std::cout << "Fit failed " << std::endl; - break; - } - } -#endif - -#else - - // use vectorized function - - for (int i = 0; i < nfit; ++i) { - iret = DoFitVec(fitobj,func, ndimObs, ndimPars, iniPar, false, useGrad); - if (iret != 0) { - std::cout << "Fit failed " << std::endl; - break; - } - } - -#endif - - w.Stop(); - std::cout << "\nTime: \t" << w.RealTime() << " , " << w.CpuTime() << std::endl; - std::cout << "\n************************************************************\n"; - - return iret; -} - - - -double poly2(const double *x, const double *p) { - return p[0] + (p[1]+p[2]*x[0] ) * x[0]; -} - -int testPolyFit() { - - int iret = 0; - - - std::cout << "\n\n************************************************************\n"; - std::cout << "\t POLYNOMIAL FIT\n"; - std::cout << "************************************************************\n"; - - std::string fname("pol2"); - //TF1 * func = (TF1*)gROOT->GetFunction(fname.c_str()); - TF1 * f1 = new TF1("pol2",fname.c_str(),-5,5.); - - f1->SetParameter(0,1); - f1->SetParameter(1,0.0); - f1->SetParameter(2,1.0); - - - // fill an histogram - TH1D * h1 = new TH1D("h1","h1",20,-5.,5.); -// h1->FillRandom(fname.c_str(),100); - for (int i = 0; i <1000; ++i) - h1->Fill( f1->GetRandom() ); - - //h1->Print(); - //h1->Draw(); - iniPar[0] = 2.; iniPar[1] = 2.; iniPar[2] = 2.; - - - // dummy for testing - //iret |= FitUsingNewFitter(h1,f1); - - // use simply TF1 wrapper - //ROOT::Math::WrappedMultiTF1 f2(*f1); - ROOT::Math::WrappedParamFunction<> f2(&poly2,1,iniPar,iniPar+3); - - - // if Minuit2 is later than TMinuit on Interl is much slower , why ?? - iret |= FitUsingNewFitter(h1,f2); - iret |= FitUsingNewFitter(h1,f2); - - // test with linear fitter - // for this test need to pass a multi-dim function - ROOT::Math::WrappedTF1 wf(*f1); - ROOT::Math::MultiDimParamGradFunctionAdapter lfunc(wf); - iret |= FitUsingNewFitter(h1,lfunc,true); - - // test with a graph - - gStyle->SetErrorX(0.); // to seto zero error on X - TGraphErrors * gr = new TGraphErrors(h1); - - iret |= FitUsingNewFitter(gr,f2); - - - std::cout << "\n-----> test now TGraphErrors with errors in X coordinates\n\n"; - // try with error in X - gStyle->SetErrorX(0.5); // to set zero error on X - TGraphErrors * gr2 = new TGraphErrors(h1); - - iret |= FitUsingNewFitter(gr2,f2); - - printResult(iret); - - return iret; -} - -template -struct GausFunctions { - typedef T value_type; - - static T gaussian(const T *x, const double *p) { - //return p[0]*TMath::Gaus(x[0],p[1],p[2]); - T tmp = (x[0]-p[1])/p[2]; - return p[0] * std::exp(-tmp*tmp/2); -} - - static T gausnorm(const T *x, const double *p) { - //return p[0]*TMath::Gaus(x[0],p[1],p[2]); - T invsig = 1./p[1]; - T tmp = (x[0]-p[0]) * invsig; - const T sqrt_2pi = 1./std::sqrt(2.* 3.14159 ); - return std::exp(-0.5 * tmp*tmp ) * sqrt_2pi * invsig; -} - static T gausnorm2D(const T *x, const double *p) { - //return p[0]*TMath::Gaus(x[0],p[1],p[2]); - return gausnorm(x,p)*gausnorm(x+1,p+2); - } - static T gausnormN(const T *x, const double *p) { - //return p[0]*TMath::Gaus(x[0],p[1],p[2]); - T f = 1.0; - for (int i = 0; i < N; ++i) - f *= gausnorm(x+i,p+2*i); - - return f; - } - - static void gausnorm_v(unsigned int n, unsigned int stride, const double *x, const double *p, double * res) { - - double invsig = 1./p[1]; - std::vector arg(n); - for (unsigned int i = 0; i< n; ++i){ - double tmp = (x[i*stride]-p[0]) * invsig; - arg[i] = -0.5 *tmp*tmp; - } - const double sqrt_2pi = 1./std::sqrt(2.* 3.14159 ); -#ifdef USE_VDT - vdt::fast_expv(n,&arg[0],res); - for (unsigned int i = 0; i< n; ++i){ -#else - for (unsigned int i = 0; i< n; ++i){ - res[i] = std::exp(arg[i]); -#endif - res[i] *= sqrt_2pi * invsig; - } - - } - - static void gausnorm2D_v(unsigned int n, unsigned int stride,const double *x, const double *p, double * res) { - std::vector tmp(n); - gausnorm_v(n,2,x,p,&tmp[0]); - gausnorm_v(n,2,x+1,p+2,res); - for (unsigned int i = 0; i< n; ++i){ - res[i] *= tmp[i]; - } - } - static void gausnormN_v(unsigned int n, unsigned int stride,const double *x, const double *p, double * res) { - std::vector tmp(n); - gausnorm_v(n,stride,x,p,res); - for (int j = 1; j < stride; ++j) { - gausnorm_v(n,stride,x+j,p+2*j,&tmp[0]); - for (unsigned int i = 0; i< n; ++i){ - res[i] *= tmp[i]; - } - } - } - -}; - -double gaussian (const double *x, const double *p) { - return GausFunctions::gaussian(x,p); -} -double gausnorm (const double *x, const double *p) { - return GausFunctions::gausnorm(x,p); -} -double gausnorm2D (const double *x, const double *p) { - return GausFunctions::gausnorm2D(x,p); -} -double gausnormN (const double *x, const double *p) { - return GausFunctions::gausnormN(x,p); -} - -int testGausFit() { - - int iret = 0; - - std::cout << "\n\n************************************************************\n"; - std::cout << "\t GAUSSIAN FIT\n"; - std::cout << "************************************************************\n"; - - - - //std::string fname = std::string("gaus"); - //TF1 * func = (TF1*)gROOT->GetFunction(fname.c_str()); - //TF1 * f1 = new TF1("gaus",fname.c_str(),-5,5.); - //TF1 * f1 = new TF1("gaussian",gaussian,-5,5.,3); - //f2->SetParameters(0,1,1); - - // fill an histogram - int nbin = 10000; - TH1D * h2 = new TH1D("h2","h2",nbin,-5.,5.); -// h1->FillRandom(fname.c_str(),100); - for (int i = 0; i < 10000000; ++i) - h2->Fill( gRandom->Gaus(0,10) ); - - iniPar[0] = 100.; iniPar[1] = 2.; iniPar[2] = 2.; - - - // use simply TF1 wrapper - //ROOT::Math::WrappedMultiTF1 f2(*f1); - ROOT::Math::WrappedParamFunction<> f2(&gaussian,1,iniPar,iniPar+3); - - - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - -// iret |= FitUsingNewFitter(h2,f2); - - - - - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - - - // test also fitting a TGraphErrors with histogram data - gStyle->SetErrorX(0.); // to seto zero error on X - TGraphErrors * gr = new TGraphErrors(h2); - - - iret |= FitUsingNewFitter(gr,f2); - - // try with error in X - gStyle->SetErrorX(0.5); // to seto zero error on X - TGraphErrors * gr2 = new TGraphErrors(h2); - - iret |= FitUsingNewFitter(gr2,f2); - - - -//#ifdef LATER - // test using grad function - std::cout << "\n\nTest Using pre-calculated gradients\n\n"; - bool useGrad=true; - iret |= FitUsingNewFitter(h2,f2,useGrad); - iret |= FitUsingNewFitter(h2,f2,useGrad); - iret |= FitUsingNewFitter(h2,f2,useGrad); - iret |= FitUsingNewFitter(h2,f2,useGrad); - iret |= FitUsingNewFitter(h2,f2,useGrad); - iret |= FitUsingNewFitter(h2,f2,useGrad); - - - // test LS algorithm - std::cout << "\n\nTest Least Square algorithms\n\n"; - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - iret |= FitUsingNewFitter(h2,f2); - -// iret |= FitUsingTFit(h2,f1); -// iret |= FitUsingTFit(h2,f1); -//#endif - - //iret |= FitUsingRooFit(h2,f1); - - printResult(iret); - - return iret; -} - -int testTreeFit() { - - std::cout << "\n\n************************************************************\n"; - std::cout << "\t UNBINNED TREE (GAUSSIAN) FIT\n"; - std::cout << "************************************************************\n"; - - - TTree t1("t1","a simple Tree with simple variables"); - double x, y; - Int_t ev; - t1.Branch("x",&x,"x/D"); - t1.Branch("y",&y,"y/D"); -// t1.Branch("pz",&pz,"pz/F"); -// t1.Branch("random",&random,"random/D"); - t1.Branch("ev",&ev,"ev/I"); - - //fill the tree - int nrows = 10000; -#ifdef TREE_FIT2D - nrows = 10000; -#endif - for (Int_t i=0;iRannor(x,y); - x *= 2; x += 1.; - y *= 3; y -= 2; - - ev = i; - t1.Fill(); - - } - //t1.Draw("x"); // to select fit variable - - //TF1 * f1 = new TF1("gausnorm", gausnorm, -10,10, 2); - //TF2 * f2 = new TF2("gausnorm2D", gausnorm2D, -10,10, -10,10, 4); - - ROOT::Math::WrappedParamFunction<> wf1(&gausnorm,1,iniPar,iniPar+2); - ROOT::Math::WrappedParamFunction<> wf2(&gausnorm2D,2,iniPar,iniPar+4); - - - iniPar[0] = 0; - iniPar[1] = 1; - iniPar[2] = 0; - iniPar[3] = 1; - - // use simply TF1 wrapper - //ROOT::Math::WrappedMultiTF1 f2(*f1); - - int iret = 0; - - // fit 1D first - - - - // iret |= FitUsingNewFitter(&t1,wf1,false); // not copying the data - // iret |= FitUsingNewFitter(&t1,wf1,false); // not copying the data - - - ndimObs = wf1.NDim(); - ndimPars = wf1.NPar(); - -#ifdef USE_AOS - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm_v,true); // copying the data -#else - - -#ifndef USE_VC - iret |= FitUsingNewFitter(&t1,wf1,true); // copying the data - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm,true); // copying the data -#else - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm,true); // copying the data -#endif -// iret |= FitUsingNewFitter(&t1,wf1,true); // copying the data - -#endif - // fit 2D - - ndimObs = wf2.NDim(); - ndimPars = wf2.NPar(); - - -#ifndef USE_AOS -#ifndef USE_VC - iret |= FitUsingNewFitter(&t1,wf2, true); - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm2D, true); -#else - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm2D, true); -#endif - -#else - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnorm2D_v,true); // copying the data -#endif - - //iret |= FitUsingNewFitter(&t1,wf2, false); - - - - - printResult(iret); - return iret; - -} - -int testLargeTreeFit(int nevt = 1000) { - - - - std::cout << "\n\n************************************************************\n"; - std::cout << "\t UNBINNED TREE (GAUSSIAN MULTI-DIM) FIT\n"; - std::cout << "************************************************************\n"; - - TTree t1("t2","a large Tree with simple variables"); - double x[N]; - Int_t ev; - t1.Branch("x",x,"x[20]/D"); - t1.Branch("ev",&ev,"ev/I"); - - //fill the tree - TRandom3 r; - for (Int_t i=0;i f2(&gausnormN,N,2*N,iniPar); - - ndimObs = f2.NDim(); - ndimPars = f2.NPar(); - - - int iret = 0; - - -#ifndef USE_VC - -#ifndef USE_AOS - iret |= FitUsingNewFitter(&t1,f2); - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnormN,true); -#else - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnormN_v,true); -#endif - // iret |= FitUsingNewFitter(&t1,f2); -#else - iret |= FitUsingNewFitter(&t1,&GausFunctions::gausnormN,true); -#endif - - - printResult(iret); - return iret; - - -} - - -int testFitPerf() { - - int iret = 0; - - - - -#ifdef DEBUG - nfit = 1; -#else - nfit = 10; -#endif - iret |= testTreeFit(); - - nfit = 1; - iret |= testLargeTreeFit(2000); - - -#ifdef LATER - -#ifndef DEBUG - nfit = 10; -#endif - iret |= testGausFit(); - - - -#ifndef DEBUG - nfit = 1000; -#endif - iret |= testPolyFit(); - - -#endif - - - //return iret; - - - - if (iret != 0) - std::cerr << "testFitPerf :\t FAILED " << std::endl; - else - std::cerr << "testFitPerf :\t OK " << std::endl; - return iret; -} - -int main() { - return testFitPerf(); -} - diff --git a/math/vc/examples/genvector/Makefile b/math/vc/examples/genvector/Makefile deleted file mode 100644 index f371c0e587aa3..0000000000000 --- a/math/vc/examples/genvector/Makefile +++ /dev/null @@ -1,109 +0,0 @@ -# Makefile for the ROOT test programs. -# This Makefile shows nicely how to compile and link applications -# using the ROOT libraries on all supported platforms. -# -# Copyright (c) 2000 Rene Brun and Fons Rademakers -# -# Author: Fons Rademakers, 29/2/2000 - -#ROOTSYS = ../../../.. -include $(ROOTSYS)/etc/Makefile.arch -include $(ROOTSYS)/config/Makefile.config - -#------------------------------------------------------------------------------ - -# ifeq ($(PLATFORM),macosx) -# #unroll loop better on gcc > 4 -#CXXFLAGS+= -O3 -g -# endif -AVXCXXFLAG := -mavx -SIMDCXXFLAGS := -mavx -msse4.2 -msse4.1 -msse4a -mssse3 -msse3 -msse2 -#VCFLAGS := -fabi-version=0 -Wno-unused-function -VCFLAGS := -Wno-unused-function - -CXXFLAGS+= $(VCFLAGS) - -ifeq ($(NOAVX),) -CXXFLAGS+= $(AVXCXXFLAG) -LDFLAGS += $(AVXCXXFLAG) -endif -#CXXFLAGS+= -O3 -#CXXFLAGS+= --fast-math -#CXXFLAGS += -ftree-vectorize -ifneq ($(AUTOVEC),) -CXXFLAGS+= -ftree-vectorize -endif -ifneq ($(FASTM),) -CXXFLAGS+= -O3 -ffast-math -ftree-vectorize -Ofast -LDFLAGS+= -O3 -ffast-math -ftree-vectorize -Ofast -endif -#CXXFLAGS+= -ftree-vectorizer-verbose=2 - -ifneq ($(USEVC),) -CXXFLAGS+= -DUSE_VC -EXTRALIBS += $(ROOTSYS)/lib/libVc.a -ifneq ($(VCSCALAR),) -CXXFLAGS+= -DVC_IMPL=Scalar -endif -endif - - -ifneq ($(USEVDT),) -CXXFLAGS+= -DUSE_VDT -I/home/data/moneta/vdt_trunk/include -ifeq ($(NOAVX),) -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-avx -lvdt -else -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-sse -lvdt -endif -endif - -ifneq ($(DEBUG),) -CXXFLAGS += -g -DDEBUG -endif - - -LIBS = -L$(ROOTSYS)/lib -lCore -lCling -lMathCore -lMatrix -lGenVector - -VECTOROPOBJ = vectorOperation.$(ObjSuf) -VECTOROPSRC = vectorOperation.$(SrcSuf) -VECTOROP = vectorOperation$(ExeSuf) - - - -OBJS = $(VECTOROPOBJ) - -PROGRAMS = $(VECTOROP) - - -.SUFFIXES: .$(SrcSuf) .$(ObjSuf) $(ExeSuf) - - -$(VECTOROP): $(VECTOROPOBJ) - $(LD) $(LDFLAGS) $^ $(LIBS) $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -vc: clean usevc all - @echo "compiled for VC" - -novc: clean all - @echo "compiled without VC" - - - -check: all - for prog in $(PROGRAMS); do \ - ./$$prog > $$prog.out; \ - done; - -clean: - @rm -f $(OBJS) $(PROGRAMS) - -distclean: clean - @rm -f $(PROGRAMS) - - -.SUFFIXES: .$(SrcSuf) - - -.$(SrcSuf).$(ObjSuf): - $(CXX) $(CXXFLAGS) -c $< diff --git a/math/vc/examples/genvector/vectorOperation.cxx b/math/vc/examples/genvector/vectorOperation.cxx deleted file mode 100644 index f1cf8f3402504..0000000000000 --- a/math/vc/examples/genvector/vectorOperation.cxx +++ /dev/null @@ -1,907 +0,0 @@ -// test performance of all vectors operations +,- and * - - - -#include - -#ifdef USE_VDT -#include "vdtMath.h" -#endif - -//#define USE_VC -#ifdef USE_VC -#include "Vc/Vc" -#include -typedef Vc::double_v Double_type; -#define ZERO Vc::Zero -#else -typedef double Double_type; -#define ZERO 0 -#endif - - -#include "TRandom2.h" -#include "TStopwatch.h" - -#include -#include -#include - -#ifdef DIM_2 -#ifdef USE_POINT -#include "Math/Point2D.h" -typedef ROOT::Math::XYPoint VecType; -#elif USE_TVECTOR -#include "TVector2.h" -typedef TVector2 VecType; -#else -#include "Math/Vector2D.h" -typedef ROOT::Math::XYVector VecType; -#endif - -#ifndef USE_ROOT -#define VSUM(v) v.x() + v.y() -#else -#define VSUM(v) v.X() + v.Y() -#endif - - -#elif DIM_3 // 3 Dimensions - -#ifdef USE_POINT -#include "Math/Point3D.h" -typedef ROOT::Math::XYZPoint VecType; -#elif USE_TVECTOR -#include "TVector3.h" -typedef TVector3 VecType; -#else -#include "Math/Vector3D.h" -typedef ROOT::Math::XYZVector VecType; -#endif - -#ifndef USE_TVECTOR -#define VSUM(v) v.x() + v.y() + v.z() -#else -#define VSUM(v) v.X() + v.Y() + v.Z() -#endif - -#else // default is 4D - -#undef USE_POINT -#if USE_TVECTOR -#include "TLorentzVector.h" -typedef TLorentzVector VecType; -#else -#include "Math/Vector4D.h" -typedef ROOT::Math::LorentzVector > VecType; -//#ifdef USE_VC -//Vc_DECLARE_ALLOCATOR(VecType) -//#endif -#endif - -#include "Math/VectorUtil.h" - -#ifndef USE_TVECTOR -#define VSUM(v) v.x() + v.y() + v.z() -//#define VSUM(v) v.x() -#else -#define VSUM(v) v.X() + v.Y() + v.Z() -#endif - - -#endif - -//#define VLISTSIZE 8 -#define VLISTSIZE 100 - -#ifdef USE_VC -const int N = VLISTSIZE/Vc::double_v::Size; -#else -const int N = VLISTSIZE; -#endif - -const int NLOOP = 5000000; -//const int NLOOP = 1; - -const int NOP = NLOOP*VLISTSIZE; -const double TSCALE = 1.E9/double(NOP); - -template -class TestVector { -public: - - TestVector(); - - void Operations(); - - void Add(); - void Add2(); - void Sub(); - void Sub2(); - void Scale(); - void Scale2(); - void Divide(); - void Divide2(); - - void MathFunction_sin(); - void MathFunction_exp(); - void MathFunction_log(); - void MathFunction_atan(); - void MathFunction_atan2(); - - void Boost(); - - void Read(); - - void PrintSummary(); - - void PrintResult(Double_type s); - -private: - - std::vector vlist; - std::vector vlist2; - std::vector scale; - std::vector > vcoords; - double fTime[50]; // timing results - int fTest; -}; - -//utility function to aggregate vectors in a vc type -template -void MakeVcVector( const Vector * vlist, Vector_V & vret ) { - const int dim = 4; - typename Vector_V::Scalar vcoord[dim]; - typename Vector::Scalar coord[dim]; - for (int i = 0; i < Size; ++i) { - vlist[i].GetCoordinates(coord); - for (int j = 0; j < dim; ++j) - vcoord[j][i] = coord[j]; - } - vret.SetCootdinates(vcoord); - return vret; -} -// template -// void UnpackVcVector( const Vector * vlist, Vector_V & vret ) { -// const int dim = 4; -// typename Vector_V::Scalar vcoord[dim]; -// typename Vector::Scalar coord[dim]; -// for (int i = 0; i < Size; ++i) { -// vlist[i].GetCoordinates(coord); -// for (int j = 0; j < dim; ++j) -// vcoord[j][i] = coord[j]; -// } -// vret.SetCootdinates(vcoord); -// return vret; -// } - - -template -TestVector::TestVector() : - vlist(N), - vlist2(N), - scale(N), -#ifdef USE_VC - vcoords(N*Vc::double_v::Size), -#else - vcoords(N), -#endif - fTest(0) -{ - // create list of vectors and fill them - - TRandom2 r(111); - - double coord[4]; - for (int i = 0; i< N; ++i) { -#ifndef USE_VC - Double_type x = r.Uniform(-1,1); - Double_type y = r.Uniform(-1,1); - Double_type z = r.Uniform(-1,1); - Double_type t = r.Uniform(2,10); - Double_type s = r.Uniform(0,1); - coord[0] = x; coord[1] = y; coord[2] = z; coord[3] = t; - vcoords[i] = std::vector(coord,coord+4); -#else - Double_type x = 0.; - Double_type y = 0.; - Double_type z = 0.; - Double_type t = 0.; - Double_type s = 0.; - for (int j = 0; j< Vc::double_v::Size; ++j) { - x[j] = r.Uniform(-1,1); - y[j] = r.Uniform(-1,1); - z[j] = r.Uniform(-1,1); - t[j] = r.Uniform(2,10); - s[j] = r.Uniform(0,1); - coord[0] = x[j]; coord[1] = y[j]; coord[2] = z[j]; coord[3] = t[j]; - vcoords[i*Vc::double_v::Size+j] = std::vector(coord,coord+4); - } -#endif - -#ifdef DIM_2 - vlist[i] = Vector( x, y ); -#elif DIM_3 - vlist[i] = Vector( x, y, z); -#else // 4D - vlist[i] = Vector( x, y, z, t); -#endif - scale[i] = s; - } - - std::cout << "test using " << typeid(vlist[0]).name() << std::endl; - std::cout << "Vector used " << vlist[0] << std::endl; - std::cout << "Vector used " << vlist[1] << std::endl; - - // create second list of vectors which is same vector shifted by 1 - for (int i = 0; i< N; ++i) { -#ifndef USE_VC - vlist2[i] = (i < N-1) ? vlist[i+1] : vlist[0]; -#else - Double_type x1 = vlist[i].X(); - Double_type y1 = vlist[i].Y(); - Double_type z1 = vlist[i].Z(); - Double_type t1 = vlist[i].E(); - Double_type x2 = (i< N-1) ? vlist[i+1].X() : vlist[0].X(); - Double_type y2 = (i< N-1) ? vlist[i+1].Y() : vlist[0].Y(); - Double_type z2 = (i< N-1) ? vlist[i+1].Z() : vlist[0].Z(); - Double_type t2 = (i< N-1) ? vlist[i+1].E() : vlist[0].E(); - Double_type x; - Double_type y; - Double_type z; - Double_type t; - int j = 0; - for (j = 0; j< Vc::double_v::Size-1; ++j) { - x[j] = x1[j+1]; - y[j] = y1[j+1]; - z[j] = z1[j+1]; - t[j] = t1[j+1]; - } - j = Vc::double_v::Size-1; - x[j] = x2[0]; - y[j] = y2[0]; - z[j] = z2[0]; - t[j] = t2[0]; - vlist2[i] = Vector( x, y, z, t); -#endif - } - -} - - -template -void TestVector::PrintResult(Double_type s) - // print result -{ -#ifndef USE_VC - std::cout << "value " << s << std::endl << std::endl; -#else - Double_t s2 = 0; - for (int i = 0; i < Vc::double_v::Size; ++i) - s2 += s[i]; -// std::cout << "s = " << s << " sum "; - std::cout << "value " << s2 << std::endl << std::endl; -#endif -} - -template -void TestVector::Read() - // just read vector -{ - TStopwatch w; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Add() - // normal addition -{ - TStopwatch w; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Add2() -{ - // self addition - TStopwatch w; - Vector v3; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Sub() -{ - // normal sub - TStopwatch w; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Sub2() -{ - // self subtruction - TStopwatch w; - Vector v3; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Scale() -{ -// normal multiply - TStopwatch w; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Scale2() -{ - // self scale - TStopwatch w; - Vector v3; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Divide() -{ -// normal divide - TStopwatch w; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Divide2() -{ - // self divide - TStopwatch w; - Vector v3; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::Operations() -{ - // test several operations - TStopwatch w; - Vector v1; - Vector v2; - Vector v3; - Vector v4; - w.Start(); - Double_type s(0.0); - const int n = sqrt(N)+0.5; - for (int l = 0; l vc1x(ncomb+4); - std::vector vc2x(ncomb+4); - std::vector vc1y(ncomb+4); - std::vector vc2y(ncomb+4); - std::vector vc1z(ncomb+4); - std::vector vc2z(ncomb+4); - std::vector vc1t(ncomb+4); - std::vector vc2t(ncomb+4); - double c1[4]; - double c2[4]; - int k = 0; - for (int i = 0; i< nn; ++i) { - std::copy(vcoords[i].begin(), vcoords[i].end(),c1); - //std::cout << "vcoord " << vcoords[i][0] << " " << c1[0] << std::endl; - for (int j = 0; j< i; ++j) { - std::copy(vcoords[j].begin(), vcoords[j].end(),c2); - vc1x[k] = c1[0]; - vc2x[k] = c2[0]; - vc1y[k] = c1[1]; - vc2y[k] = c2[1]; - vc1z[k] = c1[2]; - vc2z[k] = c2[2]; - vc1t[k] = c1[3]; - vc2t[k] = c2[3]; - k++; - } - } - int ncomb2 = ncomb/Vc::double_v::Size; - if (ncomb%Vc::Double_v::Size != 0) ncomb2 += 1; - Vector v1; Vector v2; - for (int i = 0; i< ncomb2; ++i) { - - typename Vector::Scalar cv[4]; - cv[0].load( &vc1x[i*Vc::double_v::Size], Vc::Unaligned ); - cv[1].load( &vc1y[i*Vc::double_v::Size], Vc::Unaligned ); - cv[2].load( &vc1z[i*Vc::double_v::Size], Vc::Unaligned ); - cv[3].load( &vc1t[i*Vc::double_v::Size], Vc::Unaligned ); - //std::cout << cv[0] << " " << vc1x[i*Vc::double_v::Size] << std::endl; - v1.SetCoordinates( cv); - - typename Vector::Scalar cv2[4]; - cv2[0].load( &vc2x[i*Vc::double_v::Size], Vc::Unaligned ); - cv2[1].load( &vc2y[i*Vc::double_v::Size], Vc::Unaligned ); - cv2[2].load( &vc2z[i*Vc::double_v::Size], Vc::Unaligned ); - cv2[3].load( &vc2t[i*Vc::double_v::Size], Vc::Unaligned ); - //std::cout << cv[0] << " " << vc2x[i*Vc::double_v::Size] << std::endl; - v2.SetCoordinates( cv2); - - s+= ROOT::Math::VectorUtil::InvariantMass(v1, v2); - //std::cout << "inv mass of " << v1 << " " << v2 << " is " << s << std::endl; - } - -#endif - } - - std::cout << "Time for Operation :\t" << w.RealTime() << "\t" << w.CpuTime() << std::endl; - PrintResult(s); - fTime[fTest++] = w.CpuTime()*TSCALE; -} - -#else - -template -void TestVector::Operations() -{ - // test several operations - TStopwatch w; - Vector v1; - Vector v2; - Vector v3; - Vector v4; - w.Start(); - Double_type s(0.0); - -#ifndef USE_VC - const int n = sqrt(N)+0.5; -#endif - - const int nn = sqrt(vcoords.size()) + 0.5; - int ncomb = nn*(nn-1)/2; - std::vector vc1x(ncomb+4); - std::vector vc2x(ncomb+4); - std::vector vc1y(ncomb+4); - std::vector vc2y(ncomb+4); - std::vector vc1z(ncomb+4); - std::vector vc2z(ncomb+4); - std::vector vc1t(ncomb+4); - std::vector vc2t(ncomb+4); - double c1[4]; - double c2[4]; - int k = 0; - for (int i = 0; i< nn; ++i) { - std::copy(vcoords[i].begin(), vcoords[i].end(),c1); - //std::cout << "vcoord " << vcoords[i][0] << " " << c1[0] << std::endl; - for (int j = 0; j< i; ++j) { - std::copy(vcoords[j].begin(), vcoords[j].end(),c2); - vc1x[k] = c1[0]; - vc2x[k] = c2[0]; - vc1y[k] = c1[1]; - vc2y[k] = c2[1]; - vc1z[k] = c1[2]; - vc2z[k] = c2[2]; - vc1t[k] = c1[3]; - vc2t[k] = c2[3]; - k++; - } - } - - - - for (int l = 0; l -void TestVector::Boost() -{ - // test several operations - TStopwatch w; - Vector v1; - Vector v2; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction_exp() -{ - // test math function - TStopwatch w; - Vector v1; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction_log() -{ - // test several operations - TStopwatch w; - Vector v1; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction_sin() -{ - // test math function - TStopwatch w; - Vector v1; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction_atan() -{ - // test several operations - TStopwatch w; - Vector v1; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction_atan2() -{ - // test several operations - TStopwatch w; - Vector v1; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::MathFunction() -{ - // test several operations - TStopwatch w; - Vector v; - double x[N]; - double r[N]; - w.Start(); - Double_type s(0.0); - for (int l = 0; l -void TestVector::PrintSummary() -{ - std::cout << "\nResults for " << typeid(vlist[0]).name() << std::endl; - std::cout << " v3 = v1+v2" - << " v2 += v1 " - << " v3 = v1-v2" - << " v2 -= v1 " - << " v2 = a*v1 " - << " v1 *= a " - << " v2 = v1/a " - << " v1 /= a " - << " log " - << " exp " - << " sin " - << " atan " - << " atan2 " - << std::endl; - - // start from 3 - for (int i = 3; i < fTest; ++i) { - std::cout << std::setw(8) << fTime[i] << " "; - } - std::cout << std::endl << std::endl; -} - -int main() { - TestVector t; - -#ifdef USE_VC - std::cout << "testing using Vc: Vc size is " << Vc::double_v::Size << " Looping on " << N << " vectors" << std::endl; - std::cout << "Implementation type: " << VC_IMPL << std::endl; -#else - std::cout << "testing using standard double's. Looping on " << N << " vectors" << std::endl; -#endif - - - t.Read(); - - t.Operations(); - t.Boost(); - - -#ifndef USE_POINT - t.Add(); - t.Add2(); - t.Sub(); - t.Sub2(); -#endif - t.Scale(); - t.Scale2(); -#ifndef USE_TVECTOR - t.Divide(); - t.Divide2(); -#endif - - - t.MathFunction_log(); - t.MathFunction_exp(); - t.MathFunction_sin(); - t.MathFunction_atan(); - t.MathFunction_atan2(); - - - - - - // summurize test - t.PrintSummary(); -} diff --git a/math/vc/examples/mandelbrot/CMakeLists.txt b/math/vc/examples/mandelbrot/CMakeLists.txt deleted file mode 100644 index 021dafa79b727..0000000000000 --- a/math/vc/examples/mandelbrot/CMakeLists.txt +++ /dev/null @@ -1,56 +0,0 @@ -if(QT4_FOUND AND QT4_USABLE) - include(${QT_USE_FILE}) - include_directories(${CMAKE_CURRENT_BINARY_DIR}) - qt4_generate_moc(main.h moc_main.cpp) - qt4_generate_moc(mandel.h moc_mandel.cpp) - - set(SOURCES main.cpp ${CMAKE_CURRENT_BINARY_DIR}/moc_main.cpp mandel.cpp - ${CMAKE_CURRENT_BINARY_DIR}/moc_mandel.cpp) - - build_example(mandelbrot ${SOURCES} LIBS ${QT_LIBRARIES}) - - # It is an interesting test if we can compare against the autovect - # capabilities of Open64 and ICC, so we try to find those and compile extra - # binaries with them - find_program(O64_CXX openCC HINTS /opt/x86_open64-4.2.4/bin /opt/open64/bin) - find_program(ICC_CXX icpc HINTS - /opt/intel/bin - $ENV{HOME}/intel/Compiler/11.1/072/bin/intel64) - - if(FALSE AND O64_CXX) - add_custom_command(OUTPUT mandelbrot_open64 - COMMAND ${O64_CXX} -O3 -Wall -msse3 - -o ${CMAKE_CURRENT_BINARY_DIR}/mandelbrot_open64 - -I ${CMAKE_CURRENT_BINARY_DIR} -I ${QT_INCLUDE_DIR} - -I ${QT_QTCORE_INCLUDE_DIR} -I ${QT_QTGUI_INCLUDE_DIR} - -I ${CMAKE_SOURCE_DIR} -I ${CMAKE_SOURCE_DIR}/include - -L ${QT_LIBRARY_DIR} -lQtGui ${SOURCES} - -DVC_IMPL=Scalar - DEPENDS ${SOURCES} Vc - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Bulding mandelbrot_open64" - VERBATIM) - add_custom_target(build_mandelbrot_open64 ALL DEPENDS mandelbrot_open64) - add_target_property(build_mandelbrot_open64 LABELS "other") - add_dependencies(other build_mandelbrot_open64) - endif() - - if(FALSE AND ICC_CXX) - get_target_property(VcLocation Vc LOCATION) - add_custom_command(OUTPUT mandelbrot_icc - COMMAND ${ICC_CXX} -O3 -xSSE3 - -o ${CMAKE_CURRENT_BINARY_DIR}/mandelbrot_icc - -I ${CMAKE_CURRENT_BINARY_DIR} -I ${QT_INCLUDE_DIR} - -I ${QT_QTCORE_INCLUDE_DIR} -I ${QT_QTGUI_INCLUDE_DIR} - -I ${CMAKE_SOURCE_DIR} -I ${CMAKE_SOURCE_DIR}/include - -L ${QT_LIBRARY_DIR} -lQtGui ${SOURCES} ${VcLocation} - -DVC_IMPL=Scalar - DEPENDS ${SOURCES} Vc - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Bulding mandelbrot_icc" - VERBATIM) - add_custom_target(build_mandelbrot_icc ALL DEPENDS mandelbrot_icc) - add_target_property(build_mandelbrot_icc LABELS "other") - add_dependencies(other build_mandelbrot_icc) - endif() -endif() diff --git a/math/vc/examples/mandelbrot/main.cpp b/math/vc/examples/mandelbrot/main.cpp deleted file mode 100644 index 19284de4a7dc3..0000000000000 --- a/math/vc/examples/mandelbrot/main.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - Copyright (C) 2010 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#include "main.h" - -#include -#include -//#include - -MainWindow::MainWindow(QWidget *_parent) - : QWidget(_parent), - m_scale(0.01f) -{ - m_x = width() * m_scale * -0.667f; - m_y = height() * m_scale * -0.5f; - - m_rect1 = m_rect2 = rect(); - m_rect1.setWidth(m_rect1.width() / 2); - m_rect2.setX(m_rect1.width()); - - qRegisterMetaType(); - qRegisterMetaType(); - connect(&m_mandelVc, SIGNAL(ready(QImage, quint64)), SLOT(vcImage(QImage, quint64))); - connect(&m_mandelScalar, SIGNAL(ready(QImage, quint64)), SLOT(scalarImage(QImage, quint64))); - - setWindowTitle(tr("Mandelbrot")); - setCursor(Qt::CrossCursor); -} - -void MainWindow::vcImage(const QImage &img, quint64 cycles) -{ - m_img1 = img; - update(m_rect1); - if (cycles > 1) { - m_cycles1 = cycles; - updateTitle(); - } - if (QCoreApplication::arguments().contains("--benchmark")) { - m_mandelScalar.brot(m_rect2.size(), m_x, m_y, m_scale); - } -} - -void MainWindow::scalarImage(const QImage &img, quint64 cycles) -{ - m_img2 = img; - update(m_rect2); - if (cycles > 1) { - m_cycles2 = cycles; - updateTitle(); - } -} - -void MainWindow::updateTitle() -{ - setWindowTitle(tr("Mandelbrot [Speedup: %1] [%2]").arg(m_cycles2 / m_cycles1).arg(m_img1 == m_img2 ? "Equal" : "Not Equal")); -} - -void MainWindow::paintEvent(QPaintEvent *e) -{ - QPainter p(this); - QRect r1 = m_rect1 & e->rect(); - p.drawImage(r1, m_img1, r1.translated(m_dragDelta)); - QRect r2 = m_rect2 & e->rect(); - p.drawImage(r2, m_img2, QRect(QPoint(), r2.size()).translated(m_dragDelta)); -} - -void MainWindow::mousePressEvent(QMouseEvent *e) -{ - m_dragStart = e->pos(); -} - -void MainWindow::mouseMoveEvent(QMouseEvent *e) -{ - m_dragDelta = m_dragStart - e->pos(); - update(); -} - -void MainWindow::mouseReleaseEvent(QMouseEvent *e) -{ - m_dragDelta = m_dragStart - e->pos(); - // translate m_x, m_y accordingly and recreate the image - m_x += m_dragDelta.x() * m_scale; - m_y += m_dragDelta.y() * m_scale; - recreateImage(); - m_dragDelta = QPoint(); -} - -void MainWindow::wheelEvent(QWheelEvent *e) -{ - if (e->delta() < 0 && width() * m_scale > 3.f && height() * m_scale > 2.f) { - return; - } - const float xx = e->x() >= m_rect1.width() ? e->x() - m_rect1.width() : e->x(); - const float constX = m_x + m_scale * xx; - const float constY = m_y + m_scale * e->y(); - if (e->delta() > 0) { - m_scale *= 1.f / (1.f + e->delta() * 0.001f); - } else { - m_scale *= 1.f - e->delta() * 0.001f; - } - m_x = constX - m_scale * xx; - m_y = constY - m_scale * e->y(); - recreateImage(); - //update(); -} - -void MainWindow::resizeEvent(QResizeEvent *e) -{ - if (e->oldSize().isValid()) { - m_x += 0.25f * m_scale * (e->oldSize().width() - e->size().width()); - m_y += 0.5f * m_scale * (e->oldSize().height() - e->size().height()); - } else { - m_x = e->size().width() * m_scale * -0.333f; - m_y = e->size().height() * m_scale * -0.5f; - } - - m_rect1 = m_rect2 = QRect(QPoint(), e->size()); - m_rect1.setWidth(m_rect1.width() / 2); - m_rect2.setX(m_rect1.width()); - - recreateImage(); - update(); -} - -void MainWindow::recreateImage() -{ - if (!QCoreApplication::arguments().contains("--benchmark")) { - m_mandelScalar.brot(m_rect2.size(), m_x, m_y, m_scale); - } - m_mandelVc.brot(m_rect1.size(), m_x, m_y, m_scale); -} - -int main(int argc, char **argv) -{ - QApplication app(argc, argv); - MainWindow w; - w.resize(600, 200); - w.show(); - return app.exec(); -} diff --git a/math/vc/examples/mandelbrot/main.h b/math/vc/examples/mandelbrot/main.h deleted file mode 100644 index d7927ded5d57c..0000000000000 --- a/math/vc/examples/mandelbrot/main.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - Copyright (C) 2010 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#ifndef MAIN_H -#define MAIN_H - -#include -#include -#include -#include -#include -#include - -#include "mandel.h" - -class MainWindow : public QWidget -{ - Q_OBJECT - public: - MainWindow(QWidget *parent = 0); - - protected: - void paintEvent(QPaintEvent *); - void resizeEvent(QResizeEvent *); - void mousePressEvent(QMouseEvent *); - void mouseMoveEvent(QMouseEvent *); - void mouseReleaseEvent(QMouseEvent *); - void wheelEvent(QWheelEvent *); - - private slots: - void vcImage(const QImage &, quint64); - void scalarImage(const QImage &, quint64); - - private: - void recreateImage(); - void updateTitle(); - - float m_x; // left - float m_y; // top - float m_scale; - QImage m_img1; - QImage m_img2; - QRect m_rect1; - QRect m_rect2; - QPoint m_dragStart; - QPoint m_dragDelta; - - float m_cycles1, m_cycles2; - - Mandel m_mandelVc; - Mandel m_mandelScalar; -}; -#endif // MAIN_H diff --git a/math/vc/examples/mandelbrot/mandel.cpp b/math/vc/examples/mandelbrot/mandel.cpp deleted file mode 100644 index bb675965950f6..0000000000000 --- a/math/vc/examples/mandelbrot/mandel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -/* - Copyright (C) 2010-2011 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#include "mandel.h" -#include -#include -#include "../tsc.h" - -#include -#include - -using Vc::float_v; -using Vc::float_m; -using Vc::uint_v; -using Vc::uint_m; - -template -Mandel::Mandel(QObject *_parent) - : MandelBase(_parent) -{ -} - -MandelBase::MandelBase(QObject *_parent) - : QThread(_parent), - m_restart(false), m_abort(false) -{ -} - -MandelBase::~MandelBase() -{ - m_mutex.lock(); - m_abort = true; - m_wait.wakeOne(); - m_mutex.unlock(); - - wait(); -} - -void MandelBase::brot(const QSize &size, float x, float y, float scale) -{ - QMutexLocker lock(&m_mutex); - - m_size = size; - m_x = x; - m_y = y; - m_scale = scale; - - if (!isRunning()) { - start(LowPriority); - } else { - m_restart = true; - m_wait.wakeOne(); - } -} - -void MandelBase::run() -{ - while (!m_abort) { - // first we copy the parameters to our local data so that the main main thread can give a - // new task while we're working - m_mutex.lock(); - // destination image, RGB is good - no need for alpha - QImage image(m_size, QImage::Format_RGB32); - float x = m_x; - float y = m_y; - float scale = m_scale; - m_mutex.unlock(); - - // benchmark the number of cycles it takes - TimeStampCounter timer; - timer.Start(); - - // calculate the mandelbrot set/image - mandelMe(image, x, y, scale, 255); - - timer.Stop(); - - // if no new set was requested in the meantime - return the finished image - if (!m_restart) { - emit ready(image, timer.Cycles()); - } - - // wait for more work - m_mutex.lock(); - if (!m_restart) { - m_wait.wait(&m_mutex); - } - m_restart = false; - m_mutex.unlock(); - } -} - -static const float S = 4.f; - -/** - * std::complex is way too slow for our limited purposes: - * - * norm is implemented as std::abs(z) * std::abs(z) for float - * z * z is implemented as multiplication & lots of branches looking for NaN and inf - * - * since we know that we require the square of r and i for norm and multiplication we can - * explicitely cache it in the object - */ -//! [MyComplex] -template -class MyComplex -{ - public: - MyComplex(T r, T i) - : m_real(r), m_imag(i), - m_real2(r * r), m_imag2(i * i) - { - } - - MyComplex squaredPlus(T r, T i) const - { - return MyComplex( - m_real2 + r - m_imag2, - (m_real + m_real) * m_imag + i - ); - } - - T norm() const - { - return m_real2 + m_imag2; - } - - private: - T m_real, m_imag; - T m_real2, m_imag2; -}; -//! [MyComplex] - -//! [P function] -template inline MyComplex P(MyComplex z, T c_real, T c_imag) -{ - return z.squaredPlus(c_real, c_imag); -} -//! [P function] - -template<> void Mandel::mandelMe(QImage &image, float x0, - float y0, float scale, int maxIt) -{ - typedef MyComplex Z; - const unsigned int height = image.height(); - const unsigned int width = image.width(); - const float_v colorScale = 0xff / static_cast(maxIt); - for (unsigned int y = 0; y < height; ++y) { - unsigned int *VC_RESTRICT line = reinterpret_cast(image.scanLine(y)); - const float_v c_imag = y0 + y * scale; - uint_m toStore; - for (uint_v x = uint_v::IndexesFromZero(); !(toStore = x < width).isEmpty(); - x += float_v::Size) { - const float_v c_real = x0 + x * scale; - Z z(c_real, c_imag); - float_v n = 0.f; - float_m inside = z.norm() < S; - while (!(inside && n < maxIt).isEmpty()) { - z = P(z, c_real, c_imag); - ++n(inside); - inside = z.norm() < S; - } - uint_v colorValue = static_cast((maxIt - n) * colorScale) * 0x10101; - if (toStore.isFull()) { - colorValue.store(line, Vc::Unaligned); - line += uint_v::Size; - } else { - colorValue.store(line, toStore, Vc::Unaligned); - break; // we don't need to check again wether x[0] + float_v::Size < width to break out of the loop - } - } - if (restart()) { - break; - } - } -} - -template<> void Mandel::mandelMe(QImage &image, float x0, - float y0, float scale, int maxIt) -{ - typedef MyComplex Z; - const int height = image.height(); - const int width = image.width(); - const float colorScale = 0xff / static_cast(maxIt); - for (int y = 0; y < height; ++y) { - unsigned int *VC_RESTRICT line = reinterpret_cast(image.scanLine(y)); - const float c_imag = y0 + y * scale; - for (int x = 0; x < width; ++x) { - const float c_real = x0 + x * scale; - Z z(c_real, c_imag); - int n = 0; - for (; z.norm() < S && n < maxIt; ++n) { - z = P(z, c_real, c_imag); - } - *line++ = static_cast((maxIt - n) * colorScale) * 0x10101; - } - if (restart()) { - break; - } - } -} - -template class Mandel; -template class Mandel; - -// vim: sw=4 sts=4 et tw=100 diff --git a/math/vc/examples/mandelbrot/mandel.h b/math/vc/examples/mandelbrot/mandel.h deleted file mode 100644 index 4587dfc0ca99c..0000000000000 --- a/math/vc/examples/mandelbrot/mandel.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - Copyright (C) 2010 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#include -#include -#include -#include -#include -#include - -enum MandelImpl { - VcImpl, ScalarImpl -}; - -class MandelBase : public QThread -{ - Q_OBJECT - public: - void brot(const QSize &size, float x, float y, float scale); - - protected: - MandelBase(QObject* _parent = 0); - ~MandelBase(); - void emitImage(const QImage &image, quint64 cycles) { emit ready(image, cycles); } - - void run(); - virtual void mandelMe(QImage &image, float x, float y, float scale, int maxIterations) = 0; - inline bool restart() const { return m_restart; } - - signals: - void ready(const QImage &image, quint64 cycles); - - private: - QMutex m_mutex; - QWaitCondition m_wait; - QSize m_size; - float m_x, m_y, m_scale; - bool m_restart; - bool m_abort; -}; - -template -class Mandel : public MandelBase -{ - public: - Mandel(QObject *_parent = 0); - - protected: - void mandelMe(QImage &image, float x, float y, float scale, int maxIterations); -}; - diff --git a/math/vc/examples/matrix/CMakeLists.txt b/math/vc/examples/matrix/CMakeLists.txt deleted file mode 100644 index 94b51d8f97ec0..0000000000000 --- a/math/vc/examples/matrix/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -build_example(matrix main.cpp) diff --git a/math/vc/examples/matrix/main.cpp b/math/vc/examples/matrix/main.cpp deleted file mode 100644 index e274566dc2f27..0000000000000 --- a/math/vc/examples/matrix/main.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* This file is part of the Vc project - Copyright (C) 2009-2010 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -#include -#include -#include -#include - -template class Matrix; -template std::ostream &operator<<(std::ostream &, const Matrix &); - -template -class Matrix -{ - friend std::ostream &operator<< <>(std::ostream &, const Matrix &); - private: - typedef Vc::Vector V; - Vc::Memory m_mem; - public: - Matrix &operator=(const T &val) { - V vec(val); - for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) { - m_mem.vector(i) = vec; - } - return *this; - } - - Matrix &operator+=(const Matrix &rhs) { - for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) { - V v1(m_mem.vector(i)); - v1 += V(rhs.m_mem.vector(i)); - m_mem.vector(i) = v1; - } - return *this; - } -}; - -template -std::ostream &operator<<(std::ostream &out, const Matrix &m) -{ - for (unsigned int i = 0; i < Size; ++i) { - std::cout << "[" << std::setw(6) << m.m_mem[i * Size]; - for (unsigned int j = 1; j < Size; ++j) { - std::cout << std::setw(6) << m.m_mem[i * Size + j]; - } - std::cout << " ]\n"; - } - return out; -} - -int main() -{ - Matrix m1; - m1 = 1.f; - Matrix m2; - m2 = 2.f; - m1 += m2; - std::cout << m1 << std::endl; - return 0; -} diff --git a/math/vc/examples/polarcoord/CMakeLists.txt b/math/vc/examples/polarcoord/CMakeLists.txt deleted file mode 100644 index 777bcb222047f..0000000000000 --- a/math/vc/examples/polarcoord/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -build_example(polarcoord main.cpp) diff --git a/math/vc/examples/polarcoord/main.cpp b/math/vc/examples/polarcoord/main.cpp deleted file mode 100644 index 43161223b7a23..0000000000000 --- a/math/vc/examples/polarcoord/main.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -*/ - -//! [includes] -#include -#include -#include - -using Vc::float_v; -//! [includes] - -//! [memory allocation] -int main() -{ - // allocate memory for our initial x and y coordinates. Note that you can also put it into a - // normal float C-array but that you then must ensure alignment to Vc::VectorAlignment! - Vc::Memory x_mem; - Vc::Memory y_mem; - Vc::Memory r_mem; - Vc::Memory phi_mem; -//! [memory allocation] - -//! [random init] - // fill the memory with values from -1.f to 1.f - for (size_t i = 0; i < x_mem.vectorsCount(); ++i) { - x_mem.vector(i) = float_v::Random() * 2.f - 1.f; - y_mem.vector(i) = float_v::Random() * 2.f - 1.f; - } -//! [random init] - -//! [conversion] - // calculate the polar coordinates for all coordinates and overwrite the euclidian coordinates - // with the result - for (size_t i = 0; i < x_mem.vectorsCount(); ++i) { - const float_v x = x_mem.vector(i); - const float_v y = y_mem.vector(i); - - r_mem.vector(i) = Vc::sqrt(x * x + y * y); - float_v phi = Vc::atan2(y, x) * 57.295780181884765625f; // 180/pi - phi(phi < 0.f) += 360.f; - phi_mem.vector(i) = phi; - } -//! [conversion] - -//! [output] - // print the results - for (size_t i = 0; i < x_mem.entriesCount(); ++i) { - std::cout << std::setw(3) << i << ": "; - std::cout << std::setw(10) << x_mem[i] << ", " << std::setw(10) << y_mem[i] << " -> "; - std::cout << std::setw(10) << r_mem[i] << ", " << std::setw(10) << phi_mem[i] << '\n'; - } - - return 0; -} -//! [output] diff --git a/math/vc/examples/smatrix/Makefile b/math/vc/examples/smatrix/Makefile deleted file mode 100644 index 3257b9655e4bc..0000000000000 --- a/math/vc/examples/smatrix/Makefile +++ /dev/null @@ -1,145 +0,0 @@ -# Makefile for the ROOT test programs. -# This Makefile shows nicely how to compile and link applications -# using the ROOT libraries on all supported platforms. -# -# Copyright (c) 2000 Rene Brun and Fons Rademakers -# -# Author: Fons Rademakers, 29/2/2000 - -#ROOTSYS = ../../../.. -#include $(ROOTSYS)/etc/Makefile.arch -#include $(ROOTSYS)/config/Makefile.config - - -RC := root-config -ifeq ($(shell which $(RC) 2>&1 | sed -ne "s@.*/$(RC)@$(RC)@p"),$(RC)) -MKARCH := $(wildcard $(shell $(RC) --etcdir)/Makefile.arch) -RCONFIG := $(wildcard $(shell $(RC) --incdir)/RConfigure.h) -endif -ifneq ($(MKARCH),) -include $(MKARCH) -else -ifeq ($(ROOTSYS),) -ROOTSYS = .. -endif -include $(ROOTSYS)/etc/Makefile.arch -endif - -#------------------------------------------------------------------------------ - -# ifeq ($(PLATFORM),macosx) -# #unroll loop better on gcc > 4 -#CXXFLAGS+= -O3 -g -# endif -AVXCXXFLAG := -mavx -SIMDCXXFLAGS := -mavx -msse4.2 -msse4.1 -msse4a -mssse3 -msse3 -msse2 -#VCFLAGS := -fabi-version=0 -Wno-unused-function - -CXXFLAGS+= $(VCFLAGS) - -ifeq ($(NOAVX),) -CXXFLAGS+= $(AVXCXXFLAG) -LDFLAGS += $(AVXCXXFLAG) -endif -#CXXFLAGS+= -O3 -#CXXFLAGS+= --fast-math -#CXXFLAGS += -ftree-vectorize -ifneq ($(AUTOVEC),) -CXXFLAGS+= -ftree-vectorize -endif -ifneq ($(OPT3),) -CXXFLAGS+= -O3 -LDFLAGS += -O3 -endif -ifneq ($(OPT2EXT),) -CXXFLAGS+= -finline-functions -ftree-vectorize -funswitch-loops -fgcse-after-reload -fipa-cp-clone -fpredictive-commoning -ftree-loop-distribute-patterns -LDFLAGS+= -finline-functions -ftree-vectorize -funswitch-loops -fgcse-after-reload -fipa-cp-clone -fpredictive-commoning -ftree-loop-distribute-patterns -endif -ifneq ($(FASTM),) -CXXFLAGS+= -O3 -ffast-math -ftree-vectorize -Ofast -LDFLAGS+= -O3 -ffast-math -ftree-vectorize -Ofast -endif -# -ifneq ($(USEVC),) -CXXFLAGS+= -DUSE_VC -EXTRALIBS += $(ROOTSYS)/lib/libVc.a -ifneq ($(VCSCALAR),) -CXXFLAGS+= -DVC_IMPL=Scalar -endif -endif - -ifneq ($(USEVDT),) -CXXFLAGS+= -DUSE_VDT -I/home/data/moneta/vdt_trunk/include -ifeq ($(NOAVX),) -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-avx -lvdt -else -EXTRALIBS += -L/home/data/moneta/vdt_trunk/lib-sse -lvdt -endif -endif - -ifneq ($(DEBUG),) -CXXFLAGS += -g -DDEBUG -endif - -#for debugging vectorization -#CXXFLAGS+= -ftree-vectorizer-verbose=2 - - -ifneq ($(NDIM1),) -CXXFLAGS += -DNDIM1=$(NDIM1) -endif -ifneq ($(NDIM2),) -CXXFLAGS += -DNDIM2=$(NDIM2) -endif - - -LIBS = -L$(ROOTSYS)/lib -lCore -lCling -lMathCore -lMatrix -lGenVector - -MATRIXOPOBJ = testOperations.$(ObjSuf) -MATRIXOPSRC = testOperations.$(SrcSuf) -MATRIXOP = testOperations$(ExeSuf) - -TESTKALMANOBJ = testKalman.$(ObjSuf) -TESTKALMANSRC = testKalman.$(SrcSuf) -TESTKALMAN = testKalman$(ExeSuf) - - -OBJS = $(MATRIXOPOBJ) $(TESTKALMANOBJ) - -PROGRAMS = $(MATRIXOP) $(TESTKALMAN) - - -.SUFFIXES: .$(SrcSuf) .$(ObjSuf) $(ExeSuf) - - -$(MATRIXOP): $(MATRIXOPOBJ) - $(LD) $(LDFLAGS) $^ $(LIBS) $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - - -$(TESTKALMAN): $(TESTKALMANOBJ) - $(LD) $(LDFLAGS) $^ $(LIBS) $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - - - -all: $(PROGRAMS) - - -check: all - for prog in $(PROGRAMS); do \ - ./$$prog > $$prog.out; \ - done; - -clean: - @rm -f $(OBJS) $(PROGRAMS) - -distclean: clean - @rm -f $(PROGRAMS) - - -.SUFFIXES: .$(SrcSuf) - - -.$(SrcSuf).$(ObjSuf): - $(CXX) $(CXXFLAGS) -c $< diff --git a/math/vc/examples/smatrix/TestTimer.h b/math/vc/examples/smatrix/TestTimer.h deleted file mode 100644 index 22c969d0d230a..0000000000000 --- a/math/vc/examples/smatrix/TestTimer.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef TESTTIMER_H -#define TESTTIMER_H - -// simple class to measure time - -#include "TStopwatch.h" - - -namespace ROOT { - - namespace Math{ - - namespace test { - -#ifdef REPORT_TIME - void reportTime( std::string s, double time); -#endif - - void printTime(TStopwatch & time, std::string s) { - int pr = std::cout.precision(8); - std::cout << s << "\t" << " time = " << time.RealTime() << "\t(sec)\t" - // << time.CpuTime() - << std::endl; - std::cout.precision(pr); - } - - - - class Timer { - - public: - - Timer(const std::string & s = "") : fName(s), fTime(0) - { - fWatch.Start(); - } - Timer(double & t, const std::string & s = "") : fName(s), fTime(&t) - { - fWatch.Start(); - } - - ~Timer() { - fWatch.Stop(); - printTime(fWatch,fName); -#ifdef REPORT_TIME - // report time - reportTime(fName, fWatch.RealTime() ); -#endif - if (fTime) *fTime += fWatch.RealTime(); - } - - - private: - - std::string fName; - double * fTime; - TStopwatch fWatch; - - }; - } - - } -} - -#endif diff --git a/math/vc/examples/smatrix/matrix_op_vec.h b/math/vc/examples/smatrix/matrix_op_vec.h deleted file mode 100644 index f7dc87c6af60d..0000000000000 --- a/math/vc/examples/smatrix/matrix_op_vec.h +++ /dev/null @@ -1,643 +0,0 @@ -#ifndef MATRIX_OP_H -#define MATRIX_OP_H - -#include "TestTimer.h" - -// define functions for matrix operations - -//#define DEBUG -//#ifndef NLOOP -//#define NLOOP 1000000 -//#endif -#include - -using namespace ROOT::Math; - -std::vector gV; - -void initValues() { - gV.reserve(10*NLOOP); - TRandom3 r; - std::cout << "init smearing vector "; - for (int l = 0; l < 10*NLOOP; l++) - { - gV.push_back( r.Rndm() ); - } - std::cout << " with size " << gV.size() << std::endl; - -} - -// function for summing elements of matrix or vector -template -typename V::value_type SumOfElements(const V & v) { - typename V::value_type sum = 0.0; - for (typename V::const_iterator itr = v.begin(); itr != v.end(); ++itr) { - sum += *itr; - } - return sum; -} - - - -// vector assignment -template -void testVeq(const V * v, double & time, V * result) { - Stype tmp = 0.0; - test::Timer t(time,"V=V "); - for (int l = 0; l < 10*NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = v[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -// matrix assignmnent -template -void testMeq(const M * m, double & time, M * result) { - Stype tmp = 0.0; - test::Timer t(time,"M=M "); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - - - -// vector sum -template -void testVad(const V * v1, const V * v2, double & time, V * result) { - Stype tmp = 0.0; - test::Timer t(time,"V+V "); - for (int l = 0; l < 10*NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = v1[k] + v2[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -// matrix sum -template -void testMad(const M * m1, const M * m2, double & time, M * result) { - Stype tmp = 0.0; - test::Timer t(time,"M+M ");; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m1[k]; - result[k] += m2[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -// vector * constant -template -void testVscale(const V * v, T a, double & time, V * result) { - Stype tmp = 0.0; - test::Timer t(time,"a*V ");; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = a * v[k]; // v1 * a does not exist in ROOT - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - - -// matrix * constant -template -void testMscale(const M * m1, T a, double & time, M * result) { - Stype tmp = 0.0; - test::Timer t(time,"a*M ");; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m1[k]; result[k] *= a; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - - -// simple Matrix vector op -template -void testMV(const M * mat, const V * v, double & time, V * result) { - Stype tmp = 0.0; - test::Timer t(time,"M*V "); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = mat[k] * v[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -// general matrix vector op -template -void testGMV(const M * mat, const V * v1, const V *v2, double & time, V * result) { - Stype tmp = 0.0; - test::Timer t(time,"M*V+"); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = mat[k] * v1[k] + v2[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - - -// general matrix matrix op -template -void testMM(const A * a, const B * b, const C * c, double & time, C * result) { - Stype tmp = 0.0; - test::Timer t(time,"M*M "); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = a[k] * b[k] + c[k]; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - - - -// specialized functions (depending on the package) - -//smatrix -template -void testDot_S(const V * v1, const V * v2, T * result, double & time) { - Stype tmp = 0.0; - test::Timer t(time,"dot "); - for (int l = 0; l < 10*NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = Dot(v1[k],v2[k]); - } - tmp += result[NLIST-1]; - } - gResultSum += tmp; -} - - -// double testDot_S(const std::vector & w1, const std::vector & w2, double & time) { -// test::Timer t(time,"dot "); -// double result=0; -// for (int l = 0; l < NLOOP; l++) -// { -// V & v1 = *w1[l]; -// V & v2 = *w2[l]; -// result = Dot(v1,v2); -// } -// return result; -// } - -template -void testInnerProd_S(const M * a, const V * v, T * result, double & time) { - Stype tmp = 0.0; - test::Timer t(time,"prod"); - for (int l = 0; l < 10*NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = Similarity(v[k],a[k]); - } - tmp += result[NLIST-1]; - } - gResultSum += tmp; -} - -//inversion -template -void testInv_S( const M * m, double & time, M * result){ - Stype tmp = 0.0; - test::Timer t(time,"inv "); - int ierr = 0; - int ifail = 0; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m[k].Inverse(ifail); - ierr += ifail; - //result = mtmp.Inverse(ifail); - // assert(ifail == 0); - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; - assert(ierr == 0); -} - -template -void testInvFast_S( const M * m, double & time, M * result){ - Stype tmp = 0.0; - test::Timer t(time,"invF"); - int ierr = 0; - int ifail = 0; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m[k].InverseFast(ifail); - ierr += ifail; - //result = mtmp.Inverse(ifail); - // assert(ifail == 0); - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; - assert(ierr == 0); -} - -template -void testInvChol_S( const M * m, double & time, M * result){ - Stype tmp = 0.0; - test::Timer t(time,"invC"); - int ierr = 0; - int ifail = 0; - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - result[k] = m[k].InverseChol(ifail); - ierr += ifail; - //result = mtmp.Inverse(ifail); - // assert(ifail == 0); - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; - assert(ierr == 0); -} - - -// general matrix matrix op -template -void testATBA_S(const A * a, const B * b, double & time, C * result) { - Stype tmp = 0.0; - test::Timer t(time,"At*M*A"); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - C tmp = b[k] * Transpose(a[k]); - result[k] = a[k] * tmp; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -// general matrix matrix op -template -void testATBA_S2(const A * a, const B * b, double & time, C * result) { - Stype tmp = 0.0; - test::Timer t(time,"At*M*A"); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - //result = Transpose(a) * b * a; - //result = a * b * Transpose(a); - //result = a * b * a; - result[k] = SimilarityT(a[k],b[k]); - //result = a * result; - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -template -void testMT_S(const A * a, double & time, C * result) { - Stype tmp = 0.0; - test::Timer t(time,"Transp"); - for (int l = 0; l < NLOOP; l++) { - for (int k = 0; k < NLIST; k++) { - //result = Transpose(a) * b * a; - //result = a * b * Transpose(a); - //result = a * b * a; - result[k] = Transpose(a[k]); - } - tmp += SumOfElements(result[NLIST-1]); - } - gResultSum += tmp; -} - -///////////////////////////////////// -// for root -////////////////////////////////// - -// simple Matrix vector op -template -void testMV_T(const M & mat, const V & v, double & time, V & result) { - V vtmp = v; - test::Timer t(time,"M*V "); - for (int l = 0; l < NLOOP; l++) - { - vtmp[0] = gV[l]; - Add(result,0.0,mat,vtmp); - } -} - -// general matrix vector op -template -void testGMV_T(const M & mat, const V & v1, const V & v2, double & time, V & result) { - V vtmp = v1; - test::Timer t(time,"M*V+"); - for (int l = 0; l < NLOOP; l++) - { - vtmp[0] = gV[l]; - memcpy(result.GetMatrixArray(),v2.GetMatrixArray(),v2.GetNoElements()*sizeof(Double_t)); - Add(result,1.0,mat,vtmp); - } -} - -// general matrix matrix op -template -void testMM_T(const A & a, const B & b, const C & c, double & time, C & result) { - B btmp = b; - test::Timer t(time,"M*M "); - for (int l = 0; l < NLOOP; l++) - { - btmp(0,0) = gV[l]; - result.Mult(a,btmp); - result += c; - } -} - -// matrix sum -template -void testMad_T(const M & m1, const M & m2, double & time, M & result) { - M mtmp = m2; - test::Timer t(time,"M+M "); - for (int l = 0; l < NLOOP; l++) - { - mtmp(0,0) = gV[l]; - result.Plus(m1,mtmp); - } -} - -template -void testATBA_T(const A & a, const B & b, double & time, C & result) { - B btmp = b; - test::Timer t(time,"At*M*A"); - C tmp = a; - for (int l = 0; l < NLOOP; l++) - { - btmp(0,0) = gV[l]; - tmp.Mult(a,btmp); - result.MultT(tmp,a); - } -} - -template -double testDot_T(const V & v1, const V & v2, double & time) { - V vtmp = v2; - test::Timer t(time,"dot "); - double result=0; - for (int l = 0; l < 10*NLOOP; l++) - { - vtmp[0] = gV[l]; - result = Dot(v1,vtmp); - } - return result; -} - -template -double testInnerProd_T(const M & a, const V & v, double & time) { - V vtmp = v; - test::Timer t(time,"prod"); - double result=0; - for (int l = 0; l < NLOOP; l++) { - vtmp[0] = gV[l]; - result = a.Similarity(vtmp); - } - return result; -} - -//inversion -template -void testInv_T(const M & m, double & time, M& result){ - M mtmp = m; - test::Timer t(time,"inv "); - for (int l = 0; l < NLOOP; l++) - { - mtmp(0,0) = gV[l]; - memcpy(result.GetMatrixArray(),mtmp.GetMatrixArray(),mtmp.GetNoElements()*sizeof(Double_t)); - result.InvertFast(); - } -} - -template -void testInv_T2(const M & m, double & time, M& result){ - M mtmp = m; - test::Timer t(time,"inv2"); - for (int l = 0; l < NLOOP; l++) - { - memcpy(result.GetMatrixArray(),mtmp.GetMatrixArray(),mtmp.GetNoElements()*sizeof(Double_t)); - result.InvertFast(); - } -} - - -// vector sum -template -void testVad_T(const V & v1, const V & v2, double & time, V & result) { - V vtmp = v2; - test::Timer t(time,"V+V ");; - for (int l = 0; l < 10*NLOOP; l++) - { - vtmp[0] = gV[l]; - result.Add(v1,vtmp); - } -} - -// vector * constant -template -void testVscale_T(const V & v1, double a, double & time, V & result) { - V vtmp = v1; - test::Timer t(time,"a*V ");; - for (int l = 0; l < NLOOP; l++) - { - // result = a * v1; - result.Zero(); - vtmp[0] = gV[l]; - Add(result,a,vtmp); - } -} - -// general matrix matrix op -template -void testATBA_T2(const A & a, const B & b, double & time, C & result) { - B btmp = b; - test::Timer t(time,"At*M*A"); - for (int l = 0; l < NLOOP; l++) - { - btmp(0,0) = gV[l]; - memcpy(result.GetMatrixArray(),btmp.GetMatrixArray(),btmp.GetNoElements()*sizeof(Double_t)); - result.Similarity(a); - } -} - -// matrix * constant -template -void testMscale_T(const M & m1, double a, double & time, M & result) { - M mtmp = m1; - test::Timer t(time,"a*M ");; - for (int l = 0; l < NLOOP; l++) - { - //result = a * m1; - result.Zero(); - mtmp(0,0) = gV[l]; - Add(result,a,mtmp); - } -} - -template -void testMT_T(const A & a, double & time, C & result) { - A atmp = a; - test::Timer t(time,"Transp"); - for (int l = 0; l < NLOOP; l++) - { - atmp(0,0) = gV[l]; - result.Transpose(atmp); - } -} - -//////////////////////////////////////////// -// for clhep -//////////////////////////////////////////// - -//smatrix -template -double testDot_C(const V & v1, const V & v2, double & time) { - V vtmp = v2; - test::Timer t(time,"dot "); - double result=0; - for (int l = 0; l < 10*NLOOP; l++) - { - vtmp[0] = gV[l]; - result = dot(v1,vtmp); - } - return result; -} - -template -double testInnerProd_C(const M & a, const V & v, double & time) { - V vtmp = v; - test::Timer t(time,"prod"); - double result=0; - for (int l = 0; l < NLOOP; l++) - { - vtmp[0] = gV[l]; - V tmp = a*vtmp; - result = dot(vtmp,tmp); - } - return result; -} - - -// matrix assignmnent(index starts from 1) -template -void testMeq_C(const M & m, double & time, M & result) { - M mtmp = m; - test::Timer t(time,"M=M "); - for (int l = 0; l < NLOOP; l++) - { - mtmp(1,1) = gV[l]; - result = mtmp; - } -} - -// matrix sum -template -void testMad_C(const M & m1, const M & m2, double & time, M & result) { - M mtmp = m2; - test::Timer t(time,"M+M ");; - for (int l = 0; l < NLOOP; l++) - { - mtmp(1,1) = gV[l]; - result = m1; result += mtmp; - } -} - - -// matrix * constant -template -void testMscale_C(const M & m1, double a, double & time, M & result) { - M mtmp = m1; - test::Timer t(time,"a*M ");; - for (int l = 0; l < NLOOP; l++) - { - mtmp(1,1) = gV[l]; - result = mtmp * a; - } -} - - -// clhep matrix matrix op (index starts from 1) -template -void testMM_C(const A & a, const B & b, const C & c, double & time, C & result) { - B btmp = b; - test::Timer t(time,"M*M "); - for (int l = 0; l < NLOOP; l++) - { - btmp(1,1) = gV[l]; - result = a * btmp + c; - } -} - - -//inversion -template -void testInv_C( const M & a, double & time, M& result){ - M mtmp = a; - test::Timer t(time,"inv "); - int ifail = 0; - for (int l = 0; l < NLOOP; l++) - { - mtmp(1,1) = gV[l]; - result = mtmp.inverse(ifail); - if (ifail) {std::cout <<"error inverting" << mtmp << std::endl; return; } - } -} - -// general matrix matrix op -template -void testATBA_C(const A & a, const B & b, double & time, C & result) { - B btmp = b; - test::Timer t(time,"At*M*A"); - for (int l = 0; l < NLOOP; l++) - { - btmp(1,1) = gV[l]; - //result = a.T() * b * a; - result = a * btmp * a.T(); - } -} - - -template -void testATBA_C2(const A & a, const B & b, double & time, C & result) { - B btmp = b; - test::Timer t(time,"At*M*A"); - for (int l = 0; l < NLOOP; l++) - { - btmp(1,1) = gV[l]; - result = btmp.similarity(a); - } -} - - -template -void testMT_C(const A & a, double & time, C & result) { - A atmp = a; - test::Timer t(time,"Transp"); - for (int l = 0; l < NLOOP; l++) - { - atmp(1,1) = gV[l]; - result = atmp.T(); - } -} - - -#endif diff --git a/math/vc/examples/smatrix/matrix_util.h b/math/vc/examples/smatrix/matrix_util.h deleted file mode 100644 index c299d75eb2fbe..0000000000000 --- a/math/vc/examples/smatrix/matrix_util.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef MATRIX_UTIL_H -#define MATRIX_UTIL_H - -// utility functions to fill with random data - -#ifndef USE_VC - -template -void fillRandomVec(TRandom & r, V & v, unsigned int len, unsigned int start = 0, double offset = 1) { - for(unsigned int i = start; i < len+start; ++i) - v[i] = r.Rndm() + offset; -} - -template -void fillRandomMat(TRandom & r, M & m, unsigned int first, unsigned int second, unsigned int start = 0, double offset = 1) { - for(unsigned int i = start; i < first+start; ++i) - for(unsigned int j = start; j < second+start; ++j) - m(i,j) = r.Rndm() + offset; -} - -template -void fillRandomSym(TRandom & r, M & m, unsigned int first, unsigned int start = 0, double offset = 1) { - for(unsigned int i = start; i < first+start; ++i) { - for(unsigned int j = i; j < first+start; ++j) { - if ( i != j ) { - m(i,j) = r.Rndm() + offset; - m(j,i) = m(i,j); - } - else // add extra offset to make no singular when inverting - m(i,i) = r.Rndm() + 3*offset; - } - } -} - -#else - -// case of Vc -template -void fillRandomVec(TRandom & r, V & v, unsigned int len, unsigned int start = 0, double offset = 1) { - for (int k = 0; k < Vc::double_v::Size; ++k) { - for(unsigned int i = start; i < len+start; ++i) { - typename V::value_type x = v[i]; - x[k] = r.Rndm() + offset; - v[i] = x; - } - } -} - -template -void fillRandomMat(TRandom & r, M & m, unsigned int first, unsigned int second, unsigned int start = 0, double offset = 1) { - for (int k = 0; k < Vc::double_v::Size; ++k) { - for(unsigned int i = start; i < first+start; ++i) { - for(unsigned int j = start; j < second+start; ++j) { - typename M::value_type x = m(i,j); - x[k] = r.Rndm() + offset; - m(i,j) = x; - } - } - } -} - -template -void fillRandomSym(TRandom & r, M & m, unsigned int first, unsigned int start = 0, double offset = 1) { - for (int k = 0; k < Vc::double_v::Size; ++k) { - for(unsigned int i = start; i < first+start; ++i) { - for(unsigned int j = i; j < first+start; ++j) { - typename M::value_type x = m(i,j); - if ( i != j ) { - x[k] = r.Rndm() + offset; - m(i,j) = x; - m(j,i) = m(i,j); - } - else {// add extra offset to make no singular when inverting - x[k] = r.Rndm() + 3*offset; - m(i,i) = x; - } - } - } - } -} -#endif - -#endif diff --git a/math/vc/examples/smatrix/testKalman.cxx b/math/vc/examples/smatrix/testKalman.cxx deleted file mode 100644 index b940f13e73569..0000000000000 --- a/math/vc/examples/smatrix/testKalman.cxx +++ /dev/null @@ -1,655 +0,0 @@ -#ifdef USE_VC -//using namespace Vc; -#include "Vc/Vc" -#endif - - - -#include - - - -#include "Math/SVector.h" -#include "Math/SMatrix.h" - -#include "TMatrixD.h" -#include "TVectorD.h" - -#include "TRandom3.h" - - - -#include "matrix_util.h" - - -#define TEST_SYM - -//#define HAVE_CLHEP -#ifdef HAVE_CLHEP -#include "CLHEP/Matrix/SymMatrix.h" -#include "CLHEP/Matrix/Matrix.h" -#include "CLHEP/Matrix/Vector.h" -#endif - -// #include "SealUtil/SealTimer.h" -// #include "SealUtil/SealHRRTChrono.h" -// #include "SealUtil/TimingReport.h" - -#include - - -#include -#include - -double cpuTime() -{ - struct tms usage; - times(&usage); - return ((double) usage.tms_utime) / sysconf(_SC_CLK_TCK); -} - -double clockTime() -{ - struct tms usage; - return ((double) times(&usage)) / sysconf(_SC_CLK_TCK); -} - - -#ifndef NDIM1 -#define NDIM1 5 -#endif -#ifndef NDIM2 -#define NDIM2 5 -#endif - -#define NITER 1 // number of iterations - -#define NLOOP 500000 // number of time the test is repeted -#define NLISTSIZE 64 // size of matrix/vector lists - -using namespace ROOT::Math; - -#include "TestTimer.h" - -#ifdef USE_VC -typedef Vc::double_v Stype; -#else -typedef double Stype; -#endif - -#ifdef USE_VC -const int NLIST = NLISTSIZE / Vc::double_v::Size; -#else -const int NLIST = NLISTSIZE; -#endif - - -int test_smatrix_kalman() { - - // need to write explicitly the dimensions - - - typedef SMatrix MnMatrixNN; - typedef SMatrix MnMatrixMM; - typedef SMatrix MnMatrixNM; - typedef SMatrix MnMatrixMN; - typedef SMatrix MnSymMatrixNN; - typedef SMatrix MnSymMatrixMM; - typedef SVector MnVectorN; - typedef SVector MnVectorM; - - - - int first = NDIM1; //Can change the size of the matrices - int second = NDIM2; - - - std::cout << "************************************************\n"; - std::cout << " SMatrix kalman test " << first << " x " << second << std::endl; - std::cout << "************************************************\n"; - - - - - int npass = NITER; - TRandom3 r(111); - Stype x2sum = 0.0; - Stype c2sum = 0.0; - - for (int ipass = 0; ipass < npass; ipass++) { - - - MnMatrixNM H[NLIST]; - MnMatrixMN K0[NLIST]; - MnSymMatrixMM Cp[NLIST]; - MnSymMatrixNN V[NLIST]; - MnVectorN m[NLIST]; - MnVectorM xp[NLIST]; - - - // fill matrices with random data - for (int j = 0; j < NLIST; j++) fillRandomMat(r,H[j],first,second); - for (int j = 0; j < NLIST; j++) fillRandomMat(r,K0[j],second,first); - for (int j = 0; j < NLIST; j++) fillRandomSym(r,Cp[j],second); - for (int j = 0; j < NLIST; j++) fillRandomSym(r,V[j],first); - for (int j = 0; j < NLIST; j++) fillRandomVec(r,m[j],first); - for (int j = 0; j < NLIST; j++) fillRandomVec(r,xp[j],second); - - - // MnSymMatrixMM I; - // for(int i = 0; i < second; i++) - // I(i,i) = 1; - -#ifdef DEBUG - std::cout << "pass " << ipass << std::endl; - if (k == 0) { - std::cout << " K0 = " << K0[0] << std::endl; - std::cout << " H = " << K0[0] << std::endl; - std::cout << " Cp = " << Cp[0] << std::endl; - std::cout << " V = " << V[0] << std::endl; - std::cout << " m = " << m[0] << std::endl; - std::cout << " xp = " << xp[0] << std::endl; - } -#endif - - - { - Stype x2 = 0.0,c2 = 0.0; - test::Timer t("SMatrix Kalman "); - - MnVectorM x; - MnMatrixMN tmp; - MnSymMatrixNN Rinv; - MnMatrixMN K; - MnSymMatrixMM C; - MnVectorN vtmp1; - MnVectorN vtmp; - - for (int l = 0; l < NLOOP; l++) { - - // loop on the list of matrices - for (int k = 0; k < NLIST; k++) { - - - - vtmp1 = H[k]*xp[k] -m[k]; - //x = xp + K0 * (m- H * xp); - x = xp[k] - K0[k] * vtmp1; - tmp = Cp[k] * Transpose(H[k]); - Rinv = V[k]; Rinv += H[k] * tmp; - - bool test = Rinv.InvertFast(); - if(!test) { - std::cout<<"inversion failed" < - -// #ifndef NDIM1 -// #define NDIM1 5 -// #endif -// #ifndef NDIM2 -// #define NDIM2 5 -// #endif - - -int NLOOP; - - -//#define NLOOP 1 - -//#define DEBUG - -#define NLISTSIZE 64 // size of matrix/vector lists - - -#ifdef USE_VC -typedef Vc::double_v Stype; -const int NLIST = NLISTSIZE / Vc::double_v::Size; -#else -typedef double Stype; -const int NLIST = NLISTSIZE; -#endif - - -Stype gResultSum; - - -#include "matrix_op_vec.h" -#include "matrix_util.h" -#include - - - -template -int test_smatrix_op() { - - // need to write explicitly the dimensions - - - typedef SMatrix MnMatrixNN; - typedef SMatrix MnMatrixMM; - typedef SMatrix MnMatrixNM; - typedef SMatrix MnMatrixMN; - typedef SVector MnVectorN; - typedef SVector MnVectorM; - - - - int first = ND1; //Can change the size of the matrices - int second = ND2; - - - std::cout << "************************************************\n"; - std::cout << " SMatrix operations test " << first << " x " << second << std::endl; - std::cout << "************************************************\n"; - - - double t_veq, t_meq, t_vad, t_mad, t_dot, t_mv, t_gmv, t_mm, t_prd, t_inv, t_vsc, t_msc, t_ama, t_tra; - double totTime1, totTime2; - - - - int npass = NITER; - TRandom3 r(111); - - Stype r1[NLIST]; - Stype r2[NLIST]; - - gResultSum = 0; - - for (int k = 0; k < npass; k++) { - - - MnMatrixNM A[NLIST]; - MnMatrixMN B[NLIST]; - MnMatrixNN C[NLIST]; - MnMatrixMM D[NLIST]; - MnVectorN v[NLIST]; - MnVectorM p[NLIST]; - - TStopwatch w; - - // fill matrices with random data - for (int j = 0; j < NLIST; j++) fillRandomMat(r,A[j],first,second); - for (int j = 0; j < NLIST; j++) fillRandomMat(r,B[j],second,first); - for (int j = 0; j < NLIST; j++) fillRandomMat(r,C[j],first,first); - for (int j = 0; j < NLIST; j++) fillRandomMat(r,D[j],second,second); - for (int j = 0; j < NLIST; j++) fillRandomVec(r,v[j],first); - for (int j = 0; j < NLIST; j++) fillRandomVec(r,p[j],second); - - - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - std::cout << " A = " << A[0] << std::endl; - std::cout << " B = " << B[0] << std::endl; - std::cout << " C = " << C[0] << std::endl; - std::cout << " D = " << D[0] << std::endl; - std::cout << " v = " << v[0] << std::endl; - std::cout << " p = " << p[0] << std::endl; - } -#endif - - w.Start(); - - - - - MnVectorN v1[NLIST]; testMV(A,v,t_mv,v1); - //if (k == 0) v1.Print(std::cout); - MnVectorN v2[NLIST]; testGMV(A,v,v1,t_gmv,v2); - //if (k == 0) v2.Print(std::cout); - MnMatrixNN C0[NLIST]; testMM(A,B,C,t_mm,C0); - //if (k == 0) C0.Print(std::cout); - MnMatrixNN C1[NLIST]; testATBA_S(B,C0,t_ama,C1); - //if (k == 0) C1.Print(std::cout); - - MnMatrixNN C2[NLIST]; testInv_S(C1,t_inv,C2); - MnMatrixNN C3[NLIST]; testInvFast_S(C2,t_inv,C3); - //MnMatrixNN C2 = C1; - - MnVectorN v3[NLIST]; testVeq(v,t_veq,v3); - MnVectorN v4[NLIST]; testVad(v2,v3,t_vad,v4); - MnVectorN v5[NLIST]; testVscale(v4,2.0,t_vsc,v5); - MnMatrixNN C4[NLIST]; testMeq(C,t_meq,C4); - MnMatrixNN C5[NLIST]; testMad(C3,C4,t_mad,C5); - MnMatrixNN C6[NLIST]; testMscale(C5,0.5,t_msc,C6); - MnMatrixNN C7[NLIST]; testMT_S(C6,t_tra,C7); - -#ifdef DEBUG - if (k == 0) { - std::cout << " C6 = " << C5 << std::endl; - std::cout << " v5 = " << v5 << std::endl; - } -#endif - - testDot_S(v3,v5,r1,t_dot); - - testInnerProd_S(C7,v5,r2,t_prd); - - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - - } - //tr.dump(); - - //double totTime = t_veq + t_meq + t_vad + t_mad + t_dot + t_mv + t_gmv + t_mm + t_prd + t_inv + t_vsc + t_msc + t_ama + t_tra; - std::cout << "Total Time = " << totTime1 << " (s) " << " cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "SMatrix: r1[0] = " << r1[0] << " r2[0] = " << r2[0] << std::endl; - std::cerr << "SMatrix: r1[N] = " << r1[NLIST-1] << " r2[N] = " << r2[NLIST-1] << std::endl; - std::cerr << "sum of results = " << gResultSum << std::endl; - - return 0; -} - - - -#ifdef TEST_SYM -template -int test_smatrix_sym_op() { - - // need to write explicitly the dimensions - - - typedef SMatrix > MnSymMatrixNN; - typedef SMatrix MnMatrixNN; - typedef SVector MnVectorN; - - - - int first = ND1; //Can change the size of the matrices - - - std::cout << "************************************************\n"; - std::cout << " SMatrixSym operations test " << first << " x " << first << std::endl; - std::cout << "************************************************\n"; - - - double t_meq, t_mad, t_mv, t_gmv, t_mm, t_prd, t_inv, t_msc, t_ama = 0; - double totTime1, totTime2; - - - - Stype r1[NLIST]; - int npass = NITER; - TRandom3 r(111); - gResultSum = 0; - - for (int k = 0; k < npass; k++) { - - - MnSymMatrixNN A[NLIST]; - MnSymMatrixNN B[NLIST]; - MnMatrixNN C[NLIST]; - MnVectorN v[NLIST]; - - - TStopwatch w; - - // fill matrices with random data - for (int j = 0; j < NLIST; j++) fillRandomSym(r,A[j],first); - for (int j = 0; j < NLIST; j++) fillRandomSym(r,B[j],first); - for (int j = 0; j < NLIST; j++) fillRandomMat(r,C[j],first,first); - for (int j = 0; j < NLIST; j++) fillRandomVec(r,v[j],first); - - - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - std::cout << " A = " << A[0] << std::endl; - std::cout << " B = " << B[0] << std::endl; - std::cout << " C = " << C[0] << std::endl; - std::cout << " v = " << v[0] << std::endl; - } -#endif - - w.Start(); - - MnVectorN v1[NLIST]; testMV(A,v,t_mv,v1); - MnVectorN v2[NLIST]; testGMV(A,v,v1,t_gmv,v2); - MnMatrixNN C0[NLIST]; testMM(A,B,C,t_mm,C0); - MnSymMatrixNN C1[NLIST]; testATBA_S2(C0,B,t_ama,C1); - MnSymMatrixNN C2[NLIST]; testInv_S(A,t_inv,C2); - MnSymMatrixNN C3[NLIST]; testInvFast_S(C2,t_inv,C3); - MnSymMatrixNN C4[NLIST]; testInvChol_S(C3,t_inv,C4); - //C2 = C1; - MnSymMatrixNN C5[NLIST]; testMeq(C4,t_meq,C5); - MnSymMatrixNN C6[NLIST]; testMad(A,C5,t_mad,C6); - MnSymMatrixNN C7[NLIST]; testMscale(C6,0.5,t_msc,C7); - - testInnerProd_S(C7,v2,r1,t_prd); - - -#ifdef DEBUG - std::cout << "output matrices" << std::endl; - if (k == 0) { - std::cout << " C1 = " << C1 << std::endl; - std::cout << " C3 = " << C3 << std::endl; - std::cout << " C4 = " << C4 << std::endl; - std::cout << " C5 = " << C5 << std::endl; - } -#endif - - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - - - } - //tr.dump(); - - //double totTime = t_meq + t_mv + t_gmv + t_mm + t_prd + t_inv + t_mad + t_msc + t_ama; - std::cout << "Total Time = " << totTime1 << " (s) - cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "SMatrixSym: r1[0] = " << r1[0] << std::endl; - std::cerr << "SMatrixSym: r1[N] = " << r1[NLIST-1] << std::endl; - std::cerr << "sum of results = " << gResultSum << std::endl; - - return 0; -} -#endif - - -// ROOT test - - -template -int test_tmatrix_op() { - -#ifdef USE_TMATRIX - - - - typedef TMatrixD MnMatrix; - typedef TVectorD MnVector; - -// typedef boost::numeric::ublas::matrix MnMatrix; - //typedef HepSymMatrix MnSymMatrixHep; - - - int first = ND1; //Can change the size of the matrices - int second = ND2; - - - std::cout << "************************************************\n"; - std::cout << " TMatrix operations test " << first << " x " << second << std::endl; - std::cout << "************************************************\n"; - - double t_veq, t_meq, t_vad, t_mad, t_dot, t_mv, t_gmv, t_mm, t_prd, t_inv, t_vsc, t_msc, t_ama, t_tra = 0; - double totTime1, totTime2; - - double r1,r2; - int npass = NITER; - TRandom3 r(111); - gMatrixCheck = 0; - - for (int k = 0; k < npass; k++) { - - - MnMatrix A(ND1,ND2); - MnMatrix B(ND2,ND1); - MnMatrix C(ND1,ND1); - MnMatrix D(ND2,ND2); - MnVector v(ND1); - MnVector p(ND2); - - - TStopwatch w; - { - // fill matrices with random data - fillRandomMat(r,A,first,second); - fillRandomMat(r,B,second,first); - fillRandomMat(r,C,first,first); - fillRandomMat(r,D,second,second); - - fillRandomVec(r,v,first); - fillRandomVec(r,p,second); - - } - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - A.Print(); B.Print(); C.Print(); D.Print(); v.Print(); p.Print(); - } -#endif - w.Start(); - - - MnVector v1(ND1); testMV_T(A,v,t_mv,v1); - //if (k == 0) v1.Print(); - MnVector v2(ND1); testGMV_T(A,v,v1,t_gmv,v2); - //if (k == 0) v2.Print(); - MnMatrix C0(ND1,ND1); testMM_T(A,B,C,t_mm,C0); - //if (k == 0) C0.Print(); - MnMatrix C1(ND1,ND1); testATBA_T(B,C0,t_ama,C1); - //if (k == 0) C1.Print(); - MnMatrix C2(ND1,ND1); testInv_T(C1,t_inv,C2); - //if (k == 0) C2.Print(); - MnVector v3(ND1); testVeq(v,t_veq,v3); - MnVector v4(ND1); testVad_T(v2,v3,t_vad,v4); - MnVector v5(ND1); testVscale_T(v4,2.0,t_vsc,v5); - MnMatrix C3(ND1,ND1); testMeq(C,t_meq,C3); - MnMatrix C4(ND1,ND1); testMad_T(C2,C3,t_mad,C4); - //if (k == 0) C4.Print(); - MnMatrix C5(ND1,ND1); testMscale_T(C4,0.5,t_msc,C5); - //if (k == 0) C5.Print(); - MnMatrix C6(ND1,ND1); testMT_T(C5,t_tra,C6); - -#ifdef DEBUG - if (k == 0) { - C6.Print(); - v5.Print(); - } -#endif - - r1 = testDot_T(v3,v5,t_dot); - - r2 = testInnerProd_T(C6,v5,t_prd); - - //MnMatrix C2b(ND1,ND1); testInv_T2(C1,t_inv2,C2b); - - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - } - // tr.dump(); - - //double totTime = t_veq + t_meq + t_vad + t_mad + t_dot + t_mv + t_gmv + t_mm + t_prd + t_inv + t_inv2 + t_vsc + t_msc + t_ama + t_tra; - std::cout << "Total Time = " << totTime1 << " (s) - cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "TMatrix: r1 = " << r1 << " r2 = " << r2 << std::endl; - -#endif - - return 0; - -} - - - -#ifdef TEST_SYM -template -int test_tmatrix_sym_op() { - -#ifdef USE_TMATRIX - - // need to write explicitly the dimensions - - - typedef TMatrixDSym MnSymMatrix; - typedef TMatrixD MnMatrix; - typedef TVectorD MnVector; - - - - int first = ND1; //Can change the size of the matrices - - - std::cout << "************************************************\n"; - std::cout << " TMatrixSym operations test " << first << " x " << first << std::endl; - std::cout << "************************************************\n"; - - - double t_meq, t_mad, t_mv, t_gmv, t_mm, t_prd, t_inv, t_msc, t_ama = 0; - double totTime1, totTime2; - - - - double r1; - int npass = NITER; - TRandom3 r(111); - for (int k = 0; k < npass; k++) { - - - MnSymMatrix A(ND1); - MnSymMatrix B(ND1); - MnMatrix C(ND1,ND1); - MnVector v(ND1); -#define N ND1 - - TStopwatch w; - - { - // fill matrices with random data - fillRandomSym(r,A,first); - fillRandomSym(r,B,first); - fillRandomMat(r,C,first,first); - - fillRandomVec(r,v,first); - - } - - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - A.Print(); B.Print(); C.Print(); v.Print(); - } -#endif - - w.Start(); - - MnVector v1(N); testMV_T(A,v,t_mv,v1); - MnVector v2(N); testGMV_T(A,v,v1,t_gmv,v2); - MnMatrix C0(N,N); testMM_T(A,B,C,t_mm,C0); - MnSymMatrix C1(N); testATBA_T2(C0,B,t_ama,C1); - MnSymMatrix C2(N); testInv_T(A,t_inv,C2); - MnSymMatrix C3(N); testMeq(C2,t_meq,C3); - MnSymMatrix C4(N); testMad_T(A,C3,t_mad,C4); - MnSymMatrix C5(N); testMscale_T(C4,0.5,t_msc,C5); - - r1 = testInnerProd_T(C5,v2,t_prd); - -#ifdef DEBUG - std::cout << "output matrices" << std::endl; - if (k == 0) { - C1.Print(); C3.Print(); C4.Print(); C5.Print(); - } -#endif - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - - } - //tr.dump(); - - //double totTime = t_meq + t_mv + t_gmv + t_mm + t_prd + t_inv + t_mad + t_msc + t_ama; - std::cout << "Total Time = " << totTime1 << " (s) - cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "TMatrixSym: r1 = " << r1 << std::endl; - -#endif - - return 0; -} -#endif // end TEST_SYM - -#ifdef HAVE_CLHEP - -template -int test_hepmatrix_op() { - - - - - typedef HepMatrix MnMatrix; - typedef HepVector MnVector; - - - - int first = ND1; //Can change the size of the matrices - int second = ND2; - - - std::cout << "************************************************\n"; - std::cout << " HepMatrix operations test " << first << " x " << second << std::endl; - std::cout << "************************************************\n"; - - double t_veq, t_meq, t_vad, t_mad, t_dot, t_mv, t_gmv, t_mm, t_prd, t_inv, t_vsc, t_msc, t_ama, t_tra = 0; - - - double totTime1, totTime2; - - double r1,r2; - int npass = NITER; - TRandom3 r(111); - - for (int k = 0; k < npass; k++) { - - - MnMatrix A(ND1,ND2); - MnMatrix B(ND2,ND1); - MnMatrix C(ND1,ND1); - MnMatrix D(ND2,ND2); - MnVector v(ND1); - MnVector p(ND2); - - TStopwatch w; - - { - // fill matrices with random data - fillRandomMat(r,A,first,second,1); - fillRandomMat(r,B,second,first,1); - fillRandomMat(r,C,first,first,1); - fillRandomMat(r,D,second,second,1); - - fillRandomVec(r,v,first); - fillRandomVec(r,p,second); - } - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - std::cout << " A = " << A << std::endl; - std::cout << " B = " << B << std::endl; - std::cout << " C = " << C << std::endl; - std::cout << " D = " << D << std::endl; - std::cout << " v = " << v << std::endl; - std::cout << " p = " << p << std::endl; - } -#endif - - w.Start(); - - MnVector v1(ND1); testMV(A,v,t_mv,v1); - MnVector v2(ND1); testGMV(A,v,v1,t_gmv,v2); - MnMatrix C0(ND1,ND1); testMM_C(A,B,C,t_mm,C0); - MnMatrix C1(ND1,ND1); testATBA_C(B,C0,t_ama,C1); - //std::cout << " C1 = " << C1 << std::endl; - MnMatrix C2(ND1,ND1); testInv_C(C1,t_inv,C2); - //std::cout << " C2 = " << C2 << std::endl; - MnVector v3(ND1); testVeq(v,t_veq,v3); - MnVector v4(ND1); testVad(v2,v3,t_vad,v4); - MnVector v5(ND1); testVscale(v4,2.0,t_vsc,v5); - MnMatrix C3(ND1,ND1); testMeq_C(C,t_meq,C3); - MnMatrix C4(ND1,ND1); testMad_C(C2,C3,t_mad,C4); - //std::cout << " C4 = " << C4 << std::endl; - MnMatrix C5(ND1,ND1); testMscale_C(C4,0.5,t_msc,C5); - //std::cout << " C5 = " << C5 << std::endl; - MnMatrix C6(ND1,ND1); testMT_C(C5,t_tra,C6); - - - r1 = testDot_C(v3,v5,t_dot); - r2 = testInnerProd_C(C6,v5,t_prd); - -#ifdef DEBUG - if (k == 0) { - std::cout << " C6 = " << C6 << std::endl; - std::cout << " v5 = " << v5 << std::endl; - } -#endif - - // MnMatrix C2b(ND1,ND1); testInv_T2(C1,t_inv2,C2b); - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - - } - // tr.dump(); - - std::cout << "Total Time = " << totTime1 << " (s) - cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "HepMatrix: r1 = " << r1 << " r2 = " << r2 << std::endl; - - return 0; -} - - -#ifdef TEST_SYM -template -int test_hepmatrix_sym_op() { - - // need to write explicitly the dimensions - - - typedef HepSymMatrix MnSymMatrix; - typedef HepMatrix MnMatrix; - typedef HepVector MnVector; - - - - int first = ND1; //Can change the size of the matrices - - - std::cout << "************************************************\n"; - std::cout << " HepMatrixSym operations test " << first << " x " << first << std::endl; - std::cout << "************************************************\n"; - - - double t_meq, t_mad, t_mv, t_gmv, t_mm, t_prd, t_inv, t_msc, t_ama = 0; - - double totTime1, totTime2; - - - double r1; - int npass = NITER; - TRandom3 r(111); - for (int k = 0; k < npass; k++) { - - - MnSymMatrix A(ND1); - MnSymMatrix B(ND1); - MnMatrix C(ND1,ND1); - MnVector v(ND1); -#define N ND1 - - TStopwatch w; - - { - // fill matrices with random data - fillRandomSym(r,A,first,1); - fillRandomSym(r,B,first,1); - fillRandomMat(r,C,first,first,1); - fillRandomVec(r,v,first); - - } - - -#ifdef DEBUG - std::cout << "pass " << k << std::endl; - if (k == 0) { - } -#endif - - w.Start(); - - MnVector v1(N); testMV(A,v,t_mv,v1); - MnVector v2(N); testGMV(A,v,v1,t_gmv,v2); - MnMatrix C0(N,N); testMM_C(A,B,C,t_mm,C0); - MnSymMatrix C1(N); testATBA_C2(C0,B,t_ama,C1); - MnSymMatrix C2(N); testInv_C(A,t_inv,C2); - MnSymMatrix C3(N); testMeq_C(C2,t_meq,C3); - MnSymMatrix C4(N); testMad_C(A,C3,t_mad,C4); - MnSymMatrix C5(N); testMscale_C(C4,0.5,t_msc,C5); - - r1 = testInnerProd_C(C5,v2,t_prd); - -#ifdef DEBUG - std::cout << "output matrices" << std::endl; - if (k == 0) { - } -#endif - - w.Stop(); - totTime1 = w.RealTime(); - totTime2 = w.CpuTime(); - - } - //tr.dump(); - - std::cout << "Total Time = " << totTime1 << " (s) - cpu " << totTime2 << " (s) " << std::endl; - std::cerr << "HepMatrixSym: r1 = " << r1 << std::endl; - - return 0; -} - -#endif // TEST_SYM -#endif // HAVE_CLHEP - - -#if defined(HAVE_CLHEP) && defined (TEST_SYM) -#define NTYPES 6 -#define TEST(N) \ - MATRIX_SIZE=N; \ - TEST_TYPE=0; test_smatrix_op(); \ - TEST_TYPE=1; test_tmatrix_op(); \ - TEST_TYPE=2; test_hepmatrix_op(); \ - TEST_TYPE=3; test_smatrix_sym_op(); \ - TEST_TYPE=4; test_tmatrix_sym_op(); \ - TEST_TYPE=5; test_hepmatrix_sym_op(); -#elif !defined(HAVE_CLHEP) && !defined(USE_TMATRIX) && defined (TEST_SYM) -#define NTYPES 2 -#define TEST(N) \ - MATRIX_SIZE=N; \ - TEST_TYPE=0; test_smatrix_op(); \ - TEST_TYPE=1; test_smatrix_sym_op(); -#elif !defined(HAVE_CLHEP) && defined(USE_TMATRIX) && defined (TEST_SYM) -#define NTYPES 4 -#define TEST(N) \ - MATRIX_SIZE=N; \ - TEST_TYPE=0; test_smatrix_op(); \ - TEST_TYPE=1; test_tmatrix_op(); \ - TEST_TYPE=2; test_smatrix_sym_op(); \ - TEST_TYPE=3; test_tmatrix_sym_op(); -#elif defined(HAVE_CLHEP) && !defined (TEST_SYM) -#define NTYPES 3 -#define TEST(N) \ - MATRIX_SIZE=N; \ - TEST_TYPE=0; test_smatrix_op(); \ - TEST_TYPE=1; test_tmatrix_op(); \ - TEST_TYPE=2; test_hepmatrix_op(); -#else -#define NTYPES 2 -#define TEST(N) \ - TEST_TYPE=0; test_smatrix_op(); \ - TEST_TYPE=1; test_tmatrix_op(); -#endif - - - -int TEST_TYPE; -int MATRIX_SIZE; -#ifdef REPORT_TIME -std::vector< std::map > testTimeResults(NTYPES); -std::vector< std::string > typeNames(NTYPES); - -void ROOT::Math::test::reportTime(std::string s, double time) { - assert( TEST_TYPE >= 0 && TEST_TYPE < NTYPES ); - std::map & result = testTimeResults[TEST_TYPE]; - - std::map::iterator pos = result.find(s); - TH1D * h = 0; - if ( pos != result.end() ) { - h = pos->second; - } - else { - // add new elements in map - //std::cerr << "insert element in map" << s << typeNames[TEST_TYPE] << std::endl; - std::string name = typeNames[TEST_TYPE] + "_" + s; - h = new TProfile(name.c_str(), name.c_str(),100,0.5,100.5); - //result.insert(std::map::value_type(s,h) ); - result[s] = h; - } - double scale=1; - if (s.find("dot") != std::string::npos || - s.find("V=V") != std::string::npos || - s.find("V+V") != std::string::npos ) scale = 10; - h->Fill(double(MATRIX_SIZE),time/double(NLOOP*NITER*scale) ); -} -#endif - -int testOperations() { - - NLOOP = 1000*NLOOP_MIN; - -#ifdef USE_VC - std::cout << "Using VC library - size = " << Vc::double_v::Size << " VC_IMPL = " << VC_IMPL << std::endl; -#endif - - std::cout << " making vector/matrix lists of size = " << NLIST << std::endl; - - - initValues(); - -#ifndef NDIM1 -#define NDIM1 5 -#endif - TEST(NDIM1) - - return 0; -} - - -int main(int argc , char *argv[] ) { - - - std::string fname = "testOperations"; - if (argc > 1) { - std::string platf(argv[1]); - fname = fname + "_" + platf; - } - fname = fname + ".root"; - - -#ifdef REPORT_TIME - TFile * f = new TFile(fname.c_str(),"recreate"); - - typeNames[0] = "SMatrix"; - typeNames[1] = "TMatrix"; -#if !defined(HAVE_CLHEP) && defined (TEST_SYM) - typeNames[2] = "SMatrix_sym"; - typeNames[3] = "TMatrix_sym"; -#elif defined(HAVE_CLHEP) && defined (TEST_SYM) - typeNames[2] = "HepMatrix"; - typeNames[3] = "SMatrix_sym"; - typeNames[4] = "TMatrix_sym"; - typeNames[5] = "HepMatrix_sym"; -#elif defined(HAVE_CLHEP) && !defined (TEST_SYM) - typeNames[2] = "HepMatrix"; -#endif - -#endif - -#ifndef TEST_ALL_MATRIX_SIZES -// NLOOP = 1000*NLOOP_MIN -// initValues(); - -// TEST(5) -// NLOOP = 50*NLOOP_MIN; -// TEST(30); - - return testOperations(); - -#else - NLOOP = 5000*NLOOP_MIN; - initValues(); - - - - TEST(2); - TEST(3); - TEST(4); - NLOOP = 1000*NLOOP_MIN - TEST(5); - TEST(6); - TEST(7); - TEST(10); - NLOOP = 100*NLOOP_MIN; - TEST(15); - TEST(20); - NLOOP = 50*NLOOP_MIN; - TEST(30); -// NLOOP = NLOOP_MIN; -// TEST(50); -// TEST(75); -// TEST(100); -#endif - -#ifdef REPORT_TIME - f->Write(); - f->Close(); -#endif - -} - diff --git a/math/vc/examples/tsc.h b/math/vc/examples/tsc.h deleted file mode 100644 index b9526ccffa8b9..0000000000000 --- a/math/vc/examples/tsc.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - Copyright (C) 2009-2012 Matthias Kretz - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) version 3. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. - -*/ - -#ifndef TSC_H -#define TSC_H - -#ifdef _MSC_VER -#include -#pragma intrinsic(__rdtsc) -#endif - -class TimeStampCounter -{ - public: - void Start(); - void Stop(); - unsigned long long Cycles() const; - - private: - union Data { - unsigned long long a; - unsigned int b[2]; - } m_start, m_end; -}; - -inline void TimeStampCounter::Start() -{ -#ifdef _MSC_VER - unsigned int tmp; - m_start.a = __rdtscp(&tmp); -#else - asm volatile("rdtscp" : "=a"(m_start.b[0]), "=d"(m_start.b[1]) :: "ecx" ); -#endif -} - -inline void TimeStampCounter::Stop() -{ -#ifdef _MSC_VER - unsigned int tmp; - m_end.a = __rdtscp(&tmp); -#else - asm volatile("rdtscp" : "=a"(m_end.b[0]), "=d"(m_end.b[1]) :: "ecx" ); -#endif -} - -inline unsigned long long TimeStampCounter::Cycles() const -{ - return m_end.a - m_start.a; -} - -#endif // TSC_H diff --git a/math/vc/include/Vc/Allocator b/math/vc/include/Vc/Allocator deleted file mode 100644 index 8ea407ddbf564..0000000000000 --- a/math/vc/include/Vc/Allocator +++ /dev/null @@ -1,247 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_ALLOCATOR_H -#define VC_ALLOCATOR_H - -#include -#include -#include -#include "global.h" -#ifdef VC_CXX11 -#include -#endif -#include "common/macros.h" - -namespace ROOT { -namespace Vc -{ - using std::size_t; - using std::ptrdiff_t; - - /** - * \headerfile Allocator - * \ingroup Utilities - * - * Convenience macro to set the default allocator for a given \p Type to Vc::Allocator. - * - * \param Type Your type that you want to use with STL containers. - * - * \note You have to use this macro in the global namespace. - */ -#define VC_DECLARE_ALLOCATOR(Type) \ -namespace std \ -{ \ - template<> class allocator : public ::ROOT::Vc::Allocator \ - { \ - public: \ - template struct rebind { typedef ::std::allocator other; }; \ - }; \ -} -#ifdef VC_MSVC -#undef Vc_DECLARE_ALLOCATOR -#define Vc_DECLARE_ALLOCATOR(Type) \ -namespace std \ -{ \ - template<> class allocator : public ::ROOT::Vc::Allocator \ - { \ - public: \ - template struct rebind { typedef ::std::allocator other; }; \ - /* MSVC brokenness: the following function is optional - just doesn't compile without it */ \ - const allocator &select_on_container_copy_construction() const { return *this; } \ - }; \ -} -#endif - - /** - * \headerfile Allocator - * An allocator that uses global new and supports over-aligned types, as per [C++11 20.6.9]. - * - * Meant as a simple replacement for the allocator defined in the C++ Standard. - * Allocation is done using the global new/delete operators. But if the alignment property of \p - * T is larger than the size of a pointer, the allocate function allocates slightly more memory - * to adjust the pointer for correct alignment. - * - * If the \p T does not require over-alignment no additional memory will be allocated. - * - * \tparam T The type of objects to allocate. - * - * Example: - * \code - * struct Data { - * Vc::float_v x, y, z; - * }; - * - * void fun() - * { - * std::vector dat0; // this will use std::allocator, which probably ignores the - * // alignment requirements for Data. Thus any access to dat0 may - * // crash your program. - * - * std::vector > dat1; // now std::vector will get correctly aligned - * // memory. Accesses to dat1 are safe. - * ... - * \endcode - * - * %Vc ships a macro to conveniently tell STL to use Vc::Allocator per default for a given type: - * \code - * struct Data { - * Vc::float_v x, y, z; - * }; - * VC_DECLARE_ALLOCATOR(Data) - * - * void fun() - * { - * std::vector dat0; // good now - * ... - * \endcode - * - * \ingroup Utilities - */ - template class Allocator - { - private: - enum Constants { -#ifdef VC_HAVE_STD_MAX_ALIGN_T - NaturalAlignment = alignof(std::max_align_t), -#elif defined(VC_HAVE_MAX_ALIGN_T) - NaturalAlignment = alignof(::max_align_t), -#else - NaturalAlignment = sizeof(void *) > Vc_ALIGNOF(long double) ? sizeof(void *) : - (Vc_ALIGNOF(long double) > Vc_ALIGNOF(long long) ? Vc_ALIGNOF(long double) : Vc_ALIGNOF(long long)), -#endif -#ifdef VC_IMPL_AVX - SimdAlignment = 32, -#elif defined VC_IMPL_SSE - SimdAlignment = 16, -#else - SimdAlignment = 1, -#endif - Alignment = Vc_ALIGNOF(T) > SimdAlignment ? Vc_ALIGNOF(T) : SimdAlignment, - /* The number of extra bytes allocated must be large enough to put a pointer right - * before the adjusted address. This pointer stores the original address, which is - * required to call ::operator delete in deallocate. - * - * The address we get from ::operator new is a multiple of NaturalAlignment: - * p = N * NaturalAlignment - * - * Since all alignments are powers of two, Alignment is a multiple of NaturalAlignment: - * Alignment = k * NaturalAlignment - * - * two cases: - * 1. If p is already aligned to Alignment then allocate will return p + Alignment. In - * this case there are Alignment Bytes available to store a pointer. - * 2. If p is not aligned then p + (k - (N modulo k)) * NaturalAlignment will be - * returned. Since NaturalAlignment >= sizeof(void*) the pointer fits. - */ - ExtraBytes = Alignment > NaturalAlignment ? Alignment : 0, - AlignmentMask = Alignment - 1 - }; - public: - typedef size_t size_type; - typedef ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; - - template struct rebind { typedef Allocator other; }; - - Allocator() throw() { } - Allocator(const Allocator&) throw() { } - template Allocator(const Allocator&) throw() { } - - pointer address(reference x) const { return &x; } - const_pointer address(const_reference x) const { return &x; } - - pointer allocate(size_type n, const void* = 0) - { - if (n > this->max_size()) { - throw std::bad_alloc(); - } - - char *p = static_cast(::operator new(n * sizeof(T) + ExtraBytes)); - if (ExtraBytes > 0) { - char *const pp = p; - p += ExtraBytes; - const char *null = 0; - p -= ((p - null) & AlignmentMask); // equivalent to p &= ~AlignmentMask; - reinterpret_cast(p)[-1] = pp; - } - return reinterpret_cast(p); - } - - void deallocate(pointer p, size_type) - { - if (ExtraBytes > 0) { - p = reinterpret_cast(p)[-1]; - } - ::operator delete(p); - } - - size_type max_size() const throw() { return size_t(-1) / sizeof(T); } - -#ifdef VC_MSVC - // MSVC brokenness: the following function is optional - just doesn't compile without it - const Allocator &select_on_container_copy_construction() const { return *this; } - - // MSVC also requires a function that neither C++98 nor C++11 mention - // but it doesn't support variadic templates... otherwise the VC_CXX11 clause would be nice - void construct(pointer p) { ::new(p) T(); } - - // we still need the C++98 version: - void construct(pointer p, const T& __val) { ::new(p) T(__val); } - void destroy(pointer p) { p->~T(); } -#elif defined(VC_CXX11) - template void construct(U* p, Args&&... args) - { - ::new(p) U(std::forward(args)...); - } - template void destroy(U* p) { p->~U(); } -#else - void construct(pointer p, const T& __val) { ::new(p) T(__val); } - void destroy(pointer p) { p->~T(); } -#endif - }; - - template inline bool operator==(const Allocator&, const Allocator&) { return true; } - template inline bool operator!=(const Allocator&, const Allocator&) { return false; } - -} // namespace Vc -} // namespace ROOT - -#include "common/undomacros.h" -#include "vector.h" -namespace std -{ - template class allocator > : public ::ROOT::Vc::Allocator > - { - public: - template struct rebind { typedef ::std::allocator other; }; -#ifdef VC_MSVC - // MSVC brokenness: the following function is optional - just doesn't compile without it - const allocator &select_on_container_copy_construction() const { return *this; } -#endif - }; -} - -#endif // VC_ALLOCATOR_H - -// vim: ft=cpp et sw=4 sts=4 diff --git a/math/vc/include/Vc/IO b/math/vc/include/Vc/IO deleted file mode 100644 index 084b8d94dec60..0000000000000 --- a/math/vc/include/Vc/IO +++ /dev/null @@ -1,195 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VECIO_H -#define VECIO_H - -#include "vector.h" -#include "Memory" -#include - -#if defined(__GNUC__) && !defined(_WIN32) && defined(_GLIBCXX_OSTREAM) -#define VC_HACK_OSTREAM_FOR_TTY 1 -#endif - -#ifdef VC_HACK_OSTREAM_FOR_TTY -#include -#include -#endif - -#include "internal/namespace.h" - -namespace -{ - namespace AnsiColor - { - struct Type { const char *data; }; - static const Type green = { "\033[1;40;32m" }; - static const Type yellow = { "\033[1;40;33m" }; - static const Type blue = { "\033[1;40;34m" }; - static const Type normal = { "\033[0m" }; - } // namespace AnsiColor - -#ifdef VC_HACK_OSTREAM_FOR_TTY - class hacked_ostream : public std::ostream - { - public: - using std::ostream::_M_streambuf; - }; - __attribute__((__const__)) bool mayUseColor(const std::ostream &os) - { - std::basic_streambuf *hack1 = const_cast *>(os.*(&hacked_ostream::_M_streambuf)); - __gnu_cxx::stdio_sync_filebuf *hack = dynamic_cast<__gnu_cxx::stdio_sync_filebuf *>(hack1); - if (!hack) { - return false; - } - FILE *file = hack->file(); - return 1 == isatty(fileno(file)); - } -#else - inline bool mayUseColor(const std::ostream &) { return false; } -#endif -} // anonymous namespace - -namespace std -{ -inline std::ostream &operator<<(std::ostream &out, const AnsiColor::Type &c) -{ - if (mayUseColor(out)) { - out << c.data; - } - return out; -} - -template -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) -{ - out << AnsiColor::green << "["; - out << v[0]; - for (int i = 1; i < v.Size; ++i) { - out << ", " << v[i]; - } - out << "]" << AnsiColor::normal; - return out; -} - -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) -{ - out << AnsiColor::green << "["; - out << int(v[0]); - for (int i = 1; i < v.Size; ++i) { - out << ", " << int(v[i]); - } - out << "]" << AnsiColor::normal; - return out; -} -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector &v) -{ - out << AnsiColor::green << "["; - out << int(v[0]); - for (int i = 1; i < v.Size; ++i) { - out << ", " << int(v[i]); - } - out << "]" << AnsiColor::normal; - return out; -} - -#ifdef VC_HAVE_FMA -template -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::VectorMultiplication &v) -{ - return out << VECTOR_NAMESPACE::Vector(v); -} -#endif - -#ifdef VC_IMPL_AVX -template -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask &m) -#else -template -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask &m) -#endif -{ - out << AnsiColor::blue << "m["; - for (unsigned int i = 0; i < VectorSize; ++i) { - if (i > 0 && (i % 4) == 0) { - out << " "; - } - if ( m[i] ) { - out << AnsiColor::yellow << '1'; - } else { - out << AnsiColor::blue << '0'; - } - } - out << AnsiColor::blue << "]" << AnsiColor::normal; - return out; -} -#ifdef VC_IMPL_SSE -inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Float8Mask &m) -{ - out << AnsiColor::blue << "m["; - for (unsigned int i = 0; i < 8; ++i) { - if (i > 0 && (i % 4) == 0) { - out << " "; - } - if ( m[i] ) { - out << AnsiColor::yellow << '1'; - } else { - out << AnsiColor::blue << '0'; - } - } - out << AnsiColor::blue << "]" << AnsiColor::normal; - return out; -} -#endif - -template -inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase &m ) -{ - out << AnsiColor::blue << "{" << AnsiColor::normal; - for (unsigned int i = 0; i < m.vectorsCount(); ++i) { - out << V(m.vector(i)); - } - out << AnsiColor::blue << "}" << AnsiColor::normal; - return out; -} - -template -inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase &m ) -{ - out << AnsiColor::blue << "{" << AnsiColor::normal; - for (size_t i = 0; i < m.rowsCount(); ++i) { - if (i > 0) { - out << "\n "; - } - const size_t vcount = m[i].vectorsCount(); - for (size_t j = 0; j < vcount; ++j) { - out << V(m[i].vector(j)); - } - } - out << AnsiColor::blue << "}" << AnsiColor::normal; - return out; -} -} // namespace std - -#undef VECTOR_NAMESPACE - -#endif // VECIO_H - -// vim: ft=cpp diff --git a/math/vc/include/Vc/Memory b/math/vc/include/Vc/Memory deleted file mode 100644 index 214cc349c20a8..0000000000000 --- a/math/vc/include/Vc/Memory +++ /dev/null @@ -1,37 +0,0 @@ - -/* This file is part of the Vc library. - - Copyright (C) 2009 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef INCLUDE_VC_MEMORY -#define INCLUDE_VC_MEMORY - -#include "vector.h" -#include "common/memory.h" -#include "common/interleavedmemory.h" -#ifdef VC_IMPL_Scalar -# include "scalar/interleavedmemory.tcc" -#elif defined(VC_IMPL_AVX) -# include "avx/interleavedmemory.tcc" -#elif defined(VC_IMPL_SSE) -# include "sse/interleavedmemory.tcc" -#endif - -#endif // INCLUDE_VC_MEMORY - -// vim: ft=cpp diff --git a/math/vc/include/Vc/Utils b/math/vc/include/Vc/Utils deleted file mode 100644 index ef5a0eaad864b..0000000000000 --- a/math/vc/include/Vc/Utils +++ /dev/null @@ -1,33 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_UTILS -#define VC_UTILS - -#include "global.h" - -#ifdef VC_IMPL_Scalar -# define VECTOR_NAMESPACE Scalar -#else -# define VECTOR_NAMESPACE SSE -#endif - -#include "common/deinterleave.h" - -#endif // VC_UTILS diff --git a/math/vc/include/Vc/Vc b/math/vc/include/Vc/Vc deleted file mode 100644 index 47c8c4c726e71..0000000000000 --- a/math/vc/include/Vc/Vc +++ /dev/null @@ -1,29 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_VC -#define VC_VC -#include "vector.h" -#include "IO" -#include "Memory" -#include "Utils" -#include "Allocator" -#endif // VC_VC - -// vim: ft=cpp diff --git a/math/vc/include/Vc/avx/casts.h b/math/vc/include/Vc/avx/casts.h deleted file mode 100644 index 9945d33ea3577..0000000000000 --- a/math/vc/include/Vc/avx/casts.h +++ /dev/null @@ -1,190 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_CASTS_H -#define AVX_CASTS_H - -#include "intrinsics.h" -#include "types.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - template static Vc_INTRINSIC_L T avx_cast(param128 v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param128i v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param128d v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256 v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256i v) Vc_INTRINSIC_R; - template static Vc_INTRINSIC_L T avx_cast(param256d v) Vc_INTRINSIC_R; - -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - template static Vc_INTRINSIC T avx_cast(__m128 v) { return avx_cast(param128 (v)); } - template static Vc_INTRINSIC T avx_cast(__m128i v) { return avx_cast(param128i(v)); } - template static Vc_INTRINSIC T avx_cast(__m128d v) { return avx_cast(param128d(v)); } - template static Vc_INTRINSIC T avx_cast(__m256 v) { return avx_cast(param256 (v)); } - template static Vc_INTRINSIC T avx_cast(__m256i v) { return avx_cast(param256i(v)); } - template static Vc_INTRINSIC T avx_cast(__m256d v) { return avx_cast(param256d(v)); } -#endif - - // 128 -> 128 - template<> Vc_INTRINSIC m128 avx_cast(param128 v) { return v; } - template<> Vc_INTRINSIC m128 avx_cast(param128i v) { return _mm_castsi128_ps(v); } - template<> Vc_INTRINSIC m128 avx_cast(param128d v) { return _mm_castpd_ps(v); } - template<> Vc_INTRINSIC m128i avx_cast(param128 v) { return _mm_castps_si128(v); } - template<> Vc_INTRINSIC m128i avx_cast(param128i v) { return v; } - template<> Vc_INTRINSIC m128i avx_cast(param128d v) { return _mm_castpd_si128(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128 v) { return _mm_castps_pd(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128i v) { return _mm_castsi128_pd(v); } - template<> Vc_INTRINSIC m128d avx_cast(param128d v) { return v; } - - // 128 -> 256 - // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never - // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's - // upper 128bits are zero. Thus using the same register as AVX register will have the upper - // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory - // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do - // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck, - // do we really want to rely on specific compiler behavior here? - template<> Vc_INTRINSIC m256 avx_cast(param128 v) { return _mm256_castps128_ps256(v); } - template<> Vc_INTRINSIC m256 avx_cast(param128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } - template<> Vc_INTRINSIC m256 avx_cast(param128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } - template<> Vc_INTRINSIC m256i avx_cast(param128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } - template<> Vc_INTRINSIC m256i avx_cast(param128i v) { return _mm256_castsi128_si256(v); } - template<> Vc_INTRINSIC m256i avx_cast(param128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } - template<> Vc_INTRINSIC m256d avx_cast(param128d v) { return _mm256_castpd128_pd256(v); } - -#ifdef VC_MSVC - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } -#else - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(param128 v) { return _mm256_castps128_ps256(v); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(param128i v) { return _mm256_castsi128_si256(v); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(param128d v) { return _mm256_castpd128_pd256(v); } -#ifdef VC_ICC - static Vc_INTRINSIC Vc_CONST m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } - static Vc_INTRINSIC Vc_CONST m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } - static Vc_INTRINSIC Vc_CONST m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } -#endif -#endif - - // 256 -> 128 - template<> Vc_INTRINSIC m128 avx_cast(param256 v) { return _mm256_castps256_ps128(v); } - template<> Vc_INTRINSIC m128 avx_cast(param256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } - template<> Vc_INTRINSIC m128 avx_cast(param256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } - template<> Vc_INTRINSIC m128i avx_cast(param256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } - template<> Vc_INTRINSIC m128i avx_cast(param256i v) { return _mm256_castsi256_si128(v); } - template<> Vc_INTRINSIC m128i avx_cast(param256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } - template<> Vc_INTRINSIC m128d avx_cast(param256d v) { return _mm256_castpd256_pd128(v); } - - // 256 -> 256 - template<> Vc_INTRINSIC m256 avx_cast(param256 v) { return v; } - template<> Vc_INTRINSIC m256 avx_cast(param256i v) { return _mm256_castsi256_ps(v); } - template<> Vc_INTRINSIC m256 avx_cast(param256d v) { return _mm256_castpd_ps(v); } - template<> Vc_INTRINSIC m256i avx_cast(param256 v) { return _mm256_castps_si256(v); } - template<> Vc_INTRINSIC m256i avx_cast(param256i v) { return v; } - template<> Vc_INTRINSIC m256i avx_cast(param256d v) { return _mm256_castpd_si256(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256 v) { return _mm256_castps_pd(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256i v) { return _mm256_castsi256_pd(v); } - template<> Vc_INTRINSIC m256d avx_cast(param256d v) { return v; } - - // simplify splitting 256-bit registers in 128-bit registers - Vc_INTRINSIC Vc_CONST m128 lo128(param256 v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128d lo128(param256d v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128i lo128(param256i v) { return avx_cast(v); } - Vc_INTRINSIC Vc_CONST m128 hi128(param256 v) { return _mm256_extractf128_ps(v, 1); } - Vc_INTRINSIC Vc_CONST m128d hi128(param256d v) { return _mm256_extractf128_pd(v, 1); } - Vc_INTRINSIC Vc_CONST m128i hi128(param256i v) { return _mm256_extractf128_si256(v, 1); } - - // simplify combining 128-bit registers in 256-bit registers - Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(param128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(param128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, param128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, param128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, param128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256 concat(param128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(param128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(param128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256 concat(__m128 a, __m128 b) { return _mm256_insertf128_ps (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256d concat(__m128d a, __m128d b) { return _mm256_insertf128_pd (avx_cast(a), b, 1); } - Vc_INTRINSIC Vc_CONST m256i concat(__m128i a, __m128i b) { return _mm256_insertf128_si256(avx_cast(a), b, 1); } -#endif - - template struct StaticCastHelper {}; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256 v) { return _mm256_cvttps_epi32(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; - template<> struct StaticCastHelper { static inline Vc_CONST m256i cast(param256 v) { - return _mm256_castps_si256(_mm256_blendv_ps( - _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), - _mm256_castsi256_ps(_mm256_add_epi32(m256i(_mm256_cvttps_epi32(_mm256_sub_ps(v, _mm256_set2power31_ps()))), _mm256_set2power31_epu32())), - _mm256_cmpge_ps(v, _mm256_set2power31_ps()) - )); - - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256d v) { return avx_cast(_mm256_cvttpd_epi32(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param256i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256i cast(param128i v) { return concat(_mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16), _mm_srli_epi32(_mm_unpackhi_epi16(v, v), 16)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256 v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256d v) { return avx_cast(_mm256_cvtpd_ps(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param256i v) { return _mm256_cvtepi32_ps(v); } }; - template<> struct StaticCastHelper { static inline Vc_CONST m256 cast(param256i v) { - return _mm256_blendv_ps( - _mm256_cvtepi32_ps(v), - _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_sub_epi32(v, _mm256_set2power31_epu32())), _mm256_set2power31_ps()), - _mm256_castsi256_ps(_mm256_cmplt_epi32(v, _mm256_setzero_si256())) - ); - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256 v) { return _mm256_cvtps_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256d v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256d cast(param256i v) { return _mm256_cvtepi32_pd(avx_cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packs_epi32(lo128(v), hi128(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256i v) { return _mm_packus_epi32(lo128(v), hi128(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param128i v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m128i cast(param256 v) { return StaticCastHelper::cast(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE Vc_CONST m256 cast(param128i v) { return _mm256_cvtepi32_ps(StaticCastHelper::cast(v)); } }; -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // AVX_CASTS_H diff --git a/math/vc/include/Vc/avx/const.h b/math/vc/include/Vc/avx/const.h deleted file mode 100644 index ef7ae3fc33196..0000000000000 --- a/math/vc/include/Vc/avx/const.h +++ /dev/null @@ -1,112 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_CONST_H -#define VC_AVX_CONST_H - -#include -#include "const_data.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - template class Vector; - - template struct IndexesFromZeroData; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast(&_IndexesFromZero32[0]); } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast(&_IndexesFromZero16[0]); } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } - }; - template<> struct IndexesFromZeroData { - static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; } - }; - - template struct Const - { - typedef Vector<_T> V; - typedef typename V::EntryType T; - typedef typename V::Mask M; - - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig::data[0]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig::data[1]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig::data[2]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig::data[3]); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig::data[4]); } - static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig::data[5]); } - - static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i) { return V(c_trig::data[( 8 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i) { return V(c_trig::data[(14 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig::data[(24 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig::data[(29 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig::data[34]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig::data[35]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig::data[36]); } - static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig::data[20]); } - static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig::data[21]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig::data[22]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig::data[23]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig::data[(40 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig::data[(45 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig::data[(49 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig::data[(55 + i)]); } - static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig::data[37]); } - static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig::data[38]); } - - static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log::d(1)).data()); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log::d(18)); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log::d(15)); } - static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log::d(2 + i)); } - static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log::d(8 + i)); } - static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log::d(14)); } - static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log::d(17)); } - static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log::d(16)); } - static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log::d(13)); } - static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log::d(19)); } - static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log::d(20)); } - - static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; - }; - - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::highMaskFloat)); } -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_AVX_CONST_H diff --git a/math/vc/include/Vc/avx/const_data.h b/math/vc/include/Vc/avx/const_data.h deleted file mode 100644 index 45002bdd45fb1..0000000000000 --- a/math/vc/include/Vc/avx/const_data.h +++ /dev/null @@ -1,74 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_AVX_CONST_DATA_H -#define VC_AVX_CONST_DATA_H - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -ALIGN(64) extern const unsigned int _IndexesFromZero32[8]; -ALIGN(16) extern const unsigned short _IndexesFromZero16[8]; -ALIGN(16) extern const unsigned char _IndexesFromZero8[16]; - -struct STRUCT_ALIGN1(64) c_general -{ - static const float oneFloat; - static const unsigned int absMaskFloat[2]; - static const unsigned int signMaskFloat[2]; - static const unsigned int highMaskFloat; - static const unsigned short minShort[2]; - static const unsigned short one16[2]; - static const float _2power31; - static const double oneDouble; - static const unsigned long long frexpMask; - static const unsigned long long highMaskDouble; -} STRUCT_ALIGN2(64); - -template struct c_trig -{ - ALIGN(64) static const T data[]; -}; - -template struct c_log -{ - typedef float floatAlias Vc_MAY_ALIAS; - static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast(&data[i]); } - ALIGN(64) static const unsigned int data[]; -}; - -template<> struct c_log -{ - enum VectorSize { Size = 16 / sizeof(double) }; - typedef double doubleAlias Vc_MAY_ALIAS; - static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast(&data[i]); } - ALIGN(64) static const unsigned long long data[]; -}; - -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_AVX_CONST_DATA_H diff --git a/math/vc/include/Vc/avx/debug.h b/math/vc/include/Vc/avx/debug.h deleted file mode 100644 index b569d29a3b5b7..0000000000000 --- a/math/vc/include/Vc/avx/debug.h +++ /dev/null @@ -1,102 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_DEBUG_H -#define VC_AVX_DEBUG_H - -#ifndef NDEBUG -#include "vectorbase.h" -#include -#include -#endif - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -#ifdef NDEBUG -class DebugStream -{ - public: - DebugStream(const char *, const char *, int) {} - template inline DebugStream &operator<<(const T &) { return *this; } -}; -#else -class DebugStream -{ - private: - template static void printVector(V _x) - { - enum { Size = sizeof(V) / sizeof(T) }; - union { V v; T m[Size]; } x = { _x }; - std::cerr << '[' << std::setprecision(24) << x.m[0]; - for (int i = 1; i < Size; ++i) { - std::cerr << ", " << std::setprecision(24) << x.m[i]; - } - std::cerr << ']'; - } - public: - DebugStream(const char *func, const char *file, int line) - { - std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' '; - } - - template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } - - DebugStream &operator<<(__m128 x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m256 x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m128d x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m256d x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m128i x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m256i x) { - printVector(x); - return *this; - } - - ~DebugStream() - { - std::cerr << "\033[0m" << std::endl; - } -}; -#endif - -#define VC_DEBUG ::ROOT::Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) - -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#endif // VC_AVX_DEBUG_H diff --git a/math/vc/include/Vc/avx/deinterleave.tcc b/math/vc/include/Vc/avx/deinterleave.tcc deleted file mode 100644 index 28032a81073d7..0000000000000 --- a/math/vc/include/Vc/avx/deinterleave.tcc +++ /dev/null @@ -1,282 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -inline void deinterleave(double_v &VC_RESTRICT a, double_v &VC_RESTRICT b, double_v &VC_RESTRICT c) -{ // estimated latency (AVX): 4.5 cycles - const m256d tmp0 = Mem::shuffle128(a.data(), b.data()); - const m256d tmp1 = Mem::shuffle128(a.data(), c.data()); - const m256d tmp2 = Mem::shuffle128(b.data(), c.data()); - a.data() = Mem::shuffle(tmp0, tmp1); - b.data() = Mem::shuffle(tmp0, tmp2); - c.data() = Mem::shuffle(tmp1, tmp2); -} - -inline void deinterleave(float_v &VC_RESTRICT a, float_v &VC_RESTRICT b, float_v &VC_RESTRICT c) -{ - // abc abc abc - // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 - // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 - // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 - const m256 ac0 = Mem::shuffle128(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 - const m256 ac1 = Mem::shuffle128(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 - - m256 tmp0 = Mem::blend( ac0, b.data()); - tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 - m256 tmp1 = Mem::blend( ac0, b.data()); - tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 - m256 tmp2 = Mem::blend( ac0, b.data()); - tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 - - a.data() = Mem::permute(tmp0); - b.data() = Mem::permute(tmp1); - c.data() = Mem::permute(tmp2); -} - -inline void deinterleave(int_v &VC_RESTRICT a, int_v &VC_RESTRICT b, int_v &VC_RESTRICT c) -{ - deinterleave(reinterpret_cast(a), reinterpret_cast(b), - reinterpret_cast(c)); -} - -inline void deinterleave(uint_v &VC_RESTRICT a, uint_v &VC_RESTRICT b, uint_v &VC_RESTRICT c) -{ - deinterleave(reinterpret_cast(a), reinterpret_cast(b), - reinterpret_cast(c)); -} - -inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, - Vector &VC_RESTRICT c) -{ - // abc abc abc - // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121 - // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211 - // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112 - m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6 - m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7 - - m128i tmp0 = Mem::blend( ac0, b.data()); - tmp0 = Mem::blend(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5 - m128i tmp1 = Mem::blend( ac0, b.data()); - tmp1 = Mem::blend(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6 - m128i tmp2 = Mem::blend( ac0, b.data()); - tmp2 = Mem::blend(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7 - - a.data() = Mem::permuteHi(Mem::permuteLo(tmp0)); - b.data() = Mem::permuteHi(Mem::permuteLo(tmp1)); - c.data() = Mem::permuteHi(Mem::permuteLo(tmp2)); -} - -inline void deinterleave(Vector &VC_RESTRICT a, Vector &VC_RESTRICT b, - Vector &VC_RESTRICT c) -{ - deinterleave(reinterpret_cast &>(a), reinterpret_cast &>(b), - reinterpret_cast &>(c)); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - // a7 a6 a5 a4 a3 a2 a1 a0 - // b7 b6 b5 b4 b3 b2 b1 b0 - const m256 tmp0 = Reg::permute128(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0 - const m256 tmp1 = Reg::permute128(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4 - - const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 - const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 - - a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0 - b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1 -} - -inline void deinterleave(Vector &a, Vector &b) -{ - m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); -} - -} // namespace AVX - - -namespace Internal -{ - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const float *m, A align) -{ - a.load(m, align); - b.load(m + float_v::Size, align); - Vc::AVX::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const short *m, A align) -{ - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16))); - b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srai_epi32(AVX::lo128(tmp), 16), - _mm_srai_epi32(AVX::hi128(tmp), 16))); -} - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const unsigned short *m, A align) -{ - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_blend_epi16(AVX::lo128(tmp), _mm_setzero_si128(), 0xaa), - _mm_blend_epi16(AVX::hi128(tmp), _mm_setzero_si128(), 0xaa))); - b.data() = _mm256_cvtepi32_ps(Vc::AVX::concat( - _mm_srli_epi32(AVX::lo128(tmp), 16), - _mm_srli_epi32(AVX::hi128(tmp), 16))); -} - -template inline void HelperImpl::deinterleave( - sfloat_v &_a, sfloat_v &_b, const MemT *m, A align) -{ - float_v &a = reinterpret_cast(_a); - float_v &b = reinterpret_cast(_b); - HelperImpl::deinterleave(a, b, m, align); -} - -template inline void HelperImpl::deinterleave( - double_v &a, double_v &b, const double *m, A align) -{ - a.load(m, align); - b.load(m + double_v::Size, align); - - m256d tmp0 = Mem::shuffle128(a.data(), b.data()); // b1 b0 a1 a0 - m256d tmp1 = Mem::shuffle128(a.data(), b.data()); // b3 b2 a3 a2 - - a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0 - b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1 -} - -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const int *m, A align) -{ - using Vc::AVX::m256; - a.load(m, align); - b.load(m + int_v::Size, align); - - const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - - const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 - const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 - - a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 - b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 -} - -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const short *m, A align) -{ - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = Vc::AVX::concat( - _mm_srai_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); - b.data() = Vc::AVX::concat( - _mm_srai_epi32(AVX::lo128(tmp), 16), - _mm_srai_epi32(AVX::hi128(tmp), 16)); -} - -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned int *m, A align) -{ - using Vc::AVX::m256; - a.load(m, align); - b.load(m + uint_v::Size, align); - - const m256 tmp0 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - const m256 tmp1 = AVX::avx_cast(Mem::shuffle128(a.data(), b.data())); - - const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0 - const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2 - - a.data() = AVX::avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0 - b.data() = AVX::avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1 -} - -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned short *m, A align) -{ - using Vc::AVX::m256i; - const m256i tmp = Vc::AVX::VectorHelper::load(m, align); - a.data() = Vc::AVX::concat( - _mm_srli_epi32(_mm_slli_epi32(AVX::lo128(tmp), 16), 16), - _mm_srli_epi32(_mm_slli_epi32(AVX::hi128(tmp), 16), 16)); - b.data() = Vc::AVX::concat( - _mm_srli_epi32(AVX::lo128(tmp), 16), - _mm_srli_epi32(AVX::hi128(tmp), 16)); -} - -template inline void HelperImpl::deinterleave( - short_v &a, short_v &b, const short *m, A align) -{ - a.load(m, align); - b.load(m + short_v::Size, align); - Vc::AVX::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - ushort_v &a, ushort_v &b, const unsigned short *m, A align) -{ - a.load(m, align); - b.load(m + ushort_v::Size, align); - Vc::AVX::deinterleave(a, b); -} - -// only support M == V::EntryType -> no specialization -template -inline Vc_FLATTEN void HelperImpl::deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) -{ - a.load(&memory[0 * V::Size], align); - b.load(&memory[1 * V::Size], align); - c.load(&memory[2 * V::Size], align); - Vc::AVX::deinterleave(a, b, c); -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/avx/forceToRegisters.tcc b/math/vc/include/Vc/avx/forceToRegisters.tcc deleted file mode 100644 index 7906a2770bfeb..0000000000000 --- a/math/vc/include/Vc/avx/forceToRegisters.tcc +++ /dev/null @@ -1,141 +0,0 @@ -#ifdef __GNUC__ -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { - __asm__ __volatile__(""::"x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x1) { - __asm__ __volatile__("":"+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x8, const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x8.data()), "x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x8, Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x8.data()), "+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -#elif defined(VC_MSVC) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x8*/, const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x8*/, Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#else -#error "forceToRegisters unsupported on this compiler" -#endif diff --git a/math/vc/include/Vc/avx/helperimpl.h b/math/vc/include/Vc/avx/helperimpl.h deleted file mode 100644 index b080b86057d3b..0000000000000 --- a/math/vc/include/Vc/avx/helperimpl.h +++ /dev/null @@ -1,104 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_HELPERIMPL_H -#define VC_AVX_HELPERIMPL_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template<> struct HelperImpl -{ - typedef AVX::Vector float_v; - typedef AVX::Vector sfloat_v; - typedef AVX::Vector double_v; - typedef AVX::Vector int_v; - typedef AVX::Vector uint_v; - typedef AVX::Vector short_v; - typedef AVX::Vector ushort_v; - - template static void deinterleave(float_v &, float_v &, const float *, A); - template static void deinterleave(float_v &, float_v &, const short *, A); - template static void deinterleave(float_v &, float_v &, const unsigned short *, A); - - template static void deinterleave(sfloat_v &, sfloat_v &, const MemT *, A); - - template static void deinterleave(double_v &, double_v &, const double *, A); - - template static void deinterleave(int_v &, int_v &, const int *, A); - template static void deinterleave(int_v &, int_v &, const short *, A); - - template static void deinterleave(uint_v &, uint_v &, const unsigned int *, A); - template static void deinterleave(uint_v &, uint_v &, const unsigned short *, A); - - template static void deinterleave(short_v &, short_v &, const short *, A); - - template static void deinterleave(ushort_v &, ushort_v &, const unsigned short *, A); - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - V &VC_RESTRICT f, const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void deinterleave(V &VC_RESTRICT a, V &VC_RESTRICT b, - V &VC_RESTRICT c, V &VC_RESTRICT d, V &VC_RESTRICT e, - V &VC_RESTRICT f, V &VC_RESTRICT g, V &VC_RESTRICT h, - const M *VC_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; - - static Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; -}; - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#include "deinterleave.tcc" -#include "prefetches.tcc" -#include "helperimpl.tcc" -#include "undomacros.h" - -#endif // VC_AVX_HELPERIMPL_H diff --git a/math/vc/include/Vc/avx/helperimpl.tcc b/math/vc/include/Vc/avx/helperimpl.tcc deleted file mode 100644 index 65a6be2cc5430..0000000000000 --- a/math/vc/include/Vc/avx/helperimpl.tcc +++ /dev/null @@ -1,64 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_HELPERIMPL_TCC -#define VC_AVX_HELPERIMPL_TCC - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template -static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) -{ - return (value % X) > 0 ? value + X - (value % X) : value; -} - -template -Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) -{ - switch (A) { - case Vc::AlignOnVector: - return _mm_malloc(nextMultipleOf(n), Vc::AVX::VectorAlignment); - case Vc::AlignOnCacheline: - // TODO: hardcoding 64 is not such a great idea - return _mm_malloc(nextMultipleOf<64>(n), 64); - case Vc::AlignOnPage: - // TODO: hardcoding 4096 is not such a great idea - return _mm_malloc(nextMultipleOf<4096>(n), 4096); - default: -#ifndef NDEBUG - abort(); -#endif - return _mm_malloc(n, 8); - } -} - -Vc_ALWAYS_INLINE void HelperImpl::free(void *p) -{ - _mm_free(p); -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#endif // VC_AVX_HELPERIMPL_TCC diff --git a/math/vc/include/Vc/avx/interleavedmemory.tcc b/math/vc/include/Vc/avx/interleavedmemory.tcc deleted file mode 100644 index 919fea55588ea..0000000000000 --- a/math/vc/include/Vc/avx/interleavedmemory.tcc +++ /dev/null @@ -1,890 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_AVX_INTERLEAVEDMEMORY_TCC -#define VC_AVX_INTERLEAVEDMEMORY_TCC - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -namespace -{ -template struct InterleaveImpl; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); -#ifdef __x86_64__ - const long long tmp00 = _mm_cvtsi128_si64(tmp0); - const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); - const long long tmp10 = _mm_cvtsi128_si64(tmp1); - const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); - *reinterpret_cast(&data[i[0]]) = tmp00; - *reinterpret_cast(&data[i[1]]) = tmp00 >> 32; - *reinterpret_cast(&data[i[2]]) = tmp01; - *reinterpret_cast(&data[i[3]]) = tmp01 >> 32; - *reinterpret_cast(&data[i[4]]) = tmp10; - *reinterpret_cast(&data[i[5]]) = tmp10 >> 32; - *reinterpret_cast(&data[i[6]]) = tmp11; - *reinterpret_cast(&data[i[7]]) = tmp11 >> 32; -#else - *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); - *reinterpret_cast(&data[i[1]]) = _mm_extract_epi32(tmp0, 1); - *reinterpret_cast(&data[i[2]]) = _mm_extract_epi32(tmp0, 2); - *reinterpret_cast(&data[i[3]]) = _mm_extract_epi32(tmp0, 3); - *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); - *reinterpret_cast(&data[i[5]]) = _mm_extract_epi32(tmp1, 1); - *reinterpret_cast(&data[i[6]]) = _mm_extract_epi32(tmp1, 2); - *reinterpret_cast(&data[i[7]]) = _mm_extract_epi32(tmp1, 3); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { -#ifdef VC_USE_MASKMOV_SCATTER - const m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - const m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0); - typename V::EntryType *const dataHi = data - 4; - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); - const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); - - const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast(&data[i[0]])); - _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast(&dataHi[i[1]])); - _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast(&data[i[2]])); - _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast(&dataHi[i[3]])); - _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast(&data[i[4]])); - _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast(&dataHi[i[5]])); - _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast(&data[i[6]])); - _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast(&dataHi[i[7]])); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - const m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); - const m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); - - const m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - using namespace Vc::AVX; - // [0a 1a 0b 1b 0e 1e 0f 1f]: - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); - // [0c 1c 0d 1d 0g 1g 0h 1h]: - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v1.data())); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { - using namespace Vc::AVX; -#ifdef VC_USE_MASKMOV_SCATTER - // [0a 2a 0b 2b 0e 2e 0f 2f]: - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - // [0c 2c 0d 2d 0g 2g 0h 2h]: - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - // [1a __ 1b __ 1e __ 1f __]: - const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); - // [1c __ 1d __ 1g __ 1h __]: - const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v1.data())); - const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); - const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); - const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); - const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); - const m128i mask = _mm_set_epi32(0, -1, -1, -1); - _mm_maskstore_ps(reinterpret_cast(&data[i[0]]), mask, lo128(tmp4)); - _mm_maskstore_ps(reinterpret_cast(&data[i[1]]), mask, lo128(tmp5)); - _mm_maskstore_ps(reinterpret_cast(&data[i[2]]), mask, lo128(tmp6)); - _mm_maskstore_ps(reinterpret_cast(&data[i[3]]), mask, lo128(tmp7)); - _mm_maskstore_ps(reinterpret_cast(&data[i[4]]), mask, hi128(tmp4)); - _mm_maskstore_ps(reinterpret_cast(&data[i[5]]), mask, hi128(tmp5)); - _mm_maskstore_ps(reinterpret_cast(&data[i[6]]), mask, hi128(tmp6)); - _mm_maskstore_ps(reinterpret_cast(&data[i[7]]), mask, hi128(tmp7)); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - using namespace Vc::AVX; - const m256 tmp0 = _mm256_unpacklo_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - const m256 tmp1 = _mm256_unpackhi_ps(AVX::avx_cast(v0.data()), AVX::avx_cast(v2.data())); - const m256 tmp2 = _mm256_unpacklo_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); - const m256 tmp3 = _mm256_unpackhi_ps(AVX::avx_cast(v1.data()), AVX::avx_cast(v3.data())); - const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); - const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); - const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); - const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); - _mm_storeu_ps(reinterpret_cast(&data[i[0]]), lo128(tmp4)); - _mm_storeu_ps(reinterpret_cast(&data[i[1]]), lo128(tmp5)); - _mm_storeu_ps(reinterpret_cast(&data[i[2]]), lo128(tmp6)); - _mm_storeu_ps(reinterpret_cast(&data[i[3]]), lo128(tmp7)); - _mm_storeu_ps(reinterpret_cast(&data[i[4]]), hi128(tmp4)); - _mm_storeu_ps(reinterpret_cast(&data[i[5]]), hi128(tmp5)); - _mm_storeu_ps(reinterpret_cast(&data[i[6]]), hi128(tmp6)); - _mm_storeu_ps(reinterpret_cast(&data[i[7]]), hi128(tmp7)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - using namespace Vc::AVX; - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - _mm_storeu_pd(&data[i[0]], lo128(tmp0)); - _mm_storeu_pd(&data[i[1]], lo128(tmp1)); - _mm_storeu_pd(&data[i[2]], hi128(tmp0)); - _mm_storeu_pd(&data[i[3]], hi128(tmp1)); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { - using namespace Vc::AVX; -#ifdef VC_USE_MASKMOV_SCATTER - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); - const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); - -#if defined(VC_MSVC) && (VC_MSVC < 170000000 || !defined(_WIN64)) - // MSVC needs to be at Version 2012 before _mm256_set_epi64x works - const m256i mask = AVX::concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); -#else - const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); -#endif - _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); - _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); - _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); - _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - using namespace Vc::AVX; - // 0a 1a 0c 1c: - const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); - // 0b 1b 0b 1b: - const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); - // 2a 3a 2c 3c: - const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); - // 2b 3b 2b 3b: - const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); - _mm256_storeu_pd(&data[i[0]], Mem::shuffle128(tmp0, tmp2)); - _mm256_storeu_pd(&data[i[1]], Mem::shuffle128(tmp1, tmp3)); - _mm256_storeu_pd(&data[i[2]], Mem::shuffle128(tmp0, tmp2)); - _mm256_storeu_pd(&data[i[3]], Mem::shuffle128(tmp1, tmp3)); - }/*}}}*/ -}; -} // anonymous namespace - -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); - v4.scatter(m_data + 4, m_indexes); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5) -{ - InterleaveImpl::interleave(m_data , m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1) const/*{{{*/ -{ - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/ -{ - v4.gather(m_data, m_indexes + I(4)); - deinterleave(v0, v1, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/ -{ - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0]])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2]])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4]])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6]])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1]])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3]])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5]])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7]])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v0.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v1.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/ -{ - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0]]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1]]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2]]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3]]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4]]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5]]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6]]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7]]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v0.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v1.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v2.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v3.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/ -{ - v4.gather(m_data, m_indexes + I(4)); - deinterleave(v0, v1, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[0] + 4])); // a0 b0 - const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[2] + 4])); // a2 b2 - const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[4] + 4])); // a4 b4 - const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&m_data[m_indexes[6] + 4])); // a6 b6 - const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&m_data[m_indexes[1] + 4])); // a0 b0 a1 b1 - const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&m_data[m_indexes[3] + 4])); // a2 b2 a3 b3 - const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&m_data[m_indexes[5] + 4])); // a4 b4 a5 b5 - const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&m_data[m_indexes[7] + 4])); // a6 b6 a7 b7 - - const m256 tmp2 = AVX::concat(il01, il45); - const m256 tmp3 = AVX::concat(il23, il67); - - const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); - const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); - - v4.data() = _mm256_unpacklo_ps(tmp0, tmp1); - v5.data() = _mm256_unpackhi_ps(tmp0, tmp1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/ -{ - deinterleave(v0, v1, v2, v3); - const m128 il0 = _mm_loadu_ps(&m_data[m_indexes[0] + 4]); // a0 b0 c0 d0 - const m128 il1 = _mm_loadu_ps(&m_data[m_indexes[1] + 4]); // a1 b1 c1 d1 - const m128 il2 = _mm_loadu_ps(&m_data[m_indexes[2] + 4]); // a2 b2 c2 d2 - const m128 il3 = _mm_loadu_ps(&m_data[m_indexes[3] + 4]); // a3 b3 c3 d3 - const m128 il4 = _mm_loadu_ps(&m_data[m_indexes[4] + 4]); // a4 b4 c4 d4 - const m128 il5 = _mm_loadu_ps(&m_data[m_indexes[5] + 4]); // a5 b5 c5 d5 - const m128 il6 = _mm_loadu_ps(&m_data[m_indexes[6] + 4]); // a6 b6 c6 d6 - const m128 il7 = _mm_loadu_ps(&m_data[m_indexes[7] + 4]); // a7 b7 c7 d7 - - const m256 il04 = AVX::concat(il0, il4); - const m256 il15 = AVX::concat(il1, il5); - const m256 il26 = AVX::concat(il2, il6); - const m256 il37 = AVX::concat(il3, il7); - const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); - const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); - const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); - const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); - v4.data() = _mm256_unpacklo_ps(ab0246, ab1357); - v5.data() = _mm256_unpackhi_ps(ab0246, ab1357); - v6.data() = _mm256_unpacklo_ps(cd0246, cd1357); - v7.data() = _mm256_unpackhi_ps(cd0246, cd1357); -}/*}}}*/ - -static Vc_ALWAYS_INLINE void _avx_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/ -{ - const m256d ab02 = AVX::concat(_mm_loadu_pd(&data[indexes[0]]), _mm_loadu_pd(&data[indexes[2]])); - const m256d ab13 = AVX::concat(_mm_loadu_pd(&data[indexes[1]]), _mm_loadu_pd(&data[indexes[3]])); - - v0.data() = _mm256_unpacklo_pd(ab02, ab13); - v1.data() = _mm256_unpackhi_pd(ab02, ab13); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - v2.gather(m_data + 2, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - v4.gather(m_data + 4, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); - v6.gather(m_data + 6, m_indexes); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1, double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const/*{{{*/ -{ - _avx_deinterleave_double(m_data , m_indexes, v0, v1); - _avx_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _avx_deinterleave_double(m_data + 4, m_indexes, v4, v5); - _avx_deinterleave_double(m_data + 6, m_indexes, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/ - const m128i a = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2) const { - const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3) const { - const m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const { - const m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); - v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); -}/*}}}*/ - -// forward types of equal size - ugly, but it works/*{{{*/ -#define _forward(V, V2) \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6, V &v7) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6), reinterpret_cast(v7)); \ -} -_forward( int_v, float_v) -_forward(uint_v, float_v) -_forward(ushort_v, short_v) -#undef _forward/*}}}*/ - -} // namespace Common -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_AVX_INTERLEAVEDMEMORY_TCC - -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/avx/intrinsics.h b/math/vc/include/Vc/avx/intrinsics.h deleted file mode 100644 index e531a7dec92bf..0000000000000 --- a/math/vc/include/Vc/avx/intrinsics.h +++ /dev/null @@ -1,611 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_INTRINSICS_H -#define VC_AVX_INTRINSICS_H - -#include "../common/windows_fix_intrin.h" - -#include - -// see comment in sse/intrinsics.h -extern "C" { -// AVX -#include - -#if (defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)) && !defined(VC_MSVC) -#include -#endif -} - -#include "../common/fix_clang_emmintrin.h" - -#if defined(VC_CLANG) && VC_CLANG < 0x30100 -// _mm_permute_ps is broken: http://llvm.org/bugs/show_bug.cgi?id=12401 -#undef _mm_permute_ps -#define _mm_permute_ps(A, C) __extension__ ({ \ - m128 __A = (A); \ - (m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ - (C) & 0x3, ((C) & 0xc) >> 2, \ - ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) -#endif - -#include "const_data.h" -#include "macros.h" -#include - -#if defined(VC_CLANG) || defined(VC_MSVC) || (defined(VC_GCC) && !defined(__OPTIMIZE__)) -#define VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT -#endif - -#if defined(VC_CLANG) && VC_CLANG <= 0x30000 -// _mm_alignr_epi8 doesn't specify its return type, thus breaking overload resolution -#undef _mm_alignr_epi8 -#define _mm_alignr_epi8(a, b, n) ((m128i)__builtin_ia32_palignr128((a), (b), (n))) -#endif - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - /* super evil hacking around C++ features: - * consider - * void fun(int); - * namespace X { void fun(int); } - * namespace X { void bar() { fun(0); } } // this will be a call to X::fun(int) - * - * void fun(m256); - * namespace X { void fun(m256); } - * namespace X { void bar() { fun(0); } } // this will be ambiguous because m256 is a - * non-fundamental type in the global namespace, thus - * adding ::fun(m256) to the candidates - * - * To make my own overloads of the intrinsics distinct I have to use a type that is inside the - * Vc::AVX namespace. To reduce porting effort and increase generality I want to use the same - * function names as used in the global namespace. The type name may not be the same, though - * because identifiers starting with two underscores are reserved by the standard. Thus using - * those would mean to depend on undefined behavior. - * Sadly a typedef is not enough. - * Public inheritance also does not work, because at least ICC considers the __m??? types to be - * some sort of fundamental types. - * Thus composition is the only solution. - */ -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - template struct Alias - { - typedef T Base; - T _d; - Vc_ALWAYS_INLINE operator T &() { return _d; } - Vc_ALWAYS_INLINE operator const T &() const { return _d; } - Vc_ALWAYS_INLINE Alias() {} - Vc_ALWAYS_INLINE Alias(T x) : _d(x) {} - Vc_ALWAYS_INLINE Alias(const Alias &x) : _d(x._d) {} - Vc_ALWAYS_INLINE Alias &operator=(T x) { _d = x; return *this; } - Vc_ALWAYS_INLINE Alias &operator=(const Alias &x) { _d = x._d; return *this; } - }; - typedef Alias<__m128 > m128 ; - typedef Alias<__m128d> m128d; - typedef Alias<__m128i> m128i; - typedef Alias<__m256 > m256 ; - typedef Alias<__m256d> m256d; - typedef Alias<__m256i> m256i; -#else - typedef __m128 m128 ; - typedef __m128d m128d; - typedef __m128i m128i; - typedef __m256 m256 ; - typedef __m256d m256d; - typedef __m256i m256i; -#endif -#if defined(VC_UNCONDITIONAL_AVX2_INTRINSICS) && defined(VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN) - typedef const m128 & param128 ; - typedef const m128d & param128d; - typedef const m128i & param128i; - typedef const m256 & param256 ; - typedef const m256d & param256d; - typedef const m256i & param256i; -#else - typedef const m128 param128 ; - typedef const m128d param128d; - typedef const m128i param128i; - typedef const m256 param256 ; - typedef const m256d param256d; - typedef const m256i param256i; -#endif - -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - // Make use of cast intrinsics easier. But if param256 == const __m256 then these would lead to - // ambiguities. - static Vc_INTRINSIC m256i Vc_CONST _mm256_castps_si256(param256 a) { return ::_mm256_castps_si256(a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_castps_pd (param256 a) { return ::_mm256_castps_pd (a); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_castpd_si256(param256d a) { return ::_mm256_castpd_si256(a); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_castpd_ps (param256d a) { return ::_mm256_castpd_ps (a); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_castsi256_ps(param256i a) { return ::_mm256_castsi256_ps(a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_castsi256_pd(param256i a) { return ::_mm256_castsi256_pd(a); } -#endif - -#ifdef VC_GCC - // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin - // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) - static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) * static_cast<__v4df>(b)); } - static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) + static_cast<__v4df>(b)); } - static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) - static_cast<__v4df>(b)); } - static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); } - static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); } - static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); } -#endif - - static Vc_INTRINSIC m256 Vc_CONST _mm256_set1_ps (float a) { return ::_mm256_set1_ps (a); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_set1_pd (double a) { return ::_mm256_set1_pd (a); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epi32(int a) { return ::_mm256_set1_epi32(a); } - //static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epu32(unsigned int a) { return ::_mm256_set1_epu32(a); } - -#if defined(VC_GNU_ASM) && !defined(NVALGRIND) - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } -#else - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone() { m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } -#endif - static Vc_INTRINSIC m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } - static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } - static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } - - static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::one16))); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } - -#if defined(VC_GNU_ASM) && !defined(NVALGRIND) - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { __m256 r; __asm__("vcmpps $8,%0,%0,%0":"=x"(r)); return r; } -#elif defined(VC_MSVC) - // MSVC puts temporaries of this value on the stack, but sometimes at misaligned addresses, try - // some other generator instead... - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { return _mm256_castsi256_ps(_mm256_set1_epi32(-1)); } -#else - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone() { m256 r = _mm256_setzero_ps(); return _mm256_cmp_ps(r, r, _CMP_EQ_UQ); } -#endif - static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256() { return _mm256_castps_si256(_mm256_setallone()); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd() { return _mm256_castps_pd(_mm256_setallone()); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone_ps() { return _mm256_setallone(); } - - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8 () { return _mm256_set1_epi8(1); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu8 () { return _mm256_setone_epi8(); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu16() { return _mm256_setone_epi16(); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu32() { return _mm256_setone_epi32(); } - - static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } - - static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } - - static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - - //X static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi8 () { return _mm256_slli_epi8 (_mm256_setallone_si256(), 7); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::minShort))); } - static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } - -#ifdef VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT -#define _mm_extract_epu8 (x, i) (static_cast (_mm_extract_epi8 ((x), (i)))) -#define _mm_extract_epu16(x, i) (static_cast(_mm_extract_epi16((x), (i)))) -#define _mm_extract_epu32(x, i) (static_cast (_mm_extract_epi32((x), (i)))) -#else - static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i) { return _mm_extract_epi8(x, i); } - static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i) { return _mm_extract_epi16(x, i); } - static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i) { return _mm_extract_epi32(x, i); } -#endif - - /////////////////////// COMPARE OPS /////////////////////// - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd (param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } - static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } - - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps (param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } - static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } - - static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b) { - return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); - } - static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b) { - return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); - } - - /////////////////////// INTEGER OPS /////////////////////// -#define AVX_TO_SSE_2(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, param256i b0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i b1 = _mm256_extractf128_si256(b0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ - m128i r1 = _mm_##name(a1, b1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_2_si128_si256(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name##_si256(param256i a0, param256i b0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i b1 = _mm256_extractf128_si256(b0, 1); \ - m128i r0 = _mm_##name##_si128(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ - m128i r1 = _mm_##name##_si128(a1, b1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_1(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ - m128i r1 = _mm_##name(a1); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } -#define AVX_TO_SSE_1i(name) \ - static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, const int i) { \ - m128i a1 = _mm256_extractf128_si256(a0, 1); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ - m128i r1 = _mm_##name(a1, i); \ - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ - } - - AVX_TO_SSE_2(cmplt_epi8) - AVX_TO_SSE_2(cmplt_epi16) - AVX_TO_SSE_2(cmplt_epi32) - AVX_TO_SSE_2(cmpeq_epi8) - AVX_TO_SSE_2(cmpeq_epi16) - AVX_TO_SSE_2(cmpeq_epi32) - AVX_TO_SSE_2(cmpgt_epi8) - AVX_TO_SSE_2(cmpgt_epi16) - AVX_TO_SSE_2(cmpgt_epi32) - - // This code is AVX only (without AVX2). We never asked for AVX2 intrinsics. So go away... :) -#if defined _mm256_srli_si256 -#undef _mm256_srli_si256 -#endif -#if defined _mm256_slli_si256 -#undef _mm256_slli_si256 -#endif -#if defined _mm256_blend_epi16 -#undef _mm256_blend_epi16 -#endif - static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i) { - const m128i vLo = _mm256_castsi256_si128(a0); - const m128i vHi = _mm256_extractf128_si256(a0, 1); - switch (i) { - case 0: return a0; - case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 1)), _mm_srli_si128(vHi, 1), 1); - case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 2)), _mm_srli_si128(vHi, 2), 1); - case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 3)), _mm_srli_si128(vHi, 3), 1); - case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 4)), _mm_srli_si128(vHi, 4), 1); - case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 5)), _mm_srli_si128(vHi, 5), 1); - case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 6)), _mm_srli_si128(vHi, 6), 1); - case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 7)), _mm_srli_si128(vHi, 7), 1); - case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 8)), _mm_srli_si128(vHi, 8), 1); - case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 9)), _mm_srli_si128(vHi, 9), 1); - case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 10)), _mm_srli_si128(vHi, 10), 1); - case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 11)), _mm_srli_si128(vHi, 11), 1); - case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 12)), _mm_srli_si128(vHi, 12), 1); - case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 13)), _mm_srli_si128(vHi, 13), 1); - case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 14)), _mm_srli_si128(vHi, 14), 1); - case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 15)), _mm_srli_si128(vHi, 15), 1); - case 16: return _mm256_permute2f128_si256(a0, a0, 0x81); - case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), 0x80); - case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), 0x80); - case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), 0x80); - case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), 0x80); - case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), 0x80); - case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), 0x80); - case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), 0x80); - case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), 0x80); - case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), 0x80); - case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), 0x80); - case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), 0x80); - case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), 0x80); - case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), 0x80); - case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), 0x80); - case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), 0x80); - } - return _mm256_setzero_si256(); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i) { - const m128i vLo = _mm256_castsi256_si128(a0); - const m128i vHi = _mm256_extractf128_si256(a0, 1); - switch (i) { - case 0: return a0; - case 1: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm_alignr_epi8(vHi, vLo, 15), 1); - case 2: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm_alignr_epi8(vHi, vLo, 14), 1); - case 3: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm_alignr_epi8(vHi, vLo, 13), 1); - case 4: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm_alignr_epi8(vHi, vLo, 12), 1); - case 5: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm_alignr_epi8(vHi, vLo, 11), 1); - case 6: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm_alignr_epi8(vHi, vLo, 10), 1); - case 7: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm_alignr_epi8(vHi, vLo, 9), 1); - case 8: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm_alignr_epi8(vHi, vLo, 8), 1); - case 9: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm_alignr_epi8(vHi, vLo, 7), 1); - case 10: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm_alignr_epi8(vHi, vLo, 6), 1); - case 11: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm_alignr_epi8(vHi, vLo, 5), 1); - case 12: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm_alignr_epi8(vHi, vLo, 4), 1); - case 13: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm_alignr_epi8(vHi, vLo, 3), 1); - case 14: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm_alignr_epi8(vHi, vLo, 2), 1); - case 15: return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm_alignr_epi8(vHi, vLo, 1), 1); - case 16: return _mm256_permute2f128_si256(a0, a0, 0x8); - case 17: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), 0x8); - case 18: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), 0x8); - case 19: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), 0x8); - case 20: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), 0x8); - case 21: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), 0x8); - case 22: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), 0x8); - case 23: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), 0x8); - case 24: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), 0x8); - case 25: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), 0x8); - case 26: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), 0x8); - case 27: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), 0x8); - case 28: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), 0x8); - case 29: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), 0x8); - case 30: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), 0x8); - case 31: return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), 0x8); - } - return _mm256_setzero_si256(); - } - - static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y) { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y) { - return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y) { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y) { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); - } - - AVX_TO_SSE_2(packs_epi16) - AVX_TO_SSE_2(packs_epi32) - AVX_TO_SSE_2(packus_epi16) - AVX_TO_SSE_2(unpackhi_epi8) - AVX_TO_SSE_2(unpackhi_epi16) - AVX_TO_SSE_2(unpackhi_epi32) - AVX_TO_SSE_2(unpackhi_epi64) - AVX_TO_SSE_2(unpacklo_epi8) - AVX_TO_SSE_2(unpacklo_epi16) - AVX_TO_SSE_2(unpacklo_epi32) - AVX_TO_SSE_2(unpacklo_epi64) - AVX_TO_SSE_2(add_epi8) - AVX_TO_SSE_2(add_epi16) - AVX_TO_SSE_2(add_epi32) - AVX_TO_SSE_2(add_epi64) - AVX_TO_SSE_2(adds_epi8) - AVX_TO_SSE_2(adds_epi16) - AVX_TO_SSE_2(adds_epu8) - AVX_TO_SSE_2(adds_epu16) - AVX_TO_SSE_2(sub_epi8) - AVX_TO_SSE_2(sub_epi16) - AVX_TO_SSE_2(sub_epi32) - AVX_TO_SSE_2(sub_epi64) - AVX_TO_SSE_2(subs_epi8) - AVX_TO_SSE_2(subs_epi16) - AVX_TO_SSE_2(subs_epu8) - AVX_TO_SSE_2(subs_epu16) - AVX_TO_SSE_2(madd_epi16) - AVX_TO_SSE_2(mulhi_epi16) - AVX_TO_SSE_2(mullo_epi16) - AVX_TO_SSE_2(mul_epu32) - AVX_TO_SSE_1i(slli_epi16) - AVX_TO_SSE_1i(slli_epi32) - AVX_TO_SSE_1i(slli_epi64) - AVX_TO_SSE_1i(srai_epi16) - AVX_TO_SSE_1i(srai_epi32) - AVX_TO_SSE_1i(srli_epi16) - AVX_TO_SSE_1i(srli_epi32) - AVX_TO_SSE_1i(srli_epi64) - AVX_TO_SSE_2(sll_epi16) - AVX_TO_SSE_2(sll_epi32) - AVX_TO_SSE_2(sll_epi64) - AVX_TO_SSE_2(sra_epi16) - AVX_TO_SSE_2(sra_epi32) - AVX_TO_SSE_2(srl_epi16) - AVX_TO_SSE_2(srl_epi32) - AVX_TO_SSE_2(srl_epi64) - AVX_TO_SSE_2(max_epi16) - AVX_TO_SSE_2(max_epu8) - AVX_TO_SSE_2(min_epi16) - AVX_TO_SSE_2(min_epu8) - Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0) - { - m128i a1 = _mm256_extractf128_si256(a0, 1); - return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); - } - AVX_TO_SSE_2(mulhi_epu16) - // shufflehi_epi16 - // shufflelo_epi16 (param128i __A, const int __mask) - // shuffle_epi32 (param128i __A, const int __mask) - // maskmoveu_si128 (param128i __A, param128i __B, char *__C) - AVX_TO_SSE_2(avg_epu8) - AVX_TO_SSE_2(avg_epu16) - AVX_TO_SSE_2(sad_epu8) - // stream_si32 (int *__A, int __B) - // stream_si128 (param128i *__A, param128i __B) - // cvtsi32_si128 (int __A) - // cvtsi64_si128 (long long __A) - // cvtsi64x_si128 (long long __A) - AVX_TO_SSE_2(hadd_epi16) - AVX_TO_SSE_2(hadd_epi32) - AVX_TO_SSE_2(hadds_epi16) - AVX_TO_SSE_2(hsub_epi16) - AVX_TO_SSE_2(hsub_epi32) - AVX_TO_SSE_2(hsubs_epi16) - AVX_TO_SSE_2(maddubs_epi16) - AVX_TO_SSE_2(mulhrs_epi16) - AVX_TO_SSE_2(shuffle_epi8) - AVX_TO_SSE_2(sign_epi8) - AVX_TO_SSE_2(sign_epi16) - AVX_TO_SSE_2(sign_epi32) - // alignr_epi8(param128i __X, param128i __Y, const int __N) - AVX_TO_SSE_1(abs_epi8) - AVX_TO_SSE_1(abs_epi16) - AVX_TO_SSE_1(abs_epi32) -#if !defined(VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT) - m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m) { - m128i a1 = _mm256_extractf128_si256(a0, 1); - m128i b1 = _mm256_extractf128_si256(b0, 1); - m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); - m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); - } -#else -# define _mm256_blend_epi16(a0, b0, m) \ - _mm256_insertf128_si256( \ - _mm256_castsi128_si256( \ - _mm_blend_epi16( \ - _mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff)), \ - _mm_blend_epi16(_mm256_extractf128_si256(a0, 1), _mm256_extractf128_si256(b0, 1), m >> 8);, 1) -#endif - Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0) { - m128i a1 = _mm256_extractf128_si256(a0, 1); - m128i b1 = _mm256_extractf128_si256(b0, 1); - m128i m1 = _mm256_extractf128_si256(m0, 1); - m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); - m128i r1 = _mm_blendv_epi8(a1, b1, m1); - return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); - } - AVX_TO_SSE_2(cmpeq_epi64) - AVX_TO_SSE_2(min_epi8) - AVX_TO_SSE_2(max_epi8) - AVX_TO_SSE_2(min_epu16) - AVX_TO_SSE_2(max_epu16) - AVX_TO_SSE_2(min_epi32) - AVX_TO_SSE_2(max_epi32) - AVX_TO_SSE_2(min_epu32) - AVX_TO_SSE_2(max_epu32) - AVX_TO_SSE_2(mullo_epi32) - AVX_TO_SSE_2(mul_epi32) -#if !defined(VC_CLANG) || VC_CLANG > 0x30100 - // clang is missing _mm_minpos_epu16 from smmintrin.h - // http://llvm.org/bugs/show_bug.cgi?id=12399 - AVX_TO_SSE_1(minpos_epu16) -#endif - AVX_TO_SSE_1(cvtepi8_epi32) - AVX_TO_SSE_1(cvtepi16_epi32) - AVX_TO_SSE_1(cvtepi8_epi64) - AVX_TO_SSE_1(cvtepi32_epi64) - AVX_TO_SSE_1(cvtepi16_epi64) - AVX_TO_SSE_1(cvtepi8_epi16) - AVX_TO_SSE_1(cvtepu8_epi32) - AVX_TO_SSE_1(cvtepu16_epi32) - AVX_TO_SSE_1(cvtepu8_epi64) - AVX_TO_SSE_1(cvtepu32_epi64) - AVX_TO_SSE_1(cvtepu16_epi64) - AVX_TO_SSE_1(cvtepu8_epi16) - AVX_TO_SSE_2(packus_epi32) - // mpsadbw_epu8 (param128i __X, param128i __Y, const int __M) - // stream_load_si128 (param128i *__X) - AVX_TO_SSE_2(cmpgt_epi64) - -//X static Vc_INTRINSIC m256i _mm256_cmplt_epu8 (param256i a, param256i b) { return _mm256_cmplt_epi8 ( -//X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } -//X static Vc_INTRINSIC m256i _mm256_cmpgt_epu8 (param256i a, param256i b) { return _mm256_cmpgt_epi8 ( -//X _mm256_xor_si256(a, _mm256_setmin_epi8 ()), _mm256_xor_si256(b, _mm256_setmin_epi8 ())); } - static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b) { - m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - return _mm256_insertf128_si256(_mm256_castsi128_si256( - _mm_cmplt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), - _mm_cmplt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); - } - static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b) { - m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(_mm256_setmin_epi32()))); - return _mm256_insertf128_si256(_mm256_castsi128_si256( - _mm_cmpgt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))), - _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1); - } - - static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); -#else - _mm256_maskstore_ps(mem, mask, v); -#endif - } - static Vc_INTRINSIC void _mm256_maskstore(double *mem, const param256d mask, const param256d v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); -#else - _mm256_maskstore_pd(mem, mask, v); -#endif - } - static Vc_INTRINSIC void _mm256_maskstore(int *mem, const param256i mask, const param256i v) { -#ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE - _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); -#else - _mm256_maskstore_ps(reinterpret_cast(mem), _mm256_castsi256_ps(mask), _mm256_castsi256_ps(v)); -#endif - } - static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const param256i mask, const param256i v) { - _mm256_maskstore(reinterpret_cast(mem), mask, v); - } - -#if defined(VC_IMPL_FMA4) && defined(VC_CLANG) && VC_CLANG < 0x30300 - // clang miscompiles _mm256_macc_ps: http://llvm.org/bugs/show_bug.cgi?id=15040 - static Vc_INTRINSIC __m256 my256_macc_ps(__m256 a, __m256 b, __m256 c) { - __m256 r; - // avoid loading c from memory as that would trigger the bug - asm("vfmaddps %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); - return r; - } -#ifdef _mm256_macc_ps -#undef _mm256_macc_ps -#endif -#define _mm256_macc_ps(a, b, c) Vc::AVX::my256_macc_ps(a, b, c) - - static Vc_INTRINSIC __m256d my256_macc_pd(__m256d a, __m256d b, __m256d c) { - __m256d r; - // avoid loading c from memory as that would trigger the bug - asm("vfmaddpd %[c], %[b], %[a], %[r]" : [r]"=x"(r) : [a]"x"(a), [b]"x"(b), [c]"x"(c)); - return r; - } -#ifdef _mm256_macc_pd -#undef _mm256_macc_pd -#endif -#define _mm256_macc_pd(a, b, c) Vc::AVX::my256_macc_pd(a, b, c) -#endif -} // namespace AVX -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#include "shuffle.h" - -#endif // VC_AVX_INTRINSICS_H diff --git a/math/vc/include/Vc/avx/limits.h b/math/vc/include/Vc/avx/limits.h deleted file mode 100644 index 3248bb140a6c4..0000000000000 --- a/math/vc/include/Vc/avx/limits.h +++ /dev/null @@ -1,55 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_LIMITS_H -#define VC_AVX_LIMITS_H - -#include "intrinsics.h" -#include "types.h" - -namespace std -{ -#define _VC_NUM_LIM(T, _max, _min) \ -template<> struct numeric_limits< ::ROOT::Vc::AVX::Vector > : public numeric_limits \ -{ \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector max() _VC_NOEXCEPT { return _max; } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector min() _VC_NOEXCEPT { return _min; } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector lowest() _VC_NOEXCEPT { return min(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector epsilon() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector round_error() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector infinity() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector quiet_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector signaling_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::AVX::Vector denorm_min() _VC_NOEXCEPT { return ::ROOT::Vc::AVX::Vector::Zero(); } \ -} - -#ifndef VC_IMPL_AVX2 -namespace { - using ::ROOT::Vc::AVX::_mm256_srli_epi32; -} -#endif -_VC_NUM_LIM(unsigned short, ::ROOT::Vc::AVX::_mm_setallone_si128(), _mm_setzero_si128()); -_VC_NUM_LIM( short, _mm_srli_epi16(::ROOT::Vc::AVX::_mm_setallone_si128(), 1), ::ROOT::Vc::AVX::_mm_setmin_epi16()); -_VC_NUM_LIM( unsigned int, ::ROOT::Vc::AVX::_mm256_setallone_si256(), _mm256_setzero_si256()); -_VC_NUM_LIM( int, _mm256_srli_epi32(::ROOT::Vc::AVX::_mm256_setallone_si256(), 1), ::ROOT::Vc::AVX::_mm256_setmin_epi32()); -#undef _VC_NUM_LIM - -} // namespace std - -#endif // VC_AVX_LIMITS_H diff --git a/math/vc/include/Vc/avx/macros.h b/math/vc/include/Vc/avx/macros.h deleted file mode 100644 index 3a755434e26b1..0000000000000 --- a/math/vc/include/Vc/avx/macros.h +++ /dev/null @@ -1,26 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "../common/macros.h" - -#ifndef VC_AVX_MACROS_H -#define VC_AVX_MACROS_H -#undef VC_AVX_UNDOMACROS_H - -#endif // VC_AVX_MACROS_H diff --git a/math/vc/include/Vc/avx/mask.h b/math/vc/include/Vc/avx/mask.h deleted file mode 100644 index dfa707b724325..0000000000000 --- a/math/vc/include/Vc/avx/mask.h +++ /dev/null @@ -1,246 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_MASK_H -#define VC_AVX_MASK_H - -#include "intrinsics.h" -#include "../common/bitscanintrinsics.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -template class Mask -{ - friend class Mask<4u, 32u>; // double_v - friend class Mask<8u, 32u>; // float_v, (u)int_v - friend class Mask<8u, 16u>; // (u)short_v - friend class Mask<16u, 16u>; // (u)char_v - public: - FREE_STORE_OPERATORS_ALIGNED(32) - - // abstracts the way Masks are passed to functions, it can easily be changed to const ref here -#if defined VC_MSVC && defined _WIN32 - typedef const Mask &AsArg; -#else - typedef Mask AsArg; -#endif - - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE Mask(param256 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(param256d x) : k(_mm256_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(param256i x) : k(_mm256_castsi256_ps(x)) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Mask(__m256 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(__m256d x) : k(_mm256_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(__m256i x) : k(_mm256_castsi256_ps(x)) {} -#endif - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm256_setzero_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm256_setallone_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm256_setallone_ps() : m256(_mm256_setzero_ps())) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast(concat( - _mm_unpacklo_epi16(rhs.dataI(), rhs.dataI()), - _mm_unpackhi_epi16(rhs.dataI(), rhs.dataI())))) {} - Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Mask(const Mask &m) Vc_ALWAYS_INLINE_R; - - Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm256_testc_ps(k, rhs.k); } - Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm256_testc_ps(k, rhs.k); } - - Vc_ALWAYS_INLINE Mask operator!() const { return _mm256_andnot_ps(data(), _mm256_setallone_ps()); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm256_and_ps(k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm256_or_ps (k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm256_xor_ps(k, rhs.k); return *this; } - - // no need for expression template optimizations because cmp(n)eq for floats are not bitwise - // compares - Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm256_testc_ps(k, _mm256_setallone_ps()); } - Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm256_testz_ps(k, k); } - Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm256_testnzc_ps(k, _mm256_setallone_ps()); } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE operator bool() const { return isFull(); } -#endif - - Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE m256 data () const { return k; } - Vc_ALWAYS_INLINE m256i dataI() const { return _mm256_castps_si256(k); } - Vc_ALWAYS_INLINE m256d dataD() const { return _mm256_castps_pd(k); } - - Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - m256 k; -}; - -template class Mask -{ - friend class Mask<4u, 32u>; // double_v - friend class Mask<8u, 32u>; // float_v, (u)int_v - friend class Mask<8u, 16u>; // (u)short_v - friend class Mask<16u, 16u>; // (u)char_v - public: - FREE_STORE_OPERATORS_ALIGNED(16) - - // abstracts the way Masks are passed to functions, it can easily be changed to const ref here -#if defined VC_MSVC && defined _WIN32 - typedef const Mask &AsArg; -#else - typedef Mask AsArg; -#endif - - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE Mask(param128 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(param128d x) : k(_mm_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(param128i x) : k(_mm_castsi128_ps(x)) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Mask(__m128 x) : k(x) {} - Vc_ALWAYS_INLINE Mask(__m128d x) : k(_mm_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(__m128i x) : k(_mm_castsi128_ps(x)) {} -#endif - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : m128(_mm_setzero_ps())) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(avx_cast( - _mm_packs_epi32(avx_cast(rhs.data()), _mm256_extractf128_si256(rhs.dataI(), 1)))) {} - Vc_ALWAYS_INLINE Mask(const Mask *a) : k(avx_cast( - _mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {} - - Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm_testc_si128(dataI(), rhs.dataI()); } - Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm_testc_si128(dataI(), rhs.dataI()); } - - Vc_ALWAYS_INLINE Mask operator!() const { return _mm_andnot_ps(data(), _mm_setallone_ps()); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; } - - // TODO: use expression templates to optimize (v1 == v2).isFull() and friends - Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm_testc_si128(dataI(), _mm_setallone_si128()); } - Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm_testz_si128(dataI(), dataI()); } - Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm_testnzc_si128(dataI(), _mm_setallone_si128()); } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE operator bool() const { return isFull(); } -#endif - - Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE m128 data () const { return k; } - Vc_ALWAYS_INLINE m128i dataI() const { return avx_cast(k); } - Vc_ALWAYS_INLINE m128d dataD() const { return avx_cast(k); } - - Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - m128 k; -}; - -struct ForeachHelper -{ - size_t mask; - bool brk; - bool outerBreak; - Vc_ALWAYS_INLINE ForeachHelper(size_t _mask) : mask(_mask), brk(false), outerBreak(false) {} - Vc_ALWAYS_INLINE bool outer() const { return mask != 0 && !outerBreak; } - Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); } - Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; } - Vc_ALWAYS_INLINE size_t next() { - outerBreak = true; -#ifdef VC_GNU_ASM - const size_t bit = __builtin_ctzl(mask); - __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); -#else -#ifdef VC_MSVC -#pragma warning(suppress : 4267) // conversion from 'size_t' to 'unsigned long', possible loss of data -#endif - const size_t bit = _bit_scan_forward(mask); - mask &= ~(1 << bit); -#endif - return bit; - } -}; - -#define Vc_foreach_bit(_it_, _mask_) \ - for (Vc::AVX::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \ - for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak()) - -// Operators -namespace Intrinsics -{ - static Vc_ALWAYS_INLINE Vc_PURE m256 and_(param256 a, param256 b) { return _mm256_and_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m256 or_(param256 a, param256 b) { return _mm256_or_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m256 xor_(param256 a, param256 b) { return _mm256_xor_ps(a, b); } - - static Vc_ALWAYS_INLINE Vc_PURE m128 and_(param128 a, param128 b) { return _mm_and_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m128 or_(param128 a, param128 b) { return _mm_or_ps(a, b); } - static Vc_ALWAYS_INLINE Vc_PURE m128 xor_(param128 a, param128 b) { return _mm_xor_ps(a, b); } -} // namespace Intrinsics - -// binary and/or/xor cannot work with one operand larger than the other -template void operator&(const Mask &l, const Mask &r); -template void operator|(const Mask &l, const Mask &r); -template void operator^(const Mask &l, const Mask &r); - -// let binary and/or/xor work for any combination of masks (as long as they have the same sizeof) -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &l, const Mask &r) { return Intrinsics::and_(l.data(), r.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &l, const Mask &r) { return Intrinsics:: or_(l.data(), r.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &l, const Mask &r) { return Intrinsics::xor_(l.data(), r.data()); } - -// disable logical and/or for incompatible masks -template void operator&&(const Mask &lhs, const Mask &rhs); -template void operator||(const Mask &lhs, const Mask &rhs); - -// logical and/or for compatible masks -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return lhs && static_cast >(rhs); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return lhs || static_cast >(rhs); } - -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return Intrinsics::and_(lhs.data(), rhs.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return Intrinsics::or_ (lhs.data(), rhs.data()); } - -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "mask.tcc" -#include "undomacros.h" - -#endif // VC_AVX_MASK_H diff --git a/math/vc/include/Vc/avx/mask.tcc b/math/vc/include/Vc/avx/mask.tcc deleted file mode 100644 index 4e061e1dc2f25..0000000000000 --- a/math/vc/include/Vc/avx/mask.tcc +++ /dev/null @@ -1,75 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -template<> Vc_ALWAYS_INLINE Mask<4, 32>::Mask(const Mask<8, 32> &m) - : k(concat(_mm_unpacklo_ps(lo128(m.data()), lo128(m.data())), - _mm_unpackhi_ps(lo128(m.data()), lo128(m.data())))) -{ -} - -template<> Vc_ALWAYS_INLINE Mask<8, 32>::Mask(const Mask<4, 32> &m) - // aabb ccdd -> abcd 0000 - : k(concat(Mem::shuffle(lo128(m.data()), hi128(m.data())), - _mm_setzero_ps())) -{ -} - -template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const -{ - return _mm256_movemask_epi8(dataI()); -} -template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const -{ - return _mm_movemask_epi8(dataI()); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 4, 32>::toInt() const { return _mm256_movemask_pd(dataD()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 32>::toInt() const { return _mm256_movemask_ps(data ()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8, 16>::toInt() const { return _mm_movemask_epi8(_mm_packs_epi16(dataI(), _mm_setzero_si128())); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16, 16>::toInt() const { return _mm_movemask_epi8(dataI()); } - -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 4, 32>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 32>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8, 16>::operator[](int index) const { return shiftMask() & (1 << 2 * index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask<16, 16>::operator[](int index) const { return toInt() & (1 << index); } - -#ifndef VC_IMPL_POPCNT -static Vc_ALWAYS_INLINE Vc_CONST unsigned int _mm_popcnt_u32(unsigned int n) { - n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U); - n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U); - n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU); - //n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU); - //n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU); - return n; -} -#endif -template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::count() const { return _mm_popcnt_u32(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } -template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { return _bit_scan_forward(toInt()); } - -} // namespace AVX -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/avx/math.h b/math/vc/include/Vc/avx/math.h deleted file mode 100644 index 2f08afba22b7b..0000000000000 --- a/math/vc/include/Vc/avx/math.h +++ /dev/null @@ -1,119 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_MATH_H -#define VC_AVX_MATH_H - -#include "const.h" -#include "limits.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - /** - * splits \p v into exponent and mantissa, the sign is kept with the mantissa - * - * The return value will be in the range [0.5, 1.0[ - * The \p e value will be an integer defining the power-of-two exponent - */ - inline double_v frexp(double_v::AsArg v, int_v *e) { - const m256d exponentBits = Const::exponentMask().dataD(); - const m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); - e->data() = _mm256_sub_epi32(_mm256_srli_epi64(avx_cast(exponentPart), 52), _mm256_set1_epi32(0x3fe)); - const m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); - double_v ret = _mm256_and_pd(exponentMaximized, _mm256_broadcast_sd(reinterpret_cast(&c_general::frexpMask))); - double_m zeroMask = v == double_v::Zero(); - ret(isnan(v) || !isfinite(v) || zeroMask) = v; - e->setZero(zeroMask.data()); - return ret; - } - inline float_v frexp(float_v::AsArg v, int_v *e) { - const m256 exponentBits = Const::exponentMask().data(); - const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); - e->data() = _mm256_sub_epi32(_mm256_srli_epi32(avx_cast(exponentPart), 23), _mm256_set1_epi32(0x7e)); - const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); - float_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); - ret(isnan(v) || !isfinite(v) || v == float_v::Zero()) = v; - e->setZero(v == float_v::Zero()); - return ret; - } - inline sfloat_v frexp(sfloat_v::AsArg v, short_v *e) { - const m256 exponentBits = Const::exponentMask().data(); - const m256 exponentPart = _mm256_and_ps(v.data(), exponentBits); - e->data() = _mm_sub_epi16(_mm_packs_epi32(_mm_srli_epi32(avx_cast(exponentPart), 23), - _mm_srli_epi32(avx_cast(hi128(exponentPart)), 23)), _mm_set1_epi16(0x7e)); - const m256 exponentMaximized = _mm256_or_ps(v.data(), exponentBits); - sfloat_v ret = _mm256_and_ps(exponentMaximized, avx_cast(_mm256_set1_epi32(0xbf7fffffu))); - ret(isnan(v) || !isfinite(v) || v == sfloat_v::Zero()) = v; - e->setZero(v == sfloat_v::Zero()); - return ret; - } - - /* -> x * 2^e - * x == NaN -> NaN - * x == (-)inf -> (-)inf - */ - inline double_v ldexp(double_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero((v == double_v::Zero()).dataI()); - const m256i exponentBits = _mm256_slli_epi64(e.data(), 52); - return avx_cast(_mm256_add_epi64(avx_cast(v.data()), exponentBits)); - } - inline float_v ldexp(float_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero(static_cast(v == float_v::Zero())); - return (v.reinterpretCast() + (e << 23)).reinterpretCast(); - } - inline sfloat_v ldexp(sfloat_v::AsArg v, short_v::AsArg _e) { - short_v e = _e; - e.setZero(static_cast(v == sfloat_v::Zero())); - e = e << (23 - 16); - const m256i exponentBits = concat(_mm_unpacklo_epi16(_mm_setzero_si128(), e.data()), - _mm_unpackhi_epi16(_mm_setzero_si128(), e.data())); - return (v.reinterpretCast() + int_v(exponentBits)).reinterpretCast(); - } - - static Vc_ALWAYS_INLINE float_v trunc( float_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } - static Vc_ALWAYS_INLINE sfloat_v trunc(sfloat_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } - static Vc_ALWAYS_INLINE double_v trunc(double_v::AsArg v) { return _mm256_round_pd(v.data(), 0x3); } - - static Vc_ALWAYS_INLINE float_v floor(float_v::AsArg v) { return _mm256_floor_ps(v.data()); } - static Vc_ALWAYS_INLINE sfloat_v floor(sfloat_v::AsArg v) { return _mm256_floor_ps(v.data()); } - static Vc_ALWAYS_INLINE double_v floor(double_v::AsArg v) { return _mm256_floor_pd(v.data()); } - - static Vc_ALWAYS_INLINE float_v ceil(float_v::AsArg v) { return _mm256_ceil_ps(v.data()); } - static Vc_ALWAYS_INLINE sfloat_v ceil(sfloat_v::AsArg v) { return _mm256_ceil_ps(v.data()); } - static Vc_ALWAYS_INLINE double_v ceil(double_v::AsArg v) { return _mm256_ceil_pd(v.data()); } -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" -#define VC__USE_NAMESPACE AVX -#include "../common/trigonometric.h" -#define VC__USE_NAMESPACE AVX -#include "../common/logarithm.h" -#define VC__USE_NAMESPACE AVX -#include "../common/exponential.h" -#undef VC__USE_NAMESPACE - -#endif // VC_AVX_MATH_H diff --git a/math/vc/include/Vc/avx/prefetches.tcc b/math/vc/include/Vc/avx/prefetches.tcc deleted file mode 100644 index b3359bcd4349a..0000000000000 --- a/math/vc/include/Vc/avx/prefetches.tcc +++ /dev/null @@ -1,58 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010, 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_PREFETCHES_TCC -#define VC_AVX_PREFETCHES_TCC - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -Vc_ALWAYS_INLINE void HelperImpl::prefetchForOneRead(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchClose(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchMid(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchFar(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchForModify(const void *addr) -{ -#ifdef __3dNOW__ - _m_prefetchw(const_cast(addr)); -#else - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -#endif -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#endif // VC_AVX_PREFETCHES_TCC diff --git a/math/vc/include/Vc/avx/shuffle.h b/math/vc/include/Vc/avx/shuffle.h deleted file mode 100644 index ffd7f99e8db6d..0000000000000 --- a/math/vc/include/Vc/avx/shuffle.h +++ /dev/null @@ -1,239 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_SHUFFLE_H -#define VC_AVX_SHUFFLE_H - -#include "../sse/shuffle.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ - using AVX::m128; - using AVX::m128d; - using AVX::m128i; - using AVX::m256; - using AVX::m256d; - using AVX::m256i; - using AVX::param128; - using AVX::param128d; - using AVX::param128i; - using AVX::param256; - using AVX::param256d; - using AVX::param256i; - namespace Mem - { - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_ps(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_pd(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) { - VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range); - VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range); - return _mm256_permute2f128_si256(x, x, L + H * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) { - return _mm256_castps_si256(permute(_mm256_castsi256_ps(x))); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); - return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); - return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); - } - template - static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); - VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); - VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); - VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); - return _mm256_blend_ps(x, y, - (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + - (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + - (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + - (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 - ); - } - template - static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) { - return _mm256_castps_si256(blend(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); - } - template struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; }; - template - static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range); - VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range); - if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) { - return permute(x); - } - const m128 loIn = _mm256_castps256_ps128(x); - const m128 hiIn = _mm256_extractf128_ps(x, 1); - m128 lo, hi; - - if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) { - lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) { - lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) { - lo = shuffle(loIn, hiIn); - } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) { - lo = shuffle(hiIn, loIn); - } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) { - lo = _mm_unpacklo_ps(loIn, hiIn); - } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) { - lo = _mm_unpacklo_ps(hiIn, loIn); - } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) { - lo = _mm_unpackhi_ps(loIn, hiIn); - } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) { - lo = _mm_unpackhi_ps(hiIn, loIn); - } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) { - lo = blend::Value, ScaleForBlend::Value, - ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); - } - - if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) { - hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); - } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) { - hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); - } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) { - hi = shuffle(loIn, hiIn); - } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) { - hi = shuffle(hiIn, loIn); - } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) { - hi = _mm_unpacklo_ps(loIn, hiIn); - } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) { - hi = _mm_unpacklo_ps(hiIn, loIn); - } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) { - hi = _mm_unpackhi_ps(loIn, hiIn); - } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) { - hi = _mm_unpackhi_ps(hiIn, loIn); - } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) { - hi = blend::Value, ScaleForBlend::Value, - ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); - } - - return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); - } - } // namespace Mem - - // little endian has the lo bits on the right and high bits on the left - // with vectors this becomes greatly confusing: - // Mem: abcd - // Reg: dcba - // - // The shuffles and permutes above use memory ordering. The ones below use register ordering: - namespace Reg - { - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) { - VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range); - VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range); - return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - template static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range); - return _mm_permute_pd(x, Dst0 + Dst1 * 2); - } - template static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - template static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range); - return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); - } - template static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); - return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); - } - } // namespace Reg -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#endif // VC_AVX_SHUFFLE_H diff --git a/math/vc/include/Vc/avx/sorthelper.h b/math/vc/include/Vc/avx/sorthelper.h deleted file mode 100644 index f4e1b77ea9e93..0000000000000 --- a/math/vc/include/Vc/avx/sorthelper.h +++ /dev/null @@ -1,45 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_SORTHELPER_H -#define VC_AVX_SORTHELPER_H - -#include "types.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ -template struct SortHelper -{ - typedef typename VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - static VectorType sort(VTArg); - static void sort(VectorType &, VectorType &); -}; -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#endif // VC_AVX_SORTHELPER_H diff --git a/math/vc/include/Vc/avx/types.h b/math/vc/include/Vc/avx/types.h deleted file mode 100644 index 298dfcc69aa0e..0000000000000 --- a/math/vc/include/Vc/avx/types.h +++ /dev/null @@ -1,111 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_TYPES_H -#define AVX_TYPES_H - -#include "intrinsics.h" -#include "../common/storage.h" -#include "macros.h" - -#define VC_DOUBLE_V_SIZE 4 -#define VC_FLOAT_V_SIZE 8 -#define VC_SFLOAT_V_SIZE 8 -#define VC_INT_V_SIZE 8 -#define VC_UINT_V_SIZE 8 -#define VC_SHORT_V_SIZE 8 -#define VC_USHORT_V_SIZE 8 - -#include "../common/types.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - template class Vector; - - template class Mask; - - template struct VectorHelper {}; - template struct GatherHelper; - template struct ScatterHelper; - - template struct IndexTypeHelper; - template<> struct IndexTypeHelper< char > { typedef unsigned char Type; }; - template<> struct IndexTypeHelper { typedef unsigned char Type; }; - template<> struct IndexTypeHelper< short> { typedef unsigned short Type; }; - template<> struct IndexTypeHelper { typedef unsigned short Type; }; - template<> struct IndexTypeHelper< int > { typedef unsigned int Type; }; - template<> struct IndexTypeHelper { typedef unsigned int Type; }; - template<> struct IndexTypeHelper< float> { typedef unsigned int Type; }; - template<> struct IndexTypeHelper< sfloat> { typedef unsigned short Type; }; - template<> struct IndexTypeHelper< double> { typedef unsigned int Type; }; // _M128I based int32 would be nice - - template struct VectorTypeHelper; - template<> struct VectorTypeHelper< char > { typedef m128i Type; }; - template<> struct VectorTypeHelper { typedef m128i Type; }; - template<> struct VectorTypeHelper< short> { typedef m128i Type; }; - template<> struct VectorTypeHelper { typedef m128i Type; }; - template<> struct VectorTypeHelper< int > { typedef m256i Type; }; - template<> struct VectorTypeHelper { typedef m256i Type; }; - template<> struct VectorTypeHelper< float> { typedef m256 Type; }; - template<> struct VectorTypeHelper< sfloat> { typedef m256 Type; }; - template<> struct VectorTypeHelper< double> { typedef m256d Type; }; - - template struct SseVectorType; - template<> struct SseVectorType { typedef m128 Type; }; - template<> struct SseVectorType { typedef m128i Type; }; - template<> struct SseVectorType { typedef m128d Type; }; - template<> struct SseVectorType { typedef m128 Type; }; - template<> struct SseVectorType { typedef m128i Type; }; - template<> struct SseVectorType { typedef m128d Type; }; - - template struct HasVectorDivisionHelper { enum { Value = 1 }; }; - //template<> struct HasVectorDivisionHelper { enum { Value = 0 }; }; - - template struct VectorHelperSize; - -#ifdef VC_MSVC - // MSVC's __declspec(align(#)) only works with numbers, no enums or sizeof allowed ;( - template class _VectorAlignedBaseHack; - template<> class STRUCT_ALIGN1( 8) _VectorAlignedBaseHack< 8> {} STRUCT_ALIGN2( 8); - template<> class STRUCT_ALIGN1(16) _VectorAlignedBaseHack<16> {} STRUCT_ALIGN2(16); - template<> class STRUCT_ALIGN1(32) _VectorAlignedBaseHack<32> {} STRUCT_ALIGN2(32); - template<> class STRUCT_ALIGN1(64) _VectorAlignedBaseHack<64> {} STRUCT_ALIGN2(64); - template > - class VectorAlignedBaseT : public _VectorAlignedBaseHack - { - public: - FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) - }; -#else - template > - class STRUCT_ALIGN1(sizeof(V)) VectorAlignedBaseT - { - public: - FREE_STORE_OPERATORS_ALIGNED(sizeof(V)) - } STRUCT_ALIGN2(sizeof(V)); -#endif -} // namespace AVX -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#endif // AVX_TYPES_H diff --git a/math/vc/include/Vc/avx/undomacros.h b/math/vc/include/Vc/avx/undomacros.h deleted file mode 100644 index 637116df57fe4..0000000000000 --- a/math/vc/include/Vc/avx/undomacros.h +++ /dev/null @@ -1,26 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_UNDOMACROS_H -#define VC_AVX_UNDOMACROS_H -#undef VC_AVX_MACROS_H - -#endif // VC_AVX_UNDOMACROS_H - -#include "../common/undomacros.h" diff --git a/math/vc/include/Vc/avx/vector.h b/math/vc/include/Vc/avx/vector.h deleted file mode 100644 index 2b00d149eb3ce..0000000000000 --- a/math/vc/include/Vc/avx/vector.h +++ /dev/null @@ -1,466 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_VECTOR_H -#define AVX_VECTOR_H - -#include "intrinsics.h" -#include "vectorhelper.h" -#include "mask.h" -#include "writemaskedvector.h" -#include "sorthelper.h" -#include -#include -#include "../common/aliasingentryhelper.h" -#include "../common/memoryfwd.h" -#include "macros.h" - -#ifdef isfinite -#undef isfinite -#endif -#ifdef isnan -#undef isnan -#endif - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ -enum VectorAlignmentEnum { VectorAlignment = 32 }; - -template class Vector -{ - public: - FREE_STORE_OPERATORS_ALIGNED(32) - - typedef typename VectorTypeHelper::Type VectorType; - typedef typename DetermineEntryType::Type EntryType; - enum Constants { - Size = sizeof(VectorType) / sizeof(EntryType), - HasVectorDivision = HasVectorDivisionHelper::Value - }; - typedef Vector::Type> IndexType; - typedef typename Vc::AVX::Mask Mask; - typedef typename Mask::AsArg MaskArg; - typedef Vc::Memory, Size> Memory; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const Vector &AsArg; - typedef const VectorType &VectorTypeArg; -#else - typedef Vector AsArg; - typedef VectorType VectorTypeArg; -#endif - - protected: - // helper that specializes on VectorType - typedef VectorHelper HV; - - // helper that specializes on T - typedef VectorHelper HT; - - // cast any m256/m128 to VectorType - static Vc_INTRINSIC VectorType _cast(param128 v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param128i v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param128d v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256 v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256i v) { return avx_cast(v); } - static Vc_INTRINSIC VectorType _cast(param256d v) { return avx_cast(v); } - -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - typedef Common::VectorMemoryUnion StorageType; -#else - typedef Common::VectorMemoryUnion StorageType; -#endif - StorageType d; - - public: - /////////////////////////////////////////////////////////////////////////////////////////// - // uninitialized - Vc_ALWAYS_INLINE Vector() {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // constants - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerZero::ZEnum) Vc_ALWAYS_INLINE_R; - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerOne::OEnum) Vc_ALWAYS_INLINE_R; - explicit Vc_ALWAYS_INLINE_L Vector(VectorSpecialInitializerIndexesFromZero::IEnum) Vc_ALWAYS_INLINE_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector Zero() Vc_INTRINSIC_R Vc_CONST_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector One() Vc_INTRINSIC_R Vc_CONST_R; - static Vc_INTRINSIC_L Vc_CONST_L Vector IndexesFromZero() Vc_INTRINSIC_R Vc_CONST_R; - static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // internal: required to enable returning objects of VectorType - Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {} -#ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS - Vc_ALWAYS_INLINE Vector(typename VectorType::Base x) : d(x) {} -#endif - - /////////////////////////////////////////////////////////////////////////////////////////// - // static_cast / copy ctor - template explicit Vector(VC_ALIGNED_PARAMETER(Vector) x); - - // implicit cast - template Vc_INTRINSIC_L Vector &operator=(const Vector &x) Vc_INTRINSIC_R; - - // copy assignment - Vc_ALWAYS_INLINE Vector &operator=(AsArg v) { d.v() = v.d.v(); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // broadcast - explicit Vc_ALWAYS_INLINE_L Vector(EntryType a) Vc_ALWAYS_INLINE_R; - template Vc_INTRINSIC Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : d(HT::set(x)) {} - Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { d.v() = HT::set(a); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // load ctors - explicit Vc_INTRINSIC_L - Vector(const EntryType *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const EntryType *x, Alignment align) Vc_INTRINSIC_R; - template explicit Vc_INTRINSIC_L - Vector(const OtherT *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const OtherT *x, Alignment align) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // load member functions - Vc_INTRINSIC_L - void load(const EntryType *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const EntryType *mem, Alignment align) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem, Alignment align) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX - explicit inline Vector(const Vector *a); - inline void expand(Vector *x) const; - - /////////////////////////////////////////////////////////////////////////////////////////// - // zeroing - Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; - - Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setQnan(MaskArg k) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // stores - Vc_INTRINSIC_L void store(EntryType *mem) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, A align) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask, A align) const Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // swizzles - Vc_INTRINSIC_L Vc_PURE_L const Vector &abcd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cdab() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector badc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector aaaa() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bbbb() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cccc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dddd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcad() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcda() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dabc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector acbd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dbca() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dcba() const Vc_INTRINSIC_R Vc_PURE_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // gathers - template Vector(const EntryType *mem, const IndexT *indexes); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes); - template Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask); -#ifdef VC_USE_SET_GATHERS - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); -#endif - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - - /////////////////////////////////////////////////////////////////////////////////////////// - // scatters - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const; - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const; - - /////////////////////////////////////////////////////////////////////////////////////////// - //prefix - Vc_ALWAYS_INLINE Vector &operator++() { data() = VectorHelper::add(data(), VectorHelper::one()); return *this; } - Vc_ALWAYS_INLINE Vector &operator--() { data() = VectorHelper::sub(data(), VectorHelper::one()); return *this; } - //postfix - Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = VectorHelper::add(data(), VectorHelper::one()); return r; } - Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = VectorHelper::sub(data(), VectorHelper::one()); return r; } - - Vc_INTRINSIC Common::AliasingEntryHelper operator[](size_t index) { -#if defined(VC_GCC) && VC_GCC >= 0x40300 && VC_GCC < 0x40400 - ::ROOT::Vc::Warnings::_operator_bracket_warning(); -#endif - return d.m(index); - } - Vc_ALWAYS_INLINE EntryType operator[](size_t index) const { - return d.m(index); - } - - Vc_ALWAYS_INLINE Vector operator~() const { return VectorHelper::andnot_(data(), VectorHelper::allone()); } - Vc_ALWAYS_INLINE_L Vc_PURE_L Vector::Type> operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } - -#define OP1(fun) \ - Vc_ALWAYS_INLINE Vector fun() const { return Vector(VectorHelper::fun(data())); } \ - Vc_ALWAYS_INLINE Vector &fun##_eq() { data() = VectorHelper::fun(data()); return *this; } - OP1(sqrt) - OP1(abs) -#undef OP1 - -#define OP(symbol, fun) \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(const Vector &x) { data() = VectorHelper::fun(data(), x.data()); return *this; } \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ - Vc_ALWAYS_INLINE Vector operator symbol(const Vector &x) const { return Vector(VectorHelper::fun(data(), x.data())); } \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Vector) operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OP(+, add) - OP(-, sub) - OP(*, mul) -#undef OP - inline Vector &operator/=(EntryType x); - template inline Vc_PURE_L VC_EXACT_TYPE(TT, EntryType, Vector) operator/(TT x) const Vc_PURE_R; - inline Vector &operator/=(const Vector &x); - inline Vc_PURE_L Vector operator/ (const Vector &x) const Vc_PURE_R; - - // bitwise ops -#define OP_VEC(op) \ - Vc_ALWAYS_INLINE_L Vector &operator op##=(AsArg x) Vc_ALWAYS_INLINE_R; \ - Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator op (AsArg x) const Vc_ALWAYS_INLINE_R Vc_PURE_R; -#define OP_ENTRY(op) \ - Vc_ALWAYS_INLINE Vector &operator op##=(EntryType x) { return operator op##=(Vector(x)); } \ - template Vc_ALWAYS_INLINE Vc_PURE VC_EXACT_TYPE(TT, EntryType, Vector) operator op(TT x) const { return operator op(Vector(x)); } - VC_ALL_BINARY(OP_VEC) - VC_ALL_BINARY(OP_ENTRY) - VC_ALL_SHIFTS(OP_VEC) -#undef OP_VEC -#undef OP_ENTRY - - Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; - -#define OPcmp(symbol, fun) \ - Vc_ALWAYS_INLINE Mask operator symbol(AsArg x) const { return VectorHelper::fun(data(), x.data()); } \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Mask) operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OPcmp(==, cmpeq) - OPcmp(!=, cmpneq) - OPcmp(>=, cmpnlt) - OPcmp(>, cmpnle) - OPcmp(<, cmplt) - OPcmp(<=, cmple) -#undef OPcmp - Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; - - Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { - VectorHelper::fma(data(), factor.data(), summand.data()); - } - - Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { - const VectorType k = avx_cast(mask.data()); - data() = VectorHelper::blend(data(), v.data(), k); - } - - template Vc_ALWAYS_INLINE V2 staticCast() const { return V2(*this); } - template Vc_ALWAYS_INLINE V2 reinterpretCast() const { return avx_cast(data()); } - - Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) { return WriteMaskedVector(this, k); } - - /** - * \return \p true This vector was completely filled. m2 might be 0 or != 0. You still have - * to test this. - * \p false This vector was not completely filled. m2 is all 0. - */ - //inline bool pack(Mask &m1, Vector &v2, Mask &m2) { - //return VectorHelper::pack(data(), m1.data, v2.data(), m2.data); - //} - - Vc_ALWAYS_INLINE VectorType &data() { return d.v(); } - Vc_ALWAYS_INLINE const VectorType data() const { return d.v(); } - - Vc_ALWAYS_INLINE EntryType min() const { return VectorHelper::min(data()); } - Vc_ALWAYS_INLINE EntryType max() const { return VectorHelper::max(data()); } - Vc_ALWAYS_INLINE EntryType product() const { return VectorHelper::mul(data()); } - Vc_ALWAYS_INLINE EntryType sum() const { return VectorHelper::add(data()); } - Vc_ALWAYS_INLINE_L EntryType min(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType max(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType product(MaskArg m) const Vc_ALWAYS_INLINE_R; - Vc_ALWAYS_INLINE_L EntryType sum(MaskArg m) const Vc_ALWAYS_INLINE_R; - - Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; - Vc_ALWAYS_INLINE Vector sorted() const { return SortHelper::sort(data()); } - - template void callWithValuesSorted(F &f) { - EntryType value = d.m(0); - f(value); - for (int i = 1; i < Size; ++i) { - if (d.m(i) != value) { - value = d.m(i); - f(value); - } - } - } - - template Vc_INTRINSIC void call(const F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); - } - template Vc_INTRINSIC void call(F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); - } - - template Vc_INTRINSIC void call(const F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { - f(EntryType(d.m(i))); - } - } - template Vc_INTRINSIC void call(F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { - f(EntryType(d.m(i))); - } - } - - template Vc_INTRINSIC Vector apply(const F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); - return r; - } - template Vc_INTRINSIC Vector apply(F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); - return r; - } - - template Vc_INTRINSIC Vector apply(const F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); - } - return r; - } - template Vc_INTRINSIC Vector apply(F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); - } - return r; - } - - template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { - for_all_vector_entries(i, - d.m(i) = f(i); - ); - } - Vc_INTRINSIC void fill(EntryType (&f)()) { - for_all_vector_entries(i, - d.m(i) = f(); - ); - } - - Vc_INTRINSIC_L Vector copySign(AsArg reference) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; -}; - -typedef Vector double_v; -typedef Vector float_v; -typedef Vector sfloat_v; -typedef Vector int_v; -typedef Vector uint_v; -typedef Vector short_v; -typedef Vector ushort_v; -typedef double_v::Mask double_m; -typedef float_v::Mask float_m; -typedef sfloat_v::Mask sfloat_m; -typedef int_v::Mask int_m; -typedef uint_v::Mask uint_m; -typedef short_v::Mask short_m; -typedef ushort_v::Mask ushort_m; - -template class SwizzledVector : public Vector {}; - -static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE uint_v min(const uint_v &x, const uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE short_v min(const short_v &x, const short_v &y) { return _mm_min_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE ushort_v min(const ushort_v &x, const ushort_v &y) { return _mm_min_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE float_v min(const float_v &x, const float_v &y) { return _mm256_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE sfloat_v min(const sfloat_v &x, const sfloat_v &y) { return _mm256_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE double_v min(const double_v &x, const double_v &y) { return _mm256_min_pd(x.data(), y.data()); } -static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE uint_v max(const uint_v &x, const uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE short_v max(const short_v &x, const short_v &y) { return _mm_max_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE ushort_v max(const ushort_v &x, const ushort_v &y) { return _mm_max_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE float_v max(const float_v &x, const float_v &y) { return _mm256_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE sfloat_v max(const sfloat_v &x, const sfloat_v &y) { return _mm256_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE double_v max(const double_v &x, const double_v &y) { return _mm256_max_pd(x.data(), y.data()); } - - template static Vc_ALWAYS_INLINE Vector sqrt (const Vector &x) { return VectorHelper::sqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vector rsqrt(const Vector &x) { return VectorHelper::rsqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vector abs (const Vector &x) { return VectorHelper::abs(x.data()); } - template static Vc_ALWAYS_INLINE Vector reciprocal(const Vector &x) { return VectorHelper::reciprocal(x.data()); } - template static Vc_ALWAYS_INLINE Vector round(const Vector &x) { return VectorHelper::round(x.data()); } - - template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Vector &x) { return VectorHelper::isFinite(x.data()); } - template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Vector &x) { return VectorHelper::isNaN(x.data()); } - -#include "forceToRegisters.tcc" -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "vector.tcc" -#include "math.h" -#include "undomacros.h" - -#endif // AVX_VECTOR_H diff --git a/math/vc/include/Vc/avx/vector.tcc b/math/vc/include/Vc/avx/vector.tcc deleted file mode 100644 index f7afa95be83d7..0000000000000 --- a/math/vc/include/Vc/avx/vector.tcc +++ /dev/null @@ -1,1406 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "limits.h" -#include "const.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -ALIGN(64) extern unsigned int RandomState[16]; - -namespace AVX -{ - -/////////////////////////////////////////////////////////////////////////////////////////// -// constants {{{1 -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(HT::zero()) {} -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(HT::one()) {} -template Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) - : d(HV::load(IndexesFromZeroData::address(), Aligned)) {} - -template Vc_INTRINSIC Vector Vc_CONST Vector::Zero() { return HT::zero(); } -template Vc_INTRINSIC Vector Vc_CONST Vector::One() { return HT::one(); } -template Vc_INTRINSIC Vector Vc_CONST Vector::IndexesFromZero() { return HV::load(IndexesFromZeroData::address(), Aligned); } - -template template Vc_ALWAYS_INLINE Vector::Vector(VC_ALIGNED_PARAMETER(Vector) x) - : d(StaticCastHelper::cast(x.data())) {} - -template Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(HT::set(x)) {} -template<> Vc_ALWAYS_INLINE Vector::Vector(EntryType x) : d(_mm256_set1_pd(x)) {} - - -/////////////////////////////////////////////////////////////////////////////////////////// -// load ctors {{{1 -template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } - -/////////////////////////////////////////////////////////////////////////////////////////// -// load member functions {{{1 -template Vc_INTRINSIC void Vector::load(const EntryType *mem) -{ - load(mem, Aligned); -} - -template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) -{ - d.v() = HV::load(mem, align); -} - -template template Vc_INTRINSIC void Vector::load(const OtherT *mem) -{ - load(mem, Aligned); -} - -// LoadHelper {{{2 -template struct LoadHelper; - -// float {{{2 -template struct LoadHelper { - static m256 load(const double *mem, Flags f) - { - return concat(_mm256_cvtpd_ps(VectorHelper::load(&mem[0], f)), - _mm256_cvtpd_ps(VectorHelper::load(&mem[4], f))); - } -}; -template struct LoadHelper { - static m256 load(const unsigned int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const unsigned char *mem, Flags f) - { - return StaticCastHelper::cast(LoadHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256 load(const signed char *mem, Flags f) - { - return StaticCastHelper::cast(LoadHelper::load(mem, f)); - } -}; - -template struct LoadHelper : public LoadHelper {}; - -// int {{{2 -template struct LoadHelper { - static m256i load(const unsigned int *mem, Flags f) - { - return VectorHelper::load(mem, f); - } -}; -template struct LoadHelper { - static m256i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epu16 = _mm_cvtepu8_epi16(epu8); - return StaticCastHelper::cast(epu16); - } -}; -template struct LoadHelper { - static m256i load(const signed char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epi16 = _mm_cvtepi8_epi16(epi8); - return StaticCastHelper::cast(epi16); - } -}; - -// unsigned int {{{2 -template struct LoadHelper { - static m256i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m256i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - const m128i epu16 = _mm_cvtepu8_epi16(epu8); - return StaticCastHelper::cast(epu16); - } -}; - -// short {{{2 -template struct LoadHelper { - static m128i load(const unsigned short *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static m128i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepu8_epi16(epu8); - } -}; -template struct LoadHelper { - static m128i load(const signed char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epi8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepi8_epi16(epi8); - } -}; - -// unsigned short {{{2 -template struct LoadHelper { - static m128i load(const unsigned char *mem, Flags) - { - // the only available streaming load loads 16 bytes - twice as much as we need => can't use - // it, or we risk an out-of-bounds read and an unaligned load exception - const m128i epu8 = _mm_loadl_epi64(reinterpret_cast(mem)); - return _mm_cvtepu8_epi16(epu8); - } -}; - -// general load, implemented via LoadHelper {{{2 -template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) -{ - d.v() = LoadHelper::load(x, f); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// zeroing {{{1 -template Vc_INTRINSIC void Vector::setZero() -{ - data() = HV::zero(); -} -template Vc_INTRINSIC void Vector::setZero(const Mask &k) -{ - data() = HV::andnot_(avx_cast(k.data()), data()); -} - -template<> Vc_INTRINSIC void Vector::setQnan() -{ - data() = _mm256_setallone_pd(); -} -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) -{ - data() = _mm256_or_pd(data(), k.dataD()); -} -template<> Vc_INTRINSIC void Vector::setQnan() -{ - data() = _mm256_setallone_ps(); -} -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) -{ - data() = _mm256_or_ps(data(), k.data()); -} -template<> Vc_INTRINSIC void Vector::setQnan() -{ - data() = _mm256_setallone_ps(); -} -template<> Vc_INTRINSIC void Vector::setQnan(MaskArg k) -{ - data() = _mm256_or_ps(data(), k.data()); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// stores {{{1 -template Vc_INTRINSIC void Vector::store(EntryType *mem) const -{ - HV::store(mem, data(), Aligned); -} -template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const -{ - HV::store(mem, data(), avx_cast(mask.data()), Aligned); -} -template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const -{ - HV::store(mem, data(), align); -} -template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const -{ - HV::store(mem, data(), avx_cast(mask.data()), align); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// expand/merge 1 float_v <=> 2 double_v XXX rationale? remove it for release? XXX {{{1 -template Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(a[0]) -{ -} -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(concat(_mm256_cvtpd_ps(a[0].data()), _mm256_cvtpd_ps(a[1].data()))) -{ -} -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(_mm_packs_epi32(lo128(a->data()), hi128(a->data()))) -{ -} -template<> Vc_ALWAYS_INLINE Vc_FLATTEN Vector::Vector(const Vector *a) - : d(_mm_packus_epi32(lo128(a->data()), hi128(a->data()))) -{ -} -template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const -{ - x[0] = *this; -} -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const -{ - x[0].data() = _mm256_cvtps_pd(lo128(d.v())); - x[1].data() = _mm256_cvtps_pd(hi128(d.v())); -} -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const -{ - x[0].data() = concat(_mm_cvtepi16_epi32(d.v()), - _mm_cvtepi16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); -} -template<> Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::expand(Vector *x) const -{ - x[0].data() = concat(_mm_cvtepu16_epi32(d.v()), - _mm_cvtepu16_epi32(_mm_unpackhi_epi64(d.v(), d.v()))); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// swizzles {{{1 -template Vc_INTRINSIC const Vector Vc_PURE &Vector::abcd() const { return *this; } -template Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } -template Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } - -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cdab() const { return Mem::shuffle128(data(), data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::badc() const { return Mem::permute(data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::aaaa() const { const double &tmp = d.m(0); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bbbb() const { const double &tmp = d.m(1); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::cccc() const { const double &tmp = d.m(2); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dddd() const { const double &tmp = d.m(3); return _mm256_broadcast_sd(&tmp); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcad() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::bcda() const { return Mem::shuffle(data(), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dabc() const { return Mem::shuffle(Mem::shuffle128(data(), data()), data()); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::acbd() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dbca() const { return Mem::shuffle(Mem::shuffle128(data(), data()), Mem::shuffle128(data(), data())); } -template<> Vc_INTRINSIC const double_v Vc_PURE Vector::dcba() const { return cdab().badc(); } - -#define VC_SWIZZLES_16BIT_IMPL(T) \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cdab() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::badc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::aaaa() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bbbb() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::cccc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dddd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcad() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::bcda() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dabc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::acbd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dbca() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC const Vector Vc_PURE Vector::dcba() const { return Mem::permute(data()); } -VC_SWIZZLES_16BIT_IMPL(short) -VC_SWIZZLES_16BIT_IMPL(unsigned short) -#undef VC_SWIZZLES_16BIT_IMPL - -/////////////////////////////////////////////////////////////////////////////////////////// -// division {{{1 -template inline Vector &Vector::operator/=(EntryType x) -{ - if (HasVectorDivision) { - return operator/=(Vector(x)); - } - for_all_vector_entries(i, - d.m(i) /= x; - ); - return *this; -} -template template inline Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const -{ - if (HasVectorDivision) { - return operator/(Vector(x)); - } - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x; - ); - return r; -} -// per default fall back to scalar division -template inline Vector &Vector::operator/=(const Vector &x) -{ - for_all_vector_entries(i, - d.m(i) /= x.d.m(i); - ); - return *this; -} - -template inline Vector Vc_PURE Vector::operator/(const Vector &x) const -{ - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x.d.m(i); - ); - return r; -} -// specialize division on type -static Vc_INTRINSIC m256i Vc_CONST divInt(param256i a, param256i b) { - const m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); - const m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); - const m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); - const m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); - return concat( - _mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), - _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)) - ); -} -template<> inline Vector &Vector::operator/=(const Vector &x) -{ - d.v() = divInt(d.v(), x.d.v()); - return *this; -} -template<> inline Vector Vc_PURE Vector::operator/(const Vector &x) const -{ - return divInt(d.v(), x.d.v()); -} -static inline m256i Vc_CONST divUInt(param256i a, param256i b) { - m256d loa = _mm256_cvtepi32_pd(lo128(a)); - m256d hia = _mm256_cvtepi32_pd(hi128(a)); - m256d lob = _mm256_cvtepi32_pd(lo128(b)); - m256d hib = _mm256_cvtepi32_pd(hi128(b)); - // if a >= 2^31 then after conversion to double it will contain a negative number (i.e. a-2^32) - // to get the right number back we have to add 2^32 where a >= 2^31 - loa = _mm256_add_pd(loa, _mm256_and_pd(_mm256_cmp_pd(loa, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); - hia = _mm256_add_pd(hia, _mm256_and_pd(_mm256_cmp_pd(hia, _mm256_setzero_pd(), _CMP_LT_OS), _mm256_set1_pd(4294967296.))); - // we don't do the same for b because division by b >= 2^31 should be a seldom corner case and - // we rather want the standard stuff fast - // - // there is one remaining problem: a >= 2^31 and b == 1 - // in that case the return value would be 2^31 - return avx_cast(_mm256_blendv_ps(avx_cast(concat( - _mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), - _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)) - )), avx_cast(a), avx_cast(concat( - _mm_cmpeq_epi32(lo128(b), _mm_setone_epi32()), - _mm_cmpeq_epi32(hi128(b), _mm_setone_epi32()))))); -} -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v() = divUInt(d.v(), x.d.v()); - return *this; -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const -{ - return divUInt(d.v(), x.d.v()); -} -template static inline m128i Vc_CONST divShort(param128i a, param128i b) -{ - const m256 r = _mm256_div_ps(StaticCastHelper::cast(a), - StaticCastHelper::cast(b)); - return StaticCastHelper::cast(r); -} -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v() = divShort(d.v(), x.d.v()); - return *this; -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const -{ - return divShort(d.v(), x.d.v()); -} -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v() = divShort(d.v(), x.d.v()); - return *this; -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vector::operator/(const Vector &x) const -{ - return divShort(d.v(), x.d.v()); -} -template<> Vc_INTRINSIC float_v &float_v::operator/=(const float_v &x) -{ - d.v() = _mm256_div_ps(d.v(), x.d.v()); - return *this; -} -template<> Vc_INTRINSIC float_v Vc_PURE float_v::operator/(const float_v &x) const -{ - return _mm256_div_ps(d.v(), x.d.v()); -} -template<> Vc_INTRINSIC sfloat_v &sfloat_v::operator/=(const sfloat_v &x) -{ - d.v() = _mm256_div_ps(d.v(), x.d.v()); - return *this; -} -template<> Vc_INTRINSIC sfloat_v Vc_PURE sfloat_v::operator/(const sfloat_v &x) const -{ - return _mm256_div_ps(d.v(), x.d.v()); -} -template<> Vc_INTRINSIC double_v &double_v::operator/=(const double_v &x) -{ - d.v() = _mm256_div_pd(d.v(), x.d.v()); - return *this; -} -template<> Vc_INTRINSIC double_v Vc_PURE double_v::operator/(const double_v &x) const -{ - return _mm256_div_pd(d.v(), x.d.v()); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// integer ops {{{1 -#define OP_IMPL(T, symbol) \ -template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) \ -{ \ - for_all_vector_entries(i, d.m(i) symbol##= x.d.m(i); ); \ - return *this; \ -} \ -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const \ -{ \ - Vector r; \ - for_all_vector_entries(i, r.d.m(i) = d.m(i) symbol x.d.m(i); ); \ - return r; \ -} -OP_IMPL(int, <<) -OP_IMPL(int, >>) -OP_IMPL(unsigned int, <<) -OP_IMPL(unsigned int, >>) -OP_IMPL(short, <<) -OP_IMPL(short, >>) -OP_IMPL(unsigned short, <<) -OP_IMPL(unsigned short, >>) -#undef OP_IMPL - -template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { - d.v() = VectorHelper::shiftRight(d.v(), shift); - return *static_cast *>(this); -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { - return VectorHelper::shiftRight(d.v(), shift); -} -template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { - d.v() = VectorHelper::shiftLeft(d.v(), shift); - return *static_cast *>(this); -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { - return VectorHelper::shiftLeft(d.v(), shift); -} - -#define OP_IMPL(T, symbol, fun) \ - template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(AsArg x) { d.v() = HV::fun(d.v(), x.d.v()); return *this; } \ - template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(AsArg x) const { return Vector(HV::fun(d.v(), x.d.v())); } - OP_IMPL(int, &, and_) - OP_IMPL(int, |, or_) - OP_IMPL(int, ^, xor_) - OP_IMPL(unsigned int, &, and_) - OP_IMPL(unsigned int, |, or_) - OP_IMPL(unsigned int, ^, xor_) - OP_IMPL(short, &, and_) - OP_IMPL(short, |, or_) - OP_IMPL(short, ^, xor_) - OP_IMPL(unsigned short, &, and_) - OP_IMPL(unsigned short, |, or_) - OP_IMPL(unsigned short, ^, xor_) - OP_IMPL(float, &, and_) - OP_IMPL(float, |, or_) - OP_IMPL(float, ^, xor_) - OP_IMPL(sfloat, &, and_) - OP_IMPL(sfloat, |, or_) - OP_IMPL(sfloat, ^, xor_) - OP_IMPL(double, &, and_) - OP_IMPL(double, |, or_) - OP_IMPL(double, ^, xor_) -#undef OP_IMPL - -// operators {{{1 -#include "../common/operators.h" -// isNegative {{{1 -template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const -{ - return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); -} -template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const -{ - return avx_cast(_mm256_srai_epi32(avx_cast(_mm256_and_ps(_mm256_setsignmask_ps(), d.v())), 31)); -} -template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const -{ - return Mem::permute(avx_cast( - _mm256_srai_epi32(avx_cast(_mm256_and_pd(_mm256_setsignmask_pd(), d.v())), 31) - )); -} -// gathers {{{1 -// Better implementation (hopefully) with _mm256_set_ -//X template template Vector::Vector(const EntryType *mem, const Index *indexes) -//X { -//X for_all_vector_entries(int i, -//X d.m(i) = mem[indexes[i]]; -//X ); -//X } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) -{ - gather(mem, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) -{ - gather(mem, indexes); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(mem, indexes, mask); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(mem, indexes, mask); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, member2, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, member2, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - gather(array, ptrMember1, outerIndexes, innerIndexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, ptrMember1, outerIndexes, innerIndexes, mask); -} - -template struct IndexSizeChecker { static void check() {} }; -template struct IndexSizeChecker, Size> -{ - static void check() { - VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); - } -}; -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} - -#ifdef VC_USE_SET_GATHERS -template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) -{ - IndexSizeChecker, Size>::check(); - Vector indexesTmp = indexes; - indexesTmp.setZero(!mask); - (*this)(mask) = Vector(mem, indexesTmp); -} -#endif - -#ifdef VC_USE_BSF_GATHERS -#define VC_MASKED_GATHER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits &= ~(1 << i); /* btr? */ \ - d.m(i) = ith_value(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_GATHERS) -#define VC_MASKED_GATHER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (_mm_popcnt_u32(bits)) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - d.m(low) = ith_value(low); \ - case 0: \ - break; \ - } -#else -#define VC_MASKED_GATHER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) d.m(i) = ith_value(i); \ - ); -#endif - -template template -Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (mem[indexes[_i_]]) - VC_MASKED_GATHER -#undef ith_value -} - -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1), - array[indexes[2]].*(member1), array[indexes[3]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), - array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm256_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_GATHER -#undef ith_value -} - -#undef VC_MASKED_GATHER -#ifdef VC_USE_BSF_SCATTERS -#define VC_MASKED_SCATTER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits ^= (1 << i); /* btr? */ \ - ith_value(i) = d.m(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_SCATTERS) -#define VC_MASKED_SCATTER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (_mm_popcnt_u32(bits)) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - ith_value(low) = d.m(low); \ - case 0: \ - break; \ - } -#else -#define VC_MASKED_SCATTER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) ith_value(i) = d.m(i); \ - ); -#endif - -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - for_all_vector_entries(i, - mem[indexes[i]] = d.m(i); - ); -} -#if defined(VC_MSVC) && VC_MSVC >= 170000000 -// MSVC miscompiles the store mem[indexes[1]] = d.m(1) for T = (u)short -template<> template Vc_ALWAYS_INLINE void short_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - const unsigned int tmp = d.v()._d.m128i_u32[0]; - mem[indexes[0]] = tmp & 0xffff; - mem[indexes[1]] = tmp >> 16; - mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); - mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); - mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); - mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); - mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); - mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); -} -template<> template Vc_ALWAYS_INLINE void ushort_v::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - const unsigned int tmp = d.v()._d.m128i_u32[0]; - mem[indexes[0]] = tmp & 0xffff; - mem[indexes[1]] = tmp >> 16; - mem[indexes[2]] = _mm_extract_epi16(d.v(), 2); - mem[indexes[3]] = _mm_extract_epi16(d.v(), 3); - mem[indexes[4]] = _mm_extract_epi16(d.v(), 4); - mem[indexes[5]] = _mm_extract_epi16(d.v(), 5); - mem[indexes[6]] = _mm_extract_epi16(d.v(), 6); - mem[indexes[7]] = _mm_extract_epi16(d.v(), 7); -} -#endif -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const -{ -#define ith_value(_i_) mem[indexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const -{ - for_all_vector_entries(i, - array[indexes[i]].*(member1) = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const -{ -#define ith_value(_i_) array[indexes[_i_]].*(member1) - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const -{ - for_all_vector_entries(i, - array[indexes[i]].*(member1).*(member2) = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const -{ -#define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const -{ - for_all_vector_entries(i, - (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const -{ -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// operator- {{{1 -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_pd(d.v(), _mm256_setsignmask_pd()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_xor_ps(d.v(), _mm256_setsignmask_ps()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm256_sign_epi32(d.v(), _mm256_setallone_si256()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// horizontal ops {{{1 -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArg m) const -{ - Vector tmp = std::numeric_limits >::max(); - tmp(m) = *this; - return tmp.min(); -} -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArg m) const -{ - Vector tmp = std::numeric_limits >::min(); - tmp(m) = *this; - return tmp.max(); -} -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArg m) const -{ - Vector tmp(VectorSpecialInitializerOne::One); - tmp(m) = *this; - return tmp.product(); -} -template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArg m) const -{ - Vector tmp(VectorSpecialInitializerZero::Zero); - tmp(m) = *this; - return tmp.sum(); -}//}}} -// copySign {{{1 -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_ps( - _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), - _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) - ); -} -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_ps( - _mm256_and_ps(reference.d.v(), _mm256_setsignmask_ps()), - _mm256_and_ps(d.v(), _mm256_setabsmask_ps()) - ); -} -template<> Vc_INTRINSIC Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm256_or_pd( - _mm256_and_pd(reference.d.v(), _mm256_setsignmask_pd()), - _mm256_and_pd(d.v(), _mm256_setabsmask_pd()) - ); -}//}}}1 -// exponent {{{1 -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); -} -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); -} -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.).isFull()); - return Internal::exponent(d.v()); -} -// }}}1 -// Random {{{1 -static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, - Vector &state1) -{ - state0.load(&Vc::RandomState[0]); - state1.load(&Vc::RandomState[uint_v::Size]); - (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); - uint_v(_mm256_xor_si256((state0 * 0xdeece66du + 11).data(), _mm256_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); -} - -template Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return state0.reinterpretCast >(); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return HT::sub(HV::or_(_cast(_mm256_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - const m256i state = VectorHelper::load(&Vc::RandomState[0], Vc::Aligned); - for (size_t k = 0; k < 8; k += 2) { - typedef unsigned long long uint64 Vc_MAY_ALIAS; - const uint64 stateX = *reinterpret_cast(&Vc::RandomState[k]); - *reinterpret_cast(&Vc::RandomState[k]) = (stateX * 0x5deece66dull + 11); - } - return (Vector(_cast(_mm256_srli_epi64(state, 12))) | One()) - One(); -} -// }}}1 -// shifted / rotated {{{1 -template struct VectorShift; -template<> struct VectorShift<32, 4, m256d, double> -{ - static Vc_INTRINSIC m256d shifted(param256d v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(double))); - case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(double))); - case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(double))); - case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(double))); - case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(double))); - case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(double))); - } - return _mm256_setzero_pd(); - } -}; -template struct VectorShift<32, 8, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm256_srli_si256(avx_cast(v), 1 * sizeof(EntryType))); - case 2: return avx_cast(_mm256_srli_si256(avx_cast(v), 2 * sizeof(EntryType))); - case 3: return avx_cast(_mm256_srli_si256(avx_cast(v), 3 * sizeof(EntryType))); - case 4: return avx_cast(_mm256_srli_si256(avx_cast(v), 4 * sizeof(EntryType))); - case 5: return avx_cast(_mm256_srli_si256(avx_cast(v), 5 * sizeof(EntryType))); - case 6: return avx_cast(_mm256_srli_si256(avx_cast(v), 6 * sizeof(EntryType))); - case 7: return avx_cast(_mm256_srli_si256(avx_cast(v), 7 * sizeof(EntryType))); - case -1: return avx_cast(_mm256_slli_si256(avx_cast(v), 1 * sizeof(EntryType))); - case -2: return avx_cast(_mm256_slli_si256(avx_cast(v), 2 * sizeof(EntryType))); - case -3: return avx_cast(_mm256_slli_si256(avx_cast(v), 3 * sizeof(EntryType))); - case -4: return avx_cast(_mm256_slli_si256(avx_cast(v), 4 * sizeof(EntryType))); - case -5: return avx_cast(_mm256_slli_si256(avx_cast(v), 5 * sizeof(EntryType))); - case -6: return avx_cast(_mm256_slli_si256(avx_cast(v), 6 * sizeof(EntryType))); - case -7: return avx_cast(_mm256_slli_si256(avx_cast(v), 7 * sizeof(EntryType))); - } - return avx_cast(_mm256_setzero_ps()); - } -}; -template struct VectorShift<16, 8, VectorType, EntryType> -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType shifted(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (amount) { - case 0: return v; - case 1: return avx_cast(_mm_srli_si128(avx_cast(v), 1 * EntryTypeSizeof)); - case 2: return avx_cast(_mm_srli_si128(avx_cast(v), 2 * EntryTypeSizeof)); - case 3: return avx_cast(_mm_srli_si128(avx_cast(v), 3 * EntryTypeSizeof)); - case 4: return avx_cast(_mm_srli_si128(avx_cast(v), 4 * EntryTypeSizeof)); - case 5: return avx_cast(_mm_srli_si128(avx_cast(v), 5 * EntryTypeSizeof)); - case 6: return avx_cast(_mm_srli_si128(avx_cast(v), 6 * EntryTypeSizeof)); - case 7: return avx_cast(_mm_srli_si128(avx_cast(v), 7 * EntryTypeSizeof)); - case -1: return avx_cast(_mm_slli_si128(avx_cast(v), 1 * EntryTypeSizeof)); - case -2: return avx_cast(_mm_slli_si128(avx_cast(v), 2 * EntryTypeSizeof)); - case -3: return avx_cast(_mm_slli_si128(avx_cast(v), 3 * EntryTypeSizeof)); - case -4: return avx_cast(_mm_slli_si128(avx_cast(v), 4 * EntryTypeSizeof)); - case -5: return avx_cast(_mm_slli_si128(avx_cast(v), 5 * EntryTypeSizeof)); - case -6: return avx_cast(_mm_slli_si128(avx_cast(v), 6 * EntryTypeSizeof)); - case -7: return avx_cast(_mm_slli_si128(avx_cast(v), 7 * EntryTypeSizeof)); - } - return _mm_setzero_si128(); - } -}; -template Vc_INTRINSIC Vector Vector::shifted(int amount) const -{ - return VectorShift::shifted(d.v(), amount); -} -template struct VectorRotate; -template struct VectorRotate<32, 4, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - const m128i vLo = avx_cast(lo128(v)); - const m128i vHi = avx_cast(hi128(v)); - switch (static_cast(amount) % 4) { - case 0: return v; - case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); - case 2: return Mem::permute128(v); - case 3: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); - } - return _mm256_setzero_pd(); - } -}; -template struct VectorRotate<32, 8, VectorType, EntryType> -{ - typedef typename SseVectorType::Type SmallV; - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - const m128i vLo = avx_cast(lo128(v)); - const m128i vHi = avx_cast(hi128(v)); - switch (static_cast(amount) % 8) { - case 0: return v; - case 1: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof))); - case 2: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof))); - case 3: return concat(avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof))); - case 4: return Mem::permute128(v); - case 5: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 1 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 1 * EntryTypeSizeof))); - case 6: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 2 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 2 * EntryTypeSizeof))); - case 7: return concat(avx_cast(_mm_alignr_epi8(vLo, vHi, 3 * EntryTypeSizeof)), avx_cast(_mm_alignr_epi8(vHi, vLo, 3 * EntryTypeSizeof))); - } - return avx_cast(_mm256_setzero_ps()); - } -}; -template struct VectorRotate<16, 8, VectorType, EntryType> -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - static Vc_INTRINSIC VectorType rotated(VC_ALIGNED_PARAMETER(VectorType) v, int amount) - { - switch (static_cast(amount) % 8) { - case 0: return v; - case 1: return avx_cast(_mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); - case 2: return avx_cast(_mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); - case 3: return avx_cast(_mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); - case 4: return avx_cast(_mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); - case 5: return avx_cast(_mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); - case 6: return avx_cast(_mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); - case 7: return avx_cast(_mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); - } - return _mm_setzero_si128(); - } -}; -template Vc_INTRINSIC Vector Vector::rotated(int amount) const -{ - return VectorRotate::rotated(d.v(), amount); - /* - const m128i v0 = avx_cast(d.v()[0]); - const m128i v1 = avx_cast(d.v()[1]); - switch (static_cast(amount) % Size) { - case 0: return *this; - case 1: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); - case 2: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); - case 3: return concat(avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); - case 4: return concat(d.v()[1], d.v()[0]); - case 5: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); - case 6: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); - case 7: return concat(avx_cast(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), avx_cast(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); - } - */ -} -// }}}1 -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/avx/vectorhelper.h b/math/vc/include/Vc/avx/vectorhelper.h deleted file mode 100644 index 11467e89a34bf..0000000000000 --- a/math/vc/include/Vc/avx/vectorhelper.h +++ /dev/null @@ -1,765 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef AVX_VECTORHELPER_H -#define AVX_VECTORHELPER_H - -#include -#include "types.h" -#include "intrinsics.h" -#include "casts.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -namespace Internal -{ -Vc_INTRINSIC Vc_CONST m256 exponent(param256 v) -{ - m128i tmp0 = _mm_srli_epi32(avx_cast(v), 23); - m128i tmp1 = _mm_srli_epi32(avx_cast(hi128(v)), 23); - tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); - tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); - return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); -} -Vc_INTRINSIC Vc_CONST m256d exponent(param256d v) -{ - m128i tmp0 = _mm_srli_epi64(avx_cast(v), 52); - m128i tmp1 = _mm_srli_epi64(avx_cast(hi128(v)), 52); - tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); - tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); - return _mm256_cvtepi32_pd(avx_cast(Mem::shuffle(avx_cast(tmp0), avx_cast(tmp1)))); -} -} // namespace Internal - -#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } -#define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; } -#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; } -#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; } - - template<> struct VectorHelper - { - typedef m256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); } - - OP0(allone, _mm256_setallone_ps()) - OP0(zero, _mm256_setzero_ps()) - OP2(or_, _mm256_or_ps(a, b)) - OP2(xor_, _mm256_xor_ps(a, b)) - OP2(and_, _mm256_and_ps(a, b)) - OP2(andnot_, _mm256_andnot_ps(a, b)) - OP3(blend, _mm256_blendv_ps(a, b, c)) - }; - - template<> struct VectorHelper - { - typedef m256d VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); } - static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); } - // aaaa bbbb cccc dddd specialized in vector.tcc - static VectorType dacb(VTArg x) { - const m128d cb = avx_cast(_mm_alignr_epi8(avx_cast(lo128(x)), - avx_cast(hi128(x)), sizeof(double))); // XXX: lo and hi swapped? - const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped? - return concat(cb, da); - } - - OP0(allone, _mm256_setallone_pd()) - OP0(zero, _mm256_setzero_pd()) - OP2(or_, _mm256_or_pd(a, b)) - OP2(xor_, _mm256_xor_pd(a, b)) - OP2(and_, _mm256_and_pd(a, b)) - OP2(andnot_, _mm256_andnot_pd(a, b)) - OP3(blend, _mm256_blendv_pd(a, b, c)) - }; - - template<> struct VectorHelper - { - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - template static VectorType load(const T *x, AlignedFlag) Vc_PURE; - template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; - template static void store(T *mem, VTArg x, AlignedFlag); - template static void store(T *mem, VTArg x, UnalignedFlag); - template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); - - static VectorType cdab(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 3, 0, 1))); } - static VectorType badc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 0, 3, 2))); } - static VectorType aaaa(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(0, 0, 0, 0))); } - static VectorType bbbb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(1, 1, 1, 1))); } - static VectorType cccc(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(2, 2, 2, 2))); } - static VectorType dddd(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 3, 3, 3))); } - static VectorType dacb(VTArg x) { return avx_cast(_mm256_permute_ps(avx_cast(x), _MM_SHUFFLE(3, 0, 2, 1))); } - - OP0(allone, _mm256_setallone_si256()) - OP0(zero, _mm256_setzero_si256()) - OP2(or_, _mm256_or_si256(a, b)) - OP2(xor_, _mm256_xor_si256(a, b)) - OP2(and_, _mm256_and_si256(a, b)) - OP2(andnot_, _mm256_andnot_si256(a, b)) - OP3(blend, _mm256_blendv_epi8(a, b, c)) - }; - - template<> struct VectorHelper - { - typedef m128i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - template static VectorType load(const T *x, AlignedFlag) Vc_PURE; - template static VectorType load(const T *x, UnalignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE; - template static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE; - template static void store(T *mem, VTArg x, AlignedFlag); - template static void store(T *mem, VTArg x, UnalignedFlag); - template static void store(T *mem, VTArg x, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, StreamingAndUnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, AlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, UnalignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag); - template static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag); - - static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); } - static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); } - static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); } - static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); } - static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); } - static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); } - static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); } - - OP0(allone, _mm_setallone_si128()) - OP0(zero, _mm_setzero_si128()) - OP2(or_, _mm_or_si128(a, b)) - OP2(xor_, _mm_xor_si128(a, b)) - OP2(and_, _mm_and_si128(a, b)) - OP2(andnot_, _mm_andnot_si128(a, b)) - OP3(blend, _mm_blendv_epi8(a, b, c)) - }; -#undef OP1 -#undef OP2 -#undef OP3 - -#define OP1(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); } -#define OP(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); } -#define OP_(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op , SUFFIX)(a, b); } -#define OPx(op, op2) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); } -#define OPcmp(op) \ - static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); } -#define OP_CAST_(op) \ - static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \ - _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \ - CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \ - } -#define MINMAX \ - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \ - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); } - - template<> struct VectorHelper { - typedef m256d VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef double EntryType; - typedef double ConcatType; -#define SUFFIX pd - - static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); } - static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d); - } - static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); } - - static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { -#ifdef VC_IMPL_FMA4 - v1 = _mm256_macc_pd(v1, v2, v3); -#else - VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); - VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); -#if defined(VC_GCC) && VC_GCC < 0x40703 - // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot - // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 - asm("":"+x"(h1), "+x"(h2)); -#endif - const VectorType l1 = _mm256_sub_pd(v1, h1); - const VectorType l2 = _mm256_sub_pd(v2, h2); - const VectorType ll = mul(l1, l2); - const VectorType lh = add(mul(l1, h2), mul(h1, l2)); - const VectorType hh = mul(h1, h2); - // ll < lh < hh for all entries is certain - const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| - const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); - const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); - v1 = add(add(ll, b), add(c, hh)); -#endif - } - - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) - - OP1(sqrt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { - return _mm256_div_pd(one(), sqrt(x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { - return _mm256_div_pd(one(), x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { - return _mm256_cmpunord_pd(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { - return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { - return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd()); - } - - MINMAX - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { - m128d b = _mm_min_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); - b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); - return _mm_cvtsd_f64(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { - m128d b = _mm_max_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); - b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); - return _mm_cvtsd_f64(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { - m128d b = _mm_mul_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); - b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); - return _mm_cvtsd_f64(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { - m128d b = _mm_add_pd(avx_cast(a), _mm256_extractf128_pd(a, 1)); - b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); - return _mm_cvtsd_f64(b); - } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { - return _mm256_round_pd(a, _MM_FROUND_NINT); - } - }; - - template<> struct VectorHelper { - typedef float EntryType; - typedef m256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef double ConcatType; -#define SUFFIX ps - - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, - const float e, const float f, const float g, const float h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); } - static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } - - static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { -#ifdef VC_IMPL_FMA4 - v1 = _mm256_macc_ps(v1, v2, v3); -#else - m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); - m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); - m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); - m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); - m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); - m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); - v1 = AVX::concat( - _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), - _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); -#endif - } - - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) - - OP1(sqrt) OP1(rsqrt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) { - return _mm256_cmpunord_ps(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) { - return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { - return _mm256_rcp_ps(x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { - return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps()); - } - - MINMAX - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { - m128 b = _mm_min_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); - b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) - b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3 - return _mm_cvtss_f32(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { - m128 b = _mm_max_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); - b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) - b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3 - return _mm_cvtss_f32(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { - m128 b = _mm_mul_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); - b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); - b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); - return _mm_cvtss_f32(b); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { - m128 b = _mm_add_ps(avx_cast(a), _mm256_extractf128_ps(a, 1)); - b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); - b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); - return _mm_cvtss_f32(b); - } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { - return _mm256_round_ps(a, _MM_FROUND_NINT); - } - }; - - template<> struct VectorHelper : public VectorHelper {}; - - template<> struct VectorHelper { - typedef int EntryType; - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef long long ConcatType; -#define SUFFIX si256 - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } -#undef SUFFIX -#define SUFFIX epi32 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } - - static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d, - const int e, const int f, const int g, const int h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm256_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm256_srai_, SUFFIX)(a, shift); - } - OP1(abs) - - MINMAX - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { - m128i b = _mm_min_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { - m128i b = _mm_max_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { - m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { - m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } - - OP(add) OP(sub) - OPcmp(eq) - OPcmp(lt) - OPcmp(gt) - static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef unsigned int EntryType; - typedef m256i VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned long long ConcatType; -#define SUFFIX si256 - OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) - static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); } - -#undef SUFFIX -#define SUFFIX epu32 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); } - - MINMAX - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) { - m128i b = _mm_min_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) { - m128i b = _mm_max_epu32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) { - m128i b = _mm_add_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) { - m128i b = _mm_mullo_epi32(avx_cast(a), _mm256_extractf128_si256(a, 1)); - b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2))); - b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(b); - } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); } - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - -#undef SUFFIX -#define SUFFIX epi32 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm256_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm256_srli_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d, - const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) { - return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); } - - OP(add) OP(sub) - OPcmp(eq) - static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { - return _mm256_cmplt_epu32(a, b); - } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { - return _mm256_cmpgt_epu32(a, b); - } -#else - OPcmp(lt) - OPcmp(gt) -#endif - static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); } - static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } - -#undef SUFFIX - static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef signed short EntryType; - typedef int ConcatType; - - static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } - -#define SUFFIX epi16 - static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm_srai_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, - const EntryType e, const EntryType f, const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { - v1 = add(mul(v1, v2), v3); - } - - static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); } - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); } - - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { - VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { - VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - - static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } - }; - - template<> struct VectorHelper { - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned short EntryType; - typedef unsigned int ConcatType; - - static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); } - static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); } - static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); } - - static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); } - -#define SUFFIX epi16 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) { - return CAT(_mm_srli_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, - const EntryType d, const EntryType e, const EntryType f, - const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); } -#else - static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); } - static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); } -#endif - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; } - }; -#undef OP1 -#undef OP -#undef OP_ -#undef OPx -#undef OPcmp - -template<> struct VectorHelper -{ - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef char EntryType; - typedef short ConcatType; -}; - -template<> struct VectorHelper -{ - typedef VectorTypeHelper::Type VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType & VTArg; -#else - typedef const VectorType VTArg; -#endif - typedef unsigned char EntryType; - typedef unsigned short ConcatType; -}; - -} // namespace AVX -} // namespace Vc -} // namespace ROOT - -#include "vectorhelper.tcc" -#include "undomacros.h" - -#endif // AVX_VECTORHELPER_H diff --git a/math/vc/include/Vc/avx/vectorhelper.tcc b/math/vc/include/Vc/avx/vectorhelper.tcc deleted file mode 100644 index ce48fad431468..0000000000000 --- a/math/vc/include/Vc/avx/vectorhelper.tcc +++ /dev/null @@ -1,270 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "casts.h" -#include - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// float_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, AlignedFlag) -{ - return _mm256_load_ps(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, UnalignedFlag) -{ - return _mm256_loadu_ps(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper::load(const float *m, StreamingAndAlignedFlag) -{ - return avx_cast(concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4]))))); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256 - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const float *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_ps(m); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, AlignedFlag) -{ - _mm256_store_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast(x), 1), _mm256_extractf128_si256(avx_cast(m), 1), reinterpret_cast(mem + 4)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// double_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, AlignedFlag) -{ - return _mm256_load_pd(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, UnalignedFlag) -{ - return _mm256_loadu_pd(m); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper::load(const double *m, StreamingAndAlignedFlag) -{ - return avx_cast(concat( - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[2]))))); -} -template<> Vc_ALWAYS_INLINE Vc_PURE m256d - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const double *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_pd(m); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, AlignedFlag) -{ - _mm256_store_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), _mm_setallone_si128(), reinterpret_cast(mem + 2)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), avx_cast(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(avx_cast(_mm256_extractf128_pd(x, 1)), avx_cast(_mm256_extractf128_pd(m, 1)), reinterpret_cast(mem + 2)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -// (u)int_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, AlignedFlag) -{ - return _mm256_load_si256(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, UnalignedFlag) -{ - return _mm256_loadu_si256(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper::load(const T *m, StreamingAndAlignedFlag) -{ - return concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))), - _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(&m[4])))); -} -template Vc_ALWAYS_INLINE Vc_PURE m256i - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const T *m, StreamingAndUnalignedFlag) -{ - return _mm256_loadu_si256(reinterpret_cast(m)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) -{ - _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) -{ - _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(avx_cast(x), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm256_extractf128_si256(x, 1), _mm_setallone_si128(), reinterpret_cast(mem + 4)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag) -{ - _mm256_maskstore(mem, m, x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast(mem)); - _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast(mem + 4)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -// (u)short_v -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// loads -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, AlignedFlag) -{ - return _mm_load_si128(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, UnalignedFlag) -{ - return _mm_loadu_si128(reinterpret_cast(m)); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper::load(const T *m, StreamingAndAlignedFlag) -{ - return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(m))); -} -template Vc_ALWAYS_INLINE Vc_PURE m128i - VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") -VectorHelper::load(const T *m, StreamingAndUnalignedFlag) -{ - return _mm_loadu_si128(reinterpret_cast(m)); -} -//////////////////////////////////////////////////////////////////////////////////////////////////// -//// stores -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, AlignedFlag) -{ - _mm_store_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, UnalignedFlag) -{ - _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndAlignedFlag) -{ - _mm_stream_si128(reinterpret_cast<__m128i *>(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, AlignedFlag align) -{ - store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, UnalignedFlag align) -{ - store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} - -} // namespace AVX -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/avx/writemaskedvector.h b/math/vc/include/Vc/avx/writemaskedvector.h deleted file mode 100644 index b6254444aea45..0000000000000 --- a/math/vc/include/Vc/avx/writemaskedvector.h +++ /dev/null @@ -1,82 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_AVX_WRITEMASKEDVECTOR_H -#define VC_AVX_WRITEMASKEDVECTOR_H - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -template -class WriteMaskedVector -{ - friend class Vector; - typedef typename VectorTypeHelper::Type VectorType; - typedef typename DetermineEntryType::Type EntryType; - enum Constants { Size = sizeof(VectorType) / sizeof(EntryType) }; - typedef typename Vc::AVX::Mask Mask; - public: - FREE_STORE_OPERATORS_ALIGNED(32) - //prefix - Vector Vc_ALWAYS_INLINE_L &operator++() Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator--() Vc_ALWAYS_INLINE_R; - //postfix - Vector Vc_ALWAYS_INLINE_L operator++(int) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L operator--(int) Vc_ALWAYS_INLINE_R; - - Vector Vc_ALWAYS_INLINE_L &operator+=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator-=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator*=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE_L &operator/=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE &operator+=(EntryType x) { return operator+=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator-=(EntryType x) { return operator-=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator*=(EntryType x) { return operator*=(Vector(x)); } - Vector Vc_ALWAYS_INLINE &operator/=(EntryType x) { return operator/=(Vector(x)); } - - Vector Vc_ALWAYS_INLINE_L &operator=(const Vector &x) Vc_ALWAYS_INLINE_R; - Vector Vc_ALWAYS_INLINE &operator=(EntryType x) { return operator=(Vector(x)); } - - template Vc_INTRINSIC void call(const F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC void call(F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC Vector apply(const F &f) const { - return vec->apply(f, mask); - } - template Vc_INTRINSIC Vector apply(F &f) const { - return vec->apply(f, mask); - } - private: - Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, const Mask &k) : vec(v), mask(k) {} - Vector *const vec; - Mask mask; -}; - -} // namespace AVX -} // namespace Vc -} // namespace ROOT -#include "writemaskedvector.tcc" -#include "undomacros.h" -#endif // VC_AVX_WRITEMASKEDVECTOR_H diff --git a/math/vc/include/Vc/avx/writemaskedvector.tcc b/math/vc/include/Vc/avx/writemaskedvector.tcc deleted file mode 100644 index d27272aa2ac71..0000000000000 --- a/math/vc/include/Vc/avx/writemaskedvector.tcc +++ /dev/null @@ -1,93 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator++() -{ - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator--() { - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator++(int) { - Vector ret(*vec); - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; -} - -template -Vc_ALWAYS_INLINE Vector WriteMaskedVector::operator--(int) { - Vector ret(*vec); - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator+=(const Vector &x) { - vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator-=(const Vector &x) { - vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator*=(const Vector &x) { - vec->assign(VectorHelper::mul(vec->data(), x.data()), mask); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator/=(const Vector &x) { - vec->assign(*vec / x, mask); - return *vec; -} - -template -Vc_ALWAYS_INLINE Vector &WriteMaskedVector::operator=(const Vector &x) { - vec->assign(x, mask); - return *vec; -} - -} // namespace AVX -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/common/aliasingentryhelper.h b/math/vc/include/Vc/common/aliasingentryhelper.h deleted file mode 100644 index c0aa7c1e38a11..0000000000000 --- a/math/vc/include/Vc/common/aliasingentryhelper.h +++ /dev/null @@ -1,126 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_ALIASINGENTRYHELPER_H -#define VC_COMMON_ALIASINGENTRYHELPER_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -template class AliasingEntryHelper -{ - private: - typedef typename StorageType::EntryType T; -#ifdef VC_ICC - StorageType *const m_storage; - const int m_index; - public: - Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {} - Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &rhs) : m_storage(rhs.m_storage), m_index(rhs.m_index) {} - Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { - m_storage->assign(m_index, rhs); - return *this; - } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return *this; } - Vc_ALWAYS_INLINE T operator++(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) + T(1)); return r; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return *this; } - Vc_ALWAYS_INLINE T operator--(int) { T r = m_storage->m(m_index); m_storage->assign(m_index, m_storage->m(m_index) - T(1)); return r; } -#define m_data m_storage->read(m_index) -#else - typedef T A Vc_MAY_ALIAS; - A &m_data; - public: - template - Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast(d)) {} - - Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {} - Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { - m_data = rhs.m_data; - return *this; - } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; } - - Vc_ALWAYS_INLINE AliasingEntryHelper &operator++() { ++m_data; return *this; } - Vc_ALWAYS_INLINE T operator++(int) { T r = m_data; ++m_data; return r; } - Vc_ALWAYS_INLINE AliasingEntryHelper &operator--() { --m_data; return *this; } - Vc_ALWAYS_INLINE T operator--(int) { T r = m_data; --m_data; return r; } -#endif - - Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; } - - Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast(m_data) == x; } - Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast(m_data) != x; } - Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast(m_data) <= x; } - Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast(m_data) >= x; } - Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast(m_data) < x; } - Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast(m_data) > x; } - - Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast(m_data); } - Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast(m_data); } - Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast(m_data) + x; } - Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast(m_data) - x; } - Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast(m_data) / x; } - Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast(m_data) * x; } - Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast(m_data) | x; } - Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast(m_data) & x; } - Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast(m_data) ^ x; } - Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast(m_data) % x; } - //T operator<<(T x) const { return static_cast(m_data) << x; } - //T operator>>(T x) const { return static_cast(m_data) >> x; } -#ifdef m_data -#undef m_data -#endif -}; - -} // namespace Common -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_ALIASINGENTRYHELPER_H diff --git a/math/vc/include/Vc/common/bitscanintrinsics.h b/math/vc/include/Vc/common/bitscanintrinsics.h deleted file mode 100644 index e23a5b7919583..0000000000000 --- a/math/vc/include/Vc/common/bitscanintrinsics.h +++ /dev/null @@ -1,62 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_BITSCANINTRINSICS_H -#define VC_COMMON_BITSCANINTRINSICS_H - -#if defined(VC_GCC) || defined(VC_CLANG) -# if VC_GCC >= 0x40500 - // GCC 4.5.0 introduced _bit_scan_forward / _bit_scan_reverse -# include -# else - // GCC <= 4.4 and clang have x86intrin.h, but not the required functions -# define _bit_scan_forward(x) __builtin_ctz(x) -#include "macros.h" -static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) { - int r; - __asm__("bsr %1,%0" : "=r"(r) : "X"(x)); - return r; -} -#include "undomacros.h" -# define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x) -# endif -#elif defined(VC_ICC) -// for all I know ICC supports the _bit_scan_* intrinsics -#elif defined(VC_OPEN64) -// TODO -#elif defined(VC_MSVC) -#include "windows_fix_intrin.h" -#pragma intrinsic(_BitScanForward) -#pragma intrinsic(_BitScanReverse) -static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) { - unsigned long index; - _BitScanForward(&index, x); - return index; -} -static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) { - unsigned long index; - _BitScanReverse(&index, x); - return index; -} -#else -// just assume the compiler can do it -#endif - - -#endif // VC_COMMON_BITSCANINTRINSICS_H diff --git a/math/vc/include/Vc/common/deinterleave.h b/math/vc/include/Vc/common/deinterleave.h deleted file mode 100644 index 1147af69b2cd0..0000000000000 --- a/math/vc/include/Vc/common/deinterleave.h +++ /dev/null @@ -1,87 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_DEINTERLEAVE_H -#define VC_COMMON_DEINTERLEAVE_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ - -/** - * \ingroup Vectors - * - * Loads two vectors of values from an interleaved array. - * - * \param a, b The vectors to load the values from memory into. - * \param memory The memory location where to read the next 2 * V::Size values from - * \param align Either pass Vc::Aligned or Vc::Unaligned. It defaults to Vc::Aligned if nothing is - * specified. - * - * If you store your data as - * \code - * struct { float x, y; } m[1000]; - * \endcode - * then the deinterleave function allows you to read \p Size concurrent x and y values like this: - * \code - * Vc::float_v x, y; - * Vc::deinterleave(&x, &y, &m[10], Vc::Unaligned); - * \endcode - * This code will load m[10], m[12], m[14], ... into \p x and m[11], m[13], m[15], ... into \p y. - * - * The deinterleave function supports the following type combinations: -\verbatim - V \ M | float | double | ushort | short | uint | int -=========|=======|========|========|=======|======|===== - float_v | X | | X | X | | ----------|-------|--------|--------|-------|------|----- -sfloat_v | X | | X | X | | ----------|-------|--------|--------|-------|------|----- -double_v | | X | | | | ----------|-------|--------|--------|-------|------|----- - int_v | | | | X | | X ----------|-------|--------|--------|-------|------|----- - uint_v | | | X | | X | ----------|-------|--------|--------|-------|------|----- - short_v | | | | X | | ----------|-------|--------|--------|-------|------|----- -ushort_v | | | X | | | -\endverbatim - */ -template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, - const M *memory, A align) -{ - Internal::Helper::deinterleave(*a, *b, memory, align); -} - -// documented as default for align above -template Vc_ALWAYS_INLINE void deinterleave(V *a, V *b, - const M *memory) -{ - Internal::Helper::deinterleave(*a, *b, memory, Aligned); -} - -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_DEINTERLEAVE_H diff --git a/math/vc/include/Vc/common/exponential.h b/math/vc/include/Vc/common/exponential.h deleted file mode 100644 index 09d14bdd66456..0000000000000 --- a/math/vc/include/Vc/common/exponential.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef COMMON_EXPONENTIAL_H -#define COMMON_EXPONENTIAL_H -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - - ------------------------------------------------------------------- - - The exp implementation is derived from Cephes, which carries the - following Copyright notice: - - Cephes Math Library Release 2.2: June, 1992 - Copyright 1984, 1987, 1989 by Stephen L. Moshier - Direct inquiries to 30 Frost Street, Cambridge, MA 02140 - -}}}*/ - -#ifndef VC_COMMON_EXPONENTIAL_H -#define VC_COMMON_EXPONENTIAL_H - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace Common -{ - using Vc::VC__USE_NAMESPACE::c_log; - using Vc::VC__USE_NAMESPACE::Vector; - using Vc::VC__USE_NAMESPACE::floor; - using Vc::VC__USE_NAMESPACE::ldexp; - - static const float log2_e = 1.44269504088896341f; - static const float MAXLOGF = 88.72283905206835f; - static const float MINLOGF = -103.278929903431851103f; /* log(2^-149) */ - static const float MAXNUMF = 3.4028234663852885981170418348451692544e38f; - - template struct TypenameForLdexp { typedef Vector Type; }; - template<> struct TypenameForLdexp { typedef Vector Type; }; - - template static inline Vector exp(VC_ALIGNED_PARAMETER(Vector) _x) { - typedef Vector V; - typedef typename V::Mask M; - typedef typename TypenameForLdexp::Type I; - typedef Const C; - - V x(_x); - - const M overflow = x > MAXLOGF; - const M underflow = x < MINLOGF; - - // log₂(eˣ) = x * log₂(e) * log₂(2) - // = log₂(2^(x * log₂(e))) - // => eˣ = 2^(x * log₂(e)) - // => n = ⌊x * log₂(e) + ½⌋ - // => y = x - n * ln(2) | recall that: ln(2) * log₂(e) == 1 - // <=> eˣ = 2ⁿ * eʸ - V z = floor(C::log2_e() * x + 0.5f); - I n = static_cast(z); - x -= z * C::ln2_large(); - x -= z * C::ln2_small(); - - /* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */ - z = ((((( 1.9875691500E-4f * x - + 1.3981999507E-3f) * x - + 8.3334519073E-3f) * x - + 4.1665795894E-2f) * x - + 1.6666665459E-1f) * x - + 5.0000001201E-1f) * (x * x) - + x - + 1.0f; - - x = ldexp(z, n); // == z * 2ⁿ - - x(overflow) = std::numeric_limits::infinity(); - x.setZero(underflow); - - return x; - } - static inline Vector exp(Vector::AsArg _x) { - Vector x = _x; - typedef Vector V; - typedef V::Mask M; - typedef Const C; - - const M overflow = x > Vc_buildDouble( 1, 0x0006232bdd7abcd2ull, 9); // max log - const M underflow = x < Vc_buildDouble(-1, 0x0006232bdd7abcd2ull, 9); // min log - - V px = floor(C::log2_e() * x + 0.5); -#ifdef VC_IMPL_SSE - Vector n(px); - n.data() = Mem::permute(n.data()); -#elif defined(VC_IMPL_AVX) - __m128i tmp = _mm256_cvttpd_epi32(px.data()); - Vector n = AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); -#endif - x -= px * C::ln2_large(); //Vc_buildDouble(1, 0x00062e4000000000ull, -1); // ln2 - x -= px * C::ln2_small(); //Vc_buildDouble(1, 0x0007f7d1cf79abcaull, -20); // ln2 - - const double P[] = { - Vc_buildDouble(1, 0x000089cdd5e44be8ull, -13), - Vc_buildDouble(1, 0x000f06d10cca2c7eull, -6), - Vc_buildDouble(1, 0x0000000000000000ull, 0) - }; - const double Q[] = { - Vc_buildDouble(1, 0x00092eb6bc365fa0ull, -19), - Vc_buildDouble(1, 0x0004ae39b508b6c0ull, -9), - Vc_buildDouble(1, 0x000d17099887e074ull, -3), - Vc_buildDouble(1, 0x0000000000000000ull, 1) - }; - const V x2 = x * x; - px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); - x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); - x = V::One() + 2.0 * x; - - x = ldexp(x, n); // == x * 2ⁿ - - x(overflow) = std::numeric_limits::infinity(); - x.setZero(underflow); - - return x; - } -} // namespace Common -namespace VC__USE_NAMESPACE -{ - using Vc::Common::exp; -} // namespace VC__USE_NAMESPACE -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#endif // VC_COMMON_EXPONENTIAL_H -#endif // COMMON_EXPONENTIAL_H diff --git a/math/vc/include/Vc/common/fix_clang_emmintrin.h b/math/vc/include/Vc/common/fix_clang_emmintrin.h deleted file mode 100644 index bf1bc06169f2a..0000000000000 --- a/math/vc/include/Vc/common/fix_clang_emmintrin.h +++ /dev/null @@ -1,79 +0,0 @@ -/*{{{ - Copyright (C) 2013 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -}}}*/ - -#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H -#define VC_COMMON_FIX_CLANG_EMMINTRIN_H - -#include - -#ifdef VC_CLANG - -#ifdef _mm_slli_si128 -#undef _mm_slli_si128 -#define _mm_slli_si128(a, count) __extension__ ({ \ - (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); }) -#endif - -#ifdef _mm_srli_si128 -#undef _mm_srli_si128 -#define _mm_srli_si128(a, count) __extension__ ({ \ - (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); }) -#endif - -#ifdef _mm_shuffle_epi32 -#undef _mm_shuffle_epi32 -#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \ - (imm) & 0x3, ((imm) & 0xc) >> 2, \ - ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) -#endif - -#ifdef _mm_shufflelo_epi16 -#undef _mm_shufflelo_epi16 -#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ - (imm) & 0x3, ((imm) & 0xc) >> 2, \ - ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ - 4, 5, 6, 7); }) -#endif - -#ifdef _mm_shufflehi_epi16 -#undef _mm_shufflehi_epi16 -#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ - 0, 1, 2, 3, \ - 4 + (((imm) & 0x03) >> 0), \ - 4 + (((imm) & 0x0c) >> 2), \ - 4 + (((imm) & 0x30) >> 4), \ - 4 + (((imm) & 0xc0) >> 6)); }) -#endif - -#ifdef _mm_shuffle_pd -#undef _mm_shuffle_pd -#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ - __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); }) -#endif - -#endif // VC_CLANG - -#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H diff --git a/math/vc/include/Vc/common/iif.h b/math/vc/include/Vc/common/iif.h deleted file mode 100644 index d9bc96f38188d..0000000000000 --- a/math/vc/include/Vc/common/iif.h +++ /dev/null @@ -1,62 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_COMMON_IIF_H -#define VC_COMMON_IIF_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -/** - * Function to mimic the ternary operator '?:'. - * - * \param condition Determines which values are returned. This is analog to the first argument to - * the ternary operator. - * \param trueValue The values to return where \p condition is \c true. - * \param falseValue The values to return where \p condition is \c false. - * \return A combination of entries from \p trueValue and \p falseValue, according to \p condition. - * - * So instead of the scalar variant - * \code - * float x = a > 1.f ? b : b + c; - * \endcode - * you'd write - * \code - * float_v x = Vc::iif (a > 1.f, b, b + c); - * \endcode - */ -#ifndef VC_MSVC -template static Vc_ALWAYS_INLINE Vector iif (typename Vector::Mask condition, Vector trueValue, Vector falseValue) -{ -#else -template static Vc_ALWAYS_INLINE Vector iif (const typename Vector::Mask &condition, const Vector &trueValue, const Vector &_falseValue) -{ - Vector falseValue(_falseValue); -#endif - falseValue(condition) = trueValue; - return falseValue; -} -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_IIF_H diff --git a/math/vc/include/Vc/common/interleavedmemory.h b/math/vc/include/Vc/common/interleavedmemory.h deleted file mode 100644 index 5b8504a8c61bf..0000000000000 --- a/math/vc/include/Vc/common/interleavedmemory.h +++ /dev/null @@ -1,268 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_COMMON_INTERLEAVEDMEMORY_H -#define VC_COMMON_INTERLEAVEDMEMORY_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -namespace Internal -{ -template struct CopyConst { typedef B Type; }; -template struct CopyConst { typedef const B Type; }; - -template struct EnableInterleaves { typedef R Type; }; -template struct EnableInterleaves; -} // namespace Internal - -/** - * \internal - */ -template struct InterleavedMemoryAccessBase -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef typename V::AsArg VArg; - typedef T Ta Vc_MAY_ALIAS; - const I m_indexes; - Ta *const m_data; - - Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data) - : m_indexes(indexes), m_data(data) - { - } - - // implementations of the following are in {scalar,sse,avx}/interleavedmemory.tcc - void deinterleave(V &v0, V &v1) const; - void deinterleave(V &v0, V &v1, V &v2) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const; - void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const; - - void interleave(VArg v0, VArg v1); - void interleave(VArg v0, VArg v1, VArg v2); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6); - void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6, VArg v7); -}; - -/** - * \internal - */ -// delay execution of the deinterleaving gather until operator= -template struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase -{ - typedef InterleavedMemoryAccessBase Base; - typedef typename Base::Ta Ta; - typedef typename Base::I I; - - Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(const Ta *data, typename I::AsArg indexes) - : Base(indexes * I(StructSize), const_cast(data)) // this needs to be refactored to properly keep the constness - { - } -}; - -/** - * \internal - */ -template struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess -{ - typedef InterleavedMemoryAccessBase Base; - typedef typename Base::Ta Ta; - typedef typename Base::I I; - - Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes) - : InterleavedMemoryReadAccess(data, indexes) - { - } - -#define _VC_SCATTER_ASSIGNMENT(LENGTH, parameters) \ - Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ - { \ - VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ - this->interleave parameters ; \ - } \ - Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ - { \ - VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ - checkIndexesUnique(); \ - this->interleave parameters ; \ - } - _VC_SCATTER_ASSIGNMENT(2, (rhs.l, rhs.r)) - _VC_SCATTER_ASSIGNMENT(3, (rhs.l.l, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(4, (rhs.l.l.l, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(5, (rhs.l.l.l.l, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(6, (rhs.l.l.l.l.l, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(7, (rhs.l.l.l.l.l.l, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); - _VC_SCATTER_ASSIGNMENT(8, (rhs.l.l.l.l.l.l.l, rhs.l.l.l.l.l.l.r, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); -#undef _VC_SCATTER_ASSIGNMENT - -private: -#ifdef NDEBUG - Vc_ALWAYS_INLINE void checkIndexesUnique() const {} -#else - void checkIndexesUnique() const - { - const I test = Base::m_indexes.sorted(); - VC_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) - } -#endif -}; - -#ifdef DOXYGEN -} // namespace Common -// in doxygen InterleavedMemoryWrapper should appear in the Vc namespace (see the using statement -// below) -#endif - -/** - * Wraps a pointer to memory with convenience functions to access it via vectors. - * - * \param S The type of the struct. - * \param V The type of the vector to be returned when read. This should reflect the type of the - * members inside the struct. - * - * \see operator[] - * \ingroup Utilities - * \headerfile interleavedmemory.h - */ -template class InterleavedMemoryWrapper -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef typename V::AsArg VArg; - typedef typename I::AsArg IndexType; - typedef InterleavedMemoryAccess Access; - typedef InterleavedMemoryReadAccess ReadAccess; - typedef typename Internal::CopyConst::Type Ta Vc_MAY_ALIAS; - Ta *const m_data; - - VC_STATIC_ASSERT((sizeof(S) / sizeof(T)) * sizeof(T) == sizeof(S), InterleavedMemoryAccess_does_not_support_packed_structs); - -public: - /** - * Constructs the wrapper object. - * - * \param s A pointer to a C-array. - */ - Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s) - : m_data(reinterpret_cast(s)) - { - } - - /** - * Interleaved scatter/gather access. - * - * Assuming you have a struct of floats and a vector of \p indexes into the array, this function - * can be used to access the struct entries as vectors using the minimal number of store or load - * instructions. - * - * \param indexes Vector of indexes that determine the gather locations. - * - * \return A special (magic) object that executes the loads and deinterleave on assignment to a - * vector tuple. - * - * Example: - * \code - * struct Foo { - * float x, y, z; - * }; - * - * void fillWithBar(Foo *_data, uint_v indexes) - * { - * Vc::InterleavedMemoryWrapper data(_data); - * const float_v x = bar(1); - * const float_v y = bar(2); - * const float_v z = bar(3); - * data[indexes] = (x, y, z); - * // it's also possible to just store a subset at the front of the struct: - * data[indexes] = (x, y); - * // if you want to store a single entry, use scatter: - * z.scatter(_data, &Foo::x, indexes); - * } - * - * float_v normalizeStuff(Foo *_data, uint_v indexes) - * { - * Vc::InterleavedMemoryWrapper data(_data); - * float_v x, y, z; - * (x, y, z) = data[indexes]; - * // it is also possible to just load a subset from the front of the struct: - * // (x, y) = data[indexes]; - * return Vc::sqrt(x * x + y * y + z * z); - * } - * \endcode - * - * You may think of the gather operation (or scatter as the inverse) like this: -\verbatim - Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8} - indexes: [5, 0, 1, 7] -Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7}) -\endverbatim - * - * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If - * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique. - */ -#ifdef DOXYGEN - Vc_ALWAYS_INLINE Access operator[](IndexType indexes) -#else - // need to SFINAE disable this for objects that wrap constant data - template - Vc_ALWAYS_INLINE typename Internal::EnableInterleaves::Type operator[]( - VC_ALIGNED_PARAMETER(U) indexes) -#endif - { - return Access(m_data, indexes); - } - - /// const overload (gathers only) of the above function - Vc_ALWAYS_INLINE ReadAccess operator[](VC_ALIGNED_PARAMETER(IndexType) indexes) const - { - return ReadAccess(m_data, indexes); - } - - /// alias of the above function - Vc_ALWAYS_INLINE ReadAccess gather(VC_ALIGNED_PARAMETER(IndexType) indexes) const - { - return operator[](indexes); - } - - //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1); -}; -#ifndef DOXYGEN -} // namespace Common - -using Common::InterleavedMemoryWrapper; -#endif - -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_INTERLEAVEDMEMORY_H diff --git a/math/vc/include/Vc/common/logarithm.h b/math/vc/include/Vc/common/logarithm.h deleted file mode 100644 index b9af61f6401a1..0000000000000 --- a/math/vc/include/Vc/common/logarithm.h +++ /dev/null @@ -1,277 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -/* The log implementations are based on code from Julien Pommier which carries the following - copyright information: - */ -/* - Inspired by Intel Approximate Math library, and based on the - corresponding algorithms of the cephes math library -*/ -/* Copyright (C) 2007 Julien Pommier - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. - - (this is the zlib license) -*/ - -#ifndef VC_COMMON_LOGARITHM_H -#define VC_COMMON_LOGARITHM_H - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace Common -{ -#ifdef VC__USE_NAMESPACE -using Vc::VC__USE_NAMESPACE::Const; -using Vc::VC__USE_NAMESPACE::Vector; -namespace Internal -{ - using namespace Vc::VC__USE_NAMESPACE::Internal; -} // namespace Internal -#endif -enum LogarithmBase { - BaseE, Base10, Base2 -}; - -template -struct LogImpl -{ - template static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, typename Vector::AsArg exponent) { - typedef Vector V; - typedef Const C; - // Taylor series around x = 2^exponent - // f(x) = ln(x) → exponent * ln(2) → C::ln2_small + C::ln2_large - // f'(x) = x⁻¹ → x → 1 - // f''(x) = - x⁻² → -x² / 2 → C::_1_2() - // = 2!x⁻³ → x³ / 3 → C::P(8) - // = -3!x⁻⁴ → -x⁴ / 4 → C::P(7) - // = 4!x⁻⁵ → x⁵ / 5 → C::P(6) - // ... - // The high order coefficients are adjusted to reduce the error that occurs from ommission - // of higher order terms. - // P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹| - // The order of additions must go from smallest to largest terms - const V x2 = x * x; // 0 → 4 -#ifdef VC_LOG_ILP - V y2 = (C::P(6) * /*4 → 8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8); - V y0 = (C::P(0) * /*5 → 9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2); - V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5); - const V x3 = x2 * x; // 7 → 11 - const V x6 = x3 * x3; // 11 → 15 - const V x9 = x6 * x3; // 15 → 19 - V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3; -#elif defined VC_LOG_ILP2 - /* - * name start done - * movaps %xmm0, %xmm1 ; x 0 1 - * movaps %xmm0, %xmm2 ; x 0 1 - * mulps %xmm1, %xmm1 ; x2 1 5 *xmm1 - * movaps , %xmm15 ; y8 1 2 - * mulps %xmm1, %xmm2 ; x3 5 9 *xmm2 - * movaps %xmm1, %xmm3 ; x2 5 6 - * movaps %xmm1, %xmm4 ; x2 5 6 - * mulps %xmm3, %xmm3 ; x4 6 10 *xmm3 - * movaps %xmm2, %xmm5 ; x3 9 10 - * movaps %xmm2, %xmm6 ; x3 9 10 - * mulps %xmm2, %xmm4 ; x5 9 13 *xmm4 - * movaps %xmm3, %xmm7 ; x4 10 11 - * movaps %xmm3, %xmm8 ; x4 10 11 - * movaps %xmm3, %xmm9 ; x4 10 11 - * mulps %xmm5, %xmm5 ; x6 10 14 *xmm5 - * mulps %xmm3, %xmm6 ; x7 11 15 *xmm6 - * mulps %xmm7, %xmm7 ; x8 12 16 *xmm7 - * movaps %xmm4, %xmm10 ; x5 13 14 - * mulps %xmm4, %xmm8 ; x9 13 17 *xmm8 - * mulps %xmm5, %xmm10 ; x11 14 18 *xmm10 - * mulps %xmm5, %xmm9 ; x10 15 19 *xmm9 - * mulps , %xmm10 ; y0 18 22 - * mulps , %xmm9 ; y1 19 23 - * mulps , %xmm8 ; y2 20 24 - * mulps , %xmm7 ; y3 21 25 - * addps %xmm10, %xmm9 ; y 23 26 - * addps %xmm9, %xmm8 ; y 26 29 - * addps %xmm8, %xmm7 ; y 29 32 - */ - const V x3 = x2 * x; // 4 → 8 - const V x4 = x2 * x2; // 5 → 9 - const V x5 = x2 * x3; // 8 → 12 - const V x6 = x3 * x3; // 9 → 13 - const V x7 = x4 * x3; // - const V x8 = x4 * x4; - const V x9 = x5 * x4; - const V x10 = x5 * x5; - const V x11 = x5 * x6; // 13 → 17 - V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7 - + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3; -#else - V y = C::P(0); - unrolled_loop16(i, 1, 9, - y = y * x + C::P(i); - ); - y *= x * x2; -#endif - switch (Base) { - case BaseE: - // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) - y += exponent * C::ln2_small(); - y -= x2 * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent * C::ln2_large(); - break; - case Base10: - y += exponent * C::ln2_small(); - y -= x2 * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent * C::ln2_large(); - x *= C::log10_e(); - break; - case Base2: - { - const V x_ = x; - x *= C::log2_e(); - y *= C::log2_e(); - y -= x_ * x * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent; - break; - } - } - } - - static Vc_ALWAYS_INLINE void log_series(Vector &VC_RESTRICT x, Vector::AsArg exponent) { - typedef Vector V; - typedef Const C; - const V x2 = x * x; - V y = C::P(0); - V y2 = C::Q(0) + x; - unrolled_loop16(i, 1, 5, - y = y * x + C::P(i); - y2 = y2 * x + C::Q(i); - ); - y2 = x / y2; - y = y * x + C::P(5); - y = x2 * y * y2; - // TODO: refactor the following with the float implementation: - switch (Base) { - case BaseE: - // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2)) - y += exponent * C::ln2_small(); - y -= x2 * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent * C::ln2_large(); - break; - case Base10: - y += exponent * C::ln2_small(); - y -= x2 * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent * C::ln2_large(); - x *= C::log10_e(); - break; - case Base2: - { - const V x_ = x; - x *= C::log2_e(); - y *= C::log2_e(); - y -= x_ * x * C::_1_2(); // [0, 0.25[ - x += y; - x += exponent; - break; - } - } - } - - template static inline Vector calc(VC_ALIGNED_PARAMETER(Vector) _x) { - typedef Vector V; - typedef typename V::Mask M; - typedef Const C; - - V x(_x); - - const M invalidMask = x < V::Zero(); - const M infinityMask = x == V::Zero(); - const M denormal = x <= C::min(); - - x(denormal) *= V(Vc_buildDouble(1, 0, 54)); // 2²⁵ - V exponent = Internal::exponent(x.data()); // = ⎣log₂(x)⎦ - exponent(denormal) -= 54; - - x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[ - x |= C::_1_2(); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[ - - // split calculation in two cases: - // A: x ∈ [½, √½[ - // B: x ∈ [√½, 1[ - // √½ defines the point where Δe(x) := log₂(x) - ⎣log₂(x)⎦ = ½, i.e. - // log₂(√½) - ⎣log₂(√½)⎦ = ½ * -1 - ⎣½ * -1⎦ = -½ + 1 = ½ - - const M smallX = x < C::_1_sqrt2(); - x(smallX) += x; // => x ∈ [√½, 1[ ∪ [1.5, 1 + √½[ - x -= V::One(); // => x ∈ [√½ - 1, 0[ ∪ [0.5, √½[ - exponent(!smallX) += V::One(); - - log_series(x, exponent); // A: (ˣ⁄₂ᵉ - 1, e) B: (ˣ⁄₂ᵉ⁺¹ - 1, e + 1) - - x.setQnan(invalidMask); // x < 0 → NaN - x(infinityMask) = C::neginf(); // x = 0 → -∞ - - return x; - } -}; - -template static Vc_ALWAYS_INLINE Vc_CONST Vector log(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); -} -template static Vc_ALWAYS_INLINE Vc_CONST Vector log10(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); -} -template static Vc_ALWAYS_INLINE Vc_CONST Vector log2(VC_ALIGNED_PARAMETER(Vector) x) { - return LogImpl::calc(x); -} -} // namespace Common -#ifdef VC__USE_NAMESPACE -namespace VC__USE_NAMESPACE -{ - using Vc::Common::log; - using Vc::Common::log10; - using Vc::Common::log2; -} // namespace VC__USE_NAMESPACE -#undef VC__USE_NAMESPACE -#endif -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#endif // VC_COMMON_LOGARITHM_H diff --git a/math/vc/include/Vc/common/macros.h b/math/vc/include/Vc/common/macros.h deleted file mode 100644 index 98fff72c1d25f..0000000000000 --- a/math/vc/include/Vc/common/macros.h +++ /dev/null @@ -1,384 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MACROS_H -#define VC_COMMON_MACROS_H -#undef VC_COMMON_UNDOMACROS_H - -#include - -#if defined(VC_GCC) && !defined(__OPTIMIZE__) -# if VC_GCC >= 0x40500 -# pragma GCC diagnostic push -# define Vc_POP_GCC_DIAGNOSTIC__ 1 -# endif -// GCC uses lots of old-style-casts in macros that disguise as intrinsics -# pragma GCC diagnostic ignored "-Wold-style-cast" -#endif - -#ifdef VC_MSVC -# define ALIGN(n) __declspec(align(n)) -# define STRUCT_ALIGN1(n) ALIGN(n) -# define STRUCT_ALIGN2(n) -# define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef ALIGN(n) _type_ _newType_ -#else -# define ALIGN(n) __attribute__((aligned(n))) -# define STRUCT_ALIGN1(n) -# define STRUCT_ALIGN2(n) ALIGN(n) -# define ALIGNED_TYPEDEF(n, _type_, _newType_) typedef _type_ _newType_ ALIGN(n) -#endif - -#ifdef VC_CXX11 -#define Vc_ALIGNOF(_TYPE_) alignof(_TYPE_) -#else -#define Vc_ALIGNOF(_TYPE_) __alignof(_TYPE_) -#endif - -#ifdef VC_CLANG -# define Vc_INTRINSIC_L inline -# define Vc_INTRINSIC_R __attribute__((always_inline)) -# define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R -# define Vc_FLATTEN -# define Vc_CONST __attribute__((const)) -# define Vc_CONST_L -# define Vc_CONST_R Vc_CONST -# define Vc_PURE __attribute__((pure)) -# define Vc_PURE_L -# define Vc_PURE_R Vc_PURE -# define Vc_MAY_ALIAS __attribute__((may_alias)) -# define Vc_ALWAYS_INLINE_L inline -# define Vc_ALWAYS_INLINE_R __attribute__((always_inline)) -# define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R -# define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) -# define VC_IS_LIKELY(x) __builtin_expect(x, 1) -# define VC_RESTRICT __restrict__ -# define VC_DEPRECATED(msg) -#elif defined(__GNUC__) -# if (defined(VC_GCC) && VC_GCC < 0x40300) || defined(VC_OPEN64) -// GCC 4.1 and 4.2 ICE on may_alias. Since Open64 uses the GCC 4.2 frontend it has the same problem. -# define Vc_MAY_ALIAS -# else -# define Vc_MAY_ALIAS __attribute__((__may_alias__)) -# endif -# if (defined(VC_GCC) && VC_GCC < 0x40300) -// GCC 4.1 fails with "sorry unimplemented: inlining failed" -# define Vc_INTRINSIC_R __attribute__((__flatten__)) -# elif defined(VC_OPEN64) -// the GCC 4.2 frontend doesn't know the __artificial__ attribute -# define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__)) -# else -# define Vc_INTRINSIC_R __attribute__((__flatten__, __always_inline__, __artificial__)) -# endif -# define Vc_INTRINSIC_L inline -# define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R -# define Vc_FLATTEN __attribute__((__flatten__)) -# define Vc_ALWAYS_INLINE_L inline -# define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__)) -# define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R -# ifdef VC_ICC - // ICC miscompiles if there are functions marked as pure or const -# define Vc_PURE -# define Vc_CONST -# else -# define Vc_PURE __attribute__((__pure__)) -# define Vc_CONST __attribute__((__const__)) -# endif -# define Vc_CONST_L -# define Vc_CONST_R Vc_CONST -# define Vc_PURE_L -# define Vc_PURE_R Vc_PURE -# define VC_IS_UNLIKELY(x) __builtin_expect(x, 0) -# define VC_IS_LIKELY(x) __builtin_expect(x, 1) -# define VC_RESTRICT __restrict__ -# define VC_DEPRECATED(msg) __attribute__((__deprecated__(msg))) -#else -# define Vc_FLATTEN -# ifdef Vc_PURE -# undef Vc_PURE -# endif -# define Vc_MAY_ALIAS -# ifdef VC_MSVC -# define Vc_ALWAYS_INLINE inline __forceinline -# define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE -# define Vc_ALWAYS_INLINE_R -# define Vc_CONST __declspec(noalias) -# define Vc_CONST_L Vc_CONST -# define Vc_CONST_R -# define Vc_PURE /*Vc_CONST*/ -# define Vc_PURE_L Vc_PURE -# define Vc_PURE_R -# define Vc_INTRINSIC inline __forceinline -# define Vc_INTRINSIC_L Vc_INTRINSIC -# define Vc_INTRINSIC_R -# else -# define Vc_ALWAYS_INLINE -# define Vc_ALWAYS_INLINE_L -# define Vc_ALWAYS_INLINE_R -# define Vc_CONST -# define Vc_CONST_L -# define Vc_CONST_R -# define Vc_PURE -# define Vc_PURE_L -# define Vc_PURE_R -# define Vc_INTRINSIC -# define Vc_INTRINSIC_L -# define Vc_INTRINSIC_R -# endif -# define VC_IS_UNLIKELY(x) x -# define VC_IS_LIKELY(x) x -# define VC_RESTRICT __restrict -# define VC_DEPRECATED(msg) __declspec(deprecated(msg)) -#endif - -#if __cplusplus >= 201103 /*C++11*/ -#define _VC_CONSTEXPR constexpr -#define _VC_CONSTEXPR_L _VC_CONSTEXPR -#define _VC_CONSTEXPR_R -#else -#define _VC_CONSTEXPR Vc_INTRINSIC Vc_CONST -#define _VC_CONSTEXPR_L Vc_INTRINSIC_L Vc_CONST_L -#define _VC_CONSTEXPR_R Vc_INTRINSIC_R Vc_CONST_R -#endif - -#ifdef VC_CXX11 -# define _VC_NOEXCEPT noexcept -#else -# define _VC_NOEXCEPT throw() -#endif - -#define FREE_STORE_OPERATORS_ALIGNED(alignment) \ - Vc_ALWAYS_INLINE void *operator new(size_t size) { return _mm_malloc(size, alignment); } \ - Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ - Vc_ALWAYS_INLINE void *operator new[](size_t size) { return _mm_malloc(size, alignment); } \ - Vc_ALWAYS_INLINE void *operator new[](size_t , void *p) { return p; } \ - Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { _mm_free(ptr); } \ - Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ - Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) { _mm_free(ptr); } \ - Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} - -#ifdef VC_GCC -# define VC_WARN_INLINE -# define VC_WARN(msg) __attribute__((warning("\n\t" msg))) -#else -# define VC_WARN_INLINE inline -# define VC_WARN(msg) -#endif - -#define unrolled_loop16(_it_, _start_, _end_, _code_) \ -if (_start_ + 0 < _end_) { enum { _it_ = (_start_ + 0) < _end_ ? (_start_ + 0) : _start_ }; _code_ } \ -if (_start_ + 1 < _end_) { enum { _it_ = (_start_ + 1) < _end_ ? (_start_ + 1) : _start_ }; _code_ } \ -if (_start_ + 2 < _end_) { enum { _it_ = (_start_ + 2) < _end_ ? (_start_ + 2) : _start_ }; _code_ } \ -if (_start_ + 3 < _end_) { enum { _it_ = (_start_ + 3) < _end_ ? (_start_ + 3) : _start_ }; _code_ } \ -if (_start_ + 4 < _end_) { enum { _it_ = (_start_ + 4) < _end_ ? (_start_ + 4) : _start_ }; _code_ } \ -if (_start_ + 5 < _end_) { enum { _it_ = (_start_ + 5) < _end_ ? (_start_ + 5) : _start_ }; _code_ } \ -if (_start_ + 6 < _end_) { enum { _it_ = (_start_ + 6) < _end_ ? (_start_ + 6) : _start_ }; _code_ } \ -if (_start_ + 7 < _end_) { enum { _it_ = (_start_ + 7) < _end_ ? (_start_ + 7) : _start_ }; _code_ } \ -if (_start_ + 8 < _end_) { enum { _it_ = (_start_ + 8) < _end_ ? (_start_ + 8) : _start_ }; _code_ } \ -if (_start_ + 9 < _end_) { enum { _it_ = (_start_ + 9) < _end_ ? (_start_ + 9) : _start_ }; _code_ } \ -if (_start_ + 10 < _end_) { enum { _it_ = (_start_ + 10) < _end_ ? (_start_ + 10) : _start_ }; _code_ } \ -if (_start_ + 11 < _end_) { enum { _it_ = (_start_ + 11) < _end_ ? (_start_ + 11) : _start_ }; _code_ } \ -if (_start_ + 12 < _end_) { enum { _it_ = (_start_ + 12) < _end_ ? (_start_ + 12) : _start_ }; _code_ } \ -if (_start_ + 13 < _end_) { enum { _it_ = (_start_ + 13) < _end_ ? (_start_ + 13) : _start_ }; _code_ } \ -if (_start_ + 14 < _end_) { enum { _it_ = (_start_ + 14) < _end_ ? (_start_ + 14) : _start_ }; _code_ } \ -if (_start_ + 15 < _end_) { enum { _it_ = (_start_ + 15) < _end_ ? (_start_ + 15) : _start_ }; _code_ } \ -do {} while ( false ) - -#define for_all_vector_entries(_it_, _code_) \ - unrolled_loop16(_it_, 0, Size, _code_) - -#ifdef VC_ASSERT -#define VC_EXTERNAL_ASSERT 1 -#else -#ifdef NDEBUG -#define VC_ASSERT(x) -#else -#include -#define VC_ASSERT(x) assert(x); -#endif -#endif - -#ifdef VC_CLANG -#define VC_HAS_BUILTIN(x) __has_builtin(x) -#else -#define VC_HAS_BUILTIN(x) 0 -#endif - -#ifndef VC_COMMON_MACROS_H_ONCE -#define VC_COMMON_MACROS_H_ONCE - -#define _VC_CAT_HELPER(a, b, c, d) a##b##c##d -#define _VC_CAT(a, b, c, d) _VC_CAT_HELPER(a, b, c, d) - -#if __cplusplus >= 201103 /*C++11*/ || (defined(VC_MSVC) && VC_MSVC >= 160000000) -#define VC_STATIC_ASSERT_NC(cond, msg) \ - static_assert(cond, #msg) -#define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) -#else // C++98 -namespace ROOT { -namespace Vc { - namespace { - template struct STATIC_ASSERT_FAILURE; - template<> struct STATIC_ASSERT_FAILURE {}; -}} -} // namespace ROOT - -#define VC_STATIC_ASSERT_NC(cond, msg) \ - typedef STATIC_ASSERT_FAILURE _VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg); \ - enum { \ - _VC_CAT(static_assert_failed__on_line_,__LINE__,_,msg) = sizeof(_VC_CAT(static_assert_failed_on_line_,__LINE__,_,msg)) \ - } -#define VC_STATIC_ASSERT(cond, msg) VC_STATIC_ASSERT_NC(cond, msg) -#endif // C++11/98 - - template struct exponentToMultiplier { enum Values__ { - X = exponentToMultiplier::X * ((e - center < 31) ? 2 : 1), - Value = (X == 0 ? 1 : X) - }; }; - template struct exponentToMultiplier { enum Values__ { X = 1, Value = X }; }; - template struct exponentToMultiplier< -1, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -128, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -256, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -384, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -512, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -640, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -768, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier< -896, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToMultiplier<-1024, center> { enum Values__ { X = 0, Value = 1 }; }; - - template struct exponentToDivisor { enum Values__ { - X = exponentToDivisor::X * ((center - e < 31) ? 2 : 1), - Value = (X == 0 ? 1 : X) - }; }; - template struct exponentToDivisor { enum Values__ { X = 1, Value = X }; }; - template struct exponentToDivisor< 1, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 128, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 256, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 384, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 512, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 640, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 768, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 896, center> { enum Values__ { X = 0, Value = 1 }; }; - template struct exponentToDivisor< 1024, center> { enum Values__ { X = 0, Value = 1 }; }; -#endif // VC_COMMON_MACROS_H_ONCE - -#define _CAT_IMPL(a, b) a##b -#define CAT(a, b) _CAT_IMPL(a, b) - -#define Vc_buildDouble(sign, mantissa, exponent) \ - ((static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / 0x0010000000000000ull) \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - * static_cast(sign)) -#define Vc_buildFloat(sign, mantissa, exponent) \ - ((static_cast((mantissa & 0x007fffffu) | 0x00800000) / 0x00800000) \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - * exponentToMultiplier::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - / exponentToDivisor::Value \ - * static_cast(sign)) - -#define _VC_APPLY_IMPL_1(macro, a, b, c, d, e) macro(a) -#define _VC_APPLY_IMPL_2(macro, a, b, c, d, e) macro(a, b) -#define _VC_APPLY_IMPL_3(macro, a, b, c, d, e) macro(a, b, c) -#define _VC_APPLY_IMPL_4(macro, a, b, c, d, e) macro(a, b, c, d) -#define _VC_APPLY_IMPL_5(macro, a, b, c, d, e) macro(a, b, c, d, e) - -#define VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ - size(macro, double_v, a, b, c, d) \ - size(macro, float_v, a, b, c, d) \ - size(macro, sfloat_v, a, b, c, d) -#define VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \ - size(macro, int_v, a, b, c, d) \ - size(macro, uint_v, a, b, c, d) \ - size(macro, short_v, a, b, c, d) \ - size(macro, ushort_v, a, b, c, d) -#define VC_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \ - VC_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ - VC_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) -#define VC_LIST_COMPARES(size, macro, a, b, c, d) \ - size(macro, ==, a, b, c, d) \ - size(macro, !=, a, b, c, d) \ - size(macro, <=, a, b, c, d) \ - size(macro, >=, a, b, c, d) \ - size(macro, < , a, b, c, d) \ - size(macro, > , a, b, c, d) -#define VC_LIST_LOGICAL(size, macro, a, b, c, d) \ - size(macro, &&, a, b, c, d) \ - size(macro, ||, a, b, c, d) -#define VC_LIST_BINARY(size, macro, a, b, c, d) \ - size(macro, |, a, b, c, d) \ - size(macro, &, a, b, c, d) \ - size(macro, ^, a, b, c, d) -#define VC_LIST_SHIFTS(size, macro, a, b, c, d) \ - size(macro, <<, a, b, c, d) \ - size(macro, >>, a, b, c, d) -#define VC_LIST_ARITHMETICS(size, macro, a, b, c, d) \ - size(macro, +, a, b, c, d) \ - size(macro, -, a, b, c, d) \ - size(macro, *, a, b, c, d) \ - size(macro, /, a, b, c, d) \ - size(macro, %, a, b, c, d) - -#define VC_APPLY_0(_list, macro) _list(_VC_APPLY_IMPL_1, macro, 0, 0, 0, 0) -#define VC_APPLY_1(_list, macro, a) _list(_VC_APPLY_IMPL_2, macro, a, 0, 0, 0) -#define VC_APPLY_2(_list, macro, a, b) _list(_VC_APPLY_IMPL_3, macro, a, b, 0, 0) -#define VC_APPLY_3(_list, macro, a, b, c) _list(_VC_APPLY_IMPL_4, macro, a, b, c, 0) -#define VC_APPLY_4(_list, macro, a, b, c, d) _list(_VC_APPLY_IMPL_5, macro, a, b, c, d) - -#define VC_ALL_COMPARES(macro) VC_APPLY_0(VC_LIST_COMPARES, macro) -#define VC_ALL_LOGICAL(macro) VC_APPLY_0(VC_LIST_LOGICAL, macro) -#define VC_ALL_BINARY(macro) VC_APPLY_0(VC_LIST_BINARY, macro) -#define VC_ALL_SHIFTS(macro) VC_APPLY_0(VC_LIST_SHIFTS, macro) -#define VC_ALL_ARITHMETICS(macro) VC_APPLY_0(VC_LIST_ARITHMETICS, macro) -#define VC_ALL_FLOAT_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_FLOAT_VECTOR_TYPES, macro) -#define VC_ALL_VECTOR_TYPES(macro) VC_APPLY_0(VC_LIST_VECTOR_TYPES, macro) - -#define VC_EXACT_TYPE(_test, _reference, _type) \ - typename EnableIf::Value, _type>::Value - -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN -#define VC_ALIGNED_PARAMETER(_Type) const _Type & -#else -#define VC_ALIGNED_PARAMETER(_Type) const _Type -#endif - -#ifndef Vc__make_unique -#define Vc__make_unique(name) _VC_CAT(Vc__,name,_,__LINE__) -#endif - -#if defined(VC_ICC) || defined(VC_CLANG) -#define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) -#elif defined(VC_GCC) && VC_GCC < 0x40500 -#define VC_OFFSETOF(Type, member) (reinterpret_cast(&reinterpret_cast(0x1000)->member) - reinterpret_cast(0x1000)) -#else -#define VC_OFFSETOF(Type, member) offsetof(Type, member) -#endif - - -#endif // VC_COMMON_MACROS_H diff --git a/math/vc/include/Vc/common/memory.h b/math/vc/include/Vc/common/memory.h deleted file mode 100644 index fade13490f2b4..0000000000000 --- a/math/vc/include/Vc/common/memory.h +++ /dev/null @@ -1,642 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORY_H -#define VC_COMMON_MEMORY_H - -#include "memorybase.h" -#include -#include -#include -#include -#include "memoryfwd.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ - -/** - * Allocates memory on the Heap with alignment and padding suitable for vectorized access. - * - * Memory that was allocated with this function must be released with Vc::free! Other methods might - * work but are not portable. - * - * \param n Specifies the number of objects the allocated memory must be able to store. - * \tparam T The type of the allocated memory. Note, that the constructor is not called. - * \tparam A Determines the alignment of the memory. See \ref Vc::MallocAlignment. - * - * \return Pointer to memory of the requested type, or 0 on error. The allocated memory is padded at - * the end to be a multiple of the requested alignment \p A. Thus if you request memory for 21 - * int objects, aligned via Vc::AlignOnCacheline, you can safely read a full cacheline until the - * end of the array, without generating an out-of-bounds access. For a cacheline size of 64 Bytes - * and an int size of 4 Bytes you would thus get an array of 128 Bytes to work with. - * - * \warning - * \li The standard malloc function specifies the number of Bytes to allocate whereas this - * function specifies the number of values, thus differing in a factor of sizeof(T). - * \li This function is mainly meant for use with builtin types. If you use a custom - * type with a sizeof that is not a multiple of 2 the results might not be what you expect. - * \li The constructor of T is not called. You can make up for this: - * \code - * SomeType *array = new(Vc::malloc(N)) SomeType[N]; - * \endcode - * - * \see Vc::free - * - * \ingroup Utilities - * \headerfile memory.h - */ -template -Vc_ALWAYS_INLINE_L T *Vc_ALWAYS_INLINE_R malloc(size_t n) -{ - return static_cast(Internal::Helper::malloc(n * sizeof(T))); -} - -/** - * Frees memory that was allocated with Vc::malloc. - * - * \param p The pointer to the memory to be freed. - * - * \tparam T The type of the allocated memory. - * - * \warning The destructor of T is not called. If needed, you can call the destructor before calling - * free: - * \code - * for (int i = 0; i < N; ++i) { - * p[i].~T(); - * } - * Vc::free(p); - * \endcode - * - * \ingroup Utilities - * \headerfile memory.h - * - * \see Vc::malloc - */ -template -Vc_ALWAYS_INLINE void free(T *p) -{ - Internal::Helper::free(p); -} - -template struct _MemorySizeCalculation -{ - enum AlignmentCalculations { - Alignment = V::Size, - AlignmentMask = Alignment - 1, - MaskedSize = Size & AlignmentMask, - Padding = Alignment - MaskedSize, - PaddedSize = MaskedSize == 0 ? Size : Size + Padding - }; -}; - -/** - * \ingroup Utilities - * \headerfile memory.h - * - * A helper class for fixed-size two-dimensional arrays. - * - * \param V The vector type you want to operate on. (e.g. float_v or uint_v) - * \param Size1 Number of rows - * \param Size2 Number of columns - */ -template class Memory : public VectorAlignedBaseT, public MemoryBase, 2, Memory > -{ - public: - typedef typename V::EntryType EntryType; - private: - typedef MemoryBase, 2, Memory > Base; - friend class MemoryBase, 2, Memory >; - friend class MemoryDimensionBase, 2, Memory >; - enum InternalConstants { - PaddedSize2 = _MemorySizeCalculation::PaddedSize - }; -#if defined(VC_ICC) && defined(_WIN32) - __declspec(align(__alignof(VectorAlignedBaseT))) -#elif defined(VC_CLANG) - __attribute__((aligned(__alignof(VectorAlignedBaseT)))) -#elif defined(VC_MSVC) - VectorAlignedBaseT _force_alignment; - // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment - // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. - // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( -#endif - EntryType m_mem[Size1][PaddedSize2]; - public: - using Base::vector; - enum Constants { - RowCount = Size1, - VectorsCount = PaddedSize2 / V::Size - }; - - /** - * \return the number of rows in the array. - * - * \note This function can be eliminated by an optimizing compiler. - */ - _VC_CONSTEXPR size_t rowsCount() const { return RowCount; } - /** - * \return the number of scalar entries in the whole array. - * - * \warning Do not use this function for scalar iteration over the array since there will be - * padding between rows if \c Size2 is not divisible by \c V::Size. - * - * \note This function can be optimized into a compile-time constant. - */ - _VC_CONSTEXPR size_t entriesCount() const { return Size1 * Size2; } - /** - * \return the number of vectors in the whole array. - * - * \note This function can be optimized into a compile-time constant. - */ - _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount * Size1; } - - /** - * Copies the data from a different object. - * - * \param rhs The object to copy the data from. - * - * \return reference to the modified Memory object. - * - * \note Both objects must have the exact same vectorsCount(). - */ - template - Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); - return *this; - } - - Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { - Internal::copyVectors(*this, rhs); - return *this; - } - - /** - * Initialize all data with the given vector. - * - * \param v This vector will be used to initialize the memory. - * - * \return reference to the modified Memory object. - */ - inline Memory &operator=(const V &v) { - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) = v; - } - return *this; - } - } -#if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) - __attribute__((__aligned__(__alignof(VectorAlignedBaseT)))) -#endif - ; - - /** - * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and - * scalar access. - * - * Example: - * \code - Vc::Memory array; - - // scalar access: - for (size_t i = 0; i < array.entriesCount(); ++i) { - int x = array[i]; // read - array[i] = x; // write - } - // more explicit alternative: - for (size_t i = 0; i < array.entriesCount(); ++i) { - int x = array.scalar(i); // read - array.scalar(i) = x; // write - } - - // vector access: - for (size_t i = 0; i < array.vectorsCount(); ++i) { - int_v x = array.vector(i); // read - array.vector(i) = x; // write - } - * \endcode - * This code allocates a small array and implements three equivalent loops (that do nothing useful). - * The loops show how scalar and vector read/write access is best implemented. - * - * Since the size of 11 is not a multiple of int_v::Size (unless you use the - * scalar Vc implementation) the last write access of the vector loop would normally be out of - * bounds. But the Memory class automatically pads the memory such that the whole array can be - * accessed with correctly aligned memory addresses. - * - * \param V The vector type you want to operate on. (e.g. float_v or uint_v) - * \param Size The number of entries of the scalar base type the memory should hold. This - * is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes - * Memory mem). - * - * \see Memory - * - * \ingroup Utilities - * \headerfile memory.h - */ - template class Memory : public VectorAlignedBaseT, public MemoryBase, 1, void> - { - public: - typedef typename V::EntryType EntryType; - private: - typedef MemoryBase, 1, void> Base; - friend class MemoryBase, 1, void>; - friend class MemoryDimensionBase, 1, void>; - enum InternalConstants { - Alignment = V::Size, - AlignmentMask = Alignment - 1, - MaskedSize = Size & AlignmentMask, - Padding = Alignment - MaskedSize, - PaddedSize = MaskedSize == 0 ? Size : Size + Padding - }; -#if defined(VC_ICC) && defined(_WIN32) - __declspec(align(__alignof(VectorAlignedBaseT))) -#elif defined(VC_CLANG) - __attribute__((aligned(__alignof(VectorAlignedBaseT)))) -#elif defined(VC_MSVC) - VectorAlignedBaseT _force_alignment; - // __declspec(align(#)) accepts only numbers not __alignof nor just VectorAlignment - // by putting VectorAlignedBaseT here _force_alignment is aligned correctly. - // the downside is that there's a lot of padding before m_mem (32 Bytes with SSE) :( -#endif - EntryType m_mem[PaddedSize]; - public: - using Base::vector; - enum Constants { - EntriesCount = Size, - VectorsCount = PaddedSize / V::Size - }; - - /** - * Wrap existing data with the Memory convenience class. - * - * This function returns a \em reference to a Memory object that you must - * capture to avoid a copy of the whole data: - * \code - * Memory &m = Memory::fromRawData(someAlignedPointerToFloat) - * \endcode - * - * \param ptr An aligned pointer to memory of type \p V::EntryType (e.g. \c float for - * Vc::float_v). - * \return A Memory object placed at the given location in memory. - * - * \warning The pointer \p ptr passed to this function must be aligned according to the - * alignment restrictions of \p V. - * \warning The size of the accessible memory must match \p Size. This includes the - * required padding at the end to allow the last entries to be accessed via vectors. If - * you know what you are doing you might violate this constraint. - * \warning It is your responsibility to ensure that the memory is released correctly - * (not too early/not leaked). This function simply adds convenience functions to \em - * access the memory. - */ - static Vc_ALWAYS_INLINE Vc_CONST Memory &fromRawData(EntryType *ptr) - { - // DANGER! This placement new has to use the right address. If the compiler decides - // RowMemory requires padding before the actual data then the address has to be adjusted - // accordingly - char *addr = reinterpret_cast(ptr); - typedef Memory MM; - addr -= VC_OFFSETOF(MM, m_mem); - return *new(addr) MM; - } - - /** - * \return the number of scalar entries in the whole array. - * - * \note This function can be optimized into a compile-time constant. - */ - _VC_CONSTEXPR size_t entriesCount() const { return EntriesCount; } - - /** - * \return the number of vectors in the whole array. - * - * \note This function can be optimized into a compile-time constant. - */ - _VC_CONSTEXPR size_t vectorsCount() const { return VectorsCount; } - -#ifdef VC_CXX11 - Vc_ALWAYS_INLINE Memory() = default; -#else - Vc_ALWAYS_INLINE Memory() {} -#endif - - inline Memory(const Memory &rhs) - { - Internal::copyVectors(*this, rhs); - } - - template inline Memory(const Memory &rhs) - { - assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); - } - - inline Memory &operator=(const Memory &rhs) - { - Internal::copyVectors(*this, rhs); - return *this; - } - - template inline Memory &operator=(const Memory &rhs) - { - assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); - return *this; - } - - Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { - std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); - return *this; - } - inline Memory &operator=(const V &v) { - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) = v; - } - return *this; - } - } -#if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32) - __attribute__((__aligned__(__alignof(VectorAlignedBaseT)) )) -#endif - ; - - /** - * A helper class that is very similar to Memory but with dynamically allocated memory and - * thus dynamic size. - * - * Example: - * \code - size_t size = 11; - Vc::Memory array(size); - - // scalar access: - for (size_t i = 0; i < array.entriesCount(); ++i) { - array[i] = i; - } - - // vector access: - for (size_t i = 0; i < array.vectorsCount(); ++i) { - array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size; - } - * \endcode - * This code allocates a small array with 11 scalar entries - * and implements two equivalent loops that initialize the memory. - * The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to - * memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the - * scalar Vc implementation) the last write access of the vector loop would normally be out of - * bounds. But the Memory class automatically pads the memory such that the whole array can be - * accessed with correctly aligned memory addresses. - * (Note: the scalar loop can be auto-vectorized, except for the last three assignments.) - * - * \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore - * modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use - * the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector - * address calculation and loads and stores manually. - * - * \param V The vector type you want to operate on. (e.g. float_v or uint_v) - * - * \see Memory - * - * \ingroup Utilities - * \headerfile memory.h - */ - template class Memory : public MemoryBase, 1, void> - { - public: - typedef typename V::EntryType EntryType; - private: - typedef MemoryBase, 1, void> Base; - friend class MemoryBase, 1, void>; - friend class MemoryDimensionBase, 1, void>; - enum InternalConstants { - Alignment = V::Size, - AlignmentMask = Alignment - 1 - }; - size_t m_entriesCount; - size_t m_vectorsCount; - EntryType *m_mem; - size_t calcPaddedEntriesCount(size_t x) - { - size_t masked = x & AlignmentMask; - return (masked == 0 ? x : x + (Alignment - masked)); - } - public: - using Base::vector; - - /** - * Allocate enough memory to access \p size values of type \p V::EntryType. - * - * The allocated memory is aligned and padded correctly for fully vectorized access. - * - * \param size Determines how many scalar values will fit into the allocated memory. - */ - Vc_ALWAYS_INLINE Memory(size_t size) - : m_entriesCount(size), - m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)), - m_mem(Vc::malloc(m_vectorsCount)) - { - m_vectorsCount /= V::Size; - } - - /** - * Copy the memory into a new memory area. - * - * The allocated memory is aligned and padded correctly for fully vectorized access. - * - * \param rhs The Memory object to copy from. - */ - template - Vc_ALWAYS_INLINE Memory(const MemoryBase &rhs) - : m_entriesCount(rhs.entriesCount()), - m_vectorsCount(rhs.vectorsCount()), - m_mem(Vc::malloc(m_vectorsCount * V::Size)) - { - Internal::copyVectors(*this, rhs); - } - - /** - * Overload of the above function. - * - * (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.) - * - * \param rhs The Memory object to copy from. - */ - Vc_ALWAYS_INLINE Memory(const Memory &rhs) - : m_entriesCount(rhs.entriesCount()), - m_vectorsCount(rhs.vectorsCount()), - m_mem(Vc::malloc(m_vectorsCount * V::Size)) - { - Internal::copyVectors(*this, rhs); - } - - /** - * Frees the memory which was allocated in the constructor. - */ - Vc_ALWAYS_INLINE ~Memory() - { - Vc::free(m_mem); - } - - /** - * Swap the contents and size information of two Memory objects. - * - * \param rhs The other Memory object to swap. - */ - inline void swap(Memory &rhs) { - std::swap(m_mem, rhs.m_mem); - std::swap(m_entriesCount, rhs.m_entriesCount); - std::swap(m_vectorsCount, rhs.m_vectorsCount); - } - - /** - * \return the number of scalar entries in the whole array. - */ - Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return m_entriesCount; } - - /** - * \return the number of vectors in the whole array. - */ - Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return m_vectorsCount; } - - /** - * Overwrite all entries with the values stored in \p rhs. - * - * \param rhs The object to copy the data from. - * - * \return reference to the modified Memory object. - * - * \note this function requires the vectorsCount() of both Memory objects to be equal. - */ - template - Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); - return *this; - } - - Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - Internal::copyVectors(*this, rhs); - return *this; - } - - /** - * Overwrite all entries with the values stored in the memory at \p rhs. - * - * \param rhs The array to copy the data from. - * - * \return reference to the modified Memory object. - * - * \note this function requires that there are entriesCount() many values accessible from \p rhs. - */ - Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) { - std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType)); - return *this; - } -}; - -/** - * Prefetch the cacheline containing \p addr for a single read access. - * - * This prefetch completely bypasses the cache, not evicting any other data. - * - * \param addr The cacheline containing \p addr will be prefetched. - * - * \ingroup Utilities - * \headerfile memory.h - */ -Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr) -{ - Internal::Helper::prefetchForOneRead(addr); -} - -/** - * Prefetch the cacheline containing \p addr for modification. - * - * This prefetch evicts data from the cache. So use it only for data you really will use. When the - * target system supports it the cacheline will be marked as modified while prefetching, saving work - * later on. - * - * \param addr The cacheline containing \p addr will be prefetched. - * - * \ingroup Utilities - * \headerfile memory.h - */ -Vc_ALWAYS_INLINE void prefetchForModify(const void *addr) -{ - Internal::Helper::prefetchForModify(addr); -} - -/** - * Prefetch the cacheline containing \p addr to L1 cache. - * - * This prefetch evicts data from the cache. So use it only for data you really will use. - * - * \param addr The cacheline containing \p addr will be prefetched. - * - * \ingroup Utilities - * \headerfile memory.h - */ -Vc_ALWAYS_INLINE void prefetchClose(const void *addr) -{ - Internal::Helper::prefetchClose(addr); -} - -/** - * Prefetch the cacheline containing \p addr to L2 cache. - * - * This prefetch evicts data from the cache. So use it only for data you really will use. - * - * \param addr The cacheline containing \p addr will be prefetched. - * - * \ingroup Utilities - * \headerfile memory.h - */ -Vc_ALWAYS_INLINE void prefetchMid(const void *addr) -{ - Internal::Helper::prefetchMid(addr); -} - -/** - * Prefetch the cacheline containing \p addr to L3 cache. - * - * This prefetch evicts data from the cache. So use it only for data you really will use. - * - * \param addr The cacheline containing \p addr will be prefetched. - * - * \ingroup Utilities - * \headerfile memory.h - */ -Vc_ALWAYS_INLINE void prefetchFar(const void *addr) -{ - Internal::Helper::prefetchFar(addr); -} - -} // namespace Vc -} // namespace ROOT - -namespace std -{ - template Vc_ALWAYS_INLINE void swap(Vc::Memory &a, Vc::Memory &b) { a.swap(b); } -} // namespace std - -#include "undomacros.h" - -#endif // VC_COMMON_MEMORY_H diff --git a/math/vc/include/Vc/common/memorybase.h b/math/vc/include/Vc/common/memorybase.h deleted file mode 100644 index 620010acb5071..0000000000000 --- a/math/vc/include/Vc/common/memorybase.h +++ /dev/null @@ -1,603 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORYBASE_H -#define VC_COMMON_MEMORYBASE_H - -#include -#include "macros.h" - -namespace ROOT { -namespace Vc -{ - -#if __cplusplus >= 201103 || defined(VC_MSVC) -#define VC_DECLTYPE(T1, op, T2) decltype(T1() op T2()) -#elif defined(VC_OPEN64) || (defined(VC_GCC) && VC_GCC < 0x40300) -#define VC_DECLTYPE(T1, op, T2) T1 -#else -namespace -{ - struct one { char x; }; - struct two { one x, y; }; - template struct DecltypeHelper - { - static one test(const T1 &) { return one(); } - static two test(const T2 &) { return two(); } - //static void test(...) {} - }; - template struct DecltypeHelper - { - static one test(const T1 &) { return one(); } - //static void test(...) {} - }; - template struct Decltype { typedef T1 Value; }; - template struct Decltype { typedef T1 Value; }; - template struct Decltype { typedef T2 Value; }; -#ifdef VC_CLANG - // this special case is only necessary to silence a warning (which is rather a note that clang - // did the expected optimization): - // warning: variable 'SOME_PTR' is not needed and will not be emitted [-Wunneeded-internal-declaration] - // Then again, I don't remember why the SOME_PTR hack was necessary in the first place - some - // strange compiler quirk... -#define VC_DECLTYPE(T1, op, T2) typename Decltype::test(T1() op T2()))>::Value -#else - static const void *SOME_PTR; -#define VC_DECLTYPE(T1, op, T2) typename Decltype::test(*static_cast(SOME_PTR) op *static_cast(SOME_PTR)))>::Value -#endif -} // anonymous namespace -#endif - -#define VC_MEM_OPERATOR_EQ(op) \ - template \ - Vc_ALWAYS_INLINE VectorPointerHelper &operator op##=(const T &x) { \ - const V result = V(m_ptr, Internal::FlagObject::the()) op x; \ - result.store(m_ptr, Internal::FlagObject::the()); \ - return *this; \ - } - -/** - * Helper class for the Memory::vector(size_t) class of functions. - * - * You will never need to directly make use of this class. It is an implementation detail of the - * Memory API. - * - * \headerfile memorybase.h - */ -template class VectorPointerHelperConst -{ - typedef typename V::EntryType EntryType; - typedef typename V::Mask Mask; - public: - const EntryType *const m_ptr; - - explicit VectorPointerHelperConst(const EntryType *ptr) : m_ptr(ptr) {} - - /** - * Cast to \p V operator. - * - * This function allows to assign this object to any object of type \p V. - */ - Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } -}; - -/** - * Helper class for the Memory::vector(size_t) class of functions. - * - * You will never need to directly make use of this class. It is an implementation detail of the - * Memory API. - * - * \headerfile memorybase.h - */ -template class VectorPointerHelper -{ - typedef typename V::EntryType EntryType; - typedef typename V::Mask Mask; - public: - EntryType *const m_ptr; - - explicit VectorPointerHelper(EntryType *ptr) : m_ptr(ptr) {} - - /** - * Cast to \p V operator. - * - * This function allows to assign this object to any object of type \p V. - */ - Vc_ALWAYS_INLINE Vc_PURE operator const V() const { return V(m_ptr, Internal::FlagObject::the()); } - - template - Vc_ALWAYS_INLINE VectorPointerHelper &operator=(const T &x) { - V v; - v = x; - v.store(m_ptr, Internal::FlagObject::the()); - return *this; - } - - VC_ALL_BINARY(VC_MEM_OPERATOR_EQ) - VC_ALL_ARITHMETICS(VC_MEM_OPERATOR_EQ) -}; -#undef VC_MEM_OPERATOR_EQ - -#define VC_VPH_OPERATOR(op) \ -template \ -VC_DECLTYPE(V1, op, V2) operator op(const VectorPointerHelper &x, const VectorPointerHelper &y) { \ - return V1(x.m_ptr, Internal::FlagObject::the()) op V2(y.m_ptr, Internal::FlagObject::the()); \ -} -VC_ALL_ARITHMETICS(VC_VPH_OPERATOR) -VC_ALL_BINARY (VC_VPH_OPERATOR) -VC_ALL_COMPARES (VC_VPH_OPERATOR) -#undef VC_VPH_OPERATOR - -template class MemoryDimensionBase; -template class MemoryDimensionBase // {{{1 -{ - private: - Parent *p() { return static_cast(this); } - const Parent *p() const { return static_cast(this); } - public: - /** - * The type of the scalar entries in the array. - */ - typedef typename V::EntryType EntryType; - - /** - * Returns a pointer to the start of the allocated memory. - */ - Vc_ALWAYS_INLINE Vc_PURE EntryType *entries() { return &p()->m_mem[0]; } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries() const { return &p()->m_mem[0]; } - - /** - * Returns the \p i-th scalar value in the memory. - */ - Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i) { return entries()[i]; } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; } - - /** - * Cast operator to the scalar type. This allows to use the object very much like a standard - * C array. - */ - Vc_ALWAYS_INLINE Vc_PURE operator EntryType*() { return entries(); } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); } - - // omit operator[] because the EntryType* cast operator suffices, for dox it makes sense to - // show it, though because it helps API discoverability. -#ifdef DOXYGEN - /** - * Returns the \p i-th scalar value in the memory. - */ - inline EntryType &operator[](size_t i); - /// Const overload of the above function. - inline const EntryType &operator[](size_t i) const; -#endif - - /** - * Uses a vector gather to combine the entries at the indexes in \p i into the returned - * vector object. - * - * \param i An integer vector. It determines the entries to be gathered. - * \returns A vector object. Modification of this object will not modify the values in - * memory. - * - * \warning The API of this function might change in future versions of Vc to additionally - * support scatters. - */ - template Vc_ALWAYS_INLINE Vc_PURE V operator[](Vector i) const - { - return V(entries(), i); - } -}; -template class MemoryDimensionBase // {{{1 -{ - private: - Parent *p() { return static_cast(this); } - const Parent *p() const { return static_cast(this); } - public: - /** - * The type of the scalar entries in the array. - */ - typedef typename V::EntryType EntryType; - - static _VC_CONSTEXPR size_t rowCount() { return Parent::RowCount; } - - /** - * Returns a pointer to the start of the allocated memory. - */ - Vc_ALWAYS_INLINE Vc_PURE EntryType *entries(size_t x = 0) { return &p()->m_mem[x][0]; } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries(size_t x = 0) const { return &p()->m_mem[x][0]; } - - /** - * Returns the \p i,j-th scalar value in the memory. - */ - Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i, size_t j) { return entries(i)[j]; } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i, size_t j) const { return entries(i)[j]; } - - /** - * Returns the \p i-th row in the memory. - */ - Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) { - return RowMemory::fromRawData(entries(i)); - } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const { - return RowMemory::fromRawData(const_cast(entries(i))); - } - - /** - * \return the number of rows in the array. - * - * \note This function can be eliminated by an optimizing compiler. - */ - Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); } -}; - -//{{{1 -/** - * \headerfile memorybase.h - * - * Common interface to all Memory classes, independent of allocation on the stack or heap. - * - * \param V The vector type you want to operate on. (e.g. float_v or uint_v) - * \param Parent This type is the complete type of the class that derives from MemoryBase. - * \param Dimension The number of dimensions the implementation provides. - * \param RowMemory Class to be used to work on a single row. - */ -template class MemoryBase : public MemoryDimensionBase //{{{1 -{ - private: - Parent *p() { return static_cast(this); } - const Parent *p() const { return static_cast(this); } - public: - /** - * The type of the scalar entries in the array. - */ - typedef typename V::EntryType EntryType; - - /** - * \return the number of scalar entries in the array. This function is optimized away - * if a constant size array is used. - */ - Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return p()->entriesCount(); } - /** - * \return the number of vector entries that span the array. This function is optimized away - * if a constant size array is used. - */ - Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return p()->vectorsCount(); } - - using MemoryDimensionBase::entries; - using MemoryDimensionBase::scalar; - - /** - * \param i Selects the offset, where the vector should be read. - * - * \return a smart object to wrap the \p i-th vector in the memory. - * - * The return value can be used as any other vector object. I.e. you can substitute - * something like - * \code - * float_v a = ..., b = ...; - * a += b; - * \endcode - * with - * \code - * mem.vector(i) += b; - * \endcode - * - * This function ensures that only \em aligned loads and stores are used. Thus it only allows to - * access memory at fixed strides. If access to known offsets from the aligned vectors is - * needed the vector(size_t, int) function can be used. - */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i) { - return VectorPointerHelper(&entries()[i * V::Size]); - } - /** \brief Const overload of the above function - * - * \param i Selects the offset, where the vector should be read. - * - * \return a smart object to wrap the \p i-th vector in the memory. - */ - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i) const { - return VectorPointerHelperConst(&entries()[i * V::Size]); - } - - /** - * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. - * - * Example: - * \code - * Memory mem; - * mem.setZero(); - * for (int i = 0; i < mem.entriesCount(); i += float_v::Size) { - * mem.vectorAt(i) += b; - * } - * \endcode - * - * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the - * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. - * - * \param align You must take care to determine whether an unaligned load/store is - * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size - * you must pass Vc::Unaligned here. - */ -#ifdef DOXYGEN - template inline VectorPointerHelper vectorAt(size_t i, A align = Vc::Aligned); - /** \brief Const overload of the above function - * - * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory. - * - * \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the - * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten. - * - * \param align You must take care to determine whether an unaligned load/store is - * required. Per default an aligned load/store is used. If \p i is not a multiple of \c V::Size - * you must pass Vc::Unaligned here. - */ - template inline const VectorPointerHelperConst vectorAt(size_t i, A align = Vc::Aligned) const; -#else - template - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i, A) { - return VectorPointerHelper(&entries()[i]); - } - template - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i, A) const { - return VectorPointerHelperConst(&entries()[i]); - } - - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vectorAt(size_t i) { - return VectorPointerHelper(&entries()[i]); - } - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vectorAt(size_t i) const { - return VectorPointerHelperConst(&entries()[i]); - } -#endif - - /** - * \return a smart object to wrap the \p i-th vector + \p shift in the memory. - * - * This function ensures that only \em unaligned loads and stores are used. - * It allows to access memory at any location aligned to the entry type. - * - * \param i Selects the memory location of the i-th vector. Thus if \p V::Size == 4 and - * \p i is set to 3 the base address for the load/store will be the 12th entry - * (same as \p &mem[12]). - * \param shift Shifts the base address determined by parameter \p i by \p shift many - * entries. Thus \p vector(3, 1) for \p V::Size == 4 will load/store the - * 13th - 16th entries (same as \p &mem[13]). - * - * \note Any shift value is allowed as long as you make sure it stays within bounds of the - * allocated memory. Shift values that are a multiple of \p V::Size will \em not result in - * aligned loads. You have to use the above vector(size_t) function for aligned loads - * instead. - * - * \note Thus a simple way to access vectors randomly is to set \p i to 0 and use \p shift as the - * parameter to select the memory address: - * \code - * // don't use: - * mem.vector(i / V::Size, i % V::Size) += 1; - * // instead use: - * mem.vector(0, i) += 1; - * \endcode - */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper vector(size_t i, int shift) { - return VectorPointerHelper(&entries()[i * V::Size + shift]); - } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst vector(size_t i, int shift) const { - return VectorPointerHelperConst(&entries()[i * V::Size + shift]); - } - - /** - * \return the first vector in the allocated memory. - * - * This function is simply a shorthand for vector(0). - */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper firstVector() { - return VectorPointerHelper(entries()); - } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst firstVector() const { - return VectorPointerHelperConst(entries()); - } - - /** - * \return the last vector in the allocated memory. - * - * This function is simply a shorthand for vector(vectorsCount() - 1). - */ - Vc_ALWAYS_INLINE Vc_PURE VectorPointerHelper lastVector() { - return VectorPointerHelper(&entries()[vectorsCount() * V::Size - V::Size]); - } - /// Const overload of the above function. - Vc_ALWAYS_INLINE Vc_PURE const VectorPointerHelperConst lastVector() const { - return VectorPointerHelperConst(&entries()[vectorsCount() * V::Size - V::Size]); - } - - Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char *indexes) const { return V(entries(), indexes); } - Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned short *indexes) const { return V(entries(), indexes); } - Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int *indexes) const { return V(entries(), indexes); } - Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long *indexes) const { return V(entries(), indexes); } - - Vc_ALWAYS_INLINE void setZero() { - V zero(Vc::Zero); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) = zero; - } - } - - template - inline Parent &operator+=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) += rhs.vector(i); - } - return static_cast(*this); - } - template - inline Parent &operator-=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) -= rhs.vector(i); - } - return static_cast(*this); - } - template - inline Parent &operator*=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) *= rhs.vector(i); - } - return static_cast(*this); - } - template - inline Parent &operator/=(const MemoryBase &rhs) { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) /= rhs.vector(i); - } - return static_cast(*this); - } - inline Parent &operator+=(EntryType rhs) { - V v(rhs); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) += v; - } - return static_cast(*this); - } - inline Parent &operator-=(EntryType rhs) { - V v(rhs); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) -= v; - } - return static_cast(*this); - } - inline Parent &operator*=(EntryType rhs) { - V v(rhs); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) *= v; - } - return static_cast(*this); - } - inline Parent &operator/=(EntryType rhs) { - V v(rhs); - for (size_t i = 0; i < vectorsCount(); ++i) { - vector(i) /= v; - } - return static_cast(*this); - } - template - inline bool operator==(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) == V(rhs.vector(i))).isFull()) { - return false; - } - } - return true; - } - template - inline bool operator!=(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) == V(rhs.vector(i))).isEmpty()) { - return false; - } - } - return true; - } - template - inline bool operator<(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) < V(rhs.vector(i))).isFull()) { - return false; - } - } - return true; - } - template - inline bool operator<=(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) <= V(rhs.vector(i))).isFull()) { - return false; - } - } - return true; - } - template - inline bool operator>(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) > V(rhs.vector(i))).isFull()) { - return false; - } - } - return true; - } - template - inline bool operator>=(const MemoryBase &rhs) const { - assert(vectorsCount() == rhs.vectorsCount()); - for (size_t i = 0; i < vectorsCount(); ++i) { - if (!(V(vector(i)) >= V(rhs.vector(i))).isFull()) { - return false; - } - } - return true; - } -}; - -namespace Internal -{ -template -inline void copyVectors(MemoryBase &dst, - const MemoryBase &src) -{ - const size_t vectorsCount = dst.vectorsCount(); - size_t i = 3; - for (; i < vectorsCount; i += 4) { - const V tmp3 = src.vector(i - 3); - const V tmp2 = src.vector(i - 2); - const V tmp1 = src.vector(i - 1); - const V tmp0 = src.vector(i - 0); - dst.vector(i - 3) = tmp3; - dst.vector(i - 2) = tmp2; - dst.vector(i - 1) = tmp1; - dst.vector(i - 0) = tmp0; - } - for (i -= 3; i < vectorsCount; ++i) { - dst.vector(i) = src.vector(i); - } -} -} // namespace Internal - -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_MEMORYBASE_H diff --git a/math/vc/include/Vc/common/memoryfwd.h b/math/vc/include/Vc/common/memoryfwd.h deleted file mode 100644 index 6eb5f859b41ee..0000000000000 --- a/math/vc/include/Vc/common/memoryfwd.h +++ /dev/null @@ -1,30 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_MEMORYFWD_H -#define VC_COMMON_MEMORYFWD_H - -namespace ROOT { -namespace Vc -{ - template class Memory; -} // namespace Vc -} // namespace ROOT - -#endif // VC_COMMON_MEMORYFWD_H diff --git a/math/vc/include/Vc/common/operand.h b/math/vc/include/Vc/common/operand.h deleted file mode 100644 index 8856edb369919..0000000000000 --- a/math/vc/include/Vc/common/operand.h +++ /dev/null @@ -1,56 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2013 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_COMMON_OPERAND_H -#define VC_COMMON_OPERAND_H - -namespace ROOT { -namespace Vc -{ -template class Operand -{ - public: - Parent *parent() { return static_cast(this); } - const Parent *parent() const { return static_cast(this); } - - private: -}; - -enum BinaryOperation { - AddOp, - SubOp, - MulOp, - DivOp -}; - -template class BinaryOperation : public Operand > -{ - Left m_left; - Right m_right; - public: -#if VC_CXX11 - Vc_ALWAYS_INLINE BinaryOperation(Left &&l, Right &&r) -#endif - operator Result() -}; - -} // namespace Vc -} // namespace ROOT - -#endif // VC_COMMON_OPERAND_H diff --git a/math/vc/include/Vc/common/operators.h b/math/vc/include/Vc/common/operators.h deleted file mode 100644 index 561b484665084..0000000000000 --- a/math/vc/include/Vc/common/operators.h +++ /dev/null @@ -1,209 +0,0 @@ -#ifndef VC_ICC -// ICC ICEs if the following type-traits are in the anonymous namespace -namespace -{ -#endif -template struct EnableIfNeitherIntegerNorVector : public EnableIf::Value, T> {}; -template struct EnableIfNeitherIntegerNorVector, T>; - -template struct IsVector { enum { Value = false }; }; -template struct IsVector > { enum { Value = true }; }; - -template struct IsTypeCombinationOf -{ - enum { - Value = IsVector::Value ? (IsVector::Value ? ( // Vec × Vec - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) - ) : ( // Vec × Scalar - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) || - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) - )) : (IsVector::Value ? ( // Scalar × Vec - ( IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || - (HasImplicitCast::Value && IsEqualType::Value && !HasImplicitCast::Value) - ) : ( // Scalar × Scalar - ( IsEqualType::Value && IsEqualType::Value) || - ( IsEqualType::Value && IsEqualType::Value) - )) - }; -}; - -template struct IsVectorOperands -{ - enum { - Value = (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) - || (HasImplicitCast::Value && !HasImplicitCast::Value && !IsEqualType::Value && IsEqualType::Value) - }; -}; -#ifndef VC_ICC -} -#endif - -// float-int arithmetic operators //{{{1 -// These operators must be very picky about the exact types they want to handle. Once (uncontrolled) -// implicit type conversions get involved, ambiguous overloads will occur. E.g. a simple int × enum -// will become ambiguous because it can convert both to a vector type, which then can execute the -// operator. We can't argue that such code should not be used - it could break existing code, not -// under control of the developer, just by putting the Vc header somewhere on top. -// -// The following type combinations are safe (always symmetric): -// 1. Vector × Vector -// 2. Vector × Scalar (int, float, enum value, ...) -// 3. Some object that has a vector cast operator × Vector -// 4. Some object that has a vector cast operator × Scalar -// -// Additionally there are restrictions on which types combine to what resulting type: -// 1.a. float × double_v -> double_v -// 1.b. any int × double_v -> double_v -// 2.a. (u)int_v × float_v -> float_v -// 2.b. (u)int_v × float -> float_v -// 2.c. any int × float_v -> float_v -// 3.a. (u)short_v × sfloat_v -> sfloat_v -// 3.b. (u)short_v × float -> sfloat_v -// 3.c. short × sfloat_v -> sfloat_v -// 4.a. int_v × uint_v -> uint_v -// 4.b. any int × uint_v -> uint_v -// 4.c. unsigned int × int_v -> uint_v -// 4.d. signed int × int_v -> int_v -// 5. shorts like ints - -#define VC_OPERATOR_FORWARD_(ret, op) \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - ((IsEqualType::Value || IsLikeInteger::Value) && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, double_##ret>::Value operator op(const T0 &x, const T1 &y) { return double_v(x) op double_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, float_##ret>::Value operator op(const T0 &x, const T1 &y) { return float_v(x) op float_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - IsTypeCombinationOf::Value || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, sfloat_##ret>::Value operator op(const T0 &x, const T1 &y) { return sfloat_v(x) op sfloat_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, uint_##ret>::Value operator op(const T0 &x, const T1 &y) { return uint_v(x) op uint_v(y); } \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, int_##ret>::Value operator op(const T0 &x, const T1 &y) { return int_v(x) op int_v(y); } \ -\ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - IsTypeCombinationOf::Value || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsUnsignedInteger::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, ushort_##ret>::Value operator op(const T0 &x, const T1 &y) { return ushort_v(x) op ushort_v(y); } \ -template static Vc_ALWAYS_INLINE typename EnableIf< \ - IsVectorOperands::Value || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - (IsLikeSignedInteger::Value && !IsEqualType::Value && HasImplicitCast::Value && !HasImplicitCast::Value) || \ - false, short_##ret>::Value operator op(const T0 &x, const T1 &y) { return short_v(x) op short_v(y); } - - -// break incorrect combinations -#define VC_OPERATOR_INTENTIONAL_ERROR_1(V, op) \ -template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ -template static inline typename EnableIfNeitherIntegerNorVector >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } - -#define VC_OPERATOR_INTENTIONAL_ERROR_2(V1, V2, op) \ -static inline Vc::Error::invalid_operands_of_types operator op(V1::AsArg, V2::AsArg) { return Vc::Error::invalid_operands_of_types(); } \ -static inline Vc::Error::invalid_operands_of_types operator op(V2::AsArg, V1::AsArg) { return Vc::Error::invalid_operands_of_types(); } - -#define VC_OPERATOR_INTENTIONAL_ERROR_3(V, _T, op) \ -template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const V &, const T &) { return Vc::Error::invalid_operands_of_types(); } \ -template static inline typename EnableIf::Value, Vc::Error::invalid_operands_of_types >::Value operator op(const T &, const V &) { return Vc::Error::invalid_operands_of_types(); } - -//#define VC_EXTRA_CHECKING -#ifdef VC_EXTRA_CHECKING -#define VC_OPERATOR_INTENTIONAL_ERROR(op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, sfloat_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, float_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, int_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, uint_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(double_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( int_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( uint_v, ushort_v, op) \ - VC_APPLY_1(VC_LIST_VECTOR_TYPES, VC_OPERATOR_INTENTIONAL_ERROR_1, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, short_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2( float_v, ushort_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, float_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, int_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_2(sfloat_v, uint_v, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_3( float_v, double, op) \ - VC_OPERATOR_INTENTIONAL_ERROR_3(sfloat_v, double, op) -#else -#define VC_OPERATOR_INTENTIONAL_ERROR(op) -#endif - -#define VC_OPERATOR_FORWARD_COMMUTATIVE(ret, op, op2) \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return y op2 x; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return y op2 x; } \ -VC_OPERATOR_FORWARD_(ret, op) \ -VC_OPERATOR_INTENTIONAL_ERROR(op) - -#define VC_OPERATOR_FORWARD(ret, op) \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, double, double_##ret) operator op(T x, double_v::AsArg y) { return double_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, sfloat_##ret) operator op(T x, sfloat_v::AsArg y) { return sfloat_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, float, float_##ret) operator op(T x, float_v::AsArg y) { return float_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, int, int_##ret) operator op(T x, int_v::AsArg y) { return int_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned int, uint_##ret) operator op(T x, uint_v::AsArg y) { return uint_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, short, short_##ret) operator op(T x, short_v::AsArg y) { return short_v(x) op y; } \ -template static Vc_ALWAYS_INLINE VC_EXACT_TYPE(T, unsigned short, ushort_##ret) operator op(T x, ushort_v::AsArg y) { return ushort_v(x) op y; } \ -VC_OPERATOR_FORWARD_(ret, op) \ -VC_OPERATOR_INTENTIONAL_ERROR(op) - -VC_OPERATOR_FORWARD_COMMUTATIVE(v, *, *) -VC_OPERATOR_FORWARD(v, /) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, +, +) -VC_OPERATOR_FORWARD(v, -) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, |, |) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, &, &) -VC_OPERATOR_FORWARD_COMMUTATIVE(v, ^, ^) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, <, >) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, >, <) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, <=, >=) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, >=, <=) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, ==, ==) -VC_OPERATOR_FORWARD_COMMUTATIVE(m, !=, !=) - -#undef VC_OPERATOR_FORWARD_ -#undef VC_OPERATOR_INTENTIONAL_ERROR_1 -#undef VC_OPERATOR_INTENTIONAL_ERROR_2 -#undef VC_OPERATOR_INTENTIONAL_ERROR -#undef VC_OPERATOR_FORWARD_COMMUTATIVE -#undef VC_OPERATOR_FORWARD - -// }}}1 diff --git a/math/vc/include/Vc/common/storage.h b/math/vc/include/Vc/common/storage.h deleted file mode 100644 index 9c8769154e4ba..0000000000000 --- a/math/vc/include/Vc/common/storage.h +++ /dev/null @@ -1,130 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_STORAGE_H -#define VC_COMMON_STORAGE_H - -#include "aliasingentryhelper.h" -#include "macros.h" -#include "types.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -template class VectorMemoryUnion -{ - public: - typedef _VectorType VectorType; - typedef _EntryType EntryType; - typedef EntryType AliasingEntryType Vc_MAY_ALIAS; - Vc_ALWAYS_INLINE VectorMemoryUnion() { assertCorrectAlignment(&v()); } -#if defined VC_ICC || defined VC_MSVC - Vc_ALWAYS_INLINE VectorMemoryUnion(const VectorType &x) { data.v = x; assertCorrectAlignment(&data.v); } - Vc_ALWAYS_INLINE VectorMemoryUnion &operator=(const VectorType &x) { - data.v = x; return *this; - } - - Vc_ALWAYS_INLINE Vc_PURE VectorType &v() { return reinterpret_cast(data.v); } - Vc_ALWAYS_INLINE Vc_PURE const VectorType &v() const { return reinterpret_cast(data.v); } - -#if defined VC_ICC - Vc_ALWAYS_INLINE Vc_PURE AliasingEntryHelper m(size_t index) { - return AliasingEntryHelper(this, index); - } - Vc_ALWAYS_INLINE void assign(size_t index, EntryType x) { - data.m[index] = x; - } - Vc_ALWAYS_INLINE Vc_PURE EntryType read(size_t index) const { - return data.m[index]; - } -#else - Vc_ALWAYS_INLINE Vc_PURE EntryType &m(size_t index) { - return data.m[index]; - } -#endif - - Vc_ALWAYS_INLINE Vc_PURE EntryType m(size_t index) const { - return data.m[index]; - } - -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - private: - union VectorScalarUnion { - VectorTypeBase v; - EntryType m[sizeof(VectorTypeBase)/sizeof(EntryType)]; - } data; -#else - Vc_ALWAYS_INLINE VectorMemoryUnion(VectorType x) : data(x) { assertCorrectAlignment(&data); } - Vc_ALWAYS_INLINE VectorMemoryUnion &operator=(VectorType x) { - data = x; return *this; - } - - Vc_ALWAYS_INLINE Vc_PURE VectorType &v() { return data; } - Vc_ALWAYS_INLINE Vc_PURE const VectorType &v() const { return data; } - - Vc_ALWAYS_INLINE Vc_PURE AliasingEntryType &m(size_t index) { - return reinterpret_cast(&data)[index]; - } - - Vc_ALWAYS_INLINE Vc_PURE EntryType m(size_t index) const { - return reinterpret_cast(&data)[index]; - } - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - VectorType data; -#endif -}; - -#if VC_GCC == 0x40700 || (VC_GCC >= 0x40600 && VC_GCC <= 0x40603) -// workaround bug 52736 in GCC -template static Vc_ALWAYS_INLINE Vc_CONST T &vectorMemoryUnionAliasedMember(V *data, size_t index) { - if (__builtin_constant_p(index) && index == 0) { - T *ret; - asm("mov %1,%0" : "=r"(ret) : "r"(data)); - return *ret; - } else { - return reinterpret_cast(data)[index]; - } -} -template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128d, double>::AliasingEntryType &VectorMemoryUnion<__m128d, double>::m(size_t index) { - return vectorMemoryUnionAliasedMember(&data, index); -} -template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128i, long long>::AliasingEntryType &VectorMemoryUnion<__m128i, long long>::m(size_t index) { - return vectorMemoryUnionAliasedMember(&data, index); -} -template<> Vc_ALWAYS_INLINE Vc_PURE VectorMemoryUnion<__m128i, unsigned long long>::AliasingEntryType &VectorMemoryUnion<__m128i, unsigned long long>::m(size_t index) { - return vectorMemoryUnionAliasedMember(&data, index); -} -#endif - -} // namespace Common -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_STORAGE_H diff --git a/math/vc/include/Vc/common/support.h b/math/vc/include/Vc/common/support.h deleted file mode 100644 index fd771e37a076a..0000000000000 --- a/math/vc/include/Vc/common/support.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef VC_DEPRECATED_COMMON_SUPPORT_H -#define VC_DEPRECATED_COMMON_SUPPORT_H -#ifdef __GNUC__ -#warning "the header is deprecated. Use instead." -#endif -#include -#endif // VC_DEPRECATED_COMMON_SUPPORT_H diff --git a/math/vc/include/Vc/common/trigonometric.h b/math/vc/include/Vc/common/trigonometric.h deleted file mode 100644 index 5c81df2cb4bf4..0000000000000 --- a/math/vc/include/Vc/common/trigonometric.h +++ /dev/null @@ -1,83 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_TRIGONOMETRIC_H -#define VC_COMMON_TRIGONOMETRIC_H - -#ifndef VC__USE_NAMESPACE -#error "Do not include Vc/common/trigonometric.h outside of Vc itself" -#endif - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace -{ - using Vc::VC__USE_NAMESPACE::Vector; -} // namespace - -namespace Internal -{ -template struct MapImpl { enum Dummy { Value = Impl }; }; -template<> struct MapImpl { enum Dummy { Value = MapImpl::Value }; }; -typedef ImplementationT::Value -#if defined(VC_IMPL_XOP) && defined(VC_IMPL_FMA4) - + Vc::XopInstructions - + Vc::Fma4Instructions -#endif - > TrigonometricImplementation; -} // namespace Internal - -template struct Trigonometric -{ - template static Vector sin(const Vector &_x); - template static Vector cos(const Vector &_x); - template static void sincos(const Vector &_x, Vector *_sin, Vector *_cos); - template static Vector asin (const Vector &_x); - template static Vector atan (const Vector &_x); - template static Vector atan2(const Vector &y, const Vector &x); -}; -namespace VC__USE_NAMESPACE -#undef VC__USE_NAMESPACE -{ - template static Vc_ALWAYS_INLINE Vc_PURE Vector sin(const Vector &_x) { - return Vc::Trigonometric::sin(_x); - } - template static Vc_ALWAYS_INLINE Vc_PURE Vector cos(const Vector &_x) { - return Vc::Trigonometric::cos(_x); - } - template static Vc_ALWAYS_INLINE void sincos(const Vector &_x, Vector *_sin, Vector *_cos) { - Vc::Trigonometric::sincos(_x, _sin, _cos); - } - template static Vc_ALWAYS_INLINE Vc_PURE Vector asin (const Vector &_x) { - return Vc::Trigonometric::asin(_x); - } - template static Vc_ALWAYS_INLINE Vc_PURE Vector atan (const Vector &_x) { - return Vc::Trigonometric::atan(_x); - } - template static Vc_ALWAYS_INLINE Vc_PURE Vector atan2(const Vector &y, const Vector &x) { - return Vc::Trigonometric::atan2(y, x); - } -} // namespace VC__USE_NAMESPACE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" -#endif // VC_COMMON_TRIGONOMETRIC_H diff --git a/math/vc/include/Vc/common/types.h b/math/vc/include/Vc/common/types.h deleted file mode 100644 index 73565c403adb1..0000000000000 --- a/math/vc/include/Vc/common/types.h +++ /dev/null @@ -1,225 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_COMMON_TYPES_H -#define VC_COMMON_TYPES_H - -#ifdef VC_CHECK_ALIGNMENT -#include -#include -#endif - -namespace ROOT { -namespace Vc -{ - -// helper type to implement sfloat_v (Vector) -struct sfloat {}; - -template struct DetermineEntryType { typedef T Type; }; -template<> struct DetermineEntryType { typedef float Type; }; - -template struct NegateTypeHelper { typedef T Type; }; -template<> struct NegateTypeHelper { typedef char Type; }; -template<> struct NegateTypeHelper { typedef short Type; }; -template<> struct NegateTypeHelper { typedef int Type; }; - -namespace VectorSpecialInitializerZero { enum ZEnum { Zero = 0 }; } -namespace VectorSpecialInitializerOne { enum OEnum { One = 1 }; } -namespace VectorSpecialInitializerIndexesFromZero { enum IEnum { IndexesFromZero }; } - -template class Memory; -#ifdef VC_MSVC -# if defined(VC_IMPL_Scalar) -namespace Scalar { - template class Vector; - template class Mask; -} -#define _Vector Vc::Scalar::Vector -# elif defined(VC_IMPL_SSE) -namespace SSE { - template class Vector; - template class Mask; - class Float8Mask; -} -#define _Vector Vc::SSE::Vector -# elif defined(VC_IMPL_AVX) -namespace AVX { - template class Vector; - template class Mask; -} -#define _Vector Vc::AVX::Vector -# else -# error "Sorry, MSVC is a nasty compiler and needs extra care. Please help." -# endif -#endif -namespace -{ - template struct EnableIf { typedef T Value; }; - template struct EnableIf {}; - - template struct IsSignedInteger { enum { Value = 0 }; }; - template<> struct IsSignedInteger { enum { Value = 1 }; }; - template<> struct IsSignedInteger { enum { Value = 1 }; }; - template<> struct IsSignedInteger { enum { Value = 1 }; }; - template<> struct IsSignedInteger { enum { Value = 1 }; }; - template<> struct IsSignedInteger { enum { Value = 1 }; }; - - template struct IsUnsignedInteger { enum { Value = 0 }; }; - template<> struct IsUnsignedInteger { enum { Value = 1 }; }; - template<> struct IsUnsignedInteger { enum { Value = 1 }; }; - template<> struct IsUnsignedInteger { enum { Value = 1 }; }; - template<> struct IsUnsignedInteger { enum { Value = 1 }; }; - template<> struct IsUnsignedInteger { enum { Value = 1 }; }; - - template struct IsInteger { enum { Value = IsSignedInteger::Value | IsUnsignedInteger::Value }; }; - - template struct IsReal { enum { Value = 0 }; }; - template<> struct IsReal { enum { Value = 1 }; }; - template<> struct IsReal { enum { Value = 1 }; }; - - template struct IsEqualType { enum { Value = 0 }; }; - template struct IsEqualType { enum { Value = 1 }; }; - - template - struct IsInTypelist { enum { Value = false }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - template struct IsInTypelist { enum { Value = true }; }; - - template struct IsCombinationOf { enum { Value = false }; }; - template struct IsCombinationOf { enum { Value = true }; }; - template struct IsCombinationOf { enum { Value = true }; }; - - namespace - { - struct yes { char x; }; - struct no { yes x, y; }; - } // anonymous namespace - - template struct HasImplicitCast - { -#ifdef VC_MSVC - // MSVC can't compile this code if we pass a type that has large alignment restrictions by - // value - // clang OTOH warns about this code if we pass a null-reference, thus we ifdef the const-ref - // for MSVC only - static yes test(const To &) { return yes(); } -#else - static yes test( To) { return yes(); } -#endif - static no test(...) { return no(); } - enum { -#ifdef VC_MSVC - // I want to test whether implicit cast works. If it works MSVC thinks it should give a warning. Wrong. Shut up. -#pragma warning(suppress : 4257 4267) -#endif - Value = !!(sizeof(test(*static_cast(0))) == sizeof(yes)) - }; - }; -#if defined(VC_GCC) && VC_GCC < 0x40300 - // GCC 4.1 is very noisy because of the float->int and double->int type trait tests. We get - // around this noise with a little specialization. - template<> struct HasImplicitCast { enum { Value = true }; }; - template<> struct HasImplicitCast { enum { Value = true }; }; -#endif - -#ifdef VC_MSVC - // MSVC is such a broken compiler :'( - // HasImplicitCast breaks if From has an __declspec(align(#)) modifier and has no implicit cast - // to To. That's because it'll call test(...) as test(From) and not test(const From &). - // This results in C2718. And MSVC is too stupid to see that it should just shut up and - // everybody would be happy. - // - // Because the HasImplicitCast specializations can only be implemented after the Vector class - // was declared we have to write some nasty hacks. - template struct HasImplicitCast<_Vector, T2> { enum { Value = false }; }; -#if defined(VC_IMPL_Scalar) - template struct HasImplicitCast, T2> { enum { Value = false }; }; - template struct HasImplicitCast, Vc::Scalar::Mask > { enum { Value = true }; }; -#elif defined(VC_IMPL_SSE) - template struct HasImplicitCast, T2> { enum { Value = false }; }; - template struct HasImplicitCast, Vc::SSE::Mask > { enum { Value = true }; }; - template struct HasImplicitCast { enum { Value = false }; }; - template<> struct HasImplicitCast { enum { Value = true }; }; -#elif defined(VC_IMPL_AVX) - template struct HasImplicitCast, T2> { enum { Value = false }; }; - template struct HasImplicitCast, Vc::AVX::Mask > { enum { Value = true }; }; -#endif - template struct HasImplicitCast<_Vector, _Vector > { enum { Value = true }; }; - //template<> struct HasImplicitCast<_Vector< int>, _Vector< unsigned int>> { enum { Value = true }; }; - //template<> struct HasImplicitCast<_Vector< unsigned int>, _Vector< int>> { enum { Value = true }; }; - //template<> struct HasImplicitCast<_Vector< short>, _Vector> { enum { Value = true }; }; - //template<> struct HasImplicitCast<_Vector, _Vector< short>> { enum { Value = true }; }; - template struct HasImplicitCast, T2> { enum { Value = false }; }; - template struct HasImplicitCast, Vc::Memory > { enum { Value = true }; }; -#undef _Vector -#endif - - template struct CanConvertToInt : public HasImplicitCast {}; - template<> struct CanConvertToInt { enum { Value = 0 }; }; - //template<> struct CanConvertToInt { enum { Value = 0 }; }; - //template<> struct CanConvertToInt { enum { Value = 0 }; }; - - enum TestEnum {}; - VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 0, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 0, CanConvertToInt_is_broken); - VC_STATIC_ASSERT(CanConvertToInt::Value == 1, CanConvertToInt_is_broken); - - typedef HasImplicitCast HasImplicitCastTest0; - typedef HasImplicitCast HasImplicitCastTest1; - typedef HasImplicitCast HasImplicitCastTest2; - typedef HasImplicitCast HasImplicitCastTest3; - typedef HasImplicitCast HasImplicitCastTest4; - - VC_STATIC_ASSERT(HasImplicitCastTest0::Value == true, HasImplicitCast0_is_broken); - VC_STATIC_ASSERT(HasImplicitCastTest1::Value == true, HasImplicitCast1_is_broken); - VC_STATIC_ASSERT(HasImplicitCastTest2::Value == true, HasImplicitCast2_is_broken); - VC_STATIC_ASSERT(HasImplicitCastTest3::Value == true, HasImplicitCast3_is_broken); - VC_STATIC_ASSERT(HasImplicitCastTest4::Value == false, HasImplicitCast4_is_broken); - - template struct IsLikeInteger { enum { Value = !IsReal::Value && CanConvertToInt::Value }; }; - template struct IsLikeSignedInteger { enum { Value = IsLikeInteger::Value && !IsUnsignedInteger::Value }; }; -} // anonymous namespace - -#ifndef VC_CHECK_ALIGNMENT -template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){} -#else -template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr) -{ - const size_t s = Vc_ALIGNOF(_T); - if((reinterpret_cast(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) { - fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n"); - abort(); - } -} -#endif - -} // namespace Vc -} // namespace ROOT - -#endif // VC_COMMON_TYPES_H diff --git a/math/vc/include/Vc/common/undomacros.h b/math/vc/include/Vc/common/undomacros.h deleted file mode 100644 index 4f62379298bb8..0000000000000 --- a/math/vc/include/Vc/common/undomacros.h +++ /dev/null @@ -1,110 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_UNDOMACROS_H -#define VC_COMMON_UNDOMACROS_H -#undef VC_COMMON_MACROS_H - -#undef Vc_ALIGNOF - -#undef Vc_INTRINSIC -#undef Vc_INTRINSIC_L -#undef Vc_INTRINSIC_R -#undef Vc_CONST -#undef Vc_CONST_L -#undef Vc_CONST_R -#undef Vc_PURE -#undef Vc_PURE_L -#undef Vc_PURE_R -#undef Vc_MAY_ALIAS -#undef Vc_ALWAYS_INLINE -#undef Vc_ALWAYS_INLINE_L -#undef Vc_ALWAYS_INLINE_R -#undef VC_IS_UNLIKELY -#undef VC_IS_LIKELY -#undef VC_RESTRICT -#undef VC_DEPRECATED -#undef _VC_CONSTEXPR -#undef _VC_CONSTEXPR_L -#undef _VC_CONSTEXPR_R -#undef _VC_NOEXCEPT - -#undef ALIGN -#undef STRUCT_ALIGN1 -#undef STRUCT_ALIGN2 -#undef ALIGNED_TYPEDEF -#undef _CAT_IMPL -#undef CAT -#undef unrolled_loop16 -#undef for_all_vector_entries -#undef FREE_STORE_OPERATORS_ALIGNED - -#undef VC_WARN_INLINE -#undef VC_WARN - -#ifdef VC_EXTERNAL_ASSERT -#undef VC_EXTERNAL_ASSERT -#else -#undef VC_ASSERT -#endif - -#undef VC_HAS_BUILTIN - -#undef Vc_buildDouble -#undef Vc_buildFloat - -#undef _VC_APPLY_IMPL_1 -#undef _VC_APPLY_IMPL_2 -#undef _VC_APPLY_IMPL_3 -#undef _VC_APPLY_IMPL_4 -#undef _VC_APPLY_IMPL_5 - -#undef VC_LIST_FLOAT_VECTOR_TYPES -#undef VC_LIST_INT_VECTOR_TYPES -#undef VC_LIST_VECTOR_TYPES -#undef VC_LIST_COMPARES -#undef VC_LIST_LOGICAL -#undef VC_LIST_BINARY -#undef VC_LIST_SHIFTS -#undef VC_LIST_ARITHMETICS - -#undef VC_APPLY_0 -#undef VC_APPLY_1 -#undef VC_APPLY_2 -#undef VC_APPLY_3 -#undef VC_APPLY_4 - -#undef VC_ALL_COMPARES -#undef VC_ALL_LOGICAL -#undef VC_ALL_BINARY -#undef VC_ALL_SHIFTS -#undef VC_ALL_ARITHMETICS -#undef VC_ALL_FLOAT_VECTOR_TYPES -#undef VC_ALL_VECTOR_TYPES - -#undef VC_EXACT_TYPE -#undef VC_ALIGNED_PARAMETER -#undef VC_OFFSETOF - -#ifdef Vc_POP_GCC_DIAGNOSTIC__ -#pragma GCC diagnostic pop -#undef Vc_POP_GCC_DIAGNOSTIC__ -#endif - -#endif // VC_COMMON_UNDOMACROS_H diff --git a/math/vc/include/Vc/common/vectortuple.h b/math/vc/include/Vc/common/vectortuple.h deleted file mode 100644 index b4eafae222428..0000000000000 --- a/math/vc/include/Vc/common/vectortuple.h +++ /dev/null @@ -1,160 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_COMMON_VECTORTUPLE_H -#define VC_COMMON_VECTORTUPLE_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -template struct InterleavedMemoryReadAccess; - -template struct VectorTuple; -template struct VectorTuple<2, V> -{ - typedef typename V::EntryType T; - typedef V &VC_RESTRICT Reference; - Reference l, r; - - Vc_ALWAYS_INLINE VectorTuple(Reference a, Reference b) - : l(a), r(b) - { - } - - Vc_ALWAYS_INLINE VectorTuple<3, V> operator,(V &a) const - { - return VectorTuple<3, V>(*this, a); - } - - Vc_ALWAYS_INLINE VectorTuple<3, const V> operator,(const V &a) const - { - return VectorTuple<3, const V>(*this, a); - } - - template - Vc_ALWAYS_INLINE void operator=(const InterleavedMemoryReadAccess &access) const - { - VC_STATIC_ASSERT(2 <= StructSize, You_are_trying_to_extract_more_data_from_the_struct_than_it_has); - access.deinterleave(l, r); - } -}; - -template struct VectorTuple<2, const V> -{ - typedef typename V::EntryType T; - typedef const V &VC_RESTRICT Reference; - Reference l, r; - - Vc_ALWAYS_INLINE VectorTuple(Reference a, Reference b) - : l(a), r(b) - { - } - - Vc_ALWAYS_INLINE VectorTuple<3, const V> operator,(const V &a) const - { - return VectorTuple<3, const V>(*this, a); - } -}; - -#define _VC_VECTORTUPLE_SPECIALIZATION(LENGTH, parameters) \ -template struct VectorTuple \ -{ \ - typedef typename V::EntryType T; \ - typedef V &VC_RESTRICT Reference; \ - const VectorTuple &l; \ - Reference r; \ - \ - Vc_ALWAYS_INLINE VectorTuple(const VectorTuple &tuple, Reference a) \ - : l(tuple), r(a) \ - { \ - } \ - \ - Vc_ALWAYS_INLINE VectorTuple operator,(V &a) const \ - { \ - return VectorTuple(*this, a); \ - } \ - \ - template \ - Vc_ALWAYS_INLINE void operator=(const InterleavedMemoryReadAccess &access) const \ - { \ - VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_extract_more_data_from_the_struct_than_it_has); \ - access.deinterleave parameters; \ - } \ -}; \ -template struct VectorTuple \ -{ \ - typedef typename V::EntryType T; \ - typedef const V &VC_RESTRICT Reference; \ - const VectorTuple &l; \ - Reference r; \ - \ - Vc_ALWAYS_INLINE VectorTuple(const VectorTuple &tuple, Reference a) \ - : l(tuple), r(a) \ - { \ - } \ - \ - Vc_ALWAYS_INLINE VectorTuple operator,(const V &a) const \ - { \ - return VectorTuple(*this, a); \ - } \ -} -_VC_VECTORTUPLE_SPECIALIZATION(3, (l.l, l.r, r)); -_VC_VECTORTUPLE_SPECIALIZATION(4, (l.l.l, l.l.r, l.r, r)); -_VC_VECTORTUPLE_SPECIALIZATION(5, (l.l.l.l, l.l.l.r, l.l.r, l.r, r)); -_VC_VECTORTUPLE_SPECIALIZATION(6, (l.l.l.l.l, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); -_VC_VECTORTUPLE_SPECIALIZATION(7, (l.l.l.l.l.l, l.l.l.l.l.r, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); -_VC_VECTORTUPLE_SPECIALIZATION(8, (l.l.l.l.l.l.l, l.l.l.l.l.l.r, l.l.l.l.l.r, l.l.l.l.r, l.l.l.r, l.l.r, l.r, r)); -// VC_STATIC_ASSERT(false, You_are_gathering_too_many_vectors__This_is_not_implemented); - -} // namespace Common - -#ifdef VC_IMPL_Scalar -namespace Scalar -#elif defined VC_IMPL_SSE -namespace SSE -#elif defined VC_IMPL_AVX -namespace AVX -#endif -{ - -template -Vc_ALWAYS_INLINE Common::VectorTuple<2, Vc::Vector > operator,(Vc::Vector &a, Vc::Vector &b) -{ - return Common::VectorTuple<2, Vc::Vector >(a, b); -} - -template -Vc_ALWAYS_INLINE Common::VectorTuple<2, const Vc::Vector > operator,(const Vc::Vector &a, const Vc::Vector &b) -{ - return Common::VectorTuple<2, const Vc::Vector >(a, b); -} - -} // namespace Scalar/SSE/AVX - -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_COMMON_VECTORTUPLE_H diff --git a/math/vc/include/Vc/common/windows_fix_intrin.h b/math/vc/include/Vc/common/windows_fix_intrin.h deleted file mode 100644 index c64d6247c7ec6..0000000000000 --- a/math/vc/include/Vc/common/windows_fix_intrin.h +++ /dev/null @@ -1,300 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_WINDOWS_FIX_INTRIN_H -#define VC_COMMON_WINDOWS_FIX_INTRIN_H - -#if defined(VC_MSVC) && !defined(__midl) -// MSVC sucks. If you include intrin.h you get all SSE and AVX intrinsics -// declared. This is a problem because we need to implement the intrinsics -// that are not supported in hardware ourselves. -// Something always includes intrin.h even if you don't -// do it explicitly. Therefore we try to be the first to include it -// but with __midl defined, in which case it is basically empty. -#ifdef __INTRIN_H_ -#error "intrin.h was already included, polluting the namespace. Please fix your code to include the Vc headers before anything that includes intrin.h. (Vc will declare the relevant intrinsics as they are required by some system headers.)" -#endif -#define __midl -#include -#undef __midl -#include -#include -#include -extern "C" { - -#ifdef _WIN64 -_CRTIMP double ceil(_In_ double); -__int64 _InterlockedDecrement64(__int64 volatile *); -__int64 _InterlockedExchange64(__int64 volatile *, __int64); -void * _InterlockedExchangePointer(void * volatile *, void *); -__int64 _InterlockedExchangeAdd64(__int64 volatile *, __int64); -void *_InterlockedCompareExchangePointer (void * volatile *, void *, void *); -__int64 _InterlockedIncrement64(__int64 volatile *); -int __cdecl _setjmpex(jmp_buf); -void __faststorefence(void); -__int64 __mulh(__int64,__int64); -unsigned __int64 __umulh(unsigned __int64,unsigned __int64); -unsigned __int64 __readcr0(void); -unsigned __int64 __readcr2(void); -unsigned __int64 __readcr3(void); -unsigned __int64 __readcr4(void); -unsigned __int64 __readcr8(void); -void __writecr0(unsigned __int64); -void __writecr3(unsigned __int64); -void __writecr4(unsigned __int64); -void __writecr8(unsigned __int64); -unsigned __int64 __readdr(unsigned int); -void __writedr(unsigned int, unsigned __int64); -unsigned __int64 __readeflags(void); -void __writeeflags(unsigned __int64); -void __movsq(unsigned long long *, unsigned long long const *, size_t); -unsigned char __readgsbyte(unsigned long Offset); -unsigned short __readgsword(unsigned long Offset); -unsigned long __readgsdword(unsigned long Offset); -unsigned __int64 __readgsqword(unsigned long Offset); -void __writegsbyte(unsigned long Offset, unsigned char Data); -void __writegsword(unsigned long Offset, unsigned short Data); -void __writegsdword(unsigned long Offset, unsigned long Data); -void __writegsqword(unsigned long Offset, unsigned __int64 Data); -void __addgsbyte(unsigned long Offset, unsigned char Data); -void __addgsword(unsigned long Offset, unsigned short Data); -void __addgsdword(unsigned long Offset, unsigned long Data); -void __addgsqword(unsigned long Offset, unsigned __int64 Data); -void __incgsbyte(unsigned long Offset); -void __incgsword(unsigned long Offset); -void __incgsdword(unsigned long Offset); -void __incgsqword(unsigned long Offset); -unsigned char __vmx_vmclear(unsigned __int64*); -unsigned char __vmx_vmlaunch(void); -unsigned char __vmx_vmptrld(unsigned __int64*); -unsigned char __vmx_vmread(size_t, size_t*); -unsigned char __vmx_vmresume(void); -unsigned char __vmx_vmwrite(size_t, size_t); -unsigned char __vmx_on(unsigned __int64*); -void __stosq(unsigned __int64 *, unsigned __int64, size_t); -unsigned char _interlockedbittestandset64(__int64 volatile *a, __int64 b); -unsigned char _interlockedbittestandreset64(__int64 volatile *a, __int64 b); -short _InterlockedCompareExchange16_np(short volatile *Destination, short Exchange, short Comparand); -long _InterlockedCompareExchange_np (long volatile *, long, long); -__int64 _InterlockedCompareExchange64_np(__int64 volatile *, __int64, __int64); -void *_InterlockedCompareExchangePointer_np (void * volatile *, void *, void *); -unsigned char _InterlockedCompareExchange128(__int64 volatile *, __int64, __int64, __int64 *); -unsigned char _InterlockedCompareExchange128_np(__int64 volatile *, __int64, __int64, __int64 *); -long _InterlockedAnd_np(long volatile *, long); -char _InterlockedAnd8_np(char volatile *, char); -short _InterlockedAnd16_np(short volatile *, short); -__int64 _InterlockedAnd64_np(__int64 volatile *, __int64); -long _InterlockedOr_np(long volatile *, long); -char _InterlockedOr8_np(char volatile *, char); -short _InterlockedOr16_np(short volatile *, short); -__int64 _InterlockedOr64_np(__int64 volatile *, __int64); -long _InterlockedXor_np(long volatile *, long); -char _InterlockedXor8_np(char volatile *, char); -short _InterlockedXor16_np(short volatile *, short); -__int64 _InterlockedXor64_np(__int64 volatile *, __int64); -unsigned __int64 __lzcnt64(unsigned __int64); -unsigned __int64 __popcnt64(unsigned __int64); -__int64 _InterlockedOr64(__int64 volatile *, __int64); -__int64 _InterlockedXor64(__int64 volatile *, __int64); -__int64 _InterlockedAnd64(__int64 volatile *, __int64); -unsigned char _bittest64(__int64 const *a, __int64 b); -unsigned char _bittestandset64(__int64 *a, __int64 b); -unsigned char _bittestandreset64(__int64 *a, __int64 b); -unsigned char _bittestandcomplement64(__int64 *a, __int64 b); -unsigned char _BitScanForward64(unsigned long* Index, unsigned __int64 Mask); -unsigned char _BitScanReverse64(unsigned long* Index, unsigned __int64 Mask); -unsigned __int64 __shiftleft128(unsigned __int64 LowPart, unsigned __int64 HighPart, unsigned char Shift); -unsigned __int64 __shiftright128(unsigned __int64 LowPart, unsigned __int64 HighPart, unsigned char Shift); -unsigned __int64 _umul128(unsigned __int64 multiplier, unsigned __int64 multiplicand, unsigned __int64 *highproduct); -__int64 _mul128(__int64 multiplier, __int64 multiplicand, __int64 *highproduct); -#endif - -long _InterlockedOr(long volatile *, long); -char _InterlockedOr8(char volatile *, char); -short _InterlockedOr16(short volatile *, short); -long _InterlockedXor(long volatile *, long); -char _InterlockedXor8(char volatile *, char); -short _InterlockedXor16(short volatile *, short); -long _InterlockedAnd(long volatile *, long); -char _InterlockedAnd8(char volatile *, char); -short _InterlockedAnd16(short volatile *, short); -unsigned char _bittest(long const *a, long b); -unsigned char _bittestandset(long *a, long b); -unsigned char _bittestandreset(long *a, long b); -unsigned char _bittestandcomplement(long *a, long b); -unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask); -unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask); -_CRTIMP wchar_t * __cdecl wcscat( _Pre_cap_for_(_Source) _Prepost_z_ wchar_t *, _In_z_ const wchar_t * _Source); -_Check_return_ _CRTIMP int __cdecl wcscmp(_In_z_ const wchar_t *,_In_z_ const wchar_t *); -_CRTIMP wchar_t * __cdecl wcscpy(_Pre_cap_for_(_Source) _Post_z_ wchar_t *, _In_z_ const wchar_t * _Source); -_Check_return_ _CRTIMP size_t __cdecl wcslen(_In_z_ const wchar_t *); -#pragma warning(suppress: 4985) -_CRTIMP wchar_t * __cdecl _wcsset(_Inout_z_ wchar_t *, wchar_t); -void _ReadBarrier(void); -unsigned char _rotr8(unsigned char value, unsigned char shift); -unsigned short _rotr16(unsigned short value, unsigned char shift); -unsigned char _rotl8(unsigned char value, unsigned char shift); -unsigned short _rotl16(unsigned short value, unsigned char shift); -short _InterlockedIncrement16(short volatile *Addend); -short _InterlockedDecrement16(short volatile *Addend); -short _InterlockedCompareExchange16(short volatile *Destination, short Exchange, short Comparand); -void __nvreg_save_fence(void); -void __nvreg_restore_fence(void); - -#ifdef _M_IX86 -unsigned long __readcr0(void); -unsigned long __readcr2(void); -unsigned long __readcr3(void); -unsigned long __readcr4(void); -unsigned long __readcr8(void); -void __writecr0(unsigned); -void __writecr3(unsigned); -void __writecr4(unsigned); -void __writecr8(unsigned); -unsigned __readdr(unsigned int); -void __writedr(unsigned int, unsigned); -unsigned __readeflags(void); -void __writeeflags(unsigned); -void __addfsbyte(unsigned long Offset, unsigned char Data); -void __addfsword(unsigned long Offset, unsigned short Data); -void __addfsdword(unsigned long Offset, unsigned long Data); -void __incfsbyte(unsigned long Offset); -void __incfsword(unsigned long Offset); -void __incfsdword(unsigned long Offset); -unsigned char __readfsbyte(unsigned long Offset); -unsigned short __readfsword(unsigned long Offset); -unsigned long __readfsdword(unsigned long Offset); -unsigned __int64 __readfsqword(unsigned long Offset); -void __writefsbyte(unsigned long Offset, unsigned char Data); -void __writefsword(unsigned long Offset, unsigned short Data); -void __writefsdword(unsigned long Offset, unsigned long Data); -void __writefsqword(unsigned long Offset, unsigned __int64 Data); -long _InterlockedAddLargeStatistic(__int64 volatile *, long); -#endif - -_Ret_bytecap_(_Size) void * __cdecl _alloca(size_t _Size); -int __cdecl abs(_In_ int); -_Check_return_ unsigned short __cdecl _byteswap_ushort(_In_ unsigned short value); -_Check_return_ unsigned long __cdecl _byteswap_ulong(_In_ unsigned long value); -_Check_return_ unsigned __int64 __cdecl _byteswap_uint64(_In_ unsigned __int64 value); -void __cdecl __debugbreak(void); -void __cdecl _disable(void); -__int64 __emul(int,int); -unsigned __int64 __emulu(unsigned int,unsigned int); -void __cdecl _enable(void); -long __cdecl _InterlockedDecrement(long volatile *); -long _InterlockedExchange(long volatile *, long); -short _InterlockedExchange16(short volatile *, short); -char _InterlockedExchange8(char volatile *, char); -long _InterlockedExchangeAdd(long volatile *, long); -short _InterlockedExchangeAdd16(short volatile *, short); -char _InterlockedExchangeAdd8(char volatile *, char); -long _InterlockedCompareExchange (long volatile *, long, long); -__int64 _InterlockedCompareExchange64(__int64 volatile *, __int64, __int64); -long __cdecl _InterlockedIncrement(long volatile *); -int __cdecl _inp(unsigned short); -int __cdecl inp(unsigned short); -unsigned long __cdecl _inpd(unsigned short); -unsigned long __cdecl inpd(unsigned short); -unsigned short __cdecl _inpw(unsigned short); -unsigned short __cdecl inpw(unsigned short); -long __cdecl labs(_In_ long); -_Check_return_ unsigned long __cdecl _lrotl(_In_ unsigned long,_In_ int); -_Check_return_ unsigned long __cdecl _lrotr(_In_ unsigned long,_In_ int); -unsigned __int64 __ll_lshift(unsigned __int64,int); -__int64 __ll_rshift(__int64,int); -_Check_return_ int __cdecl memcmp(_In_opt_bytecount_(_Size) const void *,_In_opt_bytecount_(_Size) const void *,_In_ size_t _Size); -void * __cdecl memcpy(_Out_opt_bytecapcount_(_Size) void *,_In_opt_bytecount_(_Size) const void *,_In_ size_t _Size); -void * __cdecl memset(_Out_opt_bytecapcount_(_Size) void *,_In_ int,_In_ size_t _Size); -int __cdecl _outp(unsigned short,int); -int __cdecl outp(unsigned short,int); -unsigned long __cdecl _outpd(unsigned short,unsigned long); -unsigned long __cdecl outpd(unsigned short,unsigned long); -unsigned short __cdecl _outpw(unsigned short,unsigned short); -unsigned short __cdecl outpw(unsigned short,unsigned short); -void * _ReturnAddress(void); -_Check_return_ unsigned int __cdecl _rotl(_In_ unsigned int,_In_ int); -_Check_return_ unsigned int __cdecl _rotr(_In_ unsigned int,_In_ int); -int __cdecl _setjmp(jmp_buf); -_Check_return_ int __cdecl strcmp(_In_z_ const char *,_In_z_ const char *); -_Check_return_ size_t __cdecl strlen(_In_z_ const char *); -char * __cdecl strset(_Inout_z_ char *,_In_ int); -unsigned __int64 __ull_rshift(unsigned __int64,int); -void * _AddressOfReturnAddress(void); - -void _WriteBarrier(void); -void _ReadWriteBarrier(void); -void __wbinvd(void); -void __invlpg(void*); -unsigned __int64 __readmsr(unsigned long); -void __writemsr(unsigned long, unsigned __int64); -unsigned __int64 __rdtsc(void); -void __movsb(unsigned char *, unsigned char const *, size_t); -void __movsw(unsigned short *, unsigned short const *, size_t); -void __movsd(unsigned long *, unsigned long const *, size_t); -unsigned char __inbyte(unsigned short Port); -unsigned short __inword(unsigned short Port); -unsigned long __indword(unsigned short Port); -void __outbyte(unsigned short Port, unsigned char Data); -void __outword(unsigned short Port, unsigned short Data); -void __outdword(unsigned short Port, unsigned long Data); -void __inbytestring(unsigned short Port, unsigned char *Buffer, unsigned long Count); -void __inwordstring(unsigned short Port, unsigned short *Buffer, unsigned long Count); -void __indwordstring(unsigned short Port, unsigned long *Buffer, unsigned long Count); -void __outbytestring(unsigned short Port, unsigned char *Buffer, unsigned long Count); -void __outwordstring(unsigned short Port, unsigned short *Buffer, unsigned long Count); -void __outdwordstring(unsigned short Port, unsigned long *Buffer, unsigned long Count); -unsigned int __getcallerseflags(); -void __vmx_vmptrst(unsigned __int64 *); -void __vmx_off(void); -void __svm_clgi(void); -void __svm_invlpga(void*, int); -void __svm_skinit(int); -void __svm_stgi(void); -void __svm_vmload(size_t); -void __svm_vmrun(size_t); -void __svm_vmsave(size_t); -void __halt(void); -void __sidt(void*); -void __lidt(void*); -void __ud2(void); -void __nop(void); -void __stosb(unsigned char *, unsigned char, size_t); -void __stosw(unsigned short *, unsigned short, size_t); -void __stosd(unsigned long *, unsigned long, size_t); -unsigned char _interlockedbittestandset(long volatile *a, long b); -unsigned char _interlockedbittestandreset(long volatile *a, long b); -void __cpuid(int a[4], int b); -void __cpuidex(int a[4], int b, int c); -unsigned __int64 __readpmc(unsigned long a); -unsigned long __segmentlimit(unsigned long a); -_Check_return_ unsigned __int64 __cdecl _rotl64(_In_ unsigned __int64,_In_ int); -_Check_return_ unsigned __int64 __cdecl _rotr64(_In_ unsigned __int64,_In_ int); -__int64 __cdecl _abs64(__int64); -void __int2c(void); -char _InterlockedCompareExchange8(char volatile *Destination, char Exchange, char Comparand); -unsigned short __lzcnt16(unsigned short); -unsigned int __lzcnt(unsigned int); -unsigned short __popcnt16(unsigned short); -unsigned int __popcnt(unsigned int); -unsigned __int64 __rdtscp(unsigned int*); -} -#endif - -#endif // VC_COMMON_WINDOWS_FIX_INTRIN_H diff --git a/math/vc/include/Vc/cpuid.h b/math/vc/include/Vc/cpuid.h deleted file mode 100644 index d215a39d0a427..0000000000000 --- a/math/vc/include/Vc/cpuid.h +++ /dev/null @@ -1,214 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef CPUID_H -#define CPUID_H - -namespace ROOT { -namespace Vc -{ - -/** - * \ingroup Utilities - * \headerfile cpuid.h - * - * This class is available for x86 / AMD64 systems to read and interpret information about the CPU's - * capabilities. - * - * Before any of the getter functions may be called, the init() function must have been called. It - * will be called automatically, but for any function executing before main, you better call - * \c CpuId::init() first. - * - * %Vc users will most likely not need this class directly, but rely on the - * isImplementationSupported, bestImplementationSupported, extraInstructionsSupported, and - * currentImplementationSupported functions. - */ -class CpuId -{ - typedef unsigned char uchar; - typedef unsigned short ushort; - typedef unsigned int uint; - - public: - enum ProcessorType { - OriginalOemProcessor = 0, - IntelOverDriveProcessor = 1, - DualProcessor = 2, - IntelReserved = 3 - }; - - /** - * Reads the CPU capabilities and stores them for faster subsequent access. - * - * Will be executed automatically before main, but not necessarily before other functions - * executing before main. - */ - static void init(); - - //! Return the cache line size in bits. - static inline ushort cacheLineSize() { return static_cast(s_cacheLineSize) * 8u; } - //! Return the ProcessorType. - static inline ProcessorType processorType() { return s_processorType; } - //! Return the family number of the processor (vendor dependent). - static inline uint processorFamily() { return s_processorFamily; } - //! Return the model number of the processor (vendor dependent). - static inline uint processorModel() { return s_processorModel; } - //! Return the number of logical processors. - static inline uint logicalProcessors() { return s_logicalProcessors; } - //! Return whether the CPU vendor is AMD. - static inline bool isAmd () { return s_ecx0 == 0x444D4163; } - //! Return whether the CPU vendor is Intel. - static inline bool isIntel () { return s_ecx0 == 0x6C65746E; } - //! Return whether the CPU supports SSE3. - static inline bool hasSse3 () { return s_processorFeaturesC & (1 << 0); } - //! Return whether the CPU supports the PCLMULQDQ instruction. - static inline bool hasPclmulqdq() { return (s_processorFeaturesC & (1 << 1)) != 0; } - //! Return whether the CPU supports the MONITOR/MWAIT instructions. - static inline bool hasMonitor() { return (s_processorFeaturesC & (1 << 3)) != 0; } - //! Return whether the CPU supports the Virtual Machine Extensions. - static inline bool hasVmx () { return (s_processorFeaturesC & (1 << 5)) != 0; } - //! Return whether the CPU supports the Safer Mode Extensions. - static inline bool hasSmx () { return (s_processorFeaturesC & (1 << 6)) != 0; } - //! Return whether the CPU supports the Enhanced Intel SpeedStep technology. - static inline bool hasEist () { return (s_processorFeaturesC & (1 << 7)) != 0; } - //! Return whether the CPU supports Thermal Monitor 2. - static inline bool hasTm2 () { return (s_processorFeaturesC & (1 << 8)) != 0; } - //! Return whether the CPU supports SSSE3. - static inline bool hasSsse3() { return (s_processorFeaturesC & (1 << 9)) != 0; } - //! Return whether the CPU supports FMA extensions using YMM state. - static inline bool hasFma () { return (s_processorFeaturesC & (1 << 12)) != 0; } - //! Return whether the CPU supports CMPXCHG16B. - static inline bool hasCmpXchg16b() { return (s_processorFeaturesC & (1 << 13)) != 0; } - //! Return whether the CPU supports the Perfmon and Debug Capability. - static inline bool hasPdcm () { return (s_processorFeaturesC & (1 << 15)) != 0; } - //! Return whether the CPU supports Direct Cache Access: prefetch data from a memory mapped device. - static inline bool hasDca() { return (s_processorFeaturesC & (1 << 18)) != 0; } - //! Return whether the CPU supports SSE 4.1 - static inline bool hasSse41() { return (s_processorFeaturesC & (1 << 19)) != 0; } - //! Return whether the CPU supports SSE 4.2 - static inline bool hasSse42() { return (s_processorFeaturesC & (1 << 20)) != 0; } - //! Return whether the CPU supports the MOVBE instruction. - static inline bool hasMovbe() { return (s_processorFeaturesC & (1 << 22)) != 0; } - //! Return whether the CPU supports the POPCNT instruction. - static inline bool hasPopcnt(){ return (s_processorFeaturesC & (1 << 23)) != 0; } - //static inline bool hasTscDeadline() { return (s_processorFeaturesC & (1 << 24)) != 0; } - //! Return whether the CPU supports the AESNI instructions. - static inline bool hasAes () { return (s_processorFeaturesC & (1 << 25)) != 0; } - //static inline bool hasXsave() { return (s_processorFeaturesC & (1 << 26)) != 0; } - //! Return whether the CPU and OS support the XSETBV/XGETBV instructions. - static inline bool hasOsxsave() { return (s_processorFeaturesC & (1 << 27)) != 0; } - //! Return whether the CPU supports AVX. - static inline bool hasAvx () { return (s_processorFeaturesC & (1 << 28)) != 0; } - //! Return whether the CPU supports 16-bit floating-point conversion instructions. - static inline bool hasF16c () { return (s_processorFeaturesC & (1 << 29)) != 0; } - //! Return whether the CPU supports the RDRAND instruction. - static inline bool hasRdrand(){ return (s_processorFeaturesC & (1 << 30)) != 0; } - //! Return whether the CPU contains an x87 FPU. - static inline bool hasFpu () { return (s_processorFeaturesD & (1 << 0)) != 0; } - static inline bool hasVme () { return (s_processorFeaturesD & (1 << 1)) != 0; } - //! Return whether the CPU contains Debugging Extensions. - static inline bool hasDe () { return (s_processorFeaturesD & (1 << 2)) != 0; } - //! Return whether the CPU contains Page Size Extensions. - static inline bool hasPse () { return (s_processorFeaturesD & (1 << 3)) != 0; } - //! Return whether the CPU supports the RDTSC instruction. - static inline bool hasTsc () { return (s_processorFeaturesD & (1 << 4)) != 0; } - //! Return whether the CPU supports the Model Specific Registers instructions. - static inline bool hasMsr () { return (s_processorFeaturesD & (1 << 5)) != 0; } - //! Return whether the CPU supports the Physical Address Extension. - static inline bool hasPae () { return (s_processorFeaturesD & (1 << 6)) != 0; } - //! Return whether the CPU supports the CMPXCHG8B instruction. - static inline bool hasCx8 () { return (s_processorFeaturesD & (1 << 8)) != 0; } - //! Return whether the CPU supports Memory Type Range Registers. - static inline bool hasMtrr () { return (s_processorFeaturesD & (1 << 12)) != 0; } - //! Return whether the CPU supports CMOV instructions. - static inline bool hasCmov () { return (s_processorFeaturesD & (1 << 15)) != 0; } - //! Return whether the CPU supports the CLFLUSH instruction. - static inline bool hasClfsh() { return (s_processorFeaturesD & (1 << 19)) != 0; } - //! Return whether the CPU supports ACPI. - static inline bool hasAcpi () { return (s_processorFeaturesD & (1 << 22)) != 0; } - //! Return whether the CPU supports MMX. - static inline bool hasMmx () { return (s_processorFeaturesD & (1 << 23)) != 0; } - //! Return whether the CPU supports SSE. - static inline bool hasSse () { return (s_processorFeaturesD & (1 << 25)) != 0; } - //! Return whether the CPU supports SSE2. - static inline bool hasSse2 () { return (s_processorFeaturesD & (1 << 26)) != 0; } - static inline bool hasHtt () { return (s_processorFeaturesD & (1 << 28)) != 0; } - //! Return whether the CPU supports SSE4a. - static inline bool hasSse4a() { return (s_processorFeatures8C & (1 << 6)) != 0; } - //! Return whether the CPU supports misaligned SSE instructions. - static inline bool hasMisAlignSse() { return (s_processorFeatures8C & (1 << 7)) != 0; } - //! Return whether the CPU supports the AMD prefetchw instruction. - static inline bool hasAmdPrefetch() { return (s_processorFeatures8C & (1 << 8)) != 0; } - //! Return whether the CPU supports the XOP instructions. - static inline bool hasXop () { return (s_processorFeatures8C & (1 << 11)) != 0; } - //! Return whether the CPU supports the FMA4 instructions. - static inline bool hasFma4 () { return (s_processorFeatures8C & (1 << 16)) != 0; } - //! Return whether the CPU supports the RDTSCP instruction. - static inline bool hasRdtscp() { return (s_processorFeatures8D & (1 << 27)) != 0; } - static inline bool has3DNow() { return (s_processorFeatures8D & (1u << 31)) != 0; } - static inline bool has3DNowExt() { return (s_processorFeatures8D & (1 << 30)) != 0; } - //! Return the size of the L1 instruction cache. - static inline uint L1Instruction() { return s_L1Instruction; } - //! Return the size of the L1 data cache. - static inline uint L1Data() { return s_L1Data; } - //! Return the size of the L2 cache. - static inline uint L2Data() { return s_L2Data; } - //! Return the size of the L3 cache. - static inline uint L3Data() { return s_L3Data; } - static inline ushort L1InstructionLineSize() { return s_L1InstructionLineSize; } - static inline ushort L1DataLineSize() { return s_L1DataLineSize; } - static inline ushort L2DataLineSize() { return s_L2DataLineSize; } - static inline ushort L3DataLineSize() { return s_L3DataLineSize; } - static inline uint L1Associativity() { return s_L1Associativity; } - static inline uint L2Associativity() { return s_L2Associativity; } - static inline uint L3Associativity() { return s_L3Associativity; } - static inline ushort prefetch() { return s_prefetch; } - - private: - static void interpret(uchar byte, bool *checkLeaf4); - - static uint s_ecx0; - static uint s_logicalProcessors; - static uint s_processorFeaturesC; - static uint s_processorFeaturesD; - static uint s_processorFeatures8C; - static uint s_processorFeatures8D; - static uint s_L1Instruction; - static uint s_L1Data; - static uint s_L2Data; - static uint s_L3Data; - static ushort s_L1InstructionLineSize; - static ushort s_L1DataLineSize; - static ushort s_L2DataLineSize; - static ushort s_L3DataLineSize; - static uint s_L1Associativity; - static uint s_L2Associativity; - static uint s_L3Associativity; - static ushort s_prefetch; - static uchar s_brandIndex; - static uchar s_cacheLineSize; - static uchar s_processorModel; - static uchar s_processorFamily; - static ProcessorType s_processorType; - static bool s_noL2orL3; -}; -} // namespace Vc -} // namespace ROOT - -#endif // CPUID_H diff --git a/math/vc/include/Vc/double_v b/math/vc/include/Vc/double_v deleted file mode 100644 index 354482be55ec5..0000000000000 --- a/math/vc/include/Vc/double_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/double_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/float_v b/math/vc/include/Vc/float_v deleted file mode 100644 index c6da32b4e562d..0000000000000 --- a/math/vc/include/Vc/float_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/float_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/global.h b/math/vc/include/Vc/global.h deleted file mode 100644 index 82089a862133c..0000000000000 --- a/math/vc/include/Vc/global.h +++ /dev/null @@ -1,509 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_GLOBAL_H -#define VC_GLOBAL_H - -#ifndef DOXYGEN - -// Compiler defines -#ifdef __INTEL_COMPILER -#define VC_ICC __INTEL_COMPILER_BUILD_DATE -#elif defined(__OPENCC__) -#define VC_OPEN64 1 -#elif defined(__clang__) -#define VC_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) -#elif defined(__GNUC__) -#define VC_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__) -#elif defined(_MSC_VER) -#define VC_MSVC _MSC_FULL_VER -#else -#define VC_UNSUPPORTED_COMPILER 1 -#endif - -// Features/Quirks defines -#if defined VC_MSVC && defined _WIN32 -// the Win32 ABI can't handle function parameters with alignment >= 16 -#define VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN 1 -#endif -#if defined(__GNUC__) && !defined(VC_NO_INLINE_ASM) -#define VC_GNU_ASM 1 -#endif -#if defined(VC_GCC) && (VC_GCC <= 0x40405 || (VC_GCC >= 0x40500 && VC_GCC <= 0x40502)) && !(VC_GCC == 0x40502 && defined(__GNUC_UBUNTU_VERSION__) && __GNUC_UBUNTU_VERSION__ == 0xb0408) -// GCC 4.6.0 / 4.5.3 / 4.4.6 switched to the interface as defined by ICC -// (Ubuntu 11.04 ships a GCC 4.5.2 with the new interface) -#define VC_MM256_MASKSTORE_WRONG_MASK_TYPE 1 -#endif -#if defined(VC_GCC) && VC_GCC >= 0x40300 -#define VC_HAVE_ATTRIBUTE_ERROR 1 -#define VC_HAVE_ATTRIBUTE_WARNING 1 -#endif - -#if (defined(__GXX_EXPERIMENTAL_CXX0X__) && VC_GCC >= 0x40600) || __cplusplus >= 201103 -# define VC_CXX11 1 -# ifdef VC_GCC -# if VC_GCC >= 0x40700 // && VC_GCC < 0x408000) -// ::max_align_t was introduced with GCC 4.7. std::max_align_t took a bit longer. -# define VC_HAVE_MAX_ALIGN_T 1 -# endif -# elif defined(VC_ICC) -# define VC_HAVE_MAX_ALIGN_T 1 -# elif !defined(VC_CLANG) -// Clang doesn't provide max_align_t at all -# define VC_HAVE_STD_MAX_ALIGN_T 1 -# endif -#endif - -// ICC ships the AVX2 intrinsics inside the AVX1 header. -// FIXME: the number 20120731 is too large, but I don't know which one is the right one -#if (defined(VC_ICC) && VC_ICC >= 20120731) || (defined(VC_MSVC) && VC_MSVC >= 170000000) -#define VC_UNCONDITIONAL_AVX2_INTRINSICS 1 -#endif - -/* Define the following strings to a unique integer, which is the only type the preprocessor can - * compare. This allows to use -DVC_IMPL=SSE3. The preprocessor will then consider VC_IMPL and SSE3 - * to be equal. Of course, it is important to undefine the strings later on! - */ -#define Scalar 0x00100000 -#define SSE 0x00200000 -#define SSE2 0x00300000 -#define SSE3 0x00400000 -#define SSSE3 0x00500000 -#define SSE4_1 0x00600000 -#define SSE4_2 0x00700000 -#define AVX 0x00800000 - -#define XOP 0x00000001 -#define FMA4 0x00000002 -#define F16C 0x00000004 -#define POPCNT 0x00000008 -#define SSE4a 0x00000010 -#define FMA 0x00000020 - -#define IMPL_MASK 0xFFF00000 -#define EXT_MASK 0x000FFFFF - -#ifdef VC_MSVC -# ifdef _M_IX86_FP -# if _M_IX86_FP >= 1 -# ifndef __SSE__ -# define __SSE__ 1 -# endif -# endif -# if _M_IX86_FP >= 2 -# ifndef __SSE2__ -# define __SSE2__ 1 -# endif -# endif -# elif defined(_M_AMD64) -// If the target is x86_64 then SSE2 is guaranteed -# ifndef __SSE__ -# define __SSE__ 1 -# endif -# ifndef __SSE2__ -# define __SSE2__ 1 -# endif -# endif -#endif - -#ifndef VC_IMPL - -# if defined(__AVX__) -# define VC_IMPL_AVX 1 -# else -# if defined(__SSE4_2__) -# define VC_IMPL_SSE 1 -# define VC_IMPL_SSE4_2 1 -# endif -# if defined(__SSE4_1__) -# define VC_IMPL_SSE 1 -# define VC_IMPL_SSE4_1 1 -# endif -# if defined(__SSE3__) -# define VC_IMPL_SSE 1 -# define VC_IMPL_SSE3 1 -# endif -# if defined(__SSSE3__) -# define VC_IMPL_SSE 1 -# define VC_IMPL_SSSE3 1 -# endif -# if defined(__SSE2__) -# define VC_IMPL_SSE 1 -# define VC_IMPL_SSE2 1 -# endif - -# if defined(VC_IMPL_SSE) - // nothing -# else -# define VC_IMPL_Scalar 1 -# endif -# endif -# if defined(VC_IMPL_AVX) || defined(VC_IMPL_SSE) -# ifdef __FMA4__ -# define VC_IMPL_FMA4 1 -# endif -# ifdef __XOP__ -# define VC_IMPL_XOP 1 -# endif -# ifdef __F16C__ -# define VC_IMPL_F16C 1 -# endif -# ifdef __POPCNT__ -# define VC_IMPL_POPCNT 1 -# endif -# ifdef __SSE4A__ -# define VC_IMPL_SSE4a 1 -# endif -# ifdef __FMA__ -# define VC_IMPL_FMA 1 -# endif -# endif - -#else // VC_IMPL - -# if (VC_IMPL & IMPL_MASK) == AVX // AVX supersedes SSE -# define VC_IMPL_AVX 1 -# elif (VC_IMPL & IMPL_MASK) == Scalar -# define VC_IMPL_Scalar 1 -# elif (VC_IMPL & IMPL_MASK) == SSE4_2 -# define VC_IMPL_SSE4_2 1 -# define VC_IMPL_SSE4_1 1 -# define VC_IMPL_SSSE3 1 -# define VC_IMPL_SSE3 1 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# elif (VC_IMPL & IMPL_MASK) == SSE4_1 -# define VC_IMPL_SSE4_1 1 -# define VC_IMPL_SSSE3 1 -# define VC_IMPL_SSE3 1 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# elif (VC_IMPL & IMPL_MASK) == SSSE3 -# define VC_IMPL_SSSE3 1 -# define VC_IMPL_SSE3 1 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# elif (VC_IMPL & IMPL_MASK) == SSE3 -# define VC_IMPL_SSE3 1 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# elif (VC_IMPL & IMPL_MASK) == SSE2 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# elif (VC_IMPL & IMPL_MASK) == SSE -# define VC_IMPL_SSE 1 -# if defined(__SSE4_2__) -# define VC_IMPL_SSE4_2 1 -# endif -# if defined(__SSE4_1__) -# define VC_IMPL_SSE4_1 1 -# endif -# if defined(__SSE3__) -# define VC_IMPL_SSE3 1 -# endif -# if defined(__SSSE3__) -# define VC_IMPL_SSSE3 1 -# endif -# if defined(__SSE2__) -# define VC_IMPL_SSE2 1 -# endif -# elif (VC_IMPL & IMPL_MASK) == 0 && (VC_IMPL & SSE4a) - // this is for backward compatibility only where SSE4a was included in the main - // line of available SIMD instruction sets -# define VC_IMPL_SSE3 1 -# define VC_IMPL_SSE2 1 -# define VC_IMPL_SSE 1 -# endif -# if (VC_IMPL & XOP) -# define VC_IMPL_XOP 1 -# endif -# if (VC_IMPL & FMA4) -# define VC_IMPL_FMA4 1 -# endif -# if (VC_IMPL & F16C) -# define VC_IMPL_F16C 1 -# endif -# if (VC_IMPL & POPCNT) -# define VC_IMPL_POPCNT 1 -# endif -# if (VC_IMPL & SSE4a) -# define VC_IMPL_SSE4a 1 -# endif -# if (VC_IMPL & FMA) -# define VC_IMPL_FMA 1 -# endif -# undef VC_IMPL - -#endif // VC_IMPL - -// If AVX is enabled in the compiler it will use VEX coding for the SIMD instructions. -#ifdef __AVX__ -# define VC_USE_VEX_CODING 1 -#endif - -#if defined(VC_GCC) && VC_GCC < 0x40300 && !defined(VC_IMPL_Scalar) -# ifndef VC_DONT_WARN_OLD_GCC -# warning "GCC < 4.3 does not have full support for SSE2 intrinsics. Using scalar types/operations only. Define VC_DONT_WARN_OLD_GCC to silence this warning." -# endif -# undef VC_IMPL_SSE -# undef VC_IMPL_SSE2 -# undef VC_IMPL_SSE3 -# undef VC_IMPL_SSE4_1 -# undef VC_IMPL_SSE4_2 -# undef VC_IMPL_SSSE3 -# undef VC_IMPL_AVX -# undef VC_IMPL_FMA4 -# undef VC_IMPL_XOP -# undef VC_IMPL_F16C -# undef VC_IMPL_POPCNT -# undef VC_IMPL_SSE4a -# undef VC_IMPL_FMA -# undef VC_USE_VEX_CODING -# define VC_IMPL_Scalar 1 -#endif - -# if !defined(VC_IMPL_Scalar) && !defined(VC_IMPL_SSE) && !defined(VC_IMPL_AVX) -# error "No suitable Vc implementation was selected! Probably VC_IMPL was set to an invalid value." -# elif defined(VC_IMPL_SSE) && !defined(VC_IMPL_SSE2) -# error "SSE requested but no SSE2 support. Vc needs at least SSE2!" -# endif - -#undef Scalar -#undef SSE -#undef SSE2 -#undef SSE3 -#undef SSSE3 -#undef SSE4_1 -#undef SSE4_2 -#undef AVX - -#undef XOP -#undef FMA4 -#undef F16C -#undef POPCNT -#undef SSE4a -#undef FMA - -#undef IMPL_MASK -#undef EXT_MASK - -namespace ROOT { -namespace Vc { -enum AlignedFlag { - Aligned = 0 -}; -enum UnalignedFlag { - Unaligned = 1 -}; -enum StreamingAndAlignedFlag { // implies Aligned - Streaming = 2 -}; -enum StreamingAndUnalignedFlag { - StreamingAndUnaligned = 3 -}; -#endif // DOXYGEN - -/** - * \ingroup Utilities - * - * Enum that specifies the alignment and padding restrictions to use for memory allocation with - * Vc::malloc. - */ -enum MallocAlignment { - /** - * Align on boundary of vector sizes (e.g. 16 Bytes on SSE platforms) and pad to allow - * vector access to the end. Thus the allocated memory contains a multiple of - * VectorAlignment bytes. - */ - AlignOnVector, - /** - * Align on boundary of cache line sizes (e.g. 64 Bytes on x86) and pad to allow - * full cache line access to the end. Thus the allocated memory contains a multiple of - * 64 bytes. - */ - AlignOnCacheline, - /** - * Align on boundary of page sizes (e.g. 4096 Bytes on x86) and pad to allow - * full page access to the end. Thus the allocated memory contains a multiple of - * 4096 bytes. - */ - AlignOnPage -}; - -#if __cplusplus >= 201103 /*C++11*/ -#define Vc_CONSTEXPR constexpr -#elif defined(__GNUC__) -#define Vc_CONSTEXPR inline __attribute__((__always_inline__, __const__)) -#elif defined(VC_MSVC) -#define Vc_CONSTEXPR inline __forceinline -#else -#define Vc_CONSTEXPR inline -#endif -Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; } -Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; } -Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; } -Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; } - -Vc_CONSTEXPR StreamingAndAlignedFlag operator|(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; } -Vc_CONSTEXPR StreamingAndAlignedFlag operator|(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; } -Vc_CONSTEXPR StreamingAndAlignedFlag operator&(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; } -Vc_CONSTEXPR StreamingAndAlignedFlag operator&(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; } - -/** - * \ingroup Utilities - * - * Enum to identify a certain SIMD instruction set. - * - * You can use \ref VC_IMPL for the currently active implementation. - * - * \see ExtraInstructions - */ -enum Implementation { - /// uses only fundamental types - ScalarImpl, - /// x86 SSE + SSE2 - SSE2Impl, - /// x86 SSE + SSE2 + SSE3 - SSE3Impl, - /// x86 SSE + SSE2 + SSE3 + SSSE3 - SSSE3Impl, - /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 - SSE41Impl, - /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2 - SSE42Impl, - /// x86 AVX - AVXImpl, - /// x86 AVX + AVX2 - AVX2Impl, - ImplementationMask = 0xfff -}; - -/** - * \ingroup Utilities - * - * The list of available instructions is not easily described by a linear list of instruction sets. - * On x86 the following instruction sets always include their predecessors: - * SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 - * - * But there are additional instructions that are not necessarily required by this list. These are - * covered in this enum. - */ -enum ExtraInstructions { - //! Support for float16 conversions in hardware - Float16cInstructions = 0x01000, - //! Support for FMA4 instructions - Fma4Instructions = 0x02000, - //! Support for XOP instructions - XopInstructions = 0x04000, - //! Support for the population count instruction - PopcntInstructions = 0x08000, - //! Support for SSE4a instructions - Sse4aInstructions = 0x10000, - //! Support for FMA instructions (3 operand variant) - FmaInstructions = 0x20000, - // PclmulqdqInstructions, - // AesInstructions, - // RdrandInstructions - ExtraInstructionsMask = 0xfffff000u -}; - -#ifndef DOXYGEN - -#ifdef VC_IMPL_Scalar -#define VC_IMPL ::ROOT::Vc::ScalarImpl -#elif defined(VC_IMPL_AVX) -#define VC_IMPL ::ROOT::Vc::AVXImpl -#elif defined(VC_IMPL_SSE4_2) -#define VC_IMPL ::ROOT::Vc::SSE42Impl -#elif defined(VC_IMPL_SSE4_1) -#define VC_IMPL ::ROOT::Vc::SSE41Impl -#elif defined(VC_IMPL_SSSE3) -#define VC_IMPL ::ROOT::Vc::SSSE3Impl -#elif defined(VC_IMPL_SSE3) -#define VC_IMPL ::ROOT::Vc::SSE3Impl -#elif defined(VC_IMPL_SSE2) -#define VC_IMPL ::ROOT::Vc::SSE2Impl -#endif - -template struct ImplementationT { enum _Value { - Value = Features, - Implementation = Features & Vc::ImplementationMask, - ExtraInstructions = Features & Vc::ExtraInstructionsMask -}; }; - -typedef ImplementationT< -#ifdef VC_USE_VEX_CODING - // everything will use VEX coding, so the system has to support AVX even if VC_IMPL_AVX is not set - // but AFAIU the OSXSAVE and xgetbv tests do not have to positive (unless, of course, the - // compiler decides to insert an instruction that uses the full register size - so better be on - // the safe side) - AVXImpl -#else - VC_IMPL -#endif -#ifdef VC_IMPL_SSE4a - + Vc::Sse4aInstructions -#ifdef VC_IMPL_XOP - + Vc::XopInstructions -#ifdef VC_IMPL_FMA4 - + Vc::Fma4Instructions -#endif -#endif -#endif -#ifdef VC_IMPL_POPCNT - + Vc::PopcntInstructions -#endif -#ifdef VC_IMPL_FMA - + Vc::FmaInstructions -#endif - > CurrentImplementation; - -namespace Internal { - template struct HelperImpl; - typedef HelperImpl Helper; - - template struct FlagObject; - template<> struct FlagObject { static Vc_CONSTEXPR AlignedFlag the() { return Aligned; } }; - template<> struct FlagObject { static Vc_CONSTEXPR UnalignedFlag the() { return Unaligned; } }; - template<> struct FlagObject { static Vc_CONSTEXPR StreamingAndAlignedFlag the() { return Streaming; } }; - template<> struct FlagObject { static Vc_CONSTEXPR StreamingAndUnalignedFlag the() { return StreamingAndUnaligned; } }; -} // namespace Internal - -namespace Warnings -{ - void _operator_bracket_warning() -#ifdef VC_HAVE_ATTRIBUTE_WARNING - __attribute__((warning("\n\tUse of Vc::Vector::operator[] to modify scalar entries is known to miscompile with GCC 4.3.x.\n\tPlease upgrade to a more recent GCC or avoid operator[] altogether.\n\t(This warning adds an unnecessary function call to operator[] which should work around the problem at a little extra cost.)"))) -#endif - ; -} // namespace Warnings - -namespace Error -{ - template struct invalid_operands_of_types {}; -} // namespace Error - -#endif // DOXYGEN -} // namespace Vc -} // namespace ROOT - -#undef Vc_CONSTEXPR -#include "version.h" - -#endif // VC_GLOBAL_H diff --git a/math/vc/include/Vc/int_v b/math/vc/include/Vc/int_v deleted file mode 100644 index 073457b8201aa..0000000000000 --- a/math/vc/include/Vc/int_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/int_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/internal/namespace.h b/math/vc/include/Vc/internal/namespace.h deleted file mode 100644 index 7b305d61f5805..0000000000000 --- a/math/vc/include/Vc/internal/namespace.h +++ /dev/null @@ -1,28 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifdef VC_IMPL_Scalar -# define VECTOR_NAMESPACE Vc::Scalar -#elif defined(VC_IMPL_AVX) -# define VECTOR_NAMESPACE Vc::AVX -#elif defined(VC_IMPL_SSE) -# define VECTOR_NAMESPACE Vc::SSE -#else -# error "No known Vc implementation was selected. This should not happen. The logic in Vc/global.h failed." -#endif diff --git a/math/vc/include/Vc/limits b/math/vc/include/Vc/limits deleted file mode 100644 index c7e4fc4172813..0000000000000 --- a/math/vc/include/Vc/limits +++ /dev/null @@ -1,57 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef INCLUDE_VC_LIMITS -#define INCLUDE_VC_LIMITS - -#include "vector.h" -#include "common/macros.h" -#include - -namespace std -{ -template struct numeric_limits > : public numeric_limits::EntryType> -{ -private: - typedef numeric_limits::EntryType> _Base; -public: - static Vc_INTRINSIC Vc_CONST Vc::Vector max() { return Vc::Vector(_Base::max()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector min() { return Vc::Vector(_Base::min()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector lowest() { return Vc::Vector(_Base::lowest()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector epsilon() { return Vc::Vector(_Base::epsilon()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector round_error() { return Vc::Vector(_Base::round_error()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector infinity() { return Vc::Vector(_Base::infinity()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector quiet_NaN() { return Vc::Vector(_Base::quiet_NaN()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector signaling_NaN() { return Vc::Vector(_Base::signaling_NaN()); } - static Vc_INTRINSIC Vc_CONST Vc::Vector denorm_min() { return Vc::Vector(_Base::denorm_min()); } -}; -} // namespace std - -#include "common/undomacros.h" -#ifdef VC_IMPL_Scalar -# include "scalar/limits.h" -#elif defined(VC_IMPL_AVX) -# include "avx/limits.h" -#elif defined(VC_IMPL_SSE) -# include "sse/limits.h" -#endif - -#endif // INCLUDE_VC_LIMITS - -// vim: ft=cpp diff --git a/math/vc/include/Vc/scalar/helperimpl.h b/math/vc/include/Vc/scalar/helperimpl.h deleted file mode 100644 index a4a49281c2048..0000000000000 --- a/math/vc/include/Vc/scalar/helperimpl.h +++ /dev/null @@ -1,58 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_DEINTERLEAVE_H -#define VC_SCALAR_DEINTERLEAVE_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template<> struct HelperImpl -{ - template - static Vc_ALWAYS_INLINE void deinterleave(V &a, V &b, const M *mem, A) - { - a = mem[0]; - b = mem[1]; - } - - static Vc_ALWAYS_INLINE void prefetchForOneRead(const void *) {} - static Vc_ALWAYS_INLINE void prefetchForModify(const void *) {} - static Vc_ALWAYS_INLINE void prefetchClose(const void *) {} - static Vc_ALWAYS_INLINE void prefetchMid(const void *) {} - static Vc_ALWAYS_INLINE void prefetchFar(const void *) {} - - template - static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; -}; - -} // namespace Scalar -} // namespace Vc -} // namespace ROOT - -#include "helperimpl.tcc" -#include "undomacros.h" - -#endif // VC_SCALAR_DEINTERLEAVE_H diff --git a/math/vc/include/Vc/scalar/helperimpl.tcc b/math/vc/include/Vc/scalar/helperimpl.tcc deleted file mode 100644 index b8190d84e325d..0000000000000 --- a/math/vc/include/Vc/scalar/helperimpl.tcc +++ /dev/null @@ -1,86 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_HELPERIMPL_TCC -#define VC_SCALAR_HELPERIMPL_TCC - -#include -#if defined _WIN32 || defined _WIN64 -#include -#endif - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template -static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) -{ - return (value % X) > 0 ? value + X - (value % X) : value; -} - -template -Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) -{ - void *ptr = 0; - switch (A) { - case Vc::AlignOnVector: - return std::malloc(n); - case Vc::AlignOnCacheline: - // TODO: hardcoding 64 is not such a great idea -#ifdef _WIN32 -#ifdef __GNUC__ -#define _VC_ALIGNED_MALLOC __mingw_aligned_malloc -#else -#define _VC_ALIGNED_MALLOC _aligned_malloc -#endif - ptr = _VC_ALIGNED_MALLOC(nextMultipleOf<64>(n), 64); -#else - if (0 == posix_memalign(&ptr, 64, nextMultipleOf<64>(n))) { - return ptr; - } -#endif - break; - case Vc::AlignOnPage: - // TODO: hardcoding 4096 is not such a great idea -#ifdef _WIN32 - ptr = _VC_ALIGNED_MALLOC(nextMultipleOf<4096>(n), 4096); -#undef _VC_ALIGNED_MALLOC -#else - if (0 == posix_memalign(&ptr, 4096, nextMultipleOf<4096>(n))) { - return ptr; - } -#endif - break; - } - return ptr; -} - -Vc_ALWAYS_INLINE void HelperImpl::free(void *p) -{ - std::free(p); -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#endif // VC_SCALAR_HELPERIMPL_TCC diff --git a/math/vc/include/Vc/scalar/interleavedmemory.tcc b/math/vc/include/Vc/scalar/interleavedmemory.tcc deleted file mode 100644 index 95548f7452e35..0000000000000 --- a/math/vc/include/Vc/scalar/interleavedmemory.tcc +++ /dev/null @@ -1,160 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_SCALAR_INTERLEAVEDMEMORY_TCC -#define VC_SCALAR_INTERLEAVEDMEMORY_TCC - -#include "macros.h" -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); - m_data[m_indexes.data() + 3] = v3.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); - m_data[m_indexes.data() + 3] = v3.data(); - m_data[m_indexes.data() + 4] = v4.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); - m_data[m_indexes.data() + 3] = v3.data(); - m_data[m_indexes.data() + 4] = v4.data(); - m_data[m_indexes.data() + 5] = v5.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); - m_data[m_indexes.data() + 3] = v3.data(); - m_data[m_indexes.data() + 4] = v4.data(); - m_data[m_indexes.data() + 5] = v5.data(); - m_data[m_indexes.data() + 6] = v6.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) -{ - m_data[m_indexes.data() + 0] = v0.data(); - m_data[m_indexes.data() + 1] = v1.data(); - m_data[m_indexes.data() + 2] = v2.data(); - m_data[m_indexes.data() + 3] = v3.data(); - m_data[m_indexes.data() + 4] = v4.data(); - m_data[m_indexes.data() + 5] = v5.data(); - m_data[m_indexes.data() + 6] = v6.data(); - m_data[m_indexes.data() + 7] = v7.data(); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; - v3.data() = m_data[m_indexes.data() + 3]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; - v3.data() = m_data[m_indexes.data() + 3]; - v4.data() = m_data[m_indexes.data() + 4]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; - v3.data() = m_data[m_indexes.data() + 3]; - v4.data() = m_data[m_indexes.data() + 4]; - v5.data() = m_data[m_indexes.data() + 5]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; - v3.data() = m_data[m_indexes.data() + 3]; - v4.data() = m_data[m_indexes.data() + 4]; - v5.data() = m_data[m_indexes.data() + 5]; - v6.data() = m_data[m_indexes.data() + 6]; -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const/*{{{*/ -{ - v0.data() = m_data[m_indexes.data() + 0]; - v1.data() = m_data[m_indexes.data() + 1]; - v2.data() = m_data[m_indexes.data() + 2]; - v3.data() = m_data[m_indexes.data() + 3]; - v4.data() = m_data[m_indexes.data() + 4]; - v5.data() = m_data[m_indexes.data() + 5]; - v6.data() = m_data[m_indexes.data() + 6]; - v7.data() = m_data[m_indexes.data() + 7]; -}/*}}}*/ - -} // namespace Common -} // namespace Vc -} // namespace ROOT -#include "undomacros.h" - -#endif // VC_SCALAR_INTERLEAVEDMEMORY_TCC - -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/scalar/limits.h b/math/vc/include/Vc/scalar/limits.h deleted file mode 100644 index 49543fe6e95e4..0000000000000 --- a/math/vc/include/Vc/scalar/limits.h +++ /dev/null @@ -1,24 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_LIMITS_H -#define VC_SCALAR_LIMITS_H - - -#endif // VC_SCALAR_LIMITS_H diff --git a/math/vc/include/Vc/scalar/macros.h b/math/vc/include/Vc/scalar/macros.h deleted file mode 100644 index 29db3f8194103..0000000000000 --- a/math/vc/include/Vc/scalar/macros.h +++ /dev/null @@ -1,25 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "../common/macros.h" - -#ifndef VC_SCALAR_MACROS_H -#define VC_SCALAR_MACROS_H - -#endif // VC_SCALAR_MACROS_H diff --git a/math/vc/include/Vc/scalar/mask.h b/math/vc/include/Vc/scalar/mask.h deleted file mode 100644 index 48f864b869472..0000000000000 --- a/math/vc/include/Vc/scalar/mask.h +++ /dev/null @@ -1,103 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_MASK_H -#define VC_SCALAR_MASK_H - -#include "types.h" - -namespace ROOT { -namespace Vc -{ -namespace Scalar -{ -template class Mask -{ - public: - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : m(b) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : m(false) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : m(true) {} - Vc_ALWAYS_INLINE Mask(const Mask *a) : m(a[0].m) {} - - Vc_ALWAYS_INLINE Mask &operator=(const Mask &rhs) { m = rhs.m; return *this; } - Vc_ALWAYS_INLINE Mask &operator=(bool rhs) { m = rhs; return *this; } - - Vc_ALWAYS_INLINE void expand(Mask *x) { x[0].m = m; } - - Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return Mask(m == rhs.m); } - Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return Mask(m != rhs.m); } - - Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); } - Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); } - Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); } - Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); } - Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); } - Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; } - - Vc_ALWAYS_INLINE bool isFull () const { return m; } - Vc_ALWAYS_INLINE bool isEmpty() const { return !m; } - Vc_ALWAYS_INLINE bool isMix () const { return false; } - - Vc_ALWAYS_INLINE bool data () const { return m; } - Vc_ALWAYS_INLINE bool dataI() const { return m; } - Vc_ALWAYS_INLINE bool dataD() const { return m; } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE operator bool() const { return isFull(); } -#endif - - template - Vc_ALWAYS_INLINE Mask cast() const { return *this; } - - Vc_ALWAYS_INLINE bool operator[](int) const { return m; } - - Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; } - - /** - * Returns the index of the first one in the mask. - * - * The return value is undefined if the mask is empty. - */ - Vc_ALWAYS_INLINE int firstOne() const { return 0; } - - private: - bool m; -}; - -struct ForeachHelper -{ - bool continu; - Vc_ALWAYS_INLINE ForeachHelper(bool mask) : continu(mask) {} - Vc_ALWAYS_INLINE void next() { continu = false; } -}; - -#define Vc_foreach_bit(_it_, _mask_) \ - for (Vc::Scalar::ForeachHelper Vc__make_unique(foreach_bit_obj)(_mask_); Vc__make_unique(foreach_bit_obj).continu; Vc__make_unique(foreach_bit_obj).next()) \ - for (_it_ = 0; Vc__make_unique(foreach_bit_obj).continu; Vc__make_unique(foreach_bit_obj).next()) - -} // namespace Scalar -} // namespace Vc -} // namespace ROOT - -#endif // VC_SCALAR_MASK_H diff --git a/math/vc/include/Vc/scalar/math.h b/math/vc/include/Vc/scalar/math.h deleted file mode 100644 index f142b0be9741f..0000000000000 --- a/math/vc/include/Vc/scalar/math.h +++ /dev/null @@ -1,253 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_MATH_H -#define VC_SCALAR_MATH_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Scalar -{ - -#define VC_MINMAX(V) \ -static Vc_ALWAYS_INLINE V min(const V &x, const V &y) { return V(std::min(x.data(), y.data())); } \ -static Vc_ALWAYS_INLINE V max(const V &x, const V &y) { return V(std::max(x.data(), y.data())); } -VC_ALL_VECTOR_TYPES(VC_MINMAX) -#undef VC_MINMAX - -template static Vc_ALWAYS_INLINE Vector sqrt (const Vector &x) -{ - return Vector(std::sqrt(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector rsqrt(const Vector &x) -{ - const typename Vector::EntryType one = 1; return Vector(one / std::sqrt(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector abs (const Vector &x) -{ - return Vector(std::abs(x.data())); -} - -template<> Vc_ALWAYS_INLINE int_v abs(const int_v &x) { return x < 0 ? -x : x; } -template<> Vc_ALWAYS_INLINE uint_v abs(const uint_v &x) { return x; } -template<> Vc_ALWAYS_INLINE short_v abs(const short_v &x) { return x < 0 ? -x : x; } -template<> Vc_ALWAYS_INLINE ushort_v abs(const ushort_v &x) { return x; } - -template static Vc_ALWAYS_INLINE void sincos(const Vector &x, Vector *sin, Vector *cos) -{ -#if (defined(VC_CLANG) && VC_HAS_BUILTIN(__builtin_sincosf)) || (!defined(VC_CLANG) && defined(__GNUC__) && !defined(_WIN32)) - __builtin_sincosf(x.data(), &sin->data(), &cos->data()); -#elif defined(_GNU_SOURCE) - sincosf(x.data(), &sin->data(), &cos->data()); -#else - sin->data() = std::sin(x.data()); - cos->data() = std::cos(x.data()); -#endif -} - -template<> Vc_ALWAYS_INLINE void sincos(const Vector &x, Vector *sin, Vector *cos) -{ -#if (defined(VC_CLANG) && VC_HAS_BUILTIN(__builtin_sincos)) || (!defined(VC_CLANG) && defined(__GNUC__) && !defined(_WIN32)) - __builtin_sincos(x.data(), &sin->data(), &cos->data()); -#elif defined(_GNU_SOURCE) - ::sincos(x.data(), &sin->data(), &cos->data()); -#else - sin->data() = std::sin(x.data()); - cos->data() = std::cos(x.data()); -#endif -} - -template static Vc_ALWAYS_INLINE Vector sin (const Vector &x) -{ - return Vector(std::sin(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector asin (const Vector &x) -{ - return Vector(std::asin(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector cos (const Vector &x) -{ - return Vector(std::cos(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector log (const Vector &x) -{ - return Vector(std::log(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector log10(const Vector &x) -{ - return Vector(std::log10(x.data())); -} - -#if (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600) || defined(_ISOC99_SOURCE) || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) -static Vc_ALWAYS_INLINE double_v log2(double_v::AsArg x) { return double_v(::log2 (x.data())); } -static Vc_ALWAYS_INLINE sfloat_v log2(sfloat_v::AsArg x) { return sfloat_v(::log2f(x.data())); } -static Vc_ALWAYS_INLINE float_v log2( float_v::AsArg x) { return float_v(::log2f(x.data())); } -#else -namespace { -template static _VC_CONSTEXPR T c_ln2() { return Vc_buildFloat(1, 0x317218, -1); } // .693147182464599609375 -template<> _VC_CONSTEXPR double c_ln2() { return Vc_buildDouble(1, 0x62E42FEFA39EFull, -1); } // .69314718055994528622676398299518041312694549560546875 -} -#define VC_LOG2(V) \ -static Vc_ALWAYS_INLINE V log2(const V &x) \ -{ \ - return V(std::log(x.data()) / c_ln2()); \ -} -VC_ALL_FLOAT_VECTOR_TYPES(VC_LOG2) -#undef VC_LOG2 -#endif - -template static Vc_ALWAYS_INLINE Vector exp (const Vector &x) -{ - return Vector(std::exp(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector atan (const Vector &x) -{ - return Vector(std::atan( x.data() )); -} - -template static Vc_ALWAYS_INLINE Vector atan2(const Vector &x, const Vector &y) -{ - return Vector(std::atan2( x.data(), y.data() )); -} - -template static Vc_ALWAYS_INLINE Vector trunc(const Vector &x) -{ -#if __cplusplus >= 201103 /*C++11*/ - return std::trunc(x.data()); -#else - return x.data() > 0 ? std::floor(x.data()) : std::ceil(x.data()); -#endif -} - -template static Vc_ALWAYS_INLINE Vector floor(const Vector &x) -{ - return Vector(std::floor(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector ceil(const Vector &x) -{ - return Vector(std::ceil(x.data())); -} - -template static Vc_ALWAYS_INLINE Vector round(const Vector &x) -{ - return x; -} - -namespace -{ - template bool _realIsEvenHalf(T x) { - const T two = 2; - const T half = 0.5; - const T f = std::floor(x * half) * two; - return (x - f) == half; - } -} // namespace -template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) -{ - return float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f)); -} - -template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) -{ - return sfloat_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f)); -} - -template<> Vc_ALWAYS_INLINE Vector round(const Vector &x) -{ - return double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. )); -} - -template static Vc_ALWAYS_INLINE Vector reciprocal(const Vector &x) -{ - const typename Vector::EntryType one = 1; return Vector(one / x.data()); -} - -#ifdef isfinite -#undef isfinite -#endif -#ifdef isnan -#undef isnan -#endif -template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Vector &x) -{ - return typename Vector::Mask( -#ifdef _MSC_VER - !!_finite(x.data()) -#elif defined(__INTEL_COMPILER) - ::isfinite(x.data()) -#else - std::isfinite(x.data()) -#endif - ); -} - -template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Vector &x) -{ - return typename Vector::Mask( -#ifdef _MSC_VER - !!_isnan(x.data()) -#elif defined(__INTEL_COMPILER) - ::isnan(x.data()) -#else - std::isnan(x.data()) -#endif - ); -} - -Vc_ALWAYS_INLINE Vector frexp(Vector x, Vector *e) { - return float_v(::frexpf(x.data(), &e->data())); -} -Vc_ALWAYS_INLINE Vector frexp(Vector x, Vector *e) { - return double_v(::frexp(x.data(), &e->data())); -} -Vc_ALWAYS_INLINE sfloat_v frexp(sfloat_v x, short_v *e) { - int ee; - const float r = ::frexpf(x.data(), &ee); - e->data() = ee; - return sfloat_v(r); -} - -Vc_ALWAYS_INLINE Vector ldexp(Vector x, Vector e) { - return float_v(::ldexpf(x.data(), e.data())); -} -Vc_ALWAYS_INLINE Vector ldexp(Vector x, Vector e) { - return double_v(::ldexp(x.data(), e.data())); -} -Vc_ALWAYS_INLINE sfloat_v ldexp(sfloat_v x, short_v e) { - return sfloat_v(::ldexpf(x.data(), e.data())); -} - -} // namespace Scalar -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_SCALAR_MATH_H diff --git a/math/vc/include/Vc/scalar/types.h b/math/vc/include/Vc/scalar/types.h deleted file mode 100644 index f4beb74143b27..0000000000000 --- a/math/vc/include/Vc/scalar/types.h +++ /dev/null @@ -1,44 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_TYPES_H -#define VC_SCALAR_TYPES_H - -#define VC_DOUBLE_V_SIZE 1 -#define VC_FLOAT_V_SIZE 1 -#define VC_SFLOAT_V_SIZE 1 -#define VC_INT_V_SIZE 1 -#define VC_UINT_V_SIZE 1 -#define VC_SHORT_V_SIZE 1 -#define VC_USHORT_V_SIZE 1 - -#include "../common/types.h" - -namespace ROOT { -namespace Vc -{ - namespace Scalar - { - template class VectorAlignedBaseT {}; - template class Vector; - } // namespace Scalar -} // namespace Vc -} // namespace ROOT - -#endif // VC_SCALAR_TYPES_H diff --git a/math/vc/include/Vc/scalar/undomacros.h b/math/vc/include/Vc/scalar/undomacros.h deleted file mode 100644 index f0de7afa9e788..0000000000000 --- a/math/vc/include/Vc/scalar/undomacros.h +++ /dev/null @@ -1,25 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_UNDOMACROS_H -#define VC_SCALAR_UNDOMACROS_H - -#endif // VC_SCALAR_UNDOMACROS_H - -#include "../common/undomacros.h" diff --git a/math/vc/include/Vc/scalar/vector.h b/math/vc/include/Vc/scalar/vector.h deleted file mode 100644 index d1700346fc1c4..0000000000000 --- a/math/vc/include/Vc/scalar/vector.h +++ /dev/null @@ -1,480 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SCALAR_VECTOR_H -#define SCALAR_VECTOR_H - -#include -#include -#include - -#ifdef _MSC_VER -#include -#endif - -#include "../common/memoryfwd.h" -#include "macros.h" -#include "types.h" -#include "mask.h" -#include "writemaskedvector.h" - -namespace ROOT { -namespace Vc -{ -namespace Scalar -{ - enum VectorAlignmentEnum { VectorAlignment = 4 }; - -template -class Vector -{ - friend class WriteMaskedVector; - public: - typedef typename DetermineEntryType::Type EntryType; - protected: - EntryType m_data; - public: - typedef Vc::Memory, 1> Memory; - typedef Vector IndexType; - typedef Scalar::Mask<1u> Mask; - typedef Vector AsArg; - - Vc_ALWAYS_INLINE EntryType &data() { return m_data; } - Vc_ALWAYS_INLINE EntryType data() const { return m_data; } - - enum Constants { Size = 1 }; - - /////////////////////////////////////////////////////////////////////////////////////////// - // uninitialized - Vc_ALWAYS_INLINE Vector() {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // constants - Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerZero::ZEnum) : m_data(0) {} - Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerOne::OEnum) : m_data(1) {} - Vc_ALWAYS_INLINE Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : m_data(0) {} - static Vc_ALWAYS_INLINE Vector Zero() { Vector r; r.m_data = 0; return r; } - static Vc_ALWAYS_INLINE Vector One() { Vector r; r.m_data = 1; return r; } - static Vc_ALWAYS_INLINE Vector IndexesFromZero() { return Zero(); } - static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // static_cast / copy ctor - template explicit Vc_ALWAYS_INLINE Vector(const Vector &x) : m_data(static_cast(x.data())) {} - - // implicit cast - template Vc_ALWAYS_INLINE_L Vector &operator=(const Vector &x) Vc_ALWAYS_INLINE_R; - - // copy assignment - Vc_ALWAYS_INLINE Vector &operator=(Vector v) { m_data = v.data(); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // broadcast - explicit Vc_ALWAYS_INLINE Vector(EntryType x) : m_data(x) {} - template Vc_ALWAYS_INLINE Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : m_data(x) {} - Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { m_data = a; return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // load ctors - explicit Vc_ALWAYS_INLINE Vector(const EntryType *x) : m_data(x[0]) {} - template Vc_ALWAYS_INLINE Vector(const EntryType *x, A) : m_data(x[0]) {} - template explicit Vc_ALWAYS_INLINE Vector(const Other *x) : m_data(x[0]) {} - template Vc_ALWAYS_INLINE Vector(const Other *x, A) : m_data(x[0]) {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // expand 1 float_v to 2 double_v XXX rationale? remove it for release? XXX - template Vc_ALWAYS_INLINE void expand(Vector *x) const { x->data() = static_cast(m_data); } - template explicit Vc_ALWAYS_INLINE Vector(const Vector *a) : m_data(static_cast(a->data())) {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // zeroing - Vc_ALWAYS_INLINE void setZero() { m_data = 0; } - Vc_ALWAYS_INLINE void setZero(Mask k) { if (k) m_data = 0; } - - Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // load member functions - template Vc_ALWAYS_INLINE void load(const Other *mem) { m_data = mem[0]; } - template Vc_ALWAYS_INLINE void load(const Other *mem, A) { m_data = mem[0]; } - template Vc_ALWAYS_INLINE void load(const Other *mem, Mask m) { if (m.data()) m_data = mem[0]; } - - Vc_ALWAYS_INLINE void load(const EntryType *mem) { m_data = mem[0]; } - template Vc_ALWAYS_INLINE void load(const EntryType *mem, A) { m_data = mem[0]; } - Vc_ALWAYS_INLINE void load(const EntryType *mem, Mask m) { if (m.data()) m_data = mem[0]; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // stores - Vc_ALWAYS_INLINE void store(EntryType *mem) const { mem[0] = m_data; } - Vc_ALWAYS_INLINE void store(EntryType *mem, Mask m) const { if (m.data()) mem[0] = m_data; } - template Vc_ALWAYS_INLINE void store(EntryType *mem, A) const { store(mem); } - template Vc_ALWAYS_INLINE void store(EntryType *mem, Mask m, A) const { store(mem, m); } - - /////////////////////////////////////////////////////////////////////////////////////////// - // swizzles - Vc_INTRINSIC const Vector &abcd() const { return *this; } - Vc_INTRINSIC const Vector cdab() const { return *this; } - Vc_INTRINSIC const Vector badc() const { return *this; } - Vc_INTRINSIC const Vector aaaa() const { return *this; } - Vc_INTRINSIC const Vector bbbb() const { return *this; } - Vc_INTRINSIC const Vector cccc() const { return *this; } - Vc_INTRINSIC const Vector dddd() const { return *this; } - Vc_INTRINSIC const Vector bcad() const { return *this; } - Vc_INTRINSIC const Vector bcda() const { return *this; } - Vc_INTRINSIC const Vector dabc() const { return *this; } - Vc_INTRINSIC const Vector acbd() const { return *this; } - Vc_INTRINSIC const Vector dbca() const { return *this; } - Vc_INTRINSIC const Vector dcba() const { return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // gathers - template Vc_ALWAYS_INLINE Vector(const EntryType *array, const IndexT *indexes) : m_data(array[indexes[0]]) {} - template Vc_ALWAYS_INLINE Vector(const EntryType *array, Vector indexes) : m_data(array[indexes[0]]) {} - template Vc_ALWAYS_INLINE Vector(const EntryType *array, IndexT indexes, Mask m) : m_data(m.data() ? array[indexes[0]] : 0) {} - template Vc_ALWAYS_INLINE Vector(const S1 *array, const EntryType S1::* member1, IT indexes, Mask mask = Mask(true)) - : m_data(mask.data() ? (&array[indexes[0]])->*(member1) : 0) {} - template Vc_ALWAYS_INLINE Vector(const S1 *array, const S2 S1::* member1, - const EntryType S2::* member2, IT indexes, Mask mask = Mask(true)) - : m_data(mask.data() ? array[indexes[0]].*(member1).*(member2) : 0) {} - template Vc_ALWAYS_INLINE Vector(const S1 *array, const EntryType *const S1::* ptrMember1, - IT1 outerIndex, IT2 innerIndex, Mask mask = Mask(true)) - : m_data(mask.data() ? (array[outerIndex[0]].*(ptrMember1))[innerIndex[0]] : 0) {} - - template Vc_ALWAYS_INLINE void gather(const EntryType *array, IT indexes, Mask mask = Mask(true)) - { if (mask.data()) m_data = array[indexes[0]]; } - template Vc_ALWAYS_INLINE void gather(const S1 *array, const EntryType S1::* member1, IT indexes, Mask mask = Mask(true)) - { if (mask.data()) m_data = (&array[indexes[0]])->*(member1); } - template Vc_ALWAYS_INLINE void gather(const S1 *array, const S2 S1::* member1, - const EntryType S2::* member2, IT indexes, Mask mask = Mask(true)) - { if (mask.data()) m_data = array[indexes[0]].*(member1).*(member2); } - template Vc_ALWAYS_INLINE void gather(const S1 *array, const EntryType *const S1::* ptrMember1, - IT1 outerIndex, IT2 innerIndex, Mask mask = Mask(true)) - { if (mask.data()) m_data = (array[outerIndex[0]].*(ptrMember1))[innerIndex[0]]; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // scatters - Vc_ALWAYS_INLINE void scatter(EntryType *array, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]] = m_data; } - template Vc_ALWAYS_INLINE void scatter(S1 *array, EntryType S1::* member, const Vector &indexes, Mask m = Mask(true)) const { - if (m.data()) array[indexes[0]].*(member) = m_data; - } - template Vc_ALWAYS_INLINE void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, - const Vector &indexes, Mask m = Mask(true)) const { - if (m.data()) array[indexes[0]].*(member1).*(member2) = m_data; - } - - Vc_ALWAYS_INLINE void scatter(EntryType *array, const Vector &indexes, Mask m = Mask(true)) const { if (m.data()) array[indexes[0]] = m_data; } - template Vc_ALWAYS_INLINE void scatter(S1 *array, EntryType S1::* member, const Vector &indexes, Mask m = Mask(true)) const { - if (m.data()) array[indexes[0]].*(member) = m_data; - } - template Vc_ALWAYS_INLINE void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, - const Vector &indexes, Mask m = Mask(true)) const { - if (m.data()) array[indexes[0]].*(member1).*(member2) = m_data; - } - - //prefix - Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; } - Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; } - //postfix - Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; } - Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; } - - Vc_ALWAYS_INLINE EntryType &operator[](size_t index) { - assert(index == 0); if(index) {} - return m_data; - } - - Vc_ALWAYS_INLINE EntryType operator[](size_t index) const { - assert(index == 0); if(index) {} - return m_data; - } - - Vc_ALWAYS_INLINE Vector operator~() const { return Vector(~m_data); } - Vc_ALWAYS_INLINE Vector::Type> operator-() const { return Vector::Type>(-m_data); } - Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; } - -#define OPshift(symbol) \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(const Vector &x) { m_data symbol##= x.m_data; return *this; } \ - Vc_ALWAYS_INLINE Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ - Vc_ALWAYS_INLINE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); } -#define OPshift_int(symbol) \ - Vc_ALWAYS_INLINE Vector operator symbol(int x) const { return Vector(m_data symbol x); } -#define OP(symbol) \ - OPshift(symbol) \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Vector) operator symbol(TT x) const { return operator symbol(Vector(x)); } -#define OPcmp(symbol) \ - Vc_ALWAYS_INLINE Mask operator symbol(const Vector &x) const { return Mask(m_data symbol x.m_data); } \ - template Vc_ALWAYS_INLINE VC_EXACT_TYPE(TT, EntryType, Mask) operator symbol(TT x) const { return Mask(m_data symbol x); } - - VC_ALL_ARITHMETICS(OP) - VC_ALL_BINARY(OP) - VC_ALL_SHIFTS(OPshift) - VC_ALL_SHIFTS(OPshift_int) - VC_ALL_COMPARES(OPcmp) -#undef OP -#undef OPcmp -#undef OPshift -#undef OPshift_int - Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; - - Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { - m_data = m_data * factor.data() + summand.data(); - } - - Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) { - if (m.data()) m_data = v.m_data; - } - - template Vc_ALWAYS_INLINE V2 staticCast() const { return V2(static_cast(m_data)); } - template Vc_ALWAYS_INLINE V2 reinterpretCast() const { - typedef typename V2::EntryType AliasT2 Vc_MAY_ALIAS; - return V2(*reinterpret_cast(&m_data)); - } - - Vc_ALWAYS_INLINE WriteMaskedVector operator()(Mask m) { return WriteMaskedVector(this, m); } - - Vc_ALWAYS_INLINE bool pack(Mask &m1, Vector &v2, Mask &m2) { - if (!m1.data() && m2.data()) { - m_data = v2.m_data; - m1 = true; - m2 = false; - return true; - } - return m1; - } - - Vc_ALWAYS_INLINE EntryType min() const { return m_data; } - Vc_ALWAYS_INLINE EntryType max() const { return m_data; } - Vc_ALWAYS_INLINE EntryType product() const { return m_data; } - Vc_ALWAYS_INLINE EntryType sum() const { return m_data; } - Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; } - Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; } - Vc_ALWAYS_INLINE EntryType product(Mask) const { return m_data; } - Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m) return m_data; return static_cast(0); } - - Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); } - Vc_INTRINSIC Vector rotated(int) const { return *this; } - Vector sorted() const { return *this; } - - template void callWithValuesSorted(F &f) { - f(m_data); - } - - template Vc_INTRINSIC void call(const F &f) const { - f(m_data); - } - template Vc_INTRINSIC void call(F &f) const { - f(m_data); - } - - template Vc_INTRINSIC void call(const F &f, Mask mask) const { - if (mask) { - f(m_data); - } - } - template Vc_INTRINSIC void call(F &f, Mask mask) const { - if (mask) { - f(m_data); - } - } - - template Vc_INTRINSIC Vector apply(const F &f) const { - return Vector(f(m_data)); - } - template Vc_INTRINSIC Vector apply(F &f) const { - return Vector(f(m_data)); - } - - template Vc_INTRINSIC Vector apply(const F &f, Mask mask) const { - if (mask) { - return Vector(f(m_data)); - } else { - return *this; - } - } - template Vc_INTRINSIC Vector apply(F &f, Mask mask) const { - if (mask) { - return Vector(f(m_data)); - } else { - return *this; - } - } - - template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { - m_data = f(0); - } - Vc_INTRINSIC void fill(EntryType (&f)()) { - m_data = f(); - } - - Vc_INTRINSIC_L Vector copySign(Vector reference) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; -}; - -typedef Vector double_v; -typedef Vector float_v; -typedef Vector sfloat_v; -typedef Vector int_v; -typedef Vector uint_v; -typedef Vector short_v; -typedef Vector ushort_v; -typedef double_v::Mask double_m; -typedef float_v::Mask float_m; -typedef sfloat_v::Mask sfloat_m; -typedef int_v::Mask int_m; -typedef uint_v::Mask uint_m; -typedef short_v::Mask short_m; -typedef ushort_v::Mask ushort_m; - -template class SwizzledVector : public Vector {}; - -#ifdef _MSC_VER - template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &) { - } -#else - template static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { - __asm__ __volatile__(""::"r"(x01.data())); - } - template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { - __asm__ __volatile__(""::"x"(x01.data())); - } - template<> Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x01) { - __asm__ __volatile__(""::"x"(x01.data())); - } -#endif - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &x01, const Vector &x02) { - forceToRegisters(x01); - forceToRegisters(x02); - } - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, const Vector &) {} - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template - static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &) {} - template static Vc_ALWAYS_INLINE void forceToRegisters( - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &, - const Vector &, const Vector &) {} - -} // namespace Scalar -} // namespace Vc -} // namespace ROOT - -#include "vector.tcc" -#include "math.h" -#include "undomacros.h" - -#endif // SCALAR_VECTOR_H diff --git a/math/vc/include/Vc/scalar/vector.tcc b/math/vc/include/Vc/scalar/vector.tcc deleted file mode 100644 index ef2c05d25a5c8..0000000000000 --- a/math/vc/include/Vc/scalar/vector.tcc +++ /dev/null @@ -1,244 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -namespace ROOT { -namespace Vc -{ -ALIGN(64) extern unsigned int RandomState[16]; - -namespace Scalar -{ - -// conversion/casts {{{1 -template<> template<> Vc_INTRINSIC short_v &Vector::operator=(const ushort_v &x) { - data() = static_cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC ushort_v &Vector::operator=(const short_v &x) { - data() = static_cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC int_v &Vector::operator=(const uint_v &x) { - data() = static_cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC uint_v &Vector::operator=(const int_v &x) { - data() = static_cast(x.data()); return *this; -} - -// copySign ///////////////////////////////////////////////////////////////////////// {{{1 -template<> Vc_INTRINSIC Vector Vector::copySign(Vector reference) const -{ - union { - float f; - unsigned int i; - } value, sign; - value.f = data(); - sign.f = reference.data(); - value.i = (sign.i & 0x80000000u) | (value.i & 0x7fffffffu); - return float_v(value.f); -} -template<> Vc_INTRINSIC sfloat_v Vector::copySign(sfloat_v reference) const -{ - return sfloat_v(float_v(m_data).copySign(float_v(reference.data())).data()); -} -template<> Vc_INTRINSIC Vector Vector::copySign(Vector reference) const -{ - union { - double f; - unsigned long long i; - } value, sign; - value.f = data(); - sign.f = reference.data(); - value.i = (sign.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull); - return double_v(value.f); -} // }}}1 -// bitwise operators {{{1 -#define VC_CAST_OPERATOR_FORWARD(op, IntT, VecT) \ -template<> Vc_ALWAYS_INLINE VecT &VecT::operator op##=(const VecT &x) { \ - typedef IntT uinta Vc_MAY_ALIAS; \ - uinta *left = reinterpret_cast(&m_data); \ - const uinta *right = reinterpret_cast(&x.m_data); \ - *left op##= *right; \ - return *this; \ -} \ -template<> Vc_ALWAYS_INLINE Vc_PURE VecT VecT::operator op(const VecT &x) const { \ - VecT ret = *this; \ - return VecT(ret op##= x); \ -} -#define VC_CAST_OPERATOR_FORWARD_FLOAT(op) VC_CAST_OPERATOR_FORWARD(op, unsigned int, Vector) -#define VC_CAST_OPERATOR_FORWARD_SFLOAT(op) VC_CAST_OPERATOR_FORWARD(op, unsigned int, Vector) -#define VC_CAST_OPERATOR_FORWARD_DOUBLE(op) VC_CAST_OPERATOR_FORWARD(op, unsigned long long, Vector) -VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_FLOAT) -VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_SFLOAT) -VC_ALL_BINARY(VC_CAST_OPERATOR_FORWARD_DOUBLE) -#undef VC_CAST_OPERATOR_FORWARD -#undef VC_CAST_OPERATOR_FORWARD_FLOAT -#undef VC_CAST_OPERATOR_FORWARD_SFLOAT -#undef VC_CAST_OPERATOR_FORWARD_DOUBLE -// }}}1 -// operators {{{1 -#include "../common/operators.h" -// }}}1 -// exponent {{{1 -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT(m_data >= 0.f); - union { float f; int i; } value; - value.f = m_data; - return float_v(static_cast((value.i >> 23) - 0x7f)); -} -template<> Vc_INTRINSIC sfloat_v Vector::exponent() const -{ - return sfloat_v(float_v(m_data).exponent().data()); -} -template<> Vc_INTRINSIC Vector Vector::exponent() const -{ - VC_ASSERT(m_data >= 0.); - union { double f; long long i; } value; - value.f = m_data; - return double_v(static_cast((value.i >> 52) - 0x3ff)); -} -// }}}1 -// FMA {{{1 -static Vc_ALWAYS_INLINE float highBits(float x) -{ - union { - float f; - unsigned int i; - } y; - y.f = x; - y.i &= 0xfffff000u; - return y.f; -} -static Vc_ALWAYS_INLINE double highBits(double x) -{ - union { - double f; - unsigned long long i; - } y; - y.f = x; - y.i &= 0xfffffffff8000000ull; - return y.f; -} -template Vc_ALWAYS_INLINE T _fusedMultiplyAdd(T a, T b, T c) -{ - const T h1 = highBits(a); - const T l1 = a - h1; - const T h2 = highBits(b); - const T l2 = b - h2; - const T ll = l1 * l2; - const T lh = l1 * h2 + h1 * l2; - const T hh = h1 * h2; - if (std::abs(c) < std::abs(lh)) { - return (ll + c) + (lh + hh); - } else { - return (ll + lh) + (c + hh); - } -} -template<> Vc_ALWAYS_INLINE void float_v::fusedMultiplyAdd(const float_v &f, const float_v &s) -{ - data() = _fusedMultiplyAdd(data(), f.data(), s.data()); -} -template<> Vc_ALWAYS_INLINE void sfloat_v::fusedMultiplyAdd(const sfloat_v &f, const sfloat_v &s) -{ - data() = _fusedMultiplyAdd(data(), f.data(), s.data()); -} -template<> Vc_ALWAYS_INLINE void double_v::fusedMultiplyAdd(const double_v &f, const double_v &s) -{ - data() = _fusedMultiplyAdd(data(), f.data(), s.data()); -} -// Random {{{1 -static Vc_ALWAYS_INLINE void _doRandomStep(Vector &state0, - Vector &state1) -{ - state0.load(&Vc::RandomState[0]); - state1.load(&Vc::RandomState[uint_v::Size]); - (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); - uint_v((state0 * 0xdeece66du + 11).data() ^ (state1.data() >> 16)).store(&Vc::RandomState[0]); -} - -template Vc_INTRINSIC Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return Vector(static_cast(state0.data())); -} -template<> Vc_INTRINSIC Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - union { unsigned int i; float f; } x; - x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u; - return float_v(x.f - 1.f); -} -template<> Vc_INTRINSIC sfloat_v Vector::Random() -{ - return sfloat_v(Vector::Random().data()); -} -template<> Vc_INTRINSIC Vector Vector::Random() -{ - typedef unsigned long long uint64 Vc_MAY_ALIAS; - uint64 state0 = *reinterpret_cast(&Vc::RandomState[8]); - state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull; - *reinterpret_cast(&Vc::RandomState[8]) = state0; - union { unsigned long long i; double f; } x; - x.i = state0 | 0x3ff0000000000000ull; - return double_v(x.f - 1.); -} -// isNegative {{{1 -template Vc_INTRINSIC Vc_PURE typename Vector::Mask Vector::isNegative() const -{ - union { float f; unsigned int i; } u; - u.f = m_data; - return Mask(0u != (u.i & 0x80000000u)); -} -template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const -{ - union { double d; unsigned long long l; } u; - u.d = m_data; - return double_m(0ull != (u.l & 0x8000000000000000ull)); -} -// setQnan {{{1 -template Vc_INTRINSIC void Vector::setQnan() -{ - union { float f; unsigned int i; } u; - u.i = 0xffffffffu; - m_data = u.f; -} -template<> Vc_INTRINSIC void double_v::setQnan() -{ - union { double d; unsigned long long l; } u; - u.l = 0xffffffffffffffffull; - m_data = u.d; -} -template Vc_INTRINSIC void Vector::setQnan(Mask m) -{ - if (m) { - setQnan(); - } -} -template<> Vc_INTRINSIC void double_v::setQnan(Mask m) -{ - if (m) { - setQnan(); - } -} -// }}}1 -} // namespace Scalar -} // namespace Vc -} // namespace ROOT -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/scalar/writemaskedvector.h b/math/vc/include/Vc/scalar/writemaskedvector.h deleted file mode 100644 index a674a17d298d1..0000000000000 --- a/math/vc/include/Vc/scalar/writemaskedvector.h +++ /dev/null @@ -1,91 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SCALAR_WRITEMASKEDVECTOR_H -#define VC_SCALAR_WRITEMASKEDVECTOR_H - -namespace ROOT { -namespace Vc -{ -namespace Scalar -{ - -template class WriteMaskedVector -{ - friend class Vector; - typedef typename Vector::Mask Mask; - typedef typename Vector::EntryType EntryType; - public: - //prefix - Vc_ALWAYS_INLINE Vector &operator++() { if (mask) ++vec->m_data; return *vec; } - Vc_ALWAYS_INLINE Vector &operator--() { if (mask) --vec->m_data; return *vec; } - //postfix - Vc_ALWAYS_INLINE Vector operator++(int) { if (mask) vec->m_data++; return *vec; } - Vc_ALWAYS_INLINE Vector operator--(int) { if (mask) vec->m_data--; return *vec; } - - Vc_ALWAYS_INLINE Vector &operator+=(Vector x) { if (mask) vec->m_data += x.m_data; return *vec; } - Vc_ALWAYS_INLINE Vector &operator-=(Vector x) { if (mask) vec->m_data -= x.m_data; return *vec; } - Vc_ALWAYS_INLINE Vector &operator*=(Vector x) { if (mask) vec->m_data *= x.m_data; return *vec; } - Vc_ALWAYS_INLINE Vector &operator/=(Vector x) { if (mask) vec->m_data /= x.m_data; return *vec; } - - Vc_ALWAYS_INLINE Vector &operator=(Vector x) { - vec->assign(x, mask); - return *vec; - } - - Vc_ALWAYS_INLINE Vector &operator+=(EntryType x) { if (mask) vec->m_data += x; return *vec; } - Vc_ALWAYS_INLINE Vector &operator-=(EntryType x) { if (mask) vec->m_data -= x; return *vec; } - Vc_ALWAYS_INLINE Vector &operator*=(EntryType x) { if (mask) vec->m_data *= x; return *vec; } - Vc_ALWAYS_INLINE Vector &operator/=(EntryType x) { if (mask) vec->m_data /= x; return *vec; } - - Vc_ALWAYS_INLINE Vector &operator=(EntryType x) { - vec->assign(Vector(x), mask); - return *vec; - } - - template Vc_ALWAYS_INLINE void call(const F &f) const { - vec->call(f, mask); - } - template Vc_ALWAYS_INLINE void call(F &f) const { - vec->call(f, mask); - } - template Vc_ALWAYS_INLINE Vector apply(const F &f) const { - if (mask) { - return Vector(f(vec->m_data)); - } else { - return *vec; - } - } - template Vc_ALWAYS_INLINE Vector apply(F &f) const { - if (mask) { - return Vector(f(vec->m_data)); - } else { - return *vec; - } - } - private: - Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, Mask k) : vec(v), mask(k) {} - Vector *const vec; - Mask mask; -}; - -} // namespace Scalar -} // namespace Vc -} // namespace ROOT -#endif // VC_SCALAR_WRITEMASKEDVECTOR_H diff --git a/math/vc/include/Vc/sfloat_v b/math/vc/include/Vc/sfloat_v deleted file mode 100644 index afe84a4c9851d..0000000000000 --- a/math/vc/include/Vc/sfloat_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/sfloat_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/short_v b/math/vc/include/Vc/short_v deleted file mode 100644 index 31cabdcedb535..0000000000000 --- a/math/vc/include/Vc/short_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/short_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/sse/casts.h b/math/vc/include/Vc/sse/casts.h deleted file mode 100644 index 38a8c2ddff4a3..0000000000000 --- a/math/vc/include/Vc/sse/casts.h +++ /dev/null @@ -1,104 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_CASTS_H -#define SSE_CASTS_H - -#include "intrinsics.h" -#include "types.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - template static Vc_ALWAYS_INLINE To Vc_CONST mm128_reinterpret_cast(VC_ALIGNED_PARAMETER(From) v) { return v; } - template<> Vc_ALWAYS_INLINE _M128I Vc_CONST mm128_reinterpret_cast<_M128I, _M128 >(VC_ALIGNED_PARAMETER(_M128 ) v) { return _mm_castps_si128(v); } - template<> Vc_ALWAYS_INLINE _M128I Vc_CONST mm128_reinterpret_cast<_M128I, _M128D>(VC_ALIGNED_PARAMETER(_M128D) v) { return _mm_castpd_si128(v); } - template<> Vc_ALWAYS_INLINE _M128 Vc_CONST mm128_reinterpret_cast<_M128 , _M128D>(VC_ALIGNED_PARAMETER(_M128D) v) { return _mm_castpd_ps(v); } - template<> Vc_ALWAYS_INLINE _M128 Vc_CONST mm128_reinterpret_cast<_M128 , _M128I>(VC_ALIGNED_PARAMETER(_M128I) v) { return _mm_castsi128_ps(v); } - template<> Vc_ALWAYS_INLINE _M128D Vc_CONST mm128_reinterpret_cast<_M128D, _M128I>(VC_ALIGNED_PARAMETER(_M128I) v) { return _mm_castsi128_pd(v); } - template<> Vc_ALWAYS_INLINE _M128D Vc_CONST mm128_reinterpret_cast<_M128D, _M128 >(VC_ALIGNED_PARAMETER(_M128 ) v) { return _mm_castps_pd(v); } - template static Vc_ALWAYS_INLINE To Vc_CONST sse_cast(VC_ALIGNED_PARAMETER(From) v) { return mm128_reinterpret_cast(v); } - - template struct StaticCastHelper {}; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_cvttps_epi32(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128D &v) { return _mm_cvttpd_epi32(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { - return _mm_castps_si128(mm_blendv_ps( - _mm_castsi128_ps(_mm_cvttps_epi32(v)), - _mm_castsi128_ps(_mm_add_epi32(_mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))), _mm_set1_epi32(1 << 31))), - _mm_cmpge_ps(v, _mm_set1_ps(1u << 31)) - )); - - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128D &v) { return _mm_cvttpd_epi32(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128 &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128D &v) { return _mm_cvtpd_ps(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128I &v) { return _mm_cvtepi32_ps(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128 cast(const _M128I &v) { - return mm_blendv_ps( - _mm_cvtepi32_ps(v), - _mm_add_ps(_mm_cvtepi32_ps(_mm_sub_epi32(v, _mm_set1_epi32(1 << 31))), _mm_set1_ps(1u << 31)), - _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())) - ); - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128 &v) { return _mm_cvtps_pd(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128D &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128I &v) { return _mm_cvtepi32_pd(v); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128D cast(const _M128I &v) { return _mm_cvtepi32_pd(v); } }; - - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE M256 cast(const _M128I &v) { - return M256::create(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v, _mm_setzero_si128())), - _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, _mm_setzero_si128()))); - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE M256 cast(const _M128I &v) { - const _M128I neg = _mm_cmplt_epi16(v, _mm_setzero_si128()); - return M256::create(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v, neg)), - _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, neg))); - } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v[0]), _mm_cvttps_epi32(v[1])); } }; -#ifdef VC_IMPL_SSE4_1 - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { return _mm_packus_epi32(_mm_cvttps_epi32(v[0]), _mm_cvttps_epi32(v[1])); } }; -#else - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const M256 &v) { - return _mm_add_epi16(_mm_set1_epi16(-32768), - _mm_packs_epi32( - _mm_add_epi32(_mm_set1_epi32(-32768), _mm_cvttps_epi32(v[0])), - _mm_add_epi32(_mm_set1_epi32(-32768), _mm_cvttps_epi32(v[1])) - ) - ); - } }; -#endif - - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128 &v) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; - template<> struct StaticCastHelper { static Vc_ALWAYS_INLINE _M128I cast(const _M128I &v) { return v; } }; -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#endif // SSE_CASTS_H diff --git a/math/vc/include/Vc/sse/const.h b/math/vc/include/Vc/sse/const.h deleted file mode 100644 index b03af2d5ae585..0000000000000 --- a/math/vc/include/Vc/sse/const.h +++ /dev/null @@ -1,108 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_CONST_H -#define VC_SSE_CONST_H - -#include "const_data.h" -#include "vector.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - template class Vector; - - template struct Const - { - typedef Vector<_T> V; - typedef typename V::EntryType T; - typedef typename V::Mask M; - enum Constants { Stride = 16 / sizeof(T) }; - - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig::data[0 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig::data[1 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig::data[2 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig::data[3 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig::data[4 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig::data[5 * Stride]); } - - static Vc_ALWAYS_INLINE Vc_CONST V cosCoeff(int i) { return load(&c_trig::data[( 8 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V sinCoeff(int i) { return load(&c_trig::data[(14 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig::data[(24 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig::data[(29 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig::data[34 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig::data[35 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig::data[36 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig::data[20 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig::data[21 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig::data[22 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig::data[23 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig::data[(40 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig::data[(45 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig::data[(49 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig::data[(55 + i) * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig::data[37 * Stride]); } - static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig::data[38 * Stride]); } - - static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log::d(1)).data()); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log::d(18)); } - static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log::d(15)); } - static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log::d(2 + i)); } - static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log::d(8 + i)); } - static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log::d(14)); } - static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log::d(17)); } - static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log::d(16)); } - static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log::d(13)); } - static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log::d(19)); } - static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log::d(20)); } - - static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; - static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R; - private: - static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R; - }; - template Vc_ALWAYS_INLINE Vc_CONST Vector Const::load(const T *mem) { return V(mem); } - template<> Vc_ALWAYS_INLINE Vc_CONST sfloat_v Const::load(const float *mem) { return M256::dup(float_v(mem).data()); } - - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector(reinterpret_cast(&c_general::highMaskFloat)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector(reinterpret_cast(&c_general::highMaskDouble)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits)); } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { - return M256::dup(Const::highMask(bits).data()); - } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::P(int i) { - return M256::dup(Const::P(i).data()); - } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector Const::Q(int i) { - return M256::dup(Const::Q(i).data()); - } - template<> Vc_ALWAYS_INLINE Vc_CONST Vector::Mask Const::exponentMask() { - return M256::dup(Const::exponentMask().data()); - } -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_SSE_CONST_H diff --git a/math/vc/include/Vc/sse/const_data.h b/math/vc/include/Vc/sse/const_data.h deleted file mode 100644 index ba21ece303f77..0000000000000 --- a/math/vc/include/Vc/sse/const_data.h +++ /dev/null @@ -1,77 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_SSE_CONST_DATA_H -#define VC_SSE_CONST_DATA_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -ALIGN(16) extern const unsigned int _IndexesFromZero4[4]; -ALIGN(16) extern const unsigned short _IndexesFromZero8[8]; -ALIGN(16) extern const unsigned char _IndexesFromZero16[16]; - -struct c_general -{ - ALIGN(64) static const unsigned int allone[4]; - ALIGN(16) static const unsigned short one16[8]; - ALIGN(16) static const unsigned int one32[4]; - ALIGN(16) static const float oneFloat[4]; - ALIGN(16) static const double oneDouble[2]; - ALIGN(16) static const int absMaskFloat[4]; - ALIGN(16) static const long long absMaskDouble[2]; - ALIGN(16) static const unsigned int signMaskFloat[4]; - ALIGN(16) static const unsigned int highMaskFloat[4]; - ALIGN(16) static const unsigned long long signMaskDouble[2]; - ALIGN(16) static const unsigned long long highMaskDouble[2]; - ALIGN(16) static const short minShort[8]; - ALIGN(16) static const unsigned long long frexpMask[2]; -}; - -template struct c_trig -{ - ALIGN(64) static const T data[]; -}; - -template struct c_log -{ - enum VectorSize { Size = 16 / sizeof(T) }; - static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast(&data[i * Size]); } - ALIGN(64) static const unsigned int data[]; -}; - -template<> struct c_log -{ - enum VectorSize { Size = 16 / sizeof(double) }; - static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast(&data[i * Size]); } - ALIGN(64) static const unsigned long long data[]; -}; - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_SSE_CONST_DATA_H diff --git a/math/vc/include/Vc/sse/debug.h b/math/vc/include/Vc/sse/debug.h deleted file mode 100644 index 92e51a1dbcb11..0000000000000 --- a/math/vc/include/Vc/sse/debug.h +++ /dev/null @@ -1,90 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_DEBUG_H -#define VC_SSE_DEBUG_H - -#ifndef NDEBUG -#include "types.h" -#include -#include -#endif - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -#ifdef NDEBUG -class DebugStream -{ - public: - DebugStream(const char *, const char *, int) {} - template inline DebugStream &operator<<(const T &) { return *this; } -}; -#else -class DebugStream -{ - private: - template static void printVector(V _x) - { - enum { Size = sizeof(V) / sizeof(T) }; - union { V v; T m[Size]; } x = { _x }; - std::cerr << '[' << std::setprecision(24) << x.m[0]; - for (int i = 1; i < Size; ++i) { - std::cerr << ", " << std::setprecision(24) << x.m[i]; - } - std::cerr << ']'; - } - public: - DebugStream(const char *func, const char *file, int line) - { - std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' '; - } - - template DebugStream &operator<<(const T &x) { std::cerr << x; return *this; } - - DebugStream &operator<<(__m128 x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m128d x) { - printVector(x); - return *this; - } - DebugStream &operator<<(__m128i x) { - printVector(x); - return *this; - } - - ~DebugStream() - { - std::cerr << "\033[0m" << std::endl; - } -}; -#endif - -#define VC_DEBUG ::ROOT::Vc::SSE::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__) - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#endif // VC_SSE_DEBUG_H diff --git a/math/vc/include/Vc/sse/deinterleave.tcc b/math/vc/include/Vc/sse/deinterleave.tcc deleted file mode 100644 index 1204c0d80bb18..0000000000000 --- a/math/vc/include/Vc/sse/deinterleave.tcc +++ /dev/null @@ -1,237 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -inline void deinterleave(Vector &a, Vector &b) -{ - const _M128 tmp0 = _mm_unpacklo_ps(a.data(), b.data()); - const _M128 tmp1 = _mm_unpackhi_ps(a.data(), b.data()); - a.data() = _mm_unpacklo_ps(tmp0, tmp1); - b.data() = _mm_unpackhi_ps(tmp0, tmp1); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) -{ - a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); - b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16)); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) -{ - a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); - b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16)); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - _M128 tmp0 = _mm_unpacklo_ps(a.data()[0], a.data()[1]); - _M128 tmp1 = _mm_unpackhi_ps(a.data()[0], a.data()[1]); - _M128 tmp2 = _mm_unpacklo_ps(b.data()[0], b.data()[1]); - _M128 tmp3 = _mm_unpackhi_ps(b.data()[0], b.data()[1]); - a.data()[0] = _mm_unpacklo_ps(tmp0, tmp1); - b.data()[0] = _mm_unpackhi_ps(tmp0, tmp1); - a.data()[1] = _mm_unpacklo_ps(tmp2, tmp3); - b.data()[1] = _mm_unpackhi_ps(tmp2, tmp3); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp0, Vector::AsArg tmp1) -{ - a.data()[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp0.data(), 16), 16)); - b.data()[0] = _mm_cvtepi32_ps(_mm_srai_epi32(tmp0.data(), 16)); - a.data()[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp1.data(), 16), 16)); - b.data()[1] = _mm_cvtepi32_ps(_mm_srai_epi32(tmp1.data(), 16)); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp0, Vector::AsArg tmp1) -{ - a.data()[0] = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp0.data(), 16), 16)); - b.data()[0] = _mm_cvtepi32_ps(_mm_srli_epi32(tmp0.data(), 16)); - a.data()[1] = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp1.data(), 16), 16)); - b.data()[1] = _mm_cvtepi32_ps(_mm_srli_epi32(tmp1.data(), 16)); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - _M128D tmp = _mm_unpacklo_pd(a.data(), b.data()); - b.data() = _mm_unpackhi_pd(a.data(), b.data()); - a.data() = tmp; -} - -inline void deinterleave(Vector &a, Vector &b) -{ - const _M128I tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); - const _M128I tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); - a.data() = _mm_unpacklo_epi32(tmp0, tmp1); - b.data() = _mm_unpackhi_epi32(tmp0, tmp1); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - const _M128I tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); - const _M128I tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); - a.data() = _mm_unpacklo_epi32(tmp0, tmp1); - b.data() = _mm_unpackhi_epi32(tmp0, tmp1); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - _M128I tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - _M128I tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - _M128I tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - _M128I tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); -} - -inline void deinterleave(Vector &a, Vector &b) -{ - _M128I tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); // a0 a4 b0 b4 a1 a5 b1 b5 - _M128I tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); // a2 a6 b2 b6 a3 a7 b3 b7 - _M128I tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // a0 a2 a4 a6 b0 b2 b4 b6 - _M128I tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // a1 a3 a5 a7 b1 b3 b5 b7 - a.data() = _mm_unpacklo_epi16(tmp2, tmp3); - b.data() = _mm_unpackhi_epi16(tmp2, tmp3); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) -{ - a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16); - b.data() = _mm_srai_epi32(tmp.data(), 16); -} - -inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) -{ - a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16); - b.data() = _mm_srli_epi32(tmp.data(), 16); -} - -} // namespace SSE - - -namespace Internal -{ - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const float *m, A align) -{ - a.load(m, align); - b.load(m + float_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const short *m, A align) -{ - short_v tmp(m, align); - Vc::SSE::deinterleave(a, b, tmp); -} - -template inline void HelperImpl::deinterleave( - float_v &a, float_v &b, const unsigned short *m, A align) -{ - ushort_v tmp(m, align); - Vc::SSE::deinterleave(a, b, tmp); -} - -template inline void HelperImpl::deinterleave( - sfloat_v &a, sfloat_v &b, const float *m, A align) -{ - a.load(m, align); - b.load(m + sfloat_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - sfloat_v &a, sfloat_v &b, const short *m, A align) -{ - short_v tmp0(m, align); - short_v tmp1(m + short_v::Size, align); - Vc::SSE::deinterleave(a, b, tmp0, tmp1); -} - -template inline void HelperImpl::deinterleave( - sfloat_v &a, sfloat_v &b, const unsigned short *m, A align) -{ - ushort_v tmp0(m, align); - ushort_v tmp1(m + short_v::Size, align); - Vc::SSE::deinterleave(a, b, tmp0, tmp1); -} - -template inline void HelperImpl::deinterleave( - double_v &a, double_v &b, const double *m, A align) -{ - a.load(m, align); - b.load(m + double_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const int *m, A align) -{ - a.load(m, align); - b.load(m + int_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - int_v &a, int_v &b, const short *m, A align) -{ - short_v tmp(m, align); - Vc::SSE::deinterleave(a, b, tmp); -} - -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned int *m, A align) -{ - a.load(m, align); - b.load(m + uint_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - uint_v &a, uint_v &b, const unsigned short *m, A align) -{ - ushort_v tmp(m, align); - Vc::SSE::deinterleave(a, b, tmp); -} - -template inline void HelperImpl::deinterleave( - short_v &a, short_v &b, const short *m, A align) -{ - a.load(m, align); - b.load(m + short_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -template inline void HelperImpl::deinterleave( - ushort_v &a, ushort_v &b, const unsigned short *m, A align) -{ - a.load(m, align); - b.load(m + ushort_v::Size, align); - Vc::SSE::deinterleave(a, b); -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/sse/forceToRegisters.tcc b/math/vc/include/Vc/sse/forceToRegisters.tcc deleted file mode 100644 index a151731767fe6..0000000000000 --- a/math/vc/include/Vc/sse/forceToRegisters.tcc +++ /dev/null @@ -1,141 +0,0 @@ -#ifdef VC_GNU_ASM -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { - __asm__ __volatile__(""::"x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x1) { - __asm__ __volatile__("":"+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x8, const Vector &x7, const Vector &x6, const Vector &x5, const Vector &x4, const Vector &x3, const Vector &x2, const Vector &x1) { - __asm__ __volatile__(""::"x"(x8.data()), "x"(x7.data()), "x"(x6.data()), "x"(x5.data()), "x"(x4.data()), "x"(x3.data()), "x"(x2.data()), "x"(x1.data())); -} -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &x8, Vector &x7, Vector &x6, Vector &x5, Vector &x4, Vector &x3, Vector &x2, Vector &x1) { - __asm__ __volatile__("":"+x"(x8.data()), "+x"(x7.data()), "+x"(x6.data()), "+x"(x5.data()), "+x"(x4.data()), "+x"(x3.data()), "+x"(x2.data()), "+x"(x1.data())); -} -#elif defined(VC_MSVC) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x8*/, const Vector &/*x7*/, const Vector &/*x6*/, const Vector &/*x5*/, const Vector &/*x4*/, const Vector &/*x3*/, const Vector &/*x2*/, const Vector &/*x1*/) { -} -#pragma optimize("g", off) -template -static Vc_ALWAYS_INLINE void forceToRegistersDirty(Vector &/*x8*/, Vector &/*x7*/, Vector &/*x6*/, Vector &/*x5*/, Vector &/*x4*/, Vector &/*x3*/, Vector &/*x2*/, Vector &/*x1*/) { -} -#pragma optimize("g", on) -#else -#error "forceToRegisters unsupported on this compiler" -#endif diff --git a/math/vc/include/Vc/sse/helperimpl.h b/math/vc/include/Vc/sse/helperimpl.h deleted file mode 100644 index 8acbb035b80f9..0000000000000 --- a/math/vc/include/Vc/sse/helperimpl.h +++ /dev/null @@ -1,87 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_DEINTERLEAVE_H -#define VC_SSE_DEINTERLEAVE_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template<> struct HelperImpl -{ - typedef SSE::Vector float_v; - typedef SSE::Vector sfloat_v; - typedef SSE::Vector double_v; - typedef SSE::Vector int_v; - typedef SSE::Vector uint_v; - typedef SSE::Vector short_v; - typedef SSE::Vector ushort_v; - - template static void deinterleave(float_v &, float_v &, const float *, A); - template static void deinterleave(float_v &, float_v &, const short *, A); - template static void deinterleave(float_v &, float_v &, const unsigned short *, A); - - template static void deinterleave(sfloat_v &, sfloat_v &, const float *, A); - template static void deinterleave(sfloat_v &, sfloat_v &, const short *, A); - template static void deinterleave(sfloat_v &, sfloat_v &, const unsigned short *, A); - - template static void deinterleave(double_v &, double_v &, const double *, A); - - template static void deinterleave(int_v &, int_v &, const int *, A); - template static void deinterleave(int_v &, int_v &, const short *, A); - - template static void deinterleave(uint_v &, uint_v &, const unsigned int *, A); - template static void deinterleave(uint_v &, uint_v &, const unsigned short *, A); - - template static void deinterleave(short_v &, short_v &, const short *, A); - - template static void deinterleave(ushort_v &, ushort_v &, const unsigned short *, A); - - static Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr) Vc_ALWAYS_INLINE_R; - - template - static Vc_ALWAYS_INLINE_L void *malloc(size_t n) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; -}; - -template<> struct HelperImpl : public HelperImpl {}; -template<> struct HelperImpl : public HelperImpl {}; -template<> struct HelperImpl : public HelperImpl {}; -template<> struct HelperImpl : public HelperImpl {}; - - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#include "deinterleave.tcc" -#include "prefetches.tcc" -#include "helperimpl.tcc" -#include "undomacros.h" - -#endif // VC_SSE_DEINTERLEAVE_H diff --git a/math/vc/include/Vc/sse/helperimpl.tcc b/math/vc/include/Vc/sse/helperimpl.tcc deleted file mode 100644 index 8ebed60a4c733..0000000000000 --- a/math/vc/include/Vc/sse/helperimpl.tcc +++ /dev/null @@ -1,66 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_HELPERIMPL_TCC -#define VC_SSE_HELPERIMPL_TCC - -#include - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -template -static _VC_CONSTEXPR size_t nextMultipleOf(size_t value) -{ - return (value % X) > 0 ? value + X - (value % X) : value; -} - -template -Vc_ALWAYS_INLINE void *HelperImpl::malloc(size_t n) -{ - switch (A) { - case Vc::AlignOnVector: - return _mm_malloc(nextMultipleOf(n), Vc::SSE::VectorAlignment); - case Vc::AlignOnCacheline: - // TODO: hardcoding 64 is not such a great idea - return _mm_malloc(nextMultipleOf<64>(n), 64); - case Vc::AlignOnPage: - // TODO: hardcoding 4096 is not such a great idea - return _mm_malloc(nextMultipleOf<4096>(n), 4096); - default: -#ifndef NDEBUG - abort(); -#endif - return _mm_malloc(n, 8); - } -} - -Vc_ALWAYS_INLINE void HelperImpl::free(void *p) -{ - _mm_free(p); -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#endif // VC_SSE_HELPERIMPL_TCC diff --git a/math/vc/include/Vc/sse/interleavedmemory.tcc b/math/vc/include/Vc/sse/interleavedmemory.tcc deleted file mode 100644 index 15ad70f7e7942..0000000000000 --- a/math/vc/include/Vc/sse/interleavedmemory.tcc +++ /dev/null @@ -1,1014 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_SSE_INTERLEAVEDMEMORY_TCC -#define VC_SSE_INTERLEAVEDMEMORY_TCC - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace Common -{ - -namespace -{ -template struct InterleaveImpl; -template<> struct InterleaveImpl { - static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ - const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1) - { - const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp2 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); - const __m128 tmp3 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); - - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), tmp2); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), tmp2); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), tmp3); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), tmp3); - }/*}}}*/ - static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ - const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, const SSE::sfloat_v::AsArg v2) - { -#ifdef VC_USE_MASKMOV_SCATTER - const __m128i mask = _mm_set_epi32(0, -1, -1, -1); - - const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v2.data()[0]); - const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v2.data()[0]); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast(&data[i[0]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast(&data[i[1]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast(&data[i[2]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast(&data[i[3]])); - - const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); - const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); - const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v2.data()[1]); - const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v2.data()[1]); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp8, tmp10)), mask, reinterpret_cast(&data[i[4]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp10, tmp8)), mask, reinterpret_cast(&data[i[5]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp9, tmp11)), mask, reinterpret_cast(&data[i[6]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp11, tmp9)), mask, reinterpret_cast(&data[i[7]])); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/ - const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, - const SSE::sfloat_v::AsArg v2, const SSE::sfloat_v::AsArg v3) - { - const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]); - const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v3.data()[0]); - const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v3.data()[0]); - _mm_storeu_ps(&data[i[0]], _mm_movelh_ps(tmp0, tmp2)); - _mm_storeu_ps(&data[i[1]], _mm_movehl_ps(tmp2, tmp0)); - _mm_storeu_ps(&data[i[2]], _mm_movelh_ps(tmp1, tmp3)); - _mm_storeu_ps(&data[i[3]], _mm_movehl_ps(tmp3, tmp1)); - - const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]); - const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]); - const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v3.data()[1]); - const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v3.data()[1]); - _mm_storeu_ps(&data[i[4]], _mm_movelh_ps(tmp8, tmp10)); - _mm_storeu_ps(&data[i[5]], _mm_movehl_ps(tmp10, tmp8)); - _mm_storeu_ps(&data[i[6]], _mm_movelh_ps(tmp9, tmp11)); - _mm_storeu_ps(&data[i[7]], _mm_movehl_ps(tmp11, tmp9)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); - const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); -#ifdef __x86_64__ - const long long tmp00 = _mm_cvtsi128_si64(tmp0); - const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); - const long long tmp10 = _mm_cvtsi128_si64(tmp1); - const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); - *reinterpret_cast(&data[i[0]]) = tmp00; - *reinterpret_cast(&data[i[1]]) = tmp00 >> 32; - *reinterpret_cast(&data[i[2]]) = tmp01; - *reinterpret_cast(&data[i[3]]) = tmp01 >> 32; - *reinterpret_cast(&data[i[4]]) = tmp10; - *reinterpret_cast(&data[i[5]]) = tmp10 >> 32; - *reinterpret_cast(&data[i[6]]) = tmp11; - *reinterpret_cast(&data[i[7]]) = tmp11 >> 32; -#elif defined(VC_IMPL_SSE4_1) - *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); - *reinterpret_cast(&data[i[1]]) = _mm_extract_epi32(tmp0, 1); - *reinterpret_cast(&data[i[2]]) = _mm_extract_epi32(tmp0, 2); - *reinterpret_cast(&data[i[3]]) = _mm_extract_epi32(tmp0, 3); - *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); - *reinterpret_cast(&data[i[5]]) = _mm_extract_epi32(tmp1, 1); - *reinterpret_cast(&data[i[6]]) = _mm_extract_epi32(tmp1, 2); - *reinterpret_cast(&data[i[7]]) = _mm_extract_epi32(tmp1, 3); -#else - *reinterpret_cast(&data[i[0]]) = _mm_cvtsi128_si32(tmp0); - *reinterpret_cast(&data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4)); - *reinterpret_cast(&data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8)); - *reinterpret_cast(&data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12)); - *reinterpret_cast(&data[i[4]]) = _mm_cvtsi128_si32(tmp1); - *reinterpret_cast(&data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4)); - *reinterpret_cast(&data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8)); - *reinterpret_cast(&data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12)); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { -#ifdef VC_USE_MASKMOV_SCATTER - const __m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - const __m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0); - typename V::EntryType *const dataHi = data - 4; - const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); - const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); - - const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast(&data[i[0]])); - _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast(&dataHi[i[1]])); - _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast(&data[i[2]])); - _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast(&dataHi[i[3]])); - _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast(&data[i[4]])); - _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast(&dataHi[i[5]])); - _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast(&data[i[6]])); - _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast(&dataHi[i[7]])); -#else - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); - const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); - const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); - const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); - - const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); - const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); - const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); - const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); - - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); - _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { -#ifdef VC_USE_MASKMOV_SCATTER - const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); - const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); - const __m128i mask = _mm_set_epi32(0, -1, -1, -1); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast(&data[i[0]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast(&data[i[1]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast(&data[i[2]])); - _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast(&data[i[3]])); -#else - const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); - _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); - _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); - v2.scatter(data + 2, i); -#endif - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); - const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); - const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); - _mm_storeu_ps(reinterpret_cast(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2)); - _mm_storeu_ps(reinterpret_cast(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0)); - _mm_storeu_ps(reinterpret_cast(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3)); - _mm_storeu_ps(reinterpret_cast(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1)); - }/*}}}*/ -}; -template struct InterleaveImpl { - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1) - { - const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data()); - const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data()); - _mm_storeu_pd(&data[i[0]], tmp0); - _mm_storeu_pd(&data[i[1]], tmp1); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) - { - interleave(data, i, v0, v1); - v2.scatter(data + 2, i); - }/*}}}*/ - static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/ - const typename V::AsArg v0, const typename V::AsArg v1, - const typename V::AsArg v2, const typename V::AsArg v3) - { - interleave(data, i, v0, v1); - interleave(data + 2, i, v2, v3); - }/*}}}*/ -}; -} // anonymous namespace - -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) -{ - InterleaveImpl::interleave(m_data, m_indexes, v0, v1, v2, v3); - v4.scatter(m_data + 4, m_indexes); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5) -{ - InterleaveImpl::interleave(m_data , m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6); -}/*}}}*/ -template Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::interleave(const typename V::AsArg v0,/*{{{*/ - const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, - const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) -{ - InterleaveImpl::interleave(m_data + 0, m_indexes, v0, v1, v2, v3); - InterleaveImpl::interleave(m_data + 4, m_indexes, v4, v5, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1) const/*{{{*/ -{ - const __m128 a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0]]))); - const __m128 b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1]]))); - const __m128 c = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2]]))); - const __m128 d = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3]]))); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/ -{ - const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX] - const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX] - - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); - v2.data() = _mm_movelh_ps(tmp2, tmp3); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/ -{ - const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] - const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] - - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); - v2.data() = _mm_movelh_ps(tmp2, tmp3); - v3.data() = _mm_movehl_ps(tmp3, tmp2); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/ -{ - v4.gather(m_data, m_indexes + I(4)); - deinterleave(v0, v1, v2, v3); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/ -{ - const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); - const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] - const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] - - const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); - const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); - - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); - - const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] - v2.data() = _mm_movelh_ps(tmp2, tmp3); - v3.data() = _mm_movehl_ps(tmp3, tmp2); - - const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] - v4.data() = _mm_movelh_ps(tmp4, tmp5); - v5.data() = _mm_movehl_ps(tmp5, tmp4); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/ -{ - const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); - const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] - const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] - const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1] - - const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); - const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); - - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); - - const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] - v2.data() = _mm_movelh_ps(tmp2, tmp3); - v3.data() = _mm_movehl_ps(tmp3, tmp2); - - const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] - v4.data() = _mm_movelh_ps(tmp4, tmp5); - v5.data() = _mm_movehl_ps(tmp5, tmp4); - - const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3] - v6.data() = _mm_movelh_ps(tmp6, tmp7); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/ -{ - const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]); - const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]); - - const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1] - const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1] - const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1] - const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1] - - const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]); - const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]); - - const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3] - v0.data() = _mm_movelh_ps(tmp0, tmp1); - v1.data() = _mm_movehl_ps(tmp1, tmp0); - - const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3] - v2.data() = _mm_movelh_ps(tmp2, tmp3); - v3.data() = _mm_movehl_ps(tmp3, tmp2); - - const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3] - v4.data() = _mm_movelh_ps(tmp4, tmp5); - v5.data() = _mm_movehl_ps(tmp5, tmp4); - - const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3] - v6.data() = _mm_movelh_ps(tmp6, tmp7); - v7.data() = _mm_movehl_ps(tmp7, tmp6); -}/*}}}*/ - -static inline void _sse_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/ -{ - const __m128d a = _mm_loadu_pd(&data[indexes[0]]); - const __m128d b = _mm_loadu_pd(&data[indexes[1]]); - - v0.data() = _mm_unpacklo_pd(a, b); - v1.data() = _mm_unpackhi_pd(a, b); -}/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1) const {/*{{{*/ - _sse_deinterleave_double(m_data, m_indexes, v0, v1); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2) const { - v2.gather(m_data + 2, m_indexes); - _sse_deinterleave_double(m_data, m_indexes, v0, v1); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2, double_v &v3) const { - _sse_deinterleave_double(m_data , m_indexes, v0, v1); - _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2, double_v &v3, double_v &v4) const { - v4.gather(m_data + 4, m_indexes); - _sse_deinterleave_double(m_data , m_indexes, v0, v1); - _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2, double_v &v3, double_v &v4, double_v &v5) const { - _sse_deinterleave_double(m_data , m_indexes, v0, v1); - _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const { - v6.gather(m_data + 6, m_indexes); - _sse_deinterleave_double(m_data , m_indexes, v0, v1); - _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); -} -/*}}}*/ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(double_v &v0, double_v &v1,/*{{{*/ - double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const { - _sse_deinterleave_double(m_data , m_indexes, v0, v1); - _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3); - _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5); - _sse_deinterleave_double(m_data + 6, m_indexes, v6, v7); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/ - const __m128i a = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_cvtsi32_si128(*reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2) const { - const __m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3) const { - const __m128i a = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadl_epi64(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4) const { - const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5) const { - const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const { - const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); -}/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(short_v &v0, short_v &v1,/*{{{*/ - short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const { - const __m128i a = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[0]])); - const __m128i b = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[1]])); - const __m128i c = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[2]])); - const __m128i d = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[3]])); - const __m128i e = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[4]])); - const __m128i f = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[5]])); - const __m128i g = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[6]])); - const __m128i h = _mm_loadu_si128(reinterpret_cast(&m_data[m_indexes[7]])); - - const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 - const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5 - const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6 - const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7 - const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4 - const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5 - const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6 - const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7 - - const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 - const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7 - const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6 - const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7 - const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6 - const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7 - const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6 - const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7 - - v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); - v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); - v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); - v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); - v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); - v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); - v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); - v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); -}/*}}}*/ - -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/ -{ - const __m128 i0a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0]]))); - const __m128 i1a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1]]))); - const __m128 i2a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2]]))); - const __m128 i3a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3]]))); - const __m128 i4a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[4]]))); - const __m128 i5a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[5]]))); - const __m128 i6a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[6]]))); - const __m128 i7a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[7]]))); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); - v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - v4.gather(m_data + float_v::Size, m_indexes); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); - v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - const __m128 i0b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[0] + float_v::Size]))); - const __m128 i1b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[1] + float_v::Size]))); - const __m128 i2b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[2] + float_v::Size]))); - const __m128 i3b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[3] + float_v::Size]))); - const __m128 i4b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[4] + float_v::Size]))); - const __m128 i5b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[5] + float_v::Size]))); - const __m128 i6b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[6] + float_v::Size]))); - const __m128 i7b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&m_data[m_indexes[7] + float_v::Size]))); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); - v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); - - const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] - const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] - const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] - const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] - v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); - v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); - v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); - - const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] - const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] - const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] - const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] - v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); - v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); - - const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1] - const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3] - const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5] - const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7] - v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67)); -} -/*}}}*/ -template<> inline void InterleavedMemoryAccessBase::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/ -{ - const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]); - const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]); - const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]); - const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]); - const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]); - const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]); - const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]); - const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]); - const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]); - const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]); - const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]); - const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]); - const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]); - const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]); - const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]); - const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]); - - const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1] - const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3] - const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5] - const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7] - v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67)); - v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45)); - - const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1] - const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3] - const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5] - const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7] - v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67)); - v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45)); - - const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1] - const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3] - const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5] - const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7] - v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67)); - v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45)); - - const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1] - const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3] - const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5] - const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7] - v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67)); - v7.data() = Vc::SSE::M256::create(_mm_movehl_ps(gh23, gh01), _mm_movehl_ps(gh67, gh45)); -}/*}}}*/ - -// forward types of equal size - ugly, but it works/*{{{*/ -#define _forward(V, V2) \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6)); \ -} \ -template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase::deinterleave(V &v0, V &v1, V &v2, V &v3, \ - V &v4, V &v5, V &v6, V &v7) const { \ - reinterpret_cast *>(this)->deinterleave(reinterpret_cast(v0), reinterpret_cast(v1), \ - reinterpret_cast(v2), reinterpret_cast(v3), reinterpret_cast(v4), \ - reinterpret_cast(v5), reinterpret_cast(v6), reinterpret_cast(v7)); \ -} -_forward( int_v, float_v) -_forward(uint_v, float_v) -_forward(ushort_v, short_v) -#undef _forward/*}}}*/ - -} // namespace Common -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_SSE_INTERLEAVEDMEMORY_TCC - -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/sse/intrinsics.h b/math/vc/include/Vc/sse/intrinsics.h deleted file mode 100644 index 6ea08df9e2cd3..0000000000000 --- a/math/vc/include/Vc/sse/intrinsics.h +++ /dev/null @@ -1,602 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_INTRINSICS_H -#define SSE_INTRINSICS_H - -#include "../common/windows_fix_intrin.h" - -// The GCC xxxintrin.h headers do not make sure that the intrinsics have C linkage. This not really -// a problem, unless there is another place where the exact same functions are declared. Then the -// linkage must be the same, otherwise it won't compile. Such a case occurs on Windows, where the -// intrin.h header (included indirectly via unistd.h) declares many SSE intrinsics again. -extern "C" { -// MMX -#include -// SSE -#include -// SSE2 -#include -} - -#include "../common/fix_clang_emmintrin.h" - -#include "const_data.h" -#include -#include "macros.h" - -#ifdef __3dNOW__ -extern "C" { -#include -} -#endif - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - enum VectorAlignmentEnum { VectorAlignment = 16 }; - -#if defined(VC_GCC) && VC_GCC < 0x40600 && !defined(VC_DONT_FIX_SSE_SHIFT) - static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; } - static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; } - static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; } - static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; } - static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; } - static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; } -#endif - -#ifdef VC_GCC - // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin - // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :) - static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); } - static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); } - static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); } - static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); } - static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); } - static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); } -#endif - -#if defined(VC_GNU_ASM) && !defined(NVALGRIND) - static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r; __asm__("pcmpeqb %0,%0":"=x"(r)); return r; } -#else - static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone() { __m128i r = _mm_setzero_si128(); return _mm_cmpeq_epi8(r, r); } -#endif - static Vc_INTRINSIC __m128i Vc_CONST _mm_setallone_si128() { return _mm_setallone(); } - static Vc_INTRINSIC __m128d Vc_CONST _mm_setallone_pd() { return _mm_castsi128_pd(_mm_setallone()); } - static Vc_INTRINSIC __m128 Vc_CONST _mm_setallone_ps() { return _mm_castsi128_ps(_mm_setallone()); } - - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8 () { return _mm_set1_epi8(1); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8 () { return _mm_setone_epi8(); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast(c_general::one16)); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast(c_general::one32)); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } - - static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); } - static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); } - - static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast(c_general::absMaskDouble)); } - static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast(c_general::absMaskFloat)); } - static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast(c_general::signMaskDouble)); } - static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast(c_general::signMaskFloat)); } - - //X static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi8 () { return _mm_slli_epi8 (_mm_setallone_si128(), 7); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi16() { return _mm_load_si128(reinterpret_cast(c_general::minShort)); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setmin_epi32() { return _mm_load_si128(reinterpret_cast(c_general::signMaskFloat)); } - - //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu8 (__m128i a, __m128i b) { return _mm_cmplt_epi8 ( - //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } - //X static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu8 (__m128i a, __m128i b) { return _mm_cmpgt_epi8 ( - //X _mm_xor_si128(a, _mm_setmin_epi8 ()), _mm_xor_si128(b, _mm_setmin_epi8 ())); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16( - _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16( - _mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32( - _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32( - _mm_xor_si128(a, _mm_setmin_epi32()), _mm_xor_si128(b, _mm_setmin_epi32())); } -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -// SSE3 -#ifdef VC_IMPL_SSE3 -extern "C" { -#include -} -#endif -// SSSE3 -#ifdef VC_IMPL_SSSE3 -extern "C" { -#include -} -#define mm_abs_epi8 _mm_abs_epi8 -#define mm_abs_epi16 _mm_abs_epi16 -#define mm_abs_epi32 _mm_abs_epi32 -#define mm_alignr_epi8 _mm_alignr_epi8 -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - - // not overriding _mm_set1_epi8 because this one should only be used for non-constants - static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { -#if defined(VC_GCC) && VC_GCC < 0x40500 - return _mm_shuffle_epi8(_mm_cvtsi32_si128(a), _mm_setzero_si128()); -#else - // GCC 4.5 nows about the pshufb improvement - return _mm_set1_epi8(a); -#endif - } - -} // namespace SSE -} // namespace Vc -} // namespace ROOT -#else -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi8 (__m128i a) { - __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128()); - return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_setone_epi8())); - } - // positive value: - // negative == 0 - // a unchanged after xor - // 0 >> 31 -> 0 - // a + 0 -> a - // negative value: - // negative == -1 - // a xor -1 -> -a - 1 - // -1 >> 31 -> 1 - // -a - 1 + 1 -> -a - static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi16(__m128i a) { - __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128()); - return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_abs_epi32(__m128i a) { - __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128()); - return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31)); - } - static Vc_INTRINSIC __m128i Vc_CONST set1_epi8(int a) { - return _mm_set1_epi8(a); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_alignr_epi8(__m128i a, __m128i b, const int s) { - switch (s) { - case 0: return b; - case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1)); - case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2)); - case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3)); - case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4)); - case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5)); - case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6)); - case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7)); - case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8)); - case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9)); - case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10)); - case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11)); - case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12)); - case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13)); - case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14)); - case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15)); - case 16: return a; - case 17: return _mm_srli_si128(a, 1); - case 18: return _mm_srli_si128(a, 2); - case 19: return _mm_srli_si128(a, 3); - case 20: return _mm_srli_si128(a, 4); - case 21: return _mm_srli_si128(a, 5); - case 22: return _mm_srli_si128(a, 6); - case 23: return _mm_srli_si128(a, 7); - case 24: return _mm_srli_si128(a, 8); - case 25: return _mm_srli_si128(a, 9); - case 26: return _mm_srli_si128(a, 10); - case 27: return _mm_srli_si128(a, 11); - case 28: return _mm_srli_si128(a, 12); - case 29: return _mm_srli_si128(a, 13); - case 30: return _mm_srli_si128(a, 14); - case 31: return _mm_srli_si128(a, 15); - } - return _mm_setzero_si128(); - } - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#endif - -// SSE4.1 -#ifdef VC_IMPL_SSE4_1 -extern "C" { -#include -} -namespace ROOT { -namespace Vc -{ -namespace SSE -{ -#define mm_blendv_pd _mm_blendv_pd -#define mm_blendv_ps _mm_blendv_ps -#define mm_blendv_epi8 _mm_blendv_epi8 -#define mm_blend_epi16 _mm_blend_epi16 -#define mm_blend_ps _mm_blend_ps -#define mm_blend_pd _mm_blend_pd - -#define mm_min_epi32 _mm_min_epi32 -#define mm_max_epi32 _mm_max_epi32 -#define mm_min_epu32 _mm_min_epu32 -#define mm_max_epu32 _mm_max_epu32 -//#define mm_min_epi16 _mm_min_epi16 -//#define mm_max_epi16 _mm_max_epi16 -#define mm_min_epu16 _mm_min_epu16 -#define mm_max_epu16 _mm_max_epu16 -#define mm_min_epi8 _mm_min_epi8 -#define mm_max_epi8 _mm_max_epi8 - -#define mm_cvtepu16_epi32 _mm_cvtepu16_epi32 -#define mm_cvtepu8_epi16 _mm_cvtepu8_epi16 -#define mm_cvtepi8_epi16 _mm_cvtepi8_epi16 -#define mm_cvtepu16_epi32 _mm_cvtepu16_epi32 -#define mm_cvtepi16_epi32 _mm_cvtepi16_epi32 -#define mm_cvtepu8_epi32 _mm_cvtepu8_epi32 -#define mm_cvtepi8_epi32 _mm_cvtepi8_epi32 -#define mm_stream_load_si128 _mm_stream_load_si128 -// TODO -} // namespace SSE -} // namespace Vc -} // namespace ROOT -#else -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - static Vc_INTRINSIC __m128d mm_blendv_pd(__m128d a, __m128d b, __m128d c) { - return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b)); - } - static Vc_INTRINSIC __m128 mm_blendv_ps(__m128 a, __m128 b, __m128 c) { - return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b)); - } - static Vc_INTRINSIC __m128i mm_blendv_epi8(__m128i a, __m128i b, __m128i c) { - return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); - } - - // only use the following blend functions with immediates as mask and, of course, compiling - // with optimization - static Vc_INTRINSIC __m128d mm_blend_pd(__m128d a, __m128d b, const int mask) { - switch (mask) { - case 0x0: - return a; - case 0x1: - return _mm_shuffle_pd(b, a, 2); - case 0x2: - return _mm_shuffle_pd(a, b, 2); - case 0x3: - return b; - default: - abort(); - return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value' - } - } - static Vc_INTRINSIC __m128 mm_blend_ps(__m128 a, __m128 b, const int mask) { - __m128i c; - switch (mask) { - case 0x0: - return a; - case 0x1: - c = _mm_srli_si128(_mm_setallone_si128(), 12); - break; - case 0x2: - c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4); - break; - case 0x3: - c = _mm_srli_si128(_mm_setallone_si128(), 8); - break; - case 0x4: - c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8); - break; - case 0x5: - c = _mm_set_epi32(0, -1, 0, -1); - break; - case 0x6: - c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4); - break; - case 0x7: - c = _mm_srli_si128(_mm_setallone_si128(), 4); - break; - case 0x8: - c = _mm_slli_si128(_mm_setallone_si128(), 12); - break; - case 0x9: - c = _mm_set_epi32(-1, 0, 0, -1); - break; - case 0xa: - c = _mm_set_epi32(-1, 0, -1, 0); - break; - case 0xb: - c = _mm_set_epi32(-1, 0, -1, -1); - break; - case 0xc: - c = _mm_slli_si128(_mm_setallone_si128(), 8); - break; - case 0xd: - c = _mm_set_epi32(-1, -1, 0, -1); - break; - case 0xe: - c = _mm_slli_si128(_mm_setallone_si128(), 4); - break; - case 0xf: - return b; - default: // may not happen - abort(); - c = _mm_setzero_si128(); - break; - } - __m128 _c = _mm_castsi128_ps(c); - return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b)); - } - static Vc_INTRINSIC __m128i mm_blend_epi16(__m128i a, __m128i b, const int mask) { - __m128i c; - switch (mask) { - case 0x00: - return a; - case 0x01: - c = _mm_srli_si128(_mm_setallone_si128(), 14); - break; - case 0x03: - c = _mm_srli_si128(_mm_setallone_si128(), 12); - break; - case 0x07: - c = _mm_srli_si128(_mm_setallone_si128(), 10); - break; - case 0x0f: - return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a); - case 0x1f: - c = _mm_srli_si128(_mm_setallone_si128(), 6); - break; - case 0x3f: - c = _mm_srli_si128(_mm_setallone_si128(), 4); - break; - case 0x7f: - c = _mm_srli_si128(_mm_setallone_si128(), 2); - break; - case 0x80: - c = _mm_slli_si128(_mm_setallone_si128(), 14); - break; - case 0xc0: - c = _mm_slli_si128(_mm_setallone_si128(), 12); - break; - case 0xe0: - c = _mm_slli_si128(_mm_setallone_si128(), 10); - break; - case 0xf0: - c = _mm_slli_si128(_mm_setallone_si128(), 8); - break; - case 0xf8: - c = _mm_slli_si128(_mm_setallone_si128(), 6); - break; - case 0xfc: - c = _mm_slli_si128(_mm_setallone_si128(), 4); - break; - case 0xfe: - c = _mm_slli_si128(_mm_setallone_si128(), 2); - break; - case 0xff: - return b; - case 0xcc: - return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1))); - case 0x33: - return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1))); - default: - const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff); - c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15); - break; - } - return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); - } - - static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi8 (__m128i a, __m128i b) { - return mm_blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_max_epi32(__m128i a, __m128i b) { - return mm_blendv_epi8(b, a, _mm_cmpgt_epi32(a, b)); - } -//X static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu8 (__m128i a, __m128i b) { -//X return mm_blendv_epi8(b, a, _mm_cmpgt_epu8 (a, b)); -//X } - static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu16(__m128i a, __m128i b) { - return mm_blendv_epi8(b, a, _mm_cmpgt_epu16(a, b)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_max_epu32(__m128i a, __m128i b) { - return mm_blendv_epi8(b, a, _mm_cmpgt_epu32(a, b)); - } -//X static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu8 (__m128i a, __m128i b) { -//X return mm_blendv_epi8(a, b, _mm_cmpgt_epu8 (a, b)); -//X } - static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu16(__m128i a, __m128i b) { - return mm_blendv_epi8(a, b, _mm_cmpgt_epu16(a, b)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_min_epu32(__m128i a, __m128i b) { - return mm_blendv_epi8(a, b, _mm_cmpgt_epu32(a, b)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi8 (__m128i a, __m128i b) { - return mm_blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); - } - static Vc_INTRINSIC __m128i Vc_CONST mm_min_epi32(__m128i a, __m128i b) { - return mm_blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi16(__m128i epu8) { - return _mm_unpacklo_epi8(epu8, _mm_setzero_si128()); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi16(__m128i epi8) { - return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128())); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu16_epi32(__m128i epu16) { - return _mm_unpacklo_epi16(epu16, _mm_setzero_si128()); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi16_epi32(__m128i epu16) { - return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128())); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepu8_epi32(__m128i epu8) { - return mm_cvtepu16_epi32(mm_cvtepu8_epi16(epu8)); - } - static Vc_INTRINSIC Vc_CONST __m128i mm_cvtepi8_epi32(__m128i epi8) { - const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128()); - const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg); - return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg)); - } - static Vc_INTRINSIC Vc_PURE __m128i mm_stream_load_si128(__m128i *mem) { - return _mm_load_si128(mem); - } - -} // namespace SSE -} // namespace Vc -} // namespace ROOT -#endif - -#ifdef VC_IMPL_POPCNT -#include -#endif - -// SSE4.2 -#ifdef VC_IMPL_SSE4_2 -extern "C" { -#include -} -#endif - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - static Vc_INTRINSIC Vc_CONST float extract_float_imm(const __m128 v, const size_t i) { - float f = 0.; - switch (i) { - case 0: - f = _mm_cvtss_f32(v); - break; -#if defined VC_IMPL_SSE4_1 && !defined VC_MSVC - default: -#ifdef VC_GCC - f = __builtin_ia32_vec_ext_v4sf(static_cast<__v4sf>(v), (i)); -#else - // MSVC fails to compile this because it can't optimize i to an immediate - _MM_EXTRACT_FLOAT(f, v, i); -#endif - break; -#else - case 1: - f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 4))); - break; - case 2: - f = _mm_cvtss_f32(_mm_movehl_ps(v, v)); - break; - case 3: - f = _mm_cvtss_f32(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v), 12))); - break; -#endif - } - return f; - } - static Vc_INTRINSIC Vc_CONST double extract_double_imm(const __m128d v, const size_t i) { - if (i == 0) { - return _mm_cvtsd_f64(v); - } - return _mm_cvtsd_f64(_mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(v), _mm_castpd_ps(v)))); - } - static Vc_INTRINSIC Vc_CONST float extract_float(const __m128 v, const size_t i) { -#ifdef VC_GCC - if (__builtin_constant_p(i)) { - return extract_float_imm(v, i); -//X if (index <= 1) { -//X unsigned long long tmp = _mm_cvtsi128_si64(_mm_castps_si128(v)); -//X if (index == 0) tmp &= 0xFFFFFFFFull; -//X if (index == 1) tmp >>= 32; -//X return Common::AliasingEntryHelper(tmp); -//X } - } else { - typedef float float4[4] Vc_MAY_ALIAS; - const float4 &data = reinterpret_cast(v); - return data[i]; - } -#else - union { __m128 v; float m[4]; } u; - u.v = v; - return u.m[i]; -#endif - } - - static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) { -#ifdef VC_IMPL_SSE4_1 - return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); -#else - return _mm_load_ps(mem); -#endif - } - static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) { -#ifdef VC_IMPL_SSE4_1 - return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); -#else - return _mm_load_pd(mem); -#endif - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) { -#ifdef VC_IMPL_SSE4_1 - return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); -#else - return _mm_load_si128(reinterpret_cast(mem)); -#endif - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) { - return _mm_stream_load(reinterpret_cast(mem)); - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) { - return _mm_stream_load(reinterpret_cast(mem)); - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) { - return _mm_stream_load(reinterpret_cast(mem)); - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) { - return _mm_stream_load(reinterpret_cast(mem)); - } - static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) { - return _mm_stream_load(reinterpret_cast(mem)); - } -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -// XOP / FMA4 -#if defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4) -extern "C" { -#include -} -#endif - -#include "undomacros.h" -#include "shuffle.h" - -#endif // SSE_INTRINSICS_H diff --git a/math/vc/include/Vc/sse/limits.h b/math/vc/include/Vc/sse/limits.h deleted file mode 100644 index b1802080d17d0..0000000000000 --- a/math/vc/include/Vc/sse/limits.h +++ /dev/null @@ -1,81 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_LIMITS_H -#define VC_SSE_LIMITS_H - -#include "intrinsics.h" -#include "types.h" -#include "macros.h" - -namespace std -{ -template<> struct numeric_limits< ::ROOT::Vc::SSE::ushort_v> : public numeric_limits -{ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v max() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::_mm_setallone_si128(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v lowest() _VC_NOEXCEPT { return min(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v epsilon() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v round_error() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v infinity() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v quiet_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v signaling_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::ushort_v denorm_min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::ushort_v::Zero(); } -}; -template<> struct numeric_limits< ::ROOT::Vc::SSE::short_v> : public numeric_limits -{ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v max() _VC_NOEXCEPT { return _mm_srli_epi16(::ROOT::Vc::SSE::_mm_setallone_si128(), 1); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::_mm_setmin_epi16(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v lowest() _VC_NOEXCEPT { return min(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v epsilon() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v round_error() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v infinity() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v quiet_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v signaling_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::short_v denorm_min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::short_v::Zero(); } -}; -template<> struct numeric_limits< ::ROOT::Vc::SSE::uint_v> : public numeric_limits -{ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v max() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::_mm_setallone_si128(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v lowest() _VC_NOEXCEPT { return min(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v epsilon() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v round_error() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v infinity() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v quiet_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v signaling_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::uint_v denorm_min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::uint_v::Zero(); } -}; -template<> struct numeric_limits< ::ROOT::Vc::SSE::int_v> : public numeric_limits -{ - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v max() _VC_NOEXCEPT { return _mm_srli_epi32(::ROOT::Vc::SSE::_mm_setallone_si128(), 1); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::_mm_setmin_epi32(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v lowest() _VC_NOEXCEPT { return min(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v epsilon() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v round_error() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v infinity() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v quiet_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v signaling_NaN() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } - static Vc_INTRINSIC Vc_CONST ::ROOT::Vc::SSE::int_v denorm_min() _VC_NOEXCEPT { return ::ROOT::Vc::SSE::int_v::Zero(); } -}; -} // namespace std - -#include "undomacros.h" - -#endif // VC_SSE_LIMITS_H diff --git a/math/vc/include/Vc/sse/macros.h b/math/vc/include/Vc/sse/macros.h deleted file mode 100644 index b68b229beb166..0000000000000 --- a/math/vc/include/Vc/sse/macros.h +++ /dev/null @@ -1,47 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "../common/macros.h" - -#ifndef VC_SSE_MACROS_H -#define VC_SSE_MACROS_H -#undef VC_SSE_UNDOMACROS_H - -#ifndef _M128 -# define _M128 __m128 -#endif - -#ifndef _M128I -# define _M128I __m128i -#endif - -#ifndef _M128D -# define _M128D __m128d -#endif - -#define STORE_VECTOR(type, name, vec) \ - union { __m128i p; type v[16 / sizeof(type)]; } CAT(u, __LINE__); \ - _mm_store_si128(&CAT(u, __LINE__).p, vec); \ - const type *const name = &CAT(u, __LINE__).v[0] - -#if defined(VC_IMPL_SSE4_1) && !defined(VC_DISABLE_PTEST) -#define VC_USE_PTEST -#endif - -#endif // VC_SSE_MACROS_H diff --git a/math/vc/include/Vc/sse/mask.h b/math/vc/include/Vc/sse/mask.h deleted file mode 100644 index 5876b7de8f8eb..0000000000000 --- a/math/vc/include/Vc/sse/mask.h +++ /dev/null @@ -1,578 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_MASK_H -#define SSE_MASK_H - -#include "intrinsics.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -template struct MaskHelper; -template<> struct MaskHelper<2> { - static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2)); } - static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2)); } -}; -template<> struct MaskHelper<4> { - static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_ps(k1) == _mm_movemask_ps(k2); } - static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_ps(k1) != _mm_movemask_ps(k2); } -}; -template<> struct MaskHelper<8> { - static Vc_ALWAYS_INLINE Vc_CONST bool cmpeq (_M128 k1, _M128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) == _mm_movemask_epi8(_mm_castps_si128(k2)); } - static Vc_ALWAYS_INLINE Vc_CONST bool cmpneq(_M128 k1, _M128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) != _mm_movemask_epi8(_mm_castps_si128(k2)); } -}; - -class Float8Mask; -template class Mask -{ - friend class Mask<2u>; - friend class Mask<4u>; - friend class Mask<8u>; - friend class Mask<16u>; - friend class Float8Mask; - public: - FREE_STORE_OPERATORS_ALIGNED(16) - - // abstracts the way Masks are passed to functions, it can easily be changed to const ref here - // Also Float8Mask requires const ref on MSVC 32bit. -#if defined VC_MSVC && defined _WIN32 - typedef const Mask &Argument; -#else - typedef Mask Argument; -#endif - - Vc_ALWAYS_INLINE Mask() {} - Vc_ALWAYS_INLINE Mask(const __m128 &x) : k(x) {} - Vc_ALWAYS_INLINE Mask(const __m128d &x) : k(_mm_castpd_ps(x)) {} - Vc_ALWAYS_INLINE Mask(const __m128i &x) : k(_mm_castsi128_ps(x)) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {} - Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : _mm_setzero_ps()) {} - Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {} - Vc_ALWAYS_INLINE Mask(const Mask *a) - : k(_mm_castsi128_ps(_mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {} - Vc_ALWAYS_INLINE explicit Mask(const Float8Mask &m); - - template Vc_ALWAYS_INLINE_L explicit Mask(const Mask &x) Vc_ALWAYS_INLINE_R; -//X { -//X _M128I tmp = x.dataI(); -//X if (OtherSize < VectorSize) { -//X tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); -//X if (VectorSize / OtherSize >= 4u) { tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); } -//X if (VectorSize / OtherSize >= 8u) { tmp = _mm_packs_epi16(tmp, _mm_setzero_si128()); } -//X } else if (OtherSize > VectorSize) { -//X tmp = _mm_unpacklo_epi8(tmp, tmp); -//X if (OtherSize / VectorSize >= 4u) { tmp = _mm_unpacklo_epi8(tmp, tmp); } -//X if (OtherSize / VectorSize >= 8u) { tmp = _mm_unpacklo_epi8(tmp, tmp); } -//X } -//X k = _mm_castsi128_ps(tmp); -//X } - - inline void expand(Mask *x) const; - - Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const { return MaskHelper::cmpeq (k, rhs.k); } - Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const { return MaskHelper::cmpneq(k, rhs.k); } - - Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const { return _mm_andnot_si128(dataI(), _mm_setallone_si128()); } - - Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; } - Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; } - - Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return -#ifdef VC_USE_PTEST - _mm_testc_si128(dataI(), _mm_setallone_si128()); // return 1 if (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) == (~0 & k) -#else - _mm_movemask_epi8(dataI()) == 0xffff; -#endif - } - Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return -#ifdef VC_USE_PTEST - _mm_testz_si128(dataI(), dataI()); // return 1 if (0, 0, 0, 0) == (k & k) -#else - _mm_movemask_epi8(dataI()) == 0x0000; -#endif - } - Vc_ALWAYS_INLINE Vc_PURE bool isMix() const { -#ifdef VC_USE_PTEST - return _mm_test_mix_ones_zeros(dataI(), _mm_setallone_si128()); -#else - const int tmp = _mm_movemask_epi8(dataI()); - return tmp != 0 && (tmp ^ 0xffff) != 0; -#endif - } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE Vc_PURE operator bool() const { return isFull(); } -#endif - - Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE Vc_PURE _M128 data () const { return k; } - Vc_ALWAYS_INLINE Vc_PURE _M128I dataI() const { return _mm_castps_si128(k); } - Vc_ALWAYS_INLINE Vc_PURE _M128D dataD() const { return _mm_castps_pd(k); } - - template Vc_ALWAYS_INLINE Vc_PURE Mask cast() const { return Mask(k); } - - Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - /** - * Returns the index of the first one in the mask. - * - * The return value is undefined if the mask is empty. - */ - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - _M128 k; -}; - -struct ForeachHelper -{ - _long mask; - bool brk; - bool outerBreak; - Vc_ALWAYS_INLINE ForeachHelper(_long _mask) : mask(_mask), brk(false), outerBreak(false) {} - Vc_ALWAYS_INLINE bool outer() const { return (mask != 0) && !outerBreak; } - Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); } - Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; } - Vc_ALWAYS_INLINE _long next() { - outerBreak = true; -#ifdef VC_GNU_ASM - const _long bit = __builtin_ctzl(mask); - __asm__("btr %1,%0" : "+r"(mask) : "r"(bit)); -#elif defined(_WIN64) - unsigned long bit; - _BitScanForward64(&bit, mask); - _bittestandreset64(&mask, bit); -#elif defined(_WIN32) - unsigned long bit; - _BitScanForward(&bit, mask); - _bittestandreset(&mask, bit); -#else -#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de" -#endif - return bit; - } -}; - -#define Vc_foreach_bit(_it_, _mask_) \ - for (Vc::SSE::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \ - for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak()) - -template Vc_ALWAYS_INLINE Vc_PURE int Mask::shiftMask() const -{ - return _mm_movemask_epi8(dataI()); -} - -template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<4> &x) { - k = _mm_unpacklo_ps(x.data(), x.data()); -} -template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<8> &x) { - _M128I tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); - k = _mm_castsi128_ps(_mm_unpacklo_epi32(tmp, tmp)); -} -template<> template<> Vc_ALWAYS_INLINE Mask<2>::Mask(const Mask<16> &x) { - _M128I tmp = _mm_unpacklo_epi8(x.dataI(), x.dataI()); - tmp = _mm_unpacklo_epi16(tmp, tmp); - k = _mm_castsi128_ps(_mm_unpacklo_epi32(tmp, tmp)); -} -template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<2> &x) { - k = _mm_castsi128_ps(_mm_packs_epi16(x.dataI(), _mm_setzero_si128())); -} -template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<8> &x) { - k = _mm_castsi128_ps(_mm_unpacklo_epi16(x.dataI(), x.dataI())); -} -template<> template<> Vc_ALWAYS_INLINE Mask<4>::Mask(const Mask<16> &x) { - _M128I tmp = _mm_unpacklo_epi8(x.dataI(), x.dataI()); - k = _mm_castsi128_ps(_mm_unpacklo_epi16(tmp, tmp)); -} -template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<2> &x) { - _M128I tmp = _mm_packs_epi16(x.dataI(), x.dataI()); - k = _mm_castsi128_ps(_mm_packs_epi16(tmp, tmp)); -} -template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<4> &x) { - k = _mm_castsi128_ps(_mm_packs_epi16(x.dataI(), x.dataI())); -} -template<> template<> Vc_ALWAYS_INLINE Mask<8>::Mask(const Mask<16> &x) { - k = _mm_castsi128_ps(_mm_unpacklo_epi8(x.dataI(), x.dataI())); -} - -template<> inline void Mask< 4>::expand(Mask<2> *x) const { - x[0].k = _mm_unpacklo_ps(data(), data()); - x[1].k = _mm_unpackhi_ps(data(), data()); -} -template<> inline void Mask< 8>::expand(Mask<4> *x) const { - x[0].k = _mm_castsi128_ps(_mm_unpacklo_epi16(dataI(), dataI())); - x[1].k = _mm_castsi128_ps(_mm_unpackhi_epi16(dataI(), dataI())); -} -template<> inline void Mask<16>::expand(Mask<8> *x) const { - x[0].k = _mm_castsi128_ps(_mm_unpacklo_epi8 (dataI(), dataI())); - x[1].k = _mm_castsi128_ps(_mm_unpackhi_epi8 (dataI(), dataI())); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 2>::toInt() const { return _mm_movemask_pd(dataD()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 4>::toInt() const { return _mm_movemask_ps(data ()); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask< 8>::toInt() const { return _mm_movemask_epi8(_mm_packs_epi16(dataI(), _mm_setzero_si128())); } -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16>::toInt() const { return _mm_movemask_epi8(dataI()); } - -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 2>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 4>::operator[](int index) const { return toInt() & (1 << index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask< 8>::operator[](int index) const { return shiftMask() & (1 << 2 * index); } -template<> Vc_ALWAYS_INLINE Vc_PURE bool Mask<16>::operator[](int index) const { return toInt() & (1 << index); } - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<2>::count() const -{ - int mask = _mm_movemask_pd(dataD()); - return (mask & 1) + (mask >> 1); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<4>::count() const -{ -#ifdef VC_IMPL_POPCNT - return _mm_popcnt_u32(_mm_movemask_ps(data())); -//X tmp = (tmp & 5) + ((tmp >> 1) & 5); -//X return (tmp & 3) + ((tmp >> 2) & 3); -#else - _M128I x = _mm_srli_epi32(dataI(), 31); - x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); - x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(x); -#endif -} - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<8>::count() const -{ -#ifdef VC_IMPL_POPCNT - return _mm_popcnt_u32(_mm_movemask_epi8(dataI())) / 2; -#else -//X int tmp = _mm_movemask_epi8(dataI()); -//X tmp = (tmp & 0x1111) + ((tmp >> 2) & 0x1111); -//X tmp = (tmp & 0x0303) + ((tmp >> 4) & 0x0303); -//X return (tmp & 0x000f) + ((tmp >> 8) & 0x000f); - _M128I x = _mm_srli_epi16(dataI(), 15); - x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); - x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3))); - x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1))); - return _mm_extract_epi16(x, 0); -#endif -} - -template<> Vc_ALWAYS_INLINE Vc_PURE int Mask<16>::count() const -{ - int tmp = _mm_movemask_epi8(dataI()); -#ifdef VC_IMPL_POPCNT - return _mm_popcnt_u32(tmp); -#else - tmp = (tmp & 0x5555) + ((tmp >> 1) & 0x5555); - tmp = (tmp & 0x3333) + ((tmp >> 2) & 0x3333); - tmp = (tmp & 0x0f0f) + ((tmp >> 4) & 0x0f0f); - return (tmp & 0x00ff) + ((tmp >> 8) & 0x00ff); -#endif -} - - -class Float8Mask -{ - enum Constants { - PartialSize = 4, - VectorSize = 8 - }; - public: - FREE_STORE_OPERATORS_ALIGNED(16) - - // abstracts the way Masks are passed to functions, it can easily be changed to const ref here - // Also Float8Mask requires const ref on MSVC 32bit. -#if defined VC_MSVC && defined _WIN32 - typedef const Float8Mask & Argument; -#else - typedef Float8Mask Argument; -#endif - - Vc_ALWAYS_INLINE Float8Mask() {} - Vc_ALWAYS_INLINE Float8Mask(const M256 &x) : k(x) {} - Vc_ALWAYS_INLINE explicit Float8Mask(VectorSpecialInitializerZero::ZEnum) { - k[0] = _mm_setzero_ps(); - k[1] = _mm_setzero_ps(); - } - Vc_ALWAYS_INLINE explicit Float8Mask(VectorSpecialInitializerOne::OEnum) { - k[0] = _mm_setallone_ps(); - k[1] = _mm_setallone_ps(); - } - Vc_ALWAYS_INLINE explicit Float8Mask(bool b) { - const __m128 tmp = b ? _mm_setallone_ps() : _mm_setzero_ps(); - k[0] = tmp; - k[1] = tmp; - } - Vc_ALWAYS_INLINE Float8Mask(const Mask &a) { - k[0] = _mm_castsi128_ps(_mm_unpacklo_epi16(a.dataI(), a.dataI())); - k[1] = _mm_castsi128_ps(_mm_unpackhi_epi16(a.dataI(), a.dataI())); - } - - Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Float8Mask &rhs) const { - return MaskHelper::cmpeq (k[0], rhs.k[0]) - && MaskHelper::cmpeq (k[1], rhs.k[1]); - } - Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Float8Mask &rhs) const { - return MaskHelper::cmpneq(k[0], rhs.k[0]) - || MaskHelper::cmpneq(k[1], rhs.k[1]); - } - - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator&&(const Float8Mask &rhs) const { - Float8Mask r; - r.k[0] = _mm_and_ps(k[0], rhs.k[0]); - r.k[1] = _mm_and_ps(k[1], rhs.k[1]); - return r; - } - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator& (const Float8Mask &rhs) const { - Float8Mask r; - r.k[0] = _mm_and_ps(k[0], rhs.k[0]); - r.k[1] = _mm_and_ps(k[1], rhs.k[1]); - return r; - } - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator||(const Float8Mask &rhs) const { - Float8Mask r; - r.k[0] = _mm_or_ps(k[0], rhs.k[0]); - r.k[1] = _mm_or_ps(k[1], rhs.k[1]); - return r; - } - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator| (const Float8Mask &rhs) const { - Float8Mask r; - r.k[0] = _mm_or_ps(k[0], rhs.k[0]); - r.k[1] = _mm_or_ps(k[1], rhs.k[1]); - return r; - } - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator^ (const Float8Mask &rhs) const { - Float8Mask r; - r.k[0] = _mm_xor_ps(k[0], rhs.k[0]); - r.k[1] = _mm_xor_ps(k[1], rhs.k[1]); - return r; - } - Vc_ALWAYS_INLINE Vc_PURE Float8Mask operator!() const { - Float8Mask r; - r.k[0] = _mm_andnot_ps(k[0], _mm_setallone_ps()); - r.k[1] = _mm_andnot_ps(k[1], _mm_setallone_ps()); - return r; - } - Vc_ALWAYS_INLINE Float8Mask &operator&=(const Float8Mask &rhs) { - k[0] = _mm_and_ps(k[0], rhs.k[0]); - k[1] = _mm_and_ps(k[1], rhs.k[1]); - return *this; - } - Vc_ALWAYS_INLINE Float8Mask &operator|=(const Float8Mask &rhs) { - k[0] = _mm_or_ps (k[0], rhs.k[0]); - k[1] = _mm_or_ps (k[1], rhs.k[1]); - return *this; - } - Vc_ALWAYS_INLINE Float8Mask &operator^=(const Float8Mask &rhs) { - k[0] = _mm_xor_ps(k[0], rhs.k[0]); - k[1] = _mm_xor_ps(k[1], rhs.k[1]); - return *this; - } - - Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { - const _M128 tmp = _mm_and_ps(k[0], k[1]); -#ifdef VC_USE_PTEST - return _mm_testc_si128(_mm_castps_si128(tmp), _mm_setallone_si128()); -#else - return _mm_movemask_ps(tmp) == 0xf; - //_mm_movemask_ps(k[0]) == 0xf && - //_mm_movemask_ps(k[1]) == 0xf; -#endif - } - Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { - const _M128 tmp = _mm_or_ps(k[0], k[1]); -#ifdef VC_USE_PTEST - return _mm_testz_si128(_mm_castps_si128(tmp), _mm_castps_si128(tmp)); -#else - return _mm_movemask_ps(tmp) == 0x0; - //_mm_movemask_ps(k[0]) == 0x0 && - //_mm_movemask_ps(k[1]) == 0x0; -#endif - } - Vc_ALWAYS_INLINE Vc_PURE bool isMix() const { - // consider [1111 0000] - // solution: - // if k[0] != k[1] => return true - // if k[0] == k[1] => return k[0].isMix -#ifdef VC_USE_PTEST - __m128i tmp = _mm_castps_si128(_mm_xor_ps(k[0], k[1])); - // tmp == 0 <=> k[0] == k[1] - return !_mm_testz_si128(tmp, tmp) || - _mm_test_mix_ones_zeros(_mm_castps_si128(k[0]), _mm_setallone_si128()); -#else - const int tmp = _mm_movemask_ps(k[0]) + _mm_movemask_ps(k[1]); - return tmp > 0x0 && tmp < (0xf + 0xf); -#endif - } - -#ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK - Vc_ALWAYS_INLINE Vc_PURE operator bool() const { return isFull(); } -#endif - - Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { - return (_mm_movemask_ps(k[1]) << 4) + _mm_movemask_ps(k[0]); - } - Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return (_mm_movemask_ps(k[1]) << 4) + _mm_movemask_ps(k[0]); } - - Vc_ALWAYS_INLINE Vc_PURE const M256 &data () const { return k; } - - Vc_ALWAYS_INLINE Vc_PURE bool operator[](int index) const { - return (toInt() & (1 << index)) != 0; - } - - Vc_ALWAYS_INLINE Vc_PURE int count() const { -#ifdef VC_IMPL_POPCNT - return _mm_popcnt_u32(toInt()); -#else -//X int tmp1 = _mm_movemask_ps(k[0]); -//X int tmp2 = _mm_movemask_ps(k[1]); -//X tmp1 = (tmp1 & 5) + ((tmp1 >> 1) & 5); -//X tmp2 = (tmp2 & 5) + ((tmp2 >> 1) & 5); -//X return (tmp1 & 3) + (tmp2 & 3) + ((tmp1 >> 2) & 3) + ((tmp2 >> 2) & 3); - _M128I x = _mm_add_epi32(_mm_srli_epi32(_mm_castps_si128(k[0]), 31), - _mm_srli_epi32(_mm_castps_si128(k[1]), 31)); - x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); - x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(x); -#endif - } - - Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; - - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - M256 k; -}; - -template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const -{ - const int mask = toInt(); -#ifdef _MSC_VER - unsigned long bit; - _BitScanForward(&bit, mask); -#else - int bit; - __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask)); -#endif - return bit; -} -Vc_ALWAYS_INLINE Vc_PURE int Float8Mask::firstOne() const -{ - const int mask = toInt(); -#ifdef _MSC_VER - unsigned long bit; - _BitScanForward(&bit, mask); -#else - int bit; - __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask)); -#endif - return bit; -} - -template -Vc_ALWAYS_INLINE Mask::Mask(const Float8Mask &m) - : k(_mm_castsi128_ps(_mm_packs_epi32(_mm_castps_si128(m.data()[0]), _mm_castps_si128(m.data()[1])))) {} - -class Float8GatherMask -{ - public: - Float8GatherMask(const Mask<8u> &k) : mask(k.toInt()) {} - Float8GatherMask(const Float8Mask &k) : mask(k.toInt()) {} - int toInt() const { return mask; } - private: - const int mask; -}; - -/** - * Loop over all set bits in the mask. The iterator variable will be set to the position of the set - * bits. A mask of e.g. 00011010 would result in the loop being called with the iterator being set to - * 1, 3, and 4. - * - * This allows you to write: - * \code - * float_v a = ...; - * foreach_bit(int i, a < 0.f) { - * std::cout << a[i] << "\n"; - * } - * \endcode - * The example prints all the values in \p a that are negative, and only those. - * - * \param it The iterator variable. For example "int i". - * \param mask The mask to iterate over. You can also just write a vector operation that returns a - * mask. - */ -//X #define foreach_bit(it, mask) -//X for (int _sse_vector_foreach_inner = 1, ForeachScope _sse_vector_foreach_scope(mask.toInt()), int it = _sse_vector_foreach_scope.bit(); _sse_vector_foreach_inner; --_sse_vector_foreach_inner) -//X for (int _sse_vector_foreach_mask = (mask).toInt(), int _sse_vector_foreach_it = _sse_bitscan(mask.toInt()); -//X _sse_vector_foreach_it > 0; -//X _sse_vector_foreach_it = _sse_bitscan_initialized(_sse_vector_foreach_it, mask.data())) -//X for (int _sse_vector_foreach_inner = 1, it = _sse_vector_foreach_it; _sse_vector_foreach_inner; --_sse_vector_foreach_inner) - -// Operators -// let binary and/or/xor work for any combination of masks (as long as they have the same sizeof) -template Mask operator& (const Mask &lhs, const Mask &rhs) { return _mm_and_ps(lhs.data(), rhs.data()); } -template Mask operator| (const Mask &lhs, const Mask &rhs) { return _mm_or_ps (lhs.data(), rhs.data()); } -template Mask operator^ (const Mask &lhs, const Mask &rhs) { return _mm_xor_ps(lhs.data(), rhs.data()); } - -// binary and/or/xor cannot work with one operand larger than the other -template void operator& (const Mask &lhs, const Float8Mask &rhs); -template void operator| (const Mask &lhs, const Float8Mask &rhs); -template void operator^ (const Mask &lhs, const Float8Mask &rhs); -template void operator& (const Float8Mask &rhs, const Mask &lhs); -template void operator| (const Float8Mask &rhs, const Mask &lhs); -template void operator^ (const Float8Mask &rhs, const Mask &lhs); - -// disable logical and/or for incompatible masks -template void operator&&(const Mask &lhs, const Mask &rhs); -template void operator||(const Mask &lhs, const Mask &rhs); -template void operator&&(const Mask &lhs, const Float8Mask &rhs); -template void operator||(const Mask &lhs, const Float8Mask &rhs); -template void operator&&(const Float8Mask &rhs, const Mask &lhs); -template void operator||(const Float8Mask &rhs, const Mask &lhs); - -// logical and/or for compatible masks -template Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &lhs, const Mask &rhs) { return _mm_and_ps(lhs.data(), rhs.data()); } -template Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &lhs, const Mask &rhs) { return _mm_or_ps (lhs.data(), rhs.data()); } -Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator&&(const Float8Mask &rhs, const Mask<8> &lhs) { return static_cast >(rhs) && lhs; } -Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator||(const Float8Mask &rhs, const Mask<8> &lhs) { return static_cast >(rhs) || lhs; } -Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator&&(const Mask<8> &rhs, const Float8Mask &lhs) { return rhs && static_cast >(lhs); } -Vc_ALWAYS_INLINE Vc_PURE Mask<8> operator||(const Mask<8> &rhs, const Float8Mask &lhs) { return rhs || static_cast >(lhs); } - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // SSE_MASK_H diff --git a/math/vc/include/Vc/sse/math.h b/math/vc/include/Vc/sse/math.h deleted file mode 100644 index 071865ccb0933..0000000000000 --- a/math/vc/include/Vc/sse/math.h +++ /dev/null @@ -1,217 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_MATH_H -#define VC_SSE_MATH_H - -#include "const.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - /** - * splits \p v into exponent and mantissa, the sign is kept with the mantissa - * - * The return value will be in the range [0.5, 1.0[ - * The \p e value will be an integer defining the power-of-two exponent - */ - inline double_v frexp(const double_v &v, int_v *e) { - const __m128i exponentBits = Const::exponentMask().dataI(); - const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits); - *e = _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe)); - const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits)); - double_v ret = _mm_and_pd(exponentMaximized, _mm_load_pd(reinterpret_cast(&c_general::frexpMask[0]))); - double_m zeroMask = v == double_v::Zero(); - ret(isnan(v) || !isfinite(v) || zeroMask) = v; - e->setZero(zeroMask.data()); - return ret; - } - inline float_v frexp(const float_v &v, int_v *e) { - const __m128i exponentBits = Const::exponentMask().dataI(); - const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits); - *e = _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e)); - const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits)); - float_v ret = _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))); - ret(isnan(v) || !isfinite(v) || v == float_v::Zero()) = v; - e->setZero(v == float_v::Zero()); - return ret; - } - inline sfloat_v frexp(const sfloat_v &v, short_v *e) { - const __m128i exponentBits = Const::exponentMask().dataI(); - const __m128i exponentPart0 = _mm_and_si128(_mm_castps_si128(v.data()[0]), exponentBits); - const __m128i exponentPart1 = _mm_and_si128(_mm_castps_si128(v.data()[1]), exponentBits); - *e = _mm_sub_epi16(_mm_packs_epi32(_mm_srli_epi32(exponentPart0, 23), _mm_srli_epi32(exponentPart1, 23)), - _mm_set1_epi16(0x7e)); - const __m128 exponentMaximized0 = _mm_or_ps(v.data()[0], _mm_castsi128_ps(exponentBits)); - const __m128 exponentMaximized1 = _mm_or_ps(v.data()[1], _mm_castsi128_ps(exponentBits)); - sfloat_v ret = M256::create( - _mm_and_ps(exponentMaximized0, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))), - _mm_and_ps(exponentMaximized1, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))) - ); - sfloat_m zeroMask = v == sfloat_v::Zero(); - ret(isnan(v) || !isfinite(v) || zeroMask) = v; - e->setZero(static_cast(zeroMask)); - return ret; - } - - /* -> x * 2^e - * x == NaN -> NaN - * x == (-)inf -> (-)inf - */ - inline double_v ldexp(double_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero((v == double_v::Zero()).dataI()); - const __m128i exponentBits = _mm_slli_epi64(e.data(), 52); - return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits)); - } - inline float_v ldexp(float_v::AsArg v, int_v::AsArg _e) { - int_v e = _e; - e.setZero(static_cast(v == float_v::Zero())); - return (v.reinterpretCast() + (e << 23)).reinterpretCast(); - } - inline sfloat_v ldexp(sfloat_v::AsArg v, short_v::AsArg _e) { - short_v e = _e; - e.setZero(static_cast(v == sfloat_v::Zero())); - e <<= (23 - 16); - const __m128i exponentBits0 = _mm_unpacklo_epi16(_mm_setzero_si128(), e.data()); - const __m128i exponentBits1 = _mm_unpackhi_epi16(_mm_setzero_si128(), e.data()); - return M256::create(_mm_castsi128_ps(_mm_add_epi32(_mm_castps_si128(v.data()[0]), exponentBits0)), - _mm_castsi128_ps(_mm_add_epi32(_mm_castps_si128(v.data()[1]), exponentBits1))); - } - -#ifdef VC_IMPL_SSE4_1 - inline double_v trunc(double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); } - inline float_v trunc(float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); } - inline sfloat_v trunc(sfloat_v::AsArg v) { return M256::create(_mm_round_ps(v.data()[0], 0x3), - _mm_round_ps(v.data()[1], 0x3)); } - - inline double_v floor(double_v::AsArg v) { return _mm_floor_pd(v.data()); } - inline float_v floor(float_v::AsArg v) { return _mm_floor_ps(v.data()); } - inline sfloat_v floor(sfloat_v::AsArg v) { return M256::create(_mm_floor_ps(v.data()[0]), - _mm_floor_ps(v.data()[1])); } - - inline double_v ceil(double_v::AsArg v) { return _mm_ceil_pd(v.data()); } - inline float_v ceil(float_v::AsArg v) { return _mm_ceil_ps(v.data()); } - inline sfloat_v ceil(sfloat_v::AsArg v) { return M256::create(_mm_ceil_ps(v.data()[0]), - _mm_ceil_ps(v.data()[1])); } -#else - static inline void floor_shift(float_v &v, float_v::AsArg e) - { - int_v x = _mm_setallone_si128(); - x <<= 23; - x >>= static_cast(e); - v &= x.reinterpretCast(); - } - - static inline void floor_shift(sfloat_v &v, sfloat_v::AsArg e) - { - int_v x = _mm_setallone_si128(); - x <<= 23; - int_v y = x; - x >>= _mm_cvttps_epi32(e.data()[0]); - y >>= _mm_cvttps_epi32(e.data()[1]); - v.data()[0] = _mm_and_ps(v.data()[0], _mm_castsi128_ps(x.data())); - v.data()[1] = _mm_and_ps(v.data()[1], _mm_castsi128_ps(y.data())); - } - - static inline void floor_shift(double_v &v, double_v::AsArg e) - { - const long long initialMask = 0xfff0000000000000ull; - const uint_v shifts = static_cast(e); - union d_ll { - long long ll; - double d; - }; - d_ll mask0 = { initialMask >> shifts[0] }; - d_ll mask1 = { initialMask >> shifts[1] }; - v &= double_v(_mm_setr_pd(mask0.d, mask1.d)); - } - - template - inline Vector trunc(VC_ALIGNED_PARAMETER(Vector) _v) { - typedef Vector V; - typedef typename V::Mask M; - - V v = _v; - V e = abs(v).exponent(); - const M negativeExponent = e < 0; - e.setZero(negativeExponent); - //const M negativeInput = v < V::Zero(); - - floor_shift(v, e); - - v.setZero(negativeExponent); - //v(negativeInput && _v != v) -= V::One(); - return v; - } - - template - inline Vector floor(VC_ALIGNED_PARAMETER(Vector) _v) { - typedef Vector V; - typedef typename V::Mask M; - - V v = _v; - V e = abs(v).exponent(); - const M negativeExponent = e < 0; - e.setZero(negativeExponent); - const M negativeInput = v < V::Zero(); - - floor_shift(v, e); - - v.setZero(negativeExponent); - v(negativeInput && _v != v) -= V::One(); - return v; - } - - template - inline Vector ceil(VC_ALIGNED_PARAMETER(Vector) _v) { - typedef Vector V; - typedef typename V::Mask M; - - V v = _v; - V e = abs(v).exponent(); - const M negativeExponent = e < 0; - e.setZero(negativeExponent); - const M positiveInput = v > V::Zero(); - - floor_shift(v, e); - - v.setZero(negativeExponent); - v(positiveInput && _v != v) += V::One(); - return v; - } -#endif -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#define VC__USE_NAMESPACE SSE -#include "../common/trigonometric.h" -#define VC__USE_NAMESPACE SSE -#include "../common/logarithm.h" -#define VC__USE_NAMESPACE SSE -#include "../common/exponential.h" -#undef VC__USE_NAMESPACE - -#endif // VC_SSE_MATH_H diff --git a/math/vc/include/Vc/sse/prefetches.tcc b/math/vc/include/Vc/sse/prefetches.tcc deleted file mode 100644 index 0e32f04359b1c..0000000000000 --- a/math/vc/include/Vc/sse/prefetches.tcc +++ /dev/null @@ -1,58 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_PREFETCHES_TCC -#define VC_SSE_PREFETCHES_TCC - -namespace ROOT { -namespace Vc -{ -namespace Internal -{ - -Vc_ALWAYS_INLINE void HelperImpl::prefetchForOneRead(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchClose(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchMid(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchFar(const void *addr) -{ - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); -} -Vc_ALWAYS_INLINE void HelperImpl::prefetchForModify(const void *addr) -{ -#if defined(__3dNOW__) && (!defined(VC_CLANG) || VC_CLANG >= 0x30200) - _m_prefetchw(const_cast(addr)); -#else - _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); -#endif -} - -} // namespace Internal -} // namespace Vc -} // namespace ROOT - -#endif // VC_SSE_PREFETCHES_TCC diff --git a/math/vc/include/Vc/sse/shuffle.h b/math/vc/include/Vc/sse/shuffle.h deleted file mode 100644 index 18132173ae424..0000000000000 --- a/math/vc/include/Vc/sse/shuffle.h +++ /dev/null @@ -1,172 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_SHUFFLE_H -#define VC_SSE_SHUFFLE_H - -#include "macros.h" - -namespace ROOT { -namespace Vc -{ - enum VecPos { - X0, X1, X2, X3, X4, X5, X6, X7, - Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 - }; - - namespace Mem - { - // shuffle([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] - template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); - return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); - } - - // shuffle([x0 x1], [y0 y1]) = [x1 y0] - template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range); - return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2); - } - -#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX) -#define Vc_MAKE_INTRINSIC__(name__) Vc::SSE::_VC_CAT(m,m,_,name__) -#else -#define Vc_MAKE_INTRINSIC__(name__) _VC_CAT(_,mm,_,name__) -#endif - - // blend([x0 x1], [y0, y1]) = [x0 y1] - template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { - VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); - return Vc_MAKE_INTRINSIC__(blend_pd)(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2); - } - - // blend([x0 x1], [y0, y1]) = [x0 y1] - template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { - VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); - return Vc_MAKE_INTRINSIC__(blend_ps)(x, y, - (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + - (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8); - } - - template - static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) { - VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); - VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); - VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); - VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); - VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); - VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); - return Vc_MAKE_INTRINSIC__(blend_epi16)(x, y, - (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + - (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + - (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + - (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 - ); - } - - // permute([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] - template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - - template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - - template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - - template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) { - VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range); - return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); - } - - template - static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range); - VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range); - if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) { - x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) { - x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); - } - return x; - } - } // namespace Mem - // The shuffles and permutes above use memory ordering. The ones below use register ordering: - namespace Reg - { - // shuffle([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] - template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { - return Mem::shuffle(x, y); - } - - // shuffle([x1 x0], [y1 y0]) = [y0 x1] - template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { - return Mem::shuffle(x, y); - } - - // shuffle([x3 x2 x1 x0]) = [x3 x0 x2 x1] - template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); - return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); - } - - // shuffle([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] - template static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) { - VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); - VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); - return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64)); - } - - // blend([x1 x0], [y1, y0]) = [x1 y0] - template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { - return Mem::blend(x, y); - } - - template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { - return Mem::blend(x, y); - } - } // namespace Reg -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // VC_SSE_SHUFFLE_H diff --git a/math/vc/include/Vc/sse/types.h b/math/vc/include/Vc/sse/types.h deleted file mode 100644 index 42022da661220..0000000000000 --- a/math/vc/include/Vc/sse/types.h +++ /dev/null @@ -1,163 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_TYPES_H -#define SSE_TYPES_H - -#include "intrinsics.h" -#include "../common/storage.h" - -#define VC_DOUBLE_V_SIZE 2 -#define VC_FLOAT_V_SIZE 4 -#define VC_SFLOAT_V_SIZE 8 -#define VC_INT_V_SIZE 4 -#define VC_UINT_V_SIZE 4 -#define VC_SHORT_V_SIZE 8 -#define VC_USHORT_V_SIZE 8 - -#include "../common/types.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - template class Vector; - template class WriteMaskedVector; - - // define our own long because on Windows64 long == int while on Linux long == max. register width - // since we want to have a type that depends on 32 vs. 64 bit we need to do some special casing on Windows -#ifdef _WIN64 - typedef __int64 _long; - typedef unsigned __int64 _ulong; -#else - typedef long _long; - typedef unsigned long _ulong; -#endif - - - class Float8Mask; - class Float8GatherMask; - template class Mask; - - /* - * Hack to create a vector object with 8 floats - */ - typedef Vc::sfloat float8; - - class M256 { - public: - //Vc_INTRINSIC M256() {} - //Vc_INTRINSIC M256(_M128 a, _M128 b) { d[0] = a; d[1] = b; } - static Vc_INTRINSIC Vc_CONST M256 dup(_M128 a) { M256 r; r.d[0] = a; r.d[1] = a; return r; } - static Vc_INTRINSIC Vc_CONST M256 create(_M128 a, _M128 b) { M256 r; r.d[0] = a; r.d[1] = b; return r; } - Vc_INTRINSIC _M128 &operator[](int i) { return d[i]; } - Vc_INTRINSIC const _M128 &operator[](int i) const { return d[i]; } - private: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - _M128 d[2]; - }; -#ifdef VC_CHECK_ALIGNMENT -static Vc_ALWAYS_INLINE void assertCorrectAlignment(const M256 *ptr) -{ - const size_t s = sizeof(__m128); - if((reinterpret_cast(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) { - fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n"); - abort(); - } -} -#endif - - template struct ParameterHelper { - typedef T ByValue; - typedef T & Reference; - typedef const T & ConstRef; - }; -#if defined VC_MSVC && !defined _WIN64 - // The calling convention on WIN32 can't guarantee alignment. - // An exception are the first three arguments, which may be passed in a register. - template<> struct ParameterHelper { - typedef const M256 & ByValue; - typedef M256 & Reference; - typedef const M256 & ConstRef; - }; -#endif - - template struct VectorHelper {}; - - template struct IndexTypeHelper; - template<> struct IndexTypeHelper<2u> { typedef unsigned int Type; }; - template<> struct IndexTypeHelper<4u> { typedef unsigned int Type; }; - template<> struct IndexTypeHelper<8u> { typedef unsigned short Type; }; - template<> struct IndexTypeHelper<16u>{ typedef unsigned char Type; }; - - template struct CtorTypeHelper { typedef T Type; }; - template<> struct CtorTypeHelper { typedef int Type; }; - template<> struct CtorTypeHelper { typedef unsigned int Type; }; - template<> struct CtorTypeHelper { typedef double Type; }; - - template struct ExpandTypeHelper { typedef T Type; }; - template<> struct ExpandTypeHelper { typedef int Type; }; - template<> struct ExpandTypeHelper { typedef unsigned int Type; }; - template<> struct ExpandTypeHelper { typedef double Type; }; - - template struct VectorTypeHelper { typedef __m128i Type; }; - template<> struct VectorTypeHelper { typedef __m128d Type; }; - template<> struct VectorTypeHelper< float> { typedef __m128 Type; }; - template<> struct VectorTypeHelper { typedef M256 Type; }; - - template struct DetermineMask { typedef Mask Type; }; - template<> struct DetermineMask { typedef Float8Mask Type; }; - - template struct DetermineGatherMask { typedef T Type; }; - template<> struct DetermineGatherMask { typedef Float8GatherMask Type; }; - - template struct VectorTraits - { - typedef typename VectorTypeHelper::Type VectorType; - typedef typename DetermineEntryType::Type EntryType; - enum Constants { - Size = sizeof(VectorType) / sizeof(EntryType), - HasVectorDivision = !IsInteger::Value - }; - typedef typename DetermineMask::Type MaskType; - typedef typename DetermineGatherMask::Type GatherMaskType; - typedef Vector::Type> IndexType; - typedef Common::VectorMemoryUnion StorageType; - }; - - template struct VectorHelperSize; - - template > - class STRUCT_ALIGN1(16) VectorAlignedBaseT - { - public: - FREE_STORE_OPERATORS_ALIGNED(16) - } STRUCT_ALIGN2(16); - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -#endif // SSE_TYPES_H diff --git a/math/vc/include/Vc/sse/undomacros.h b/math/vc/include/Vc/sse/undomacros.h deleted file mode 100644 index 0e8b08cb2e60c..0000000000000 --- a/math/vc/include/Vc/sse/undomacros.h +++ /dev/null @@ -1,32 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2010 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_SSE_UNDOMACROS_H -#define VC_SSE_UNDOMACROS_H -#undef VC_SSE_MACROS_H - -#undef STORE_VECTOR - -#ifdef VC_USE_PTEST -#undef VC_USE_PTEST -#endif - -#endif // VC_SSE_UNDOMACROS_H - -#include "../common/undomacros.h" diff --git a/math/vc/include/Vc/sse/vector.h b/math/vc/include/Vc/sse/vector.h deleted file mode 100644 index b63f49f243582..0000000000000 --- a/math/vc/include/Vc/sse/vector.h +++ /dev/null @@ -1,550 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_VECTOR_H -#define SSE_VECTOR_H - -#include "intrinsics.h" -#include "types.h" -#include "vectorhelper.h" -#include "mask.h" -#include "../common/aliasingentryhelper.h" -#include "../common/memoryfwd.h" -#include -#include - -#include "macros.h" - -#ifdef isfinite -#undef isfinite -#endif -#ifdef isnan -#undef isnan -#endif - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ -template -class WriteMaskedVector -{ - friend class Vector; - typedef typename VectorTraits::MaskType Mask; - typedef typename Vector::EntryType EntryType; - public: - FREE_STORE_OPERATORS_ALIGNED(16) - //prefix - Vc_INTRINSIC Vector &operator++() { - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; - } - Vc_INTRINSIC Vector &operator--() { - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return *vec; - } - //postfix - Vc_INTRINSIC Vector operator++(int) { - Vector ret(*vec); - vec->data() = VectorHelper::add(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; - } - Vc_INTRINSIC Vector operator--(int) { - Vector ret(*vec); - vec->data() = VectorHelper::sub(vec->data(), - VectorHelper::notMaskedToZero(VectorHelper::one(), mask.data()) - ); - return ret; - } - - Vc_INTRINSIC Vector &operator+=(const Vector &x) { - vec->data() = VectorHelper::add(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; - } - Vc_INTRINSIC Vector &operator-=(const Vector &x) { - vec->data() = VectorHelper::sub(vec->data(), VectorHelper::notMaskedToZero(x.data(), mask.data())); - return *vec; - } - Vc_INTRINSIC Vector &operator*=(const Vector &x) { - vec->assign(VectorHelper::mul(vec->data(), x.data()), mask); - return *vec; - } - Vc_INTRINSIC Vector &operator/=(const Vector &x); - - Vc_INTRINSIC Vector &operator+=(EntryType x) { - return operator+=(Vector(x)); - } - Vc_INTRINSIC Vector &operator-=(EntryType x) { - return operator-=(Vector(x)); - } - Vc_INTRINSIC Vector &operator*=(EntryType x) { - return operator*=(Vector(x)); - } - Vc_INTRINSIC Vector &operator/=(EntryType x) { - return operator/=(Vector(x)); - } - - Vc_INTRINSIC Vector &operator=(const Vector &x) { - vec->assign(x, mask); - return *vec; - } - - Vc_INTRINSIC Vector &operator=(EntryType x) { - vec->assign(Vector(x), mask); - return *vec; - } - - template Vc_INTRINSIC void call(const F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC void call(F &f) const { - return vec->call(f, mask); - } - template Vc_INTRINSIC Vector apply(const F &f) const { - return vec->apply(f, mask); - } - template Vc_INTRINSIC Vector apply(F &f) const { - return vec->apply(f, mask); - } - - private: - Vc_ALWAYS_INLINE WriteMaskedVector(Vector *v, const Mask &k) : vec(v), mask(k) {} - Vector *const vec; - Mask mask; -}; - -template class Vector -{ - friend class WriteMaskedVector; - protected: -#ifdef VC_COMPILE_BENCHMARKS - public: -#endif - typedef typename VectorTraits::StorageType StorageType; - StorageType d; - typedef typename VectorTraits::GatherMaskType GatherMask; - typedef VectorHelper::VectorType> HV; - typedef VectorHelper HT; - public: - FREE_STORE_OPERATORS_ALIGNED(16) - - enum Constants { Size = VectorTraits::Size }; - typedef typename VectorTraits::VectorType VectorType; - typedef typename VectorTraits::EntryType EntryType; - typedef typename VectorTraits::IndexType IndexType; - typedef typename VectorTraits::MaskType Mask; - typedef typename Mask::Argument MaskArg; - typedef Vc::Memory, Size> Memory; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const Vector &AsArg; -#else - typedef const Vector AsArg; -#endif - - typedef T _T; - - /////////////////////////////////////////////////////////////////////////////////////////// - // uninitialized - Vc_ALWAYS_INLINE Vector() {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // constants - explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero::ZEnum) Vc_INTRINSIC_R; - explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne::OEnum) Vc_INTRINSIC_R; - explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero::IEnum) Vc_INTRINSIC_R; - static Vc_INTRINSIC_L Vector Zero() Vc_INTRINSIC_R; - static Vc_INTRINSIC_L Vector One() Vc_INTRINSIC_R; - static Vc_INTRINSIC_L Vector IndexesFromZero() Vc_INTRINSIC_R; - static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // internal: required to enable returning objects of VectorType - Vc_ALWAYS_INLINE Vector(const VectorType &x) : d(x) {} - - /////////////////////////////////////////////////////////////////////////////////////////// - // static_cast / copy ctor - template explicit Vc_INTRINSIC_L Vector(const Vector &x) Vc_INTRINSIC_R; - - // implicit cast - template Vc_INTRINSIC_L Vector &operator=(const Vector &x) Vc_INTRINSIC_R; - - // copy assignment - Vc_ALWAYS_INLINE Vector &operator=(AsArg v) { d.v() = v.d.v(); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // broadcast - explicit Vc_INTRINSIC_L Vector(EntryType a) Vc_INTRINSIC_R; - template Vc_INTRINSIC Vector(TT x, VC_EXACT_TYPE(TT, EntryType, void *) = 0) : d(HT::set(x)) {} - static Vc_INTRINSIC Vector broadcast4(const EntryType *x) { return Vector(x); } - Vc_ALWAYS_INLINE Vector &operator=(EntryType a) { d.v() = HT::set(a); return *this; } - - /////////////////////////////////////////////////////////////////////////////////////////// - // load ctors - explicit Vc_INTRINSIC_L - Vector(const EntryType *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const EntryType *x, Alignment align) Vc_INTRINSIC_R; - template explicit Vc_INTRINSIC_L - Vector(const OtherT *x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - Vector(const OtherT *x, Alignment align) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // load member functions - Vc_INTRINSIC_L - void load(const EntryType *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const EntryType *mem, Alignment align) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L - void load(const OtherT *mem, Alignment align) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // expand 1 float_v to 2 double_v XXX rationale? remove it for release? XXX - explicit Vc_INTRINSIC_L Vector(const Vector::Type> *a) Vc_INTRINSIC_R; - inline void expand(Vector::Type> *x) const; - - /////////////////////////////////////////////////////////////////////////////////////////// - // zeroing - Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; - - Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; - Vc_INTRINSIC_L void setQnan(typename Mask::Argument k) Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // stores - Vc_INTRINSIC_L void store(EntryType *mem) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, A align) const Vc_INTRINSIC_R; - template Vc_INTRINSIC_L void store(EntryType *mem, const Mask &mask, A align) const Vc_INTRINSIC_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // swizzles - Vc_INTRINSIC_L Vc_PURE_L const Vector &abcd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cdab() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector badc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector aaaa() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bbbb() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector cccc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dddd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcad() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector bcda() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dabc() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector acbd() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dbca() const Vc_INTRINSIC_R Vc_PURE_R; - Vc_INTRINSIC_L Vc_PURE_L const Vector dcba() const Vc_INTRINSIC_R Vc_PURE_R; - - /////////////////////////////////////////////////////////////////////////////////////////// - // gathers - template Vector(const EntryType *mem, const IndexT *indexes); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes); - template Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask); - template Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes); - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask); -#ifdef VC_USE_SET_GATHERS - template void gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask); -#endif - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes); - template void gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes); - template void gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask); - - /////////////////////////////////////////////////////////////////////////////////////////// - // scatters - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const; - template void scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const; - template void scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const; - template void scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const; - - //prefix - Vc_INTRINSIC Vector &operator++() { data() = VectorHelper::add(data(), VectorHelper::one()); return *this; } - Vc_INTRINSIC Vector &operator--() { data() = VectorHelper::sub(data(), VectorHelper::one()); return *this; } - //postfix - Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = VectorHelper::add(data(), VectorHelper::one()); return r; } - Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = VectorHelper::sub(data(), VectorHelper::one()); return r; } - - Vc_INTRINSIC Common::AliasingEntryHelper operator[](size_t index) { -#if defined(VC_GCC) && VC_GCC >= 0x40300 && VC_GCC < 0x40400 - ::ROOT::Vc::Warnings::_operator_bracket_warning(); -#endif - return d.m(index); - } - Vc_INTRINSIC_L EntryType operator[](size_t index) const Vc_PURE Vc_INTRINSIC_R; - - Vc_INTRINSIC Vector Vc_PURE operator~() const { return VectorHelper::andnot_(data(), VectorHelper::allone()); } - Vc_ALWAYS_INLINE_L Vector::Type> operator-() const Vc_ALWAYS_INLINE_R; - Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; } - -#define OP(symbol, fun) \ - Vc_INTRINSIC Vector &operator symbol##=(const Vector &x) { data() = VectorHelper::fun(data(), x.data()); return *this; } \ - Vc_INTRINSIC Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ - Vc_INTRINSIC Vector Vc_PURE operator symbol(const Vector &x) const { return HT::fun(data(), x.data()); } \ - template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Vector) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OP(+, add) - OP(-, sub) - OP(*, mul) -#undef OP - - Vc_INTRINSIC_L Vector &operator<<=(AsArg shift) Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector operator<< (AsArg shift) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector &operator<<=( int shift) Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector operator<< ( int shift) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector &operator>>=(AsArg shift) Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector operator>> (AsArg shift) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector &operator>>=( int shift) Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector operator>> ( int shift) const Vc_INTRINSIC_R; - - Vc_INTRINSIC_L Vector &operator/=(const Vector &x) Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector operator/ (const Vector &x) const Vc_PURE Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector &operator/=(EntryType x) Vc_INTRINSIC_R; - template Vc_INTRINSIC_L VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) operator/(TT x) const Vc_PURE Vc_INTRINSIC_R; - -#define OP(symbol, fun) \ - Vc_INTRINSIC_L Vector &operator symbol##=(const Vector &x) Vc_INTRINSIC_R; \ - Vc_INTRINSIC_L Vector operator symbol(const Vector &x) const Vc_PURE Vc_INTRINSIC_R; \ - Vc_INTRINSIC Vector &operator symbol##=(EntryType x) { return operator symbol##=(Vector(x)); } \ - template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Vector) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } - OP(|, or_) - OP(&, and_) - OP(^, xor_) -#undef OP -#define OPcmp(symbol, fun) \ - Vc_INTRINSIC Mask Vc_PURE operator symbol(const Vector &x) const { return VectorHelper::fun(data(), x.data()); } \ - template Vc_INTRINSIC VC_EXACT_TYPE(TT, EntryType, Mask) Vc_PURE operator symbol(TT x) const { return operator symbol(Vector(x)); } - - OPcmp(==, cmpeq) - OPcmp(!=, cmpneq) - OPcmp(>=, cmpnlt) - OPcmp(>, cmpnle) - OPcmp(<, cmplt) - OPcmp(<=, cmple) -#undef OPcmp - Vc_INTRINSIC_L Vc_PURE_L Mask isNegative() const Vc_PURE_R Vc_INTRINSIC_R; - - Vc_ALWAYS_INLINE void fusedMultiplyAdd(const Vector &factor, const Vector &summand) { - VectorHelper::fma(data(), factor.data(), summand.data()); - } - - Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { - const VectorType k = mm128_reinterpret_cast(mask.data()); - data() = VectorHelper::blend(data(), v.data(), k); - } - - template Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const { return StaticCastHelper::cast(data()); } - template Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const { return mm128_reinterpret_cast(data()); } - - Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return WriteMaskedVector(this, k); } - - /** - * \return \p true This vector was completely filled. m2 might be 0 or != 0. You still have - * to test this. - * \p false This vector was not completely filled. m2 is all 0. - */ - //inline bool pack(Mask &m1, Vector &v2, Mask &m2) { - //return VectorHelper::pack(data(), m1.data, v2.data(), m2.data); - //} - - Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); } - Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); } - - Vc_INTRINSIC EntryType min() const { return VectorHelper::min(data()); } - Vc_INTRINSIC EntryType max() const { return VectorHelper::max(data()); } - Vc_INTRINSIC EntryType product() const { return VectorHelper::mul(data()); } - Vc_INTRINSIC EntryType sum() const { return VectorHelper::add(data()); } - Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R; - - Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; - inline Vc_PURE Vector sorted() const { return SortHelper::sort(data()); } - - template void callWithValuesSorted(F &f) { - EntryType value = d.m(0); - f(value); - for (int i = 1; i < Size; ++i) { - if (d.m(i) != value) { - value = d.m(i); - f(value); - } - } - } - - template Vc_INTRINSIC void call(const F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); - } - template Vc_INTRINSIC void call(F &f) const { - for_all_vector_entries(i, - f(EntryType(d.m(i))); - ); - } - - template Vc_INTRINSIC void call(const F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { - f(EntryType(d.m(i))); - } - } - template Vc_INTRINSIC void call(F &f, const Mask &mask) const { - Vc_foreach_bit(size_t i, mask) { - f(EntryType(d.m(i))); - } - } - - template Vc_INTRINSIC Vector apply(const F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); - return r; - } - template Vc_INTRINSIC Vector apply(F &f) const { - Vector r; - for_all_vector_entries(i, - r.d.m(i) = f(EntryType(d.m(i))); - ); - return r; - } - - template Vc_INTRINSIC Vector apply(const F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); - } - return r; - } - template Vc_INTRINSIC Vector apply(F &f, const Mask &mask) const { - Vector r(*this); - Vc_foreach_bit (size_t i, mask) { - r.d.m(i) = f(EntryType(r.d.m(i))); - } - return r; - } - - template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { - for_all_vector_entries(i, - d.m(i) = f(i); - ); - } - Vc_INTRINSIC void fill(EntryType (&f)()) { - for_all_vector_entries(i, - d.m(i) = f(); - ); - } - - Vc_INTRINSIC_L Vector copySign(typename Vector::AsArg reference) const Vc_INTRINSIC_R; - Vc_INTRINSIC_L Vector exponent() const Vc_INTRINSIC_R; -}; - -typedef Vector double_v; -typedef Vector float_v; -typedef Vector sfloat_v; -typedef Vector int_v; -typedef Vector uint_v; -typedef Vector short_v; -typedef Vector ushort_v; -typedef double_v::Mask double_m; -typedef float_v::Mask float_m; -typedef sfloat_v::Mask sfloat_m; -typedef int_v::Mask int_m; -typedef uint_v::Mask uint_m; -typedef short_v::Mask short_m; -typedef ushort_v::Mask ushort_m; - -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::broadcast4(const float *x) { - const _M128 &v = VectorHelper<_M128>::load(x, Aligned); - return Vector(M256::create(v, v)); -} - -template class SwizzledVector : public Vector {}; - -static Vc_ALWAYS_INLINE Vc_PURE int_v min(const int_v &x, const int_v &y) { return mm_min_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE uint_v min(const uint_v &x, const uint_v &y) { return mm_min_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE short_v min(const short_v &x, const short_v &y) { return _mm_min_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE ushort_v min(const ushort_v &x, const ushort_v &y) { return mm_min_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE float_v min(const float_v &x, const float_v &y) { return _mm_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE double_v min(const double_v &x, const double_v &y) { return _mm_min_pd(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE int_v max(const int_v &x, const int_v &y) { return mm_max_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE uint_v max(const uint_v &x, const uint_v &y) { return mm_max_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE short_v max(const short_v &x, const short_v &y) { return _mm_max_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE ushort_v max(const ushort_v &x, const ushort_v &y) { return mm_max_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE float_v max(const float_v &x, const float_v &y) { return _mm_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE double_v max(const double_v &x, const double_v &y) { return _mm_max_pd(x.data(), y.data()); } - -static Vc_ALWAYS_INLINE Vc_PURE sfloat_v min(const sfloat_v &x, const sfloat_v &y) { - return M256::create(_mm_min_ps(x.data()[0], y.data()[0]), _mm_min_ps(x.data()[1], y.data()[1])); -} -static Vc_ALWAYS_INLINE Vc_PURE sfloat_v max(const sfloat_v &x, const sfloat_v &y) { - return M256::create(_mm_max_ps(x.data()[0], y.data()[0]), _mm_max_ps(x.data()[1], y.data()[1])); -} - - template static Vc_ALWAYS_INLINE Vc_PURE Vector sqrt (const Vector &x) { return VectorHelper::sqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vc_PURE Vector rsqrt(const Vector &x) { return VectorHelper::rsqrt(x.data()); } - template static Vc_ALWAYS_INLINE Vc_PURE Vector abs (const Vector &x) { return VectorHelper::abs(x.data()); } - template static Vc_ALWAYS_INLINE Vc_PURE Vector reciprocal(const Vector &x) { return VectorHelper::reciprocal(x.data()); } - template static Vc_ALWAYS_INLINE Vc_PURE Vector round(const Vector &x) { return VectorHelper::round(x.data()); } - - template static Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isfinite(const Vector &x) { return VectorHelper::isFinite(x.data()); } - template static Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isnan(const Vector &x) { return VectorHelper::isNaN(x.data()); } - -#include "forceToRegisters.tcc" -#ifdef VC_GNU_ASM -template<> -Vc_ALWAYS_INLINE void forceToRegisters(const Vector &x1) { - __asm__ __volatile__(""::"x"(x1.data()[0]), "x"(x1.data()[1])); -} -#elif defined(VC_MSVC) -#pragma optimize("g", off) -template<> -Vc_ALWAYS_INLINE void forceToRegisters(const Vector &/*x1*/) { -} -#endif -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" -#include "vector.tcc" -#include "math.h" -#endif // SSE_VECTOR_H diff --git a/math/vc/include/Vc/sse/vector.tcc b/math/vc/include/Vc/sse/vector.tcc deleted file mode 100644 index eb20122dcfed3..0000000000000 --- a/math/vc/include/Vc/sse/vector.tcc +++ /dev/null @@ -1,1545 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "limits.h" -#include "../common/bitscanintrinsics.h" -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -ALIGN(64) extern unsigned int RandomState[16]; - -namespace SSE -{ - -template static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() { - if (Size == 4) { - return reinterpret_cast(_IndexesFromZero4); - } else if (Size == 8) { - return reinterpret_cast(_IndexesFromZero8); - } else if (Size == 16) { - return reinterpret_cast(_IndexesFromZero16); - } - return 0; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// constants {{{1 -template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero::ZEnum) - : d(VectorHelper::zero()) -{ -} - -template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne::OEnum) - : d(VectorHelper::one()) -{ -} - -template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) - : d(VectorHelper::load(_IndexesFromZero(), Aligned)) -{ -} - -template Vc_INTRINSIC Vc_CONST Vector Vector::Zero() -{ - return VectorHelper::zero(); -} - -template Vc_INTRINSIC Vc_CONST Vector Vector::One() -{ - return VectorHelper::one(); -} - -template Vc_INTRINSIC Vc_CONST Vector Vector::IndexesFromZero() -{ - return VectorHelper::load(_IndexesFromZero(), Aligned); -} - -// conversion/casts {{{1 -template template Vc_INTRINSIC Vector::Vector(const Vector &x) - : d(StaticCastHelper::cast(x.data())) -{ -} - -template<> template<> Vc_INTRINSIC short_v &Vector::operator=(const ushort_v &x) { - data() = StaticCastHelper::cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC ushort_v &Vector::operator=(const short_v &x) { - data() = StaticCastHelper::cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC int_v &Vector::operator=(const uint_v &x) { - data() = StaticCastHelper::cast(x.data()); return *this; -} -template<> template<> Vc_INTRINSIC uint_v &Vector::operator=(const int_v &x) { - data() = StaticCastHelper::cast(x.data()); return *this; -} - -// broadcasts {{{1 -template Vc_INTRINSIC Vector::Vector(EntryType a) - : d(VectorHelper::set(a)) -{ -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// load ctors {{{1 -template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } -template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } - -/////////////////////////////////////////////////////////////////////////////////////////// -// load member functions {{{1 -template Vc_INTRINSIC void Vector::load(const EntryType *mem) -{ - load(mem, Aligned); -} - -template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) -{ - d.v() = VectorHelper::load(mem, align); -} - -template template Vc_INTRINSIC void Vector::load(const OtherT *mem) -{ - load(mem, Aligned); -} - -// float8: simply use the float implementation twice {{{2 -template<> template Vc_INTRINSIC void Vector::load(const OtherT *x, A a) -{ - d.v() = M256::create( - Vector(&x[0], a).data(), - Vector(&x[4], a).data() - ); -} - -// LoadHelper {{{2 -template struct LoadHelper; - -// float {{{2 -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f) - { - return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)), - _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f) - { - return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f) - { - return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f) - { - return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f) - { - return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f) - { - return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); - } -}; - -// int {{{2 -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f) - { - return VectorHelper<__m128i>::load(mem, f); - } -}; -// no difference between streaming and alignment, because the -// 32/64 bit loads are not available as streaming loads, and can always be unaligned -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) - { - return mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast(mem))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags) - { - return mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) - { - return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) - { - return mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); - } -}; - -// unsigned int {{{2 -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) - { - return mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) - { - return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); - } -}; - -// short {{{2 -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f) - { - return VectorHelper<__m128i>::load(mem, f); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) - { - return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); - } -}; -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) - { - return mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); - } -}; - -// unsigned short {{{2 -template struct LoadHelper { - static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) - { - return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); - } -}; - -// general load, implemented via LoadHelper {{{2 -template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) -{ - d.v() = LoadHelper::load(x, f); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// expand/combine {{{1 -template Vc_INTRINSIC Vector::Vector(const Vector::Type> *a) - : d(VectorHelper::concat(a[0].data(), a[1].data())) -{ -} - -template inline void Vector::expand(Vector::Type> *x) const -{ - if (Size == 8u) { - x[0].data() = VectorHelper::expand0(data()); - x[1].data() = VectorHelper::expand1(data()); - } -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// zeroing {{{1 -template Vc_INTRINSIC void Vector::setZero() -{ - data() = VectorHelper::zero(); -} - -template Vc_INTRINSIC void Vector::setZero(const Mask &k) -{ - data() = VectorHelper::andnot_(mm128_reinterpret_cast(k.data()), data()); -} - -template<> Vc_INTRINSIC void Vector::setQnan() -{ - data() = _mm_setallone_pd(); -} -template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) -{ - data() = _mm_or_pd(data(), k.dataD()); -} -template<> Vc_INTRINSIC void Vector::setQnan() -{ - data() = _mm_setallone_ps(); -} -template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) -{ - data() = _mm_or_ps(data(), k.data()); -} -template<> Vc_INTRINSIC void Vector::setQnan() -{ - d.v()[0] = _mm_setallone_ps(); - d.v()[1] = _mm_setallone_ps(); -} -template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) -{ - d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]); - d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// stores {{{1 -template Vc_INTRINSIC void Vector::store(EntryType *mem) const -{ - VectorHelper::store(mem, data(), Aligned); -} - -template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const -{ - VectorHelper::store(mem, data(), mm128_reinterpret_cast(mask.data()), Aligned); -} - -template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const -{ - VectorHelper::store(mem, data(), align); -} - -template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const -{ - HV::store(mem, data(), mm128_reinterpret_cast(mask.data()), align); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// division {{{1 -template Vc_INTRINSIC Vector &WriteMaskedVector::operator/=(const Vector &x) -{ - return operator=(*vec / x); -} -template<> Vc_INTRINSIC int_v &WriteMaskedVector::operator/=(const int_v &x) -{ - Vc_foreach_bit (int i, mask) { - vec->d.m(i) /= x.d.m(i); - } - return *vec; -} -template<> Vc_INTRINSIC uint_v &WriteMaskedVector::operator/=(const uint_v &x) -{ - Vc_foreach_bit (int i, mask) { - vec->d.m(i) /= x.d.m(i); - } - return *vec; -} -template<> Vc_INTRINSIC short_v &WriteMaskedVector::operator/=(const short_v &x) -{ - Vc_foreach_bit (int i, mask) { - vec->d.m(i) /= x.d.m(i); - } - return *vec; -} -template<> Vc_INTRINSIC ushort_v &WriteMaskedVector::operator/=(const ushort_v &x) -{ - Vc_foreach_bit (int i, mask) { - vec->d.m(i) /= x.d.m(i); - } - return *vec; -} - -template inline Vector &Vector::operator/=(EntryType x) -{ - if (VectorTraits::HasVectorDivision) { - return operator/=(Vector(x)); - } - for_all_vector_entries(i, - d.m(i) /= x; - ); - return *this; -} - -template template Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const -{ - if (VectorTraits::HasVectorDivision) { - return operator/(Vector(x)); - } - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x; - ); - return r; -} - -template inline Vector &Vector::operator/=(const Vector &x) -{ - for_all_vector_entries(i, - d.m(i) /= x.d.m(i); - ); - return *this; -} - -template inline Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - Vector r; - for_all_vector_entries(i, - r.d.m(i) = d.m(i) / x.d.m(i); - ); - return r; -} - -template<> inline Vector &Vector::operator/=(const Vector &x) -{ - __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); - __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); - lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); - hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); - d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); - return *this; -} - -template<> inline Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); - __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); - lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); - hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); - return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); -} - -template<> inline Vector &Vector::operator/=(const Vector &x) -{ - __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); - __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); - lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); - hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); - d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); - return *this; -} - -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); - __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); - lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); - hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); - return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); -} - -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v() = _mm_div_ps(d.v(), x.d.v()); - return *this; -} - -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - return _mm_div_ps(d.v(), x.d.v()); -} - -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); - d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); - return *this; -} - -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - Vector r; - r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); - r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); - return r; -} - -template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) -{ - d.v() = _mm_div_pd(d.v(), x.d.v()); - return *this; -} - -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const -{ - return _mm_div_pd(d.v(), x.d.v()); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// operator- {{{1 -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_xor_pd(d.v(), _mm_setsignmask_pd()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return _mm_xor_ps(d.v(), _mm_setsignmask_ps()); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ - return M256::create( - _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()), - _mm_xor_ps(d.v()[1], _mm_setsignmask_ps())); -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ -#ifdef VC_IMPL_SSSE3 - return _mm_sign_epi32(d.v(), _mm_setallone_si128()); -#else - return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); -#endif -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ -#ifdef VC_IMPL_SSSE3 - return _mm_sign_epi32(d.v(), _mm_setallone_si128()); -#else - return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); -#endif -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ -#ifdef VC_IMPL_SSSE3 - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); -#else - return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); -#endif -} -template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const -{ -#ifdef VC_IMPL_SSSE3 - return _mm_sign_epi16(d.v(), _mm_setallone_si128()); -#else - return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); -#endif -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// integer ops {{{1 -#define OP_IMPL(T, symbol, fun) \ -template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(const Vector &x) \ -{ \ - d.v() = VectorHelper::fun(d.v(), x.d.v()); \ - return *this; \ -} \ -template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(const Vector &x) const \ -{ \ - return VectorHelper::fun(d.v(), x.d.v()); \ -} -OP_IMPL(int, &, and_) -OP_IMPL(int, |, or_) -OP_IMPL(int, ^, xor_) -OP_IMPL(unsigned int, &, and_) -OP_IMPL(unsigned int, |, or_) -OP_IMPL(unsigned int, ^, xor_) -OP_IMPL(short, &, and_) -OP_IMPL(short, |, or_) -OP_IMPL(short, ^, xor_) -OP_IMPL(unsigned short, &, and_) -OP_IMPL(unsigned short, |, or_) -OP_IMPL(unsigned short, ^, xor_) -OP_IMPL(float, &, and_) -OP_IMPL(float, |, or_) -OP_IMPL(float, ^, xor_) -OP_IMPL(float8, &, and_) -OP_IMPL(float8, |, or_) -OP_IMPL(float8, ^, xor_) -OP_IMPL(double, &, and_) -OP_IMPL(double, |, or_) -OP_IMPL(double, ^, xor_) -#undef OP_IMPL - -#ifdef VC_IMPL_XOP -static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); } -static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); } -static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); } -static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); } -static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); } -static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); } -static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); } -static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); } - -#define _VC_OP(T, symbol, impl) \ -template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \ -{ \ - d.v() = impl(*this, shift); \ - return *this; \ -} \ -template<> Vc_INTRINSIC Vc_PURE T T::operator symbol (T::AsArg shift) const \ -{ \ - return impl(*this, shift); \ -} -VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft) -VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight) -#undef _VC_OP -#else -#if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP) -#define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak)) -#else -#define VC_WORKAROUND Vc_INTRINSIC -#endif - -#define OP_IMPL(T, symbol) \ -template<> VC_WORKAROUND Vector &Vector::operator symbol##=(Vector::AsArg x) \ -{ \ - for_all_vector_entries(i, \ - d.m(i) symbol##= x.d.m(i); \ - ); \ - return *this; \ -} \ -template<> inline Vc_PURE Vector Vector::operator symbol(Vector::AsArg x) const \ -{ \ - Vector r; \ - for_all_vector_entries(i, \ - r.d.m(i) = d.m(i) symbol x.d.m(i); \ - ); \ - return r; \ -} -OP_IMPL(int, <<) -OP_IMPL(int, >>) -OP_IMPL(unsigned int, <<) -OP_IMPL(unsigned int, >>) -OP_IMPL(short, <<) -OP_IMPL(short, >>) -OP_IMPL(unsigned short, <<) -OP_IMPL(unsigned short, >>) -#undef OP_IMPL -#undef VC_WORKAROUND -#endif - -template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { - d.v() = VectorHelper::shiftRight(d.v(), shift); - return *this; -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { - return VectorHelper::shiftRight(d.v(), shift); -} -template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { - d.v() = VectorHelper::shiftLeft(d.v(), shift); - return *this; -} -template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { - return VectorHelper::shiftLeft(d.v(), shift); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// swizzles {{{1 -template Vc_INTRINSIC Vc_PURE const Vector &Vector::abcd() const { return *this; } -template Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } -template Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } - -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cdab() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::badc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::aaaa() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bbbb() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cccc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dddd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcad() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcda() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dabc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::acbd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dbca() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } -template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dcba() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } - -#define VC_SWIZZLES_16BIT_IMPL(T) \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } \ -template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } -VC_SWIZZLES_16BIT_IMPL(short) -VC_SWIZZLES_16BIT_IMPL(unsigned short) -#undef VC_SWIZZLES_16BIT_IMPL - -// operators {{{1 -#include "../common/operators.h" -// isNegative {{{1 -template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const -{ - return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31)); -} -template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const -{ - return M256::create( - sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)), - sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31)) - ); -} -template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const -{ - return Mem::permute(sse_cast<__m128>( - _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31) - )); -} -// gathers {{{1 -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) -{ - gather(mem, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) -{ - gather(mem, indexes); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(mem, indexes, mask); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(mem, indexes, mask); -} - -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - gather(array, member1, member2, indexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, member1, member2, indexes, mask); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - gather(array, ptrMember1, outerIndexes, innerIndexes); -} -template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) - : d(HT::zero()) -{ - gather(array, ptrMember1, outerIndexes, innerIndexes, mask); -} - -template struct IndexSizeChecker { static void check() {} }; -template struct IndexSizeChecker, Size> -{ - static void check() { - VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); - } -}; -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); - d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} -template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], - mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); -} - -#ifdef VC_USE_SET_GATHERS -template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) -{ - IndexSizeChecker, Size>::check(); - Vector indexesTmp = indexes; - indexesTmp.setZero(!static_cast::Mask>(mask)); - (*this)(mask) = Vector(mem, indexesTmp); -} -#endif - -#ifdef VC_USE_BSF_GATHERS -#define VC_MASKED_GATHER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits &= ~(1 << i); /* btr? */ \ - d.m(i) = ith_value(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_GATHERS) -#define VC_MASKED_GATHER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (mask.count()) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - d.m(low) = ith_value(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - d.m(high) = ith_value(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - d.m(low) = ith_value(low); \ - case 0: \ - break; \ - } -#else -#define VC_MASKED_GATHER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) d.m(i) = ith_value(i); \ - ); -#endif - -template template -Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (mem[indexes[_i_]]) - VC_MASKED_GATHER -#undef ith_value -} - -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1)); - d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), - array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), - array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), - array[indexes[6]].*(member1), array[indexes[7]].*(member1)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), - array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); - d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), - array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), - array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) -{ - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), - array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), - array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) -{ - IndexSizeChecker::check(); -#define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) - VC_MASKED_GATHER -#undef ith_value -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], - (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); - d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template<> template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); - d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], - (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], - (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], - (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); -} -template template -Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) -{ - IndexSizeChecker::check(); - IndexSizeChecker::check(); -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_GATHER -#undef ith_value -} -// scatters {{{1 -#undef VC_MASKED_GATHER -#ifdef VC_USE_BSF_SCATTERS -#define VC_MASKED_SCATTER \ - int bits = mask.toInt(); \ - while (bits) { \ - const int i = _bit_scan_forward(bits); \ - bits ^= (1 << i); /* btr? */ \ - ith_value(i) = d.m(i); \ - } -#elif defined(VC_USE_POPCNT_BSF_SCATTERS) -#define VC_MASKED_SCATTER \ - unsigned int bits = mask.toInt(); \ - unsigned int low, high = 0; \ - switch (mask.count()) { \ - case 8: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 7: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 6: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 5: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 4: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - high = (1 << high); \ - case 3: \ - low = _bit_scan_forward(bits); \ - bits ^= high | (1 << low); \ - ith_value(low) = d.m(low); \ - case 2: \ - high = _bit_scan_reverse(bits); \ - ith_value(high) = d.m(high); \ - case 1: \ - low = _bit_scan_forward(bits); \ - ith_value(low) = d.m(low); \ - case 0: \ - break; \ - } -#else -#define VC_MASKED_SCATTER \ - if (mask.isEmpty()) { \ - return; \ - } \ - for_all_vector_entries(i, \ - if (mask[i]) ith_value(i) = d.m(i); \ - ); -#endif - -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const -{ - for_all_vector_entries(i, - mem[indexes[i]] = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const -{ -#define ith_value(_i_) mem[indexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const -{ - for_all_vector_entries(i, - array[indexes[i]].*(member1) = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const -{ -#define ith_value(_i_) array[indexes[_i_]].*(member1) - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const -{ - for_all_vector_entries(i, - array[indexes[i]].*(member1).*(member2) = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const -{ -#define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) - VC_MASKED_SCATTER -#undef ith_value -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const -{ - for_all_vector_entries(i, - (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); - ); -} -template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const -{ -#define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] - VC_MASKED_SCATTER -#undef ith_value -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// operator[] {{{1 -template Vc_INTRINSIC typename Vector::EntryType Vc_PURE Vector::operator[](size_t index) const -{ - return d.m(index); -} -#ifdef VC_GCC -template<> Vc_INTRINSIC double Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { - return extract_double_imm(d.v(), index); - } - return d.m(index); -} -template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const -{ - return extract_float(d.v(), index); -} -template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { - if (index < 4) { - return extract_float_imm(d.v()[0], index); - } - return extract_float_imm(d.v()[1], index - 4); - } - return d.m(index); -} -template<> Vc_INTRINSIC int Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { -#if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following -#ifdef __x86_64__ - if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; - if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; -#else - if (index == 0) return _mm_cvtsi128_si32(d.v()); -#endif -#endif -#ifdef VC_IMPL_SSE4_1 - return _mm_extract_epi32(d.v(), index); -#else - return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); -#endif - } - return d.m(index); -} -template<> Vc_INTRINSIC unsigned int Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { -#if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following -#ifdef __x86_64__ - if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; - if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; -#else - if (index == 0) return _mm_cvtsi128_si32(d.v()); -#endif -#endif -#ifdef VC_IMPL_SSE4_1 - return _mm_extract_epi32(d.v(), index); -#else - return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); -#endif - } - return d.m(index); -} -template<> Vc_INTRINSIC short Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { - return _mm_extract_epi16(d.v(), index); - } - return d.m(index); -} -template<> Vc_INTRINSIC unsigned short Vc_PURE Vector::operator[](size_t index) const -{ - if (__builtin_constant_p(index)) { - return _mm_extract_epi16(d.v(), index); - } - return d.m(index); -} -#endif // GCC -/////////////////////////////////////////////////////////////////////////////////////////// -// horizontal ops {{{1 -#ifndef VC_IMPL_SSE4_1 -// without SSE4.1 integer multiplication is slow and we rather multiply the scalars -template<> Vc_INTRINSIC Vc_PURE int Vector::product() const -{ - return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); -} -template<> Vc_INTRINSIC Vc_PURE unsigned int Vector::product() const -{ - return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); -} -#endif -template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::min(MaskArg m) const -{ - Vector tmp = std::numeric_limits >::max(); - tmp(m) = *this; - return tmp.min(); -} -template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::max(MaskArg m) const -{ - Vector tmp = std::numeric_limits >::min(); - tmp(m) = *this; - return tmp.max(); -} -template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::product(MaskArg m) const -{ - Vector tmp(VectorSpecialInitializerOne::One); - tmp(m) = *this; - return tmp.product(); -} -template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::sum(MaskArg m) const -{ - Vector tmp(VectorSpecialInitializerZero::Zero); - tmp(m) = *this; - return tmp.sum(); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -// copySign {{{1 -template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm_or_ps( - _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()), - _mm_and_ps(d.v(), _mm_setabsmask_ps()) - ); -} -template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const -{ - return M256::create( _mm_or_ps( - _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()), - _mm_and_ps(d.v()[0], _mm_setabsmask_ps()) - ), _mm_or_ps( - _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()), - _mm_and_ps(d.v()[1], _mm_setabsmask_ps()) - ) - ); -} -template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const -{ - return _mm_or_pd( - _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()), - _mm_and_pd(d.v(), _mm_setabsmask_pd()) - ); -}//}}}1 -// exponent {{{1 -template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); -} -template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.f).isFull()); - return Internal::exponent(d.v()); -} -template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const -{ - VC_ASSERT((*this >= 0.).isFull()); - return Internal::exponent(d.v()); -} -// }}}1 -// Random {{{1 -static void _doRandomStep(Vector &state0, - Vector &state1) -{ - state0.load(&Vc::RandomState[0]); - state1.load(&Vc::RandomState[uint_v::Size]); - (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); - uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); -} - -template Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return state0.reinterpretCast >(); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - Vector state0, state1; - _doRandomStep(state0, state1); - state1 ^= state0 >> 16; - return M256::create( - _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper::one()), VectorHelper::one()), - _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper::one()), VectorHelper::one()) - ); -} - -template<> Vc_ALWAYS_INLINE Vector Vector::Random() -{ - typedef unsigned long long uint64 Vc_MAY_ALIAS; - uint64 state0 = *reinterpret_cast(&Vc::RandomState[8]); - uint64 state1 = *reinterpret_cast(&Vc::RandomState[10]); - const __m128i state = _mm_load_si128(reinterpret_cast(&Vc::RandomState[8])); - *reinterpret_cast(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11); - *reinterpret_cast(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11); - return (Vector(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One(); -} -// shifted / rotated {{{1 -template Vc_INTRINSIC Vc_PURE Vector Vector::shifted(int amount) const -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - switch (amount) { - case 0: return *this; - case 1: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); - case 2: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); - case 3: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); - case 4: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); - case 5: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); - case 6: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); - case 7: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); - case 8: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); - case -1: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); - case -2: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); - case -3: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); - case -4: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); - case -5: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); - case -6: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); - case -7: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); - case -8: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); - } - return Zero(); -} -template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - switch (amount) { - case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); - case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); - case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); - case -4: return M256::create(_mm_setzero_ps(), d.v()[0]); - case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof))); - case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof))); - case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof))); - case 0: return *this; - case 1: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof))); - case 2: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof))); - case 3: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof))); - case 4: return M256::create(d.v()[1], _mm_setzero_ps()); - case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)), _mm_setzero_ps()); - case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)), _mm_setzero_ps()); - case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)), _mm_setzero_ps()); - } - return Zero(); -} -template Vc_INTRINSIC Vc_PURE Vector Vector::rotated(int amount) const -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - const __m128i v = mm128_reinterpret_cast<__m128i>(d.v()); - switch (static_cast(amount) % Size) { - case 0: return *this; - case 1: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 1 * EntryTypeSizeof)); - case 2: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 2 * EntryTypeSizeof)); - case 3: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 3 * EntryTypeSizeof)); - // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake. - // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType)) - // disables the following four calls unless sizeof(EntryType) == 2. - case 4: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 4 * EntryTypeSizeof)); - case 5: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 5 * EntryTypeSizeof)); - case 6: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 6 * EntryTypeSizeof)); - case 7: return mm128_reinterpret_cast(mm_alignr_epi8(v, v, 7 * EntryTypeSizeof)); - } - return Zero(); -} -template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const -{ - enum { - EntryTypeSizeof = sizeof(EntryType) - }; - const __m128i v0 = sse_cast<__m128i>(d.v()[0]); - const __m128i v1 = sse_cast<__m128i>(d.v()[1]); - switch (static_cast(amount) % Size) { - case 0: return *this; - case 1: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof))); - case 2: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof))); - case 3: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof))); - case 4: return M256::create(d.v()[1], d.v()[0]); - case 5: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof))); - case 6: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof))); - case 7: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof))); - } - return Zero(); -} -// }}}1 -// sorted specializations {{{1 -template<> inline Vc_PURE uint_v uint_v::sorted() const -{ - __m128i x = data(); - __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); - __m128i l = mm_min_epu32(x, y); - __m128i h = mm_max_epu32(x, y); - x = _mm_unpacklo_epi32(l, h); - y = _mm_unpackhi_epi32(h, l); - - // sort quads - l = mm_min_epu32(x, y); - h = mm_max_epu32(x, y); - x = _mm_unpacklo_epi32(l, h); - y = _mm_unpackhi_epi64(x, x); - - l = mm_min_epu32(x, y); - h = mm_max_epu32(x, y); - return _mm_unpacklo_epi32(l, h); -} -template<> inline Vc_PURE ushort_v ushort_v::sorted() const -{ - __m128i lo, hi, y, x = data(); - // sort pairs - y = Mem::permute(x); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - x = mm_blend_epi16(lo, hi, 0xaa); - - // merge left and right quads - y = Mem::permute(x); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - x = mm_blend_epi16(lo, hi, 0xcc); - y = _mm_srli_si128(x, 2); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); - - // merge quads into octs - y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); - y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = mm_min_epu16(x, y); - hi = mm_max_epu16(x, y); - - return _mm_unpacklo_epi16(lo, hi); -} -// }}}1 -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "undomacros.h" - -// vim: foldmethod=marker diff --git a/math/vc/include/Vc/sse/vectorhelper.h b/math/vc/include/Vc/sse/vectorhelper.h deleted file mode 100644 index f8dc2a1a91f89..0000000000000 --- a/math/vc/include/Vc/sse/vectorhelper.h +++ /dev/null @@ -1,814 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef SSE_VECTORHELPER_H -#define SSE_VECTORHELPER_H - -#include "types.h" -#include -#include "macros.h" - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -namespace Internal -{ -Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v) -{ - __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23); - tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f)); - return _mm_cvtepi32_ps(tmp); -} -Vc_INTRINSIC Vc_CONST M256 exponent(VC_ALIGNED_PARAMETER(M256) v) -{ - __m128i tmp0 = _mm_srli_epi32(_mm_castps_si128(v[0]), 23); - __m128i tmp1 = _mm_srli_epi32(_mm_castps_si128(v[1]), 23); - tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); - tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); - return M256::create( _mm_cvtepi32_ps(tmp0), _mm_cvtepi32_ps(tmp1)); -} -Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v) -{ - __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52); - tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff)); - return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08)); -} -} // namespace Internal - - template struct SortHelper - { - static inline Vc_CONST_L VectorType sort(VectorType) Vc_CONST_R; - }; - template struct SortHelper - { - static inline Vc_PURE_L M256 sort(const M256 &) Vc_PURE_R; - }; - -#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } -#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b) { return code; } -#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VectorTypeArg a, VectorTypeArg b, VectorTypeArg c) { return code; } - template<> struct VectorHelper - { - typedef M256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType &VectorTypeArg; -#else - typedef const VectorType VectorTypeArg; -#endif - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - OP0(allone, VectorType::create(_mm_setallone_ps(), _mm_setallone_ps())) - OP0(zero, VectorType::create(_mm_setzero_ps(), _mm_setzero_ps())) - OP2(or_, VectorType::create(_mm_or_ps(a[0], b[0]), _mm_or_ps(a[1], b[1]))) - OP2(xor_, VectorType::create(_mm_xor_ps(a[0], b[0]), _mm_xor_ps(a[1], b[1]))) - OP2(and_, VectorType::create(_mm_and_ps(a[0], b[0]), _mm_and_ps(a[1], b[1]))) - OP2(andnot_, VectorType::create(_mm_andnot_ps(a[0], b[0]), _mm_andnot_ps(a[1], b[1]))) - OP3(blend, VectorType::create(mm_blendv_ps(a[0], b[0], c[0]), mm_blendv_ps(a[1], b[1], c[1]))) - }; -#undef OP0 -#undef OP2 -#undef OP3 - -#define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } -#define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; } -#define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; } -#define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; } - - template<> struct VectorHelper<_M128> - { - typedef _M128 VectorType; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - OP0(allone, _mm_setallone_ps()) - OP0(zero, _mm_setzero_ps()) - OP2(or_, _mm_or_ps(a, b)) - OP2(xor_, _mm_xor_ps(a, b)) - OP2(and_, _mm_and_ps(a, b)) - OP2(andnot_, _mm_andnot_ps(a, b)) - OP3(blend, mm_blendv_ps(a, b, c)) - }; - - - template<> struct VectorHelper<_M128D> - { - typedef _M128D VectorType; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - static Vc_ALWAYS_INLINE_L void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - OP0(allone, _mm_setallone_pd()) - OP0(zero, _mm_setzero_pd()) - OP2(or_, _mm_or_pd(a, b)) - OP2(xor_, _mm_xor_pd(a, b)) - OP2(and_, _mm_and_pd(a, b)) - OP2(andnot_, _mm_andnot_pd(a, b)) - OP3(blend, mm_blendv_pd(a, b, c)) - }; - - template<> struct VectorHelper<_M128I> - { - typedef _M128I VectorType; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, AlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, UnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; - template static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R Vc_PURE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, AlignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, UnalignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, AlignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R; - template static Vc_ALWAYS_INLINE_L void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R; - - OP0(allone, _mm_setallone_si128()) - OP0(zero, _mm_setzero_si128()) - OP2(or_, _mm_or_si128(a, b)) - OP2(xor_, _mm_xor_si128(a, b)) - OP2(and_, _mm_and_si128(a, b)) - OP2(andnot_, _mm_andnot_si128(a, b)) - OP3(blend, mm_blendv_epi8(a, b, c)) - }; - -#undef OP1 -#undef OP2 -#undef OP3 - -#define OP1(op) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return CAT(_mm_##op##_, SUFFIX)(a); } -#define OP(op) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op##_ , SUFFIX)(a, b); } -#define OP_(op) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op , SUFFIX)(a, b); } -#define OPx(op, op2) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_##op2##_, SUFFIX)(a, b); } -#define OPcmp(op) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmp##op(const VectorType a, const VectorType b) { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); } -#define OP_CAST_(op) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return CAT(_mm_castps_, SUFFIX)( \ - _mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \ - CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \ - } -#define MINMAX \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return CAT(_mm_min_, SUFFIX)(a, b); } \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return CAT(_mm_max_, SUFFIX)(a, b); } - - template<> struct VectorHelper { - typedef _M128D VectorType; - typedef double EntryType; -#define SUFFIX pd - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return CAT(_mm_set_, SUFFIX)(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); } - -#ifdef VC_IMPL_FMA4 - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { - v1 = _mm_macc_pd(v1, v2, v3); - } -#else - static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { - VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); - VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); -#if defined(VC_GCC) && VC_GCC < 0x40703 - // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot - // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 - asm("":"+x"(h1), "+x"(h2)); -#endif - const VectorType l1 = _mm_sub_pd(v1, h1); - const VectorType l2 = _mm_sub_pd(v2, h2); - const VectorType ll = mul(l1, l2); - const VectorType lh = add(mul(l1, h2), mul(h1, l2)); - const VectorType hh = mul(h1, h2); - // ll < lh < hh for all entries is certain - const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3| - const VectorType b = mm_blendv_pd(v3, lh, lh_lt_v3); - const VectorType c = mm_blendv_pd(lh, v3, lh_lt_v3); - v1 = add(add(ll, b), add(c, hh)); - } -#endif - - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) - - OP1(sqrt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) { - return _mm_div_pd(one(), sqrt(x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { - return _mm_div_pd(one(), x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { - return _mm_cmpunord_pd(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { - return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { - return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd()); - } - - MINMAX - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - a = _mm_min_sd(a, _mm_unpackhi_pd(a, a)); - return _mm_cvtsd_f64(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - a = _mm_max_sd(a, _mm_unpackhi_pd(a, a)); - return _mm_cvtsd_f64(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); - return _mm_cvtsd_f64(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); - return _mm_cvtsd_f64(a); - } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { -#ifdef VC_IMPL_SSE4_1 - return _mm_round_pd(a, _MM_FROUND_NINT); -#else - //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); -#endif - } - }; - - template<> struct VectorHelper { - typedef float EntryType; - typedef _M128 VectorType; -#define SUFFIX ps - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(mask, a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); } - static Vc_ALWAYS_INLINE Vc_CONST _M128 concat(_M128D a, _M128D b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); } - -#ifdef VC_IMPL_FMA4 - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { - v1 = _mm_macc_ps(v1, v2, v3); - } -#else - static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { - __m128d v1_0 = _mm_cvtps_pd(v1); - __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1)); - __m128d v2_0 = _mm_cvtps_pd(v2); - __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2)); - __m128d v3_0 = _mm_cvtps_pd(v3); - __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3)); - v1 = _mm_movelh_ps( - _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)), - _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1))); - } -#endif - - OP(add) OP(sub) OP(mul) - OPcmp(eq) OPcmp(neq) - OPcmp(lt) OPcmp(nlt) - OPcmp(le) OPcmp(nle) - - OP1(sqrt) OP1(rsqrt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { - return _mm_cmpunord_ps(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { - return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { - return _mm_rcp_ps(x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { - return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps()); - } - - MINMAX - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) - a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3 - return _mm_cvtss_f32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) - a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3 - return _mm_cvtss_f32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); - a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); - return _mm_cvtss_f32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); - a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); - return _mm_cvtss_f32(a); - } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { -#ifdef VC_IMPL_SSE4_1 - return _mm_round_ps(a, _MM_FROUND_NINT); -#else - //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); -#endif - } - }; - - template<> struct VectorHelper { - typedef float EntryType; - typedef M256 VectorType; -#ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN - typedef const VectorType &VectorTypeArg; -#else - typedef const VectorType VectorTypeArg; -#endif - - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { - const _M128 x = _mm_set1_ps(a); - return VectorType::create(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { - const _M128 x = _mm_set_ps(a, b, c, d); - return VectorType::create(x, x); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, - const float e, const float f, const float g, const float h) { - return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h)); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return set(1.f); } - -#define REUSE_FLOAT_IMPL1(fun) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x) { \ - return VectorType::create(VectorHelper::fun(x[0]), VectorHelper::fun(x[1])); \ - } -#define REUSE_FLOAT_IMPL2(fun) \ - static Vc_ALWAYS_INLINE Vc_CONST VectorType fun(VectorTypeArg x, VectorTypeArg y) { \ - return VectorType::create(VectorHelper::fun(x[0], y[0]), VectorHelper::fun(x[1], y[1])); \ - } - REUSE_FLOAT_IMPL1(reciprocal) - REUSE_FLOAT_IMPL1(sqrt) - REUSE_FLOAT_IMPL1(rsqrt) - REUSE_FLOAT_IMPL1(isNaN) - REUSE_FLOAT_IMPL1(isFinite) - REUSE_FLOAT_IMPL1(abs) - REUSE_FLOAT_IMPL1(round) - - REUSE_FLOAT_IMPL2(and_) - REUSE_FLOAT_IMPL2(or_) - REUSE_FLOAT_IMPL2(xor_) - REUSE_FLOAT_IMPL2(notMaskedToZero) - REUSE_FLOAT_IMPL2(add) - REUSE_FLOAT_IMPL2(sub) - REUSE_FLOAT_IMPL2(mul) - REUSE_FLOAT_IMPL2(cmple) - REUSE_FLOAT_IMPL2(cmpnle) - REUSE_FLOAT_IMPL2(cmplt) - REUSE_FLOAT_IMPL2(cmpnlt) - REUSE_FLOAT_IMPL2(cmpeq) - REUSE_FLOAT_IMPL2(cmpneq) - REUSE_FLOAT_IMPL2(min) - REUSE_FLOAT_IMPL2(max) - - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorTypeArg a) { - return VectorHelper::min(VectorHelper::min(a[0], a[1])); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorTypeArg a) { - return VectorHelper::max(VectorHelper::max(a[0], a[1])); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorTypeArg a) { - return VectorHelper::mul(VectorHelper::mul(a[0], a[1])); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorTypeArg a) { - return VectorHelper::add(VectorHelper::add(a[0], a[1])); - } - - static inline void fma(VectorType &a, VectorTypeArg b, VectorTypeArg c) { - VectorHelper::fma(a[0], b[0], c[0]); - VectorHelper::fma(a[1], b[1], c[1]); - } -#undef REUSE_FLOAT_IMPL2 -#undef REUSE_FLOAT_IMPL1 - }; - - template<> struct VectorHelper { - typedef int EntryType; - typedef _M128I VectorType; -#define SUFFIX si128 - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } -#undef SUFFIX -#define SUFFIX epi32 - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } - - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { - return CAT(_mm_srai_, SUFFIX)(a, shift); - } - static Vc_INTRINSIC Vc_CONST VectorType abs(const VectorType a) { return mm_abs_epi32(a); } - - static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return mm_min_epi32(a, b); } - static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return mm_max_epi32(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } -#ifdef VC_IMPL_SSE4_1 - static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } -#else - static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) { - const VectorType aShift = _mm_srli_si128(a, 4); - const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2] - const VectorType bShift = _mm_srli_si128(b, 4); - const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3] - return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8)); - } -#endif - - OP(add) OP(sub) - OPcmp(eq) - OPcmp(lt) - OPcmp(gt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } - }; - - template<> struct VectorHelper { - typedef unsigned int EntryType; - typedef _M128I VectorType; -#define SUFFIX si128 - OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } - -#undef SUFFIX -#define SUFFIX epu32 - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } - - static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return mm_min_epu32(a, b); } - static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return mm_max_epu32(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - // using lo_epi16 for speed here - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_cvtsi128_si32(a); - } - - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) { - return VectorHelper::mul(a, b); - } -//X template static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { -//X switch (b) { -//X case 0: return zero(); -//X case 1: return a; -//X case 2: return _mm_slli_epi32(a, 1); -//X case 4: return _mm_slli_epi32(a, 2); -//X case 8: return _mm_slli_epi32(a, 3); -//X case 16: return _mm_slli_epi32(a, 4); -//X case 32: return _mm_slli_epi32(a, 5); -//X case 64: return _mm_slli_epi32(a, 6); -//X case 128: return _mm_slli_epi32(a, 7); -//X case 256: return _mm_slli_epi32(a, 8); -//X case 512: return _mm_slli_epi32(a, 9); -//X case 1024: return _mm_slli_epi32(a, 10); -//X case 2048: return _mm_slli_epi32(a, 11); -//X } -//X return mul(a, set(b)); -//X } - -#undef SUFFIX -#define SUFFIX epi32 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { - return CAT(_mm_srli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return CAT(_mm_set_, SUFFIX)(a, b, c, d); } - - OP(add) OP(sub) - OPcmp(eq) - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) { - return _mm_cmplt_epu32(a, b); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) { - return _mm_cmpgt_epu32(a, b); - } -#else - OPcmp(lt) - OPcmp(gt) -#endif - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } - -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } - }; - - template<> struct VectorHelper { - typedef _M128I VectorType; - typedef signed short EntryType; -#define SUFFIX si128 - - OP_(or_) OP_(and_) OP_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } - static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); } - static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); } - static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); } - -#undef SUFFIX -#define SUFFIX epi16 - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } - - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { - return CAT(_mm_srai_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, - const EntryType e, const EntryType f, const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { - v1 = add(mul(v1, v2), v3); } - - static Vc_INTRINSIC Vc_CONST VectorType abs(const VectorType a) { return mm_abs_epi16(a); } - - OPx(mul, mullo) - OP(min) OP(max) - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - - OP(add) OP(sub) - OPcmp(eq) - OPcmp(lt) - OPcmp(gt) - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } - }; - - template<> struct VectorHelper { - typedef _M128I VectorType; - typedef unsigned short EntryType; -#define SUFFIX si128 - OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_) - static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm_setzero_, SUFFIX)(); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, _M128 mask) { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); } -#ifdef VC_IMPL_SSE4_1 - static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packus_epi32(a, b); } -#else - // XXX too bad, but this is broken without SSE 4.1 - static Vc_ALWAYS_INLINE Vc_CONST _M128I concat(_M128I a, _M128I b) { return _mm_packs_epi32(a, b); } -#endif - static Vc_ALWAYS_INLINE Vc_CONST _M128I expand0(_M128I x) { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); } - static Vc_ALWAYS_INLINE Vc_CONST _M128I expand1(_M128I x) { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); } - -#undef SUFFIX -#define SUFFIX epu16 - static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return CAT(_mm_setone_, SUFFIX)(); } - -//X template static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { -//X switch (b) { -//X case 0: return zero(); -//X case 1: return a; -//X case 2: return _mm_slli_epi16(a, 1); -//X case 4: return _mm_slli_epi16(a, 2); -//X case 8: return _mm_slli_epi16(a, 3); -//X case 16: return _mm_slli_epi16(a, 4); -//X case 32: return _mm_slli_epi16(a, 5); -//X case 64: return _mm_slli_epi16(a, 6); -//X case 128: return _mm_slli_epi16(a, 7); -//X case 256: return _mm_slli_epi16(a, 8); -//X case 512: return _mm_slli_epi16(a, 9); -//X case 1024: return _mm_slli_epi16(a, 10); -//X case 2048: return _mm_slli_epi16(a, 11); -//X } -//X return mul(a, set(b)); -//X } -#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1 - static Vc_INTRINSIC Vc_CONST VectorType min(const VectorType a, const VectorType b) { return CAT(mm_min_, SUFFIX)(a, b); } - static Vc_INTRINSIC Vc_CONST VectorType max(const VectorType a, const VectorType b) { return CAT(mm_max_, SUFFIX)(a, b); } -#endif -#undef SUFFIX -#define SUFFIX epi16 - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { - return CAT(_mm_slli_, SUFFIX)(a, shift); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { - return CAT(_mm_srli_, SUFFIX)(a, shift); - } - - static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } - - OPx(mul, mullo) // should work correctly for all values -#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1) - OP(min) OP(max) // XXX breaks for values with MSB set -#endif - static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { - // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" - a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); - a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtsi128_si32(a); // & 0xffff is implicit - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, - const EntryType d, const EntryType e, const EntryType f, - const EntryType g, const EntryType h) { - return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h); - } - - OP(add) OP(sub) - OPcmp(eq) - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); } - -#ifndef USE_INCORRECT_UNSIGNED_COMPARE - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmplt(const VectorType a, const VectorType b) { - return _mm_cmplt_epu16(a, b); - } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpgt(const VectorType a, const VectorType b) { - return _mm_cmpgt_epu16(a, b); - } -#else - OPcmp(lt) - OPcmp(gt) -#endif - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(const VectorType a, const VectorType b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (const VectorType a, const VectorType b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); } - static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(const VectorType a, const VectorType b) { return cmpgt(a, b); } -#undef SUFFIX - static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } - }; -#undef OP1 -#undef OP -#undef OP_ -#undef OPx -#undef OPcmp - -} // namespace SSE -} // namespace Vc -} // namespace ROOT - -#include "vectorhelper.tcc" -#include "undomacros.h" - -#endif // SSE_VECTORHELPER_H diff --git a/math/vc/include/Vc/sse/vectorhelper.tcc b/math/vc/include/Vc/sse/vectorhelper.tcc deleted file mode 100644 index dca162ff38f93..0000000000000 --- a/math/vc/include/Vc/sse/vectorhelper.tcc +++ /dev/null @@ -1,493 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "casts.h" -#include - -namespace ROOT { -namespace Vc -{ -namespace SSE -{ - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// float_v -template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, AlignedFlag) -{ - return _mm_load_ps(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, UnalignedFlag) -{ - return _mm_loadu_ps(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndAlignedFlag) -{ - return _mm_stream_load(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndUnalignedFlag) -{ - return load(x, Unaligned); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// stores -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, AlignedFlag) -{ - _mm_store_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, UnalignedFlag) -{ - _mm_storeu_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndAlignedFlag) -{ - _mm_stream_ps(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, AlignedFlag) -{ - _mm_store_ps(mem, mm_blendv_ps(_mm_load_ps(mem), x, m)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) -{ - _mm_storeu_ps(mem, mm_blendv_ps(_mm_loadu_ps(mem), x, m)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast(mem)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast(mem)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// sfloat_v -template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, AlignedFlag) -{ - return VectorType::create(_mm_load_ps(x), _mm_load_ps(x + 4)); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, UnalignedFlag) -{ - return VectorType::create(_mm_loadu_ps(x), _mm_loadu_ps(x + 4)); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, StreamingAndAlignedFlag) -{ - return VectorType::create(_mm_stream_load(&x[0]), _mm_stream_load(&x[4])); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper::load(const float *x, StreamingAndUnalignedFlag) -{ - return load(x, Unaligned); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// stores -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, AlignedFlag) -{ - _mm_store_ps(mem, x[0]); - _mm_store_ps(mem + 4, x[1]); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, UnalignedFlag) -{ - _mm_storeu_ps(mem, x[0]); - _mm_storeu_ps(mem + 4, x[1]); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) -{ - _mm_stream_ps(mem, x[0]); - _mm_stream_ps(mem + 4, x[1]); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_setallone_si128(), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_setallone_si128(), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) -{ - _mm_store_ps(mem, mm_blendv_ps(_mm_load_ps(mem), x[0], m[0])); - _mm_store_ps(mem + 4, mm_blendv_ps(_mm_load_ps(mem + 4), x[1], m[1])); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) -{ - _mm_storeu_ps(mem, mm_blendv_ps(_mm_loadu_ps(mem), x[0], m[0])); - _mm_storeu_ps(mem + 4, mm_blendv_ps(_mm_loadu_ps(mem + 4), x[1], m[1])); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast(mem + 4)); -} -Vc_ALWAYS_INLINE void VectorHelper::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast(mem)); - _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast(mem + 4)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// double_v -template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, AlignedFlag) -{ - return _mm_load_pd(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, UnalignedFlag) -{ - return _mm_loadu_pd(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndAlignedFlag) -{ - return _mm_stream_load(x); -} - -template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndUnalignedFlag) -{ - return load(x, Unaligned); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// stores -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, AlignedFlag) -{ - _mm_store_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, UnalignedFlag) -{ - _mm_storeu_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndAlignedFlag) -{ - _mm_stream_pd(mem, x); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, AlignedFlag) -{ - _mm_store_pd(mem, mm_blendv_pd(_mm_load_pd(mem), x, m)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) -{ - _mm_storeu_pd(mem, mm_blendv_pd(_mm_loadu_pd(mem), x, m)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast(mem)); -} -Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast(mem)); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// int_v, uint_v, short_v, ushort_v -template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, AlignedFlag) -{ - return _mm_load_si128(reinterpret_cast(x)); -} - -template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, UnalignedFlag) -{ - return _mm_loadu_si128(reinterpret_cast(x)); -} - -template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndAlignedFlag) -{ - return _mm_stream_load(x); -} - -template Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndUnalignedFlag) -{ - return load(x, Unaligned); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// -// stores -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, AlignedFlag) -{ - _mm_store_si128(reinterpret_cast(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, UnalignedFlag) -{ - _mm_storeu_si128(reinterpret_cast(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndAlignedFlag) -{ - _mm_stream_si128(reinterpret_cast(mem), x); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, AlignedFlag align) -{ - store(mem, mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, UnalignedFlag align) -{ - store(mem, mm_blendv_epi8(load(mem, align), x, m), align); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} -template Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) -{ - _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); -} - - template<> inline Vc_CONST _M128I SortHelper<_M128I, 8>::sort(_M128I x) - { - _M128I lo, hi, y; - // sort pairs - y = Mem::permute(x); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = mm_blend_epi16(lo, hi, 0xaa); - - // merge left and right quads - y = Mem::permute(x); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = mm_blend_epi16(lo, hi, 0xcc); - y = _mm_srli_si128(x, 2); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); - - // merge quads into octs - y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); - y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - return _mm_unpacklo_epi16(lo, hi); - } - template<> inline Vc_CONST _M128I SortHelper<_M128I, 4>::sort(_M128I x) - { - /* - // in 16,67% of the cases the merge can be replaced by an append - - // x = [a b c d] - // y = [c d a b] - _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); - _M128I l = mm_min_epi32(x, y); // min[ac bd ac bd] - _M128I h = mm_max_epi32(x, y); // max[ac bd ac bd] - if (IS_UNLIKELY(_mm_cvtsi128_si32(h) <= l[1])) { // l[0] < h[0] < l[1] < h[1] - return _mm_unpacklo_epi32(l, h); - } - // h[0] > l[1] - */ - - // sort pairs - _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); - _M128I l = mm_min_epi32(x, y); - _M128I h = mm_max_epi32(x, y); - x = _mm_unpacklo_epi32(l, h); - y = _mm_unpackhi_epi32(h, l); - - // sort quads - l = mm_min_epi32(x, y); - h = mm_max_epi32(x, y); - x = _mm_unpacklo_epi32(l, h); - y = _mm_unpackhi_epi64(x, x); - - l = mm_min_epi32(x, y); - h = mm_max_epi32(x, y); - return _mm_unpacklo_epi32(l, h); - } - template<> inline Vc_CONST _M128 SortHelper<_M128, 4>::sort(_M128 x) - { - _M128 y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); - _M128 l = _mm_min_ps(x, y); - _M128 h = _mm_max_ps(x, y); - x = _mm_unpacklo_ps(l, h); - y = _mm_unpackhi_ps(h, l); - - l = _mm_min_ps(x, y); - h = _mm_max_ps(x, y); - x = _mm_unpacklo_ps(l, h); - y = _mm_movehl_ps(x, x); - - l = _mm_min_ps(x, y); - h = _mm_max_ps(x, y); - return _mm_unpacklo_ps(l, h); -//X _M128 k = _mm_cmpgt_ps(x, y); -//X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(2, 2, 0, 0)); -//X x = mm_blendv_ps(x, y, k); -//X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)); -//X k = _mm_cmpgt_ps(x, y); -//X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(1, 0, 1, 0)); -//X x = mm_blendv_ps(x, y, k); -//X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 1, 2, 0)); -//X k = _mm_cmpgt_ps(x, y); -//X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(0, 1, 1, 0)); -//X return mm_blendv_ps(x, y, k); - } - template<> inline Vc_PURE M256 SortHelper::sort(const M256 &_x) - { - M256 x = _x; - typedef SortHelper<_M128, 4> H; - - _M128 a, b, l, h; - a = H::sort(x[0]); - b = H::sort(x[1]); - - // merge - b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)); - l = _mm_min_ps(a, b); - h = _mm_max_ps(a, b); - - a = _mm_unpacklo_ps(l, h); - b = _mm_unpackhi_ps(l, h); - l = _mm_min_ps(a, b); - h = _mm_max_ps(a, b); - - a = _mm_unpacklo_ps(l, h); - b = _mm_unpackhi_ps(l, h); - l = _mm_min_ps(a, b); - h = _mm_max_ps(a, b); - - x[0] = _mm_unpacklo_ps(l, h); - x[1] = _mm_unpackhi_ps(l, h); - return x; - } - template<> inline Vc_CONST _M128D SortHelper<_M128D, 2>::sort(_M128D x) - { - const _M128D y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1)); - return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y)); - } - - // can be used to multiply with a constant. For some special constants it doesn't need an extra - // vector but can use a shift instead, basically encoding the factor in the instruction. - template Vc_ALWAYS_INLINE Vc_CONST IndexType mulConst(const IndexType x) { - typedef VectorHelper H; - switch (constant) { - case 0: return H::zero(); - case 1: return x; - case 2: return H::slli(x.data(), 1); - case 4: return H::slli(x.data(), 2); - case 8: return H::slli(x.data(), 3); - case 16: return H::slli(x.data(), 4); - case 32: return H::slli(x.data(), 5); - case 64: return H::slli(x.data(), 6); - case 128: return H::slli(x.data(), 7); - case 256: return H::slli(x.data(), 8); - case 512: return H::slli(x.data(), 9); - case 1024: return H::slli(x.data(), 10); - case 2048: return H::slli(x.data(), 11); - } -#ifndef VC_IMPL_SSE4_1 - // without SSE 4.1 int multiplication is not so nice - if (sizeof(typename IndexType::EntryType) == 4) { - switch (constant) { - case 3: return H::add( x.data() , H::slli(x.data(), 1)); - case 5: return H::add( x.data() , H::slli(x.data(), 2)); - case 9: return H::add( x.data() , H::slli(x.data(), 3)); - case 17: return H::add( x.data() , H::slli(x.data(), 4)); - case 33: return H::add( x.data() , H::slli(x.data(), 5)); - case 65: return H::add( x.data() , H::slli(x.data(), 6)); - case 129: return H::add( x.data() , H::slli(x.data(), 7)); - case 257: return H::add( x.data() , H::slli(x.data(), 8)); - case 513: return H::add( x.data() , H::slli(x.data(), 9)); - case 1025: return H::add( x.data() , H::slli(x.data(), 10)); - case 2049: return H::add( x.data() , H::slli(x.data(), 11)); - case 6: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 2)); - case 10: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 3)); - case 18: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 4)); - case 34: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 5)); - case 66: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 6)); - case 130: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 7)); - case 258: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 8)); - case 514: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 9)); - case 1026: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 10)); - case 2050: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 11)); - case 12: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 3)); - case 20: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 4)); - case 36: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 5)); - case 68: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 6)); - case 132: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 7)); - case 260: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 8)); - case 516: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 9)); - case 1028: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 10)); - case 2052: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 11)); - case 24: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 4)); - case 40: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 5)); - case 72: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 6)); - case 136: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 7)); - case 264: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 8)); - case 520: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 9)); - case 1032: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 10)); - case 2056: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 11)); - case 48: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 5)); - case 80: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 6)); - case 144: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 7)); - case 272: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 8)); - case 528: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 9)); - case 1040: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 10)); - case 2064: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 11)); - case 96: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 6)); - case 160: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 7)); - case 288: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 8)); - case 544: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 9)); - case 1056: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 10)); - case 2080: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 11)); - case 192: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 7)); - case 320: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 8)); - case 576: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 9)); - case 1088: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 10)); - case 2112: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 11)); - case 384: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 8)); - case 640: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 9)); - case 1152: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 10)); - case 2176: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 11)); - case 768: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 9)); - case 1280: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 10)); - case 2304: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 11)); - case 1536: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 10)); - case 2560: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 11)); - case 3072: return H::add(H::slli(x.data(),10), H::slli(x.data(), 11)); - } - } -#endif - return H::mul(x.data(), H::set(constant)); - } -} // namespace SSE -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/include/Vc/support.h b/math/vc/include/Vc/support.h deleted file mode 100644 index 20e3d32eac6cd..0000000000000 --- a/math/vc/include/Vc/support.h +++ /dev/null @@ -1,150 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_COMMON_SUPPORT_H -#define VC_COMMON_SUPPORT_H - -#ifndef VC_GLOBAL_H -#error "Vc/global.h must be included first!" -#endif - -#include - -#if defined(VC_GCC) && VC_GCC >= 0x40400 -#define VC_TARGET_NO_SIMD __attribute__((target("no-sse2,no-avx"))) -#else -#define VC_TARGET_NO_SIMD -#endif - -namespace ROOT { -namespace Vc -{ - -/** - * \name Micro-Architecture Feature Tests - */ -//@{ -/** - * \ingroup Utilities - * \headerfile support.h - * Determines the extra instructions supported by the current CPU. - * - * \return A combination of flags from Vc::ExtraInstructions that the current CPU supports. - */ -VC_TARGET_NO_SIMD -unsigned int extraInstructionsSupported(); - -/** - * \ingroup Utilities - * \headerfile support.h - * - * Tests whether the given implementation is supported by the system the code is executing on. - * - * \return \c true if the OS and hardware support execution of instructions defined by \p impl. - * \return \c false otherwise - * - * \param impl The SIMD target to test for. - */ -VC_TARGET_NO_SIMD -bool isImplementationSupported(Vc::Implementation impl); - -/** - * \internal - * \ingroup Utilities - * \headerfile support.h - * - * Tests whether the given implementation is supported by the system the code is executing on. - * - * \code - * if (!isImplementationSupported()) { - * std::cerr << "This code was compiled with features that this system does not support.\n"; - * return EXIT_FAILURE; - * } - * \endcode - * - * \return \c true if the OS and hardware support execution of instructions defined by \p impl. - * \return \c false otherwise - * - * \tparam Impl The SIMD target to test for. - */ -template -VC_TARGET_NO_SIMD -static inline bool isImplementationSupported() -{ - return isImplementationSupported(static_cast(Impl::Implementation)) && - (extraInstructionsSupported() & Impl::ExtraInstructions) == Impl::ExtraInstructions; -} - -/** - * \ingroup Utilities - * \headerfile support.h - * - * Determines the best supported implementation for the current system. - * - * \return The enum value for the best implementation. - */ -VC_TARGET_NO_SIMD -Vc::Implementation bestImplementationSupported(); - -#ifndef VC_COMPILE_LIB -/** - * \ingroup Utilities - * \headerfile support.h - * - * Tests that the CPU and Operating System support the vector unit which was compiled for. This - * function should be called before any other Vc functionality is used. It checks whether the program - * will work. If this function returns \c false then the program should exit with a useful error - * message before the OS has to kill it because of an invalid instruction exception. - * - * If the program continues and makes use of any vector features not supported by - * hard- or software then the program will crash. - * - * Example: - * \code - * int main() - * { - * if (!Vc::currentImplementationSupported()) { - * std::cerr << "CPU or OS requirements not met for the compiled in vector unit!\n"; - * exit -1; - * } - * ... - * } - * \endcode - * - * \return \c true if the OS and hardware support execution of the currently selected SIMD - * instructions. - * \return \c false otherwise - */ -VC_TARGET_NO_SIMD -#ifndef DOXYGEN -static -#endif -inline bool currentImplementationSupported() -{ - return isImplementationSupported(); -} -#endif // VC_COMPILE_LIB -//@} - -} // namespace Vc -} // namespace ROOT - -#undef VC_TARGET_NO_SIMD - -#endif // VC_COMMON_SUPPORT_H diff --git a/math/vc/include/Vc/uint_v b/math/vc/include/Vc/uint_v deleted file mode 100644 index 0ff0c2722d4b5..0000000000000 --- a/math/vc/include/Vc/uint_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/uint_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/ushort_v b/math/vc/include/Vc/ushort_v deleted file mode 100644 index 2d10089f545eb..0000000000000 --- a/math/vc/include/Vc/ushort_v +++ /dev/null @@ -1,3 +0,0 @@ -#ifdef __GNUC__ -#warning "Use of the Vc/ushort_v header is deprecated. The header file will be removed in a future version of Vc." -#endif diff --git a/math/vc/include/Vc/vector.h b/math/vc/include/Vc/vector.h deleted file mode 100644 index 565626941228d..0000000000000 --- a/math/vc/include/Vc/vector.h +++ /dev/null @@ -1,151 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VECTOR_H -#define VECTOR_H - -#include "global.h" -#include "internal/namespace.h" - -#ifdef VC_IMPL_Scalar -# include "scalar/vector.h" -# include "scalar/helperimpl.h" -#elif defined(VC_IMPL_AVX) -# include "avx/vector.h" -# include "avx/helperimpl.h" -#elif defined(VC_IMPL_SSE) -# include "sse/vector.h" -# include "sse/helperimpl.h" -#endif - -#ifdef isfinite -#undef isfinite -#endif -#ifdef isnan -#undef isnan -#endif - -namespace ROOT { -namespace Vc -{ - using VECTOR_NAMESPACE::VectorAlignment; - using VECTOR_NAMESPACE::VectorAlignedBaseT; - typedef VectorAlignedBaseT<> VectorAlignedBase; - using namespace VectorSpecialInitializerZero; - using namespace VectorSpecialInitializerOne; - using namespace VectorSpecialInitializerIndexesFromZero; - using VECTOR_NAMESPACE::min; - using VECTOR_NAMESPACE::max; - using VECTOR_NAMESPACE::sqrt; - using VECTOR_NAMESPACE::rsqrt; - using VECTOR_NAMESPACE::abs; - using VECTOR_NAMESPACE::sin; - using VECTOR_NAMESPACE::asin; - using VECTOR_NAMESPACE::cos; - using VECTOR_NAMESPACE::sincos; - using VECTOR_NAMESPACE::trunc; - using VECTOR_NAMESPACE::floor; - using VECTOR_NAMESPACE::ceil; - using VECTOR_NAMESPACE::exp; - using VECTOR_NAMESPACE::log; - using VECTOR_NAMESPACE::log2; - using VECTOR_NAMESPACE::log10; - using VECTOR_NAMESPACE::reciprocal; - using VECTOR_NAMESPACE::atan; - using VECTOR_NAMESPACE::atan2; - using VECTOR_NAMESPACE::frexp; - using VECTOR_NAMESPACE::ldexp; - using VECTOR_NAMESPACE::round; - using VECTOR_NAMESPACE::isfinite; - using VECTOR_NAMESPACE::isnan; - using VECTOR_NAMESPACE::forceToRegisters; - using VECTOR_NAMESPACE::Vector; - - typedef VECTOR_NAMESPACE::double_v double_v; - typedef double_v::Mask double_m; - typedef VECTOR_NAMESPACE::sfloat_v sfloat_v; - typedef sfloat_v::Mask sfloat_m; - typedef VECTOR_NAMESPACE::float_v float_v; - typedef float_v::Mask float_m; - typedef VECTOR_NAMESPACE::int_v int_v; - typedef int_v::Mask int_m; - typedef VECTOR_NAMESPACE::uint_v uint_v; - typedef uint_v::Mask uint_m; - typedef VECTOR_NAMESPACE::short_v short_v; - typedef short_v::Mask short_m; - typedef VECTOR_NAMESPACE::ushort_v ushort_v; - typedef ushort_v::Mask ushort_m; - - namespace { -#if defined(VC_IMPL_SSE) || defined(VC_IMPL_AVX) - using VECTOR_NAMESPACE::Const; -#endif - VC_STATIC_ASSERT_NC(double_v::Size == VC_DOUBLE_V_SIZE, VC_DOUBLE_V_SIZE_MACRO_WRONG); - VC_STATIC_ASSERT_NC(float_v::Size == VC_FLOAT_V_SIZE , VC_FLOAT_V_SIZE_MACRO_WRONG ); - VC_STATIC_ASSERT_NC(sfloat_v::Size == VC_SFLOAT_V_SIZE, VC_SFLOAT_V_SIZE_MACRO_WRONG); - VC_STATIC_ASSERT_NC(int_v::Size == VC_INT_V_SIZE , VC_INT_V_SIZE_MACRO_WRONG ); - VC_STATIC_ASSERT_NC(uint_v::Size == VC_UINT_V_SIZE , VC_UINT_V_SIZE_MACRO_WRONG ); - VC_STATIC_ASSERT_NC(short_v::Size == VC_SHORT_V_SIZE , VC_SHORT_V_SIZE_MACRO_WRONG ); - VC_STATIC_ASSERT_NC(ushort_v::Size == VC_USHORT_V_SIZE, VC_USHORT_V_SIZE_MACRO_WRONG); - } -} // namespace Vc -} // namespace ROOT - -#include "common/vectortuple.h" -#include "common/iif.h" - -#ifndef VC_NO_NAMESPACE_ALIAS -namespace Vc = ROOT::Vc; -#endif - -#ifndef VC_NO_STD_FUNCTIONS -namespace std -{ - using Vc::min; - using Vc::max; - - using Vc::abs; - using Vc::asin; - using Vc::atan; - using Vc::atan2; - using Vc::ceil; - using Vc::cos; - using Vc::exp; - using Vc::floor; - using Vc::frexp; - using Vc::ldexp; - using Vc::log; - using Vc::log10; - using Vc::log2; - using Vc::round; - using Vc::sin; - using Vc::sqrt; - - using Vc::isfinite; - using Vc::isnan; -} // namespace std -#endif - -#ifndef VC_CLEAN_NAMESPACE -#define foreach_bit(_it_, _mask_) Vc_foreach_bit(_it_, _mask_) -#endif - -#undef VECTOR_NAMESPACE - -#endif // VECTOR_H diff --git a/math/vc/include/Vc/version.h b/math/vc/include/Vc/version.h deleted file mode 100644 index 4ebcf7cc19659..0000000000000 --- a/math/vc/include/Vc/version.h +++ /dev/null @@ -1,53 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VC_VERSION_H -#define VC_VERSION_H - -#define VC_VERSION_STRING "0.7.4" -#define VC_VERSION_NUMBER 0x000708 -#define VC_VERSION_CHECK(major, minor, patch) ((major << 16) | (minor << 8) | (patch << 1)) -#define VC_LIBRARY_ABI_VERSION 3 - -namespace ROOT { -namespace Vc -{ - static inline const char *versionString() { - return VC_VERSION_STRING; - } - - static inline unsigned int versionNumber() { - return VC_VERSION_NUMBER; - } - -#if !defined(VC_NO_VERSION_CHECK) && !defined(VC_COMPILE_LIB) - void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *versionString); - namespace { - static struct runLibraryAbiCheck - { - runLibraryAbiCheck() { - checkLibraryAbi(VC_LIBRARY_ABI_VERSION, VC_VERSION_NUMBER, VC_VERSION_STRING); - } - } _runLibraryAbiCheck; - } -#endif -} // namespace Vc -} // namespace ROOT - -#endif // VC_VERSION_H diff --git a/math/vc/makeTest.py b/math/vc/makeTest.py deleted file mode 100644 index f92252946031f..0000000000000 --- a/math/vc/makeTest.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python -# -# scripts to replace a string in a set of a files -# - -import sys, re, os - - - - -with open("out.txt", "wt") as out: - for line in open("arithmetics.cpp"): - out.write(line.replace('main', 'arithmetics')) diff --git a/math/vc/src/avx_sorthelper.cpp b/math/vc/src/avx_sorthelper.cpp deleted file mode 100644 index 9b04df75dcca9..0000000000000 --- a/math/vc/src/avx_sorthelper.cpp +++ /dev/null @@ -1,427 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include -#include -#include -#include - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - -template<> m128i SortHelper::sort(VTArg _x) -{ - m128i lo, hi, y, x = _x; - // sort pairs - y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = _mm_blend_epi16(lo, hi, 0xaa); - - // merge left and right quads - y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = _mm_blend_epi16(lo, hi, 0xcc); - y = _mm_srli_si128(x, 2); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); - - // merge quads into octs - y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); - y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epi16(x, y); - hi = _mm_max_epi16(x, y); - - return _mm_unpacklo_epi16(lo, hi); -} -template<> m128i SortHelper::sort(VTArg _x) -{ - m128i lo, hi, y, x = _x; - // sort pairs - y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - x = _mm_blend_epi16(lo, hi, 0xaa); - - // merge left and right quads - y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - x = _mm_blend_epi16(lo, hi, 0xcc); - y = _mm_srli_si128(x, 2); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); - - // merge quads into octs - y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); - y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - - x = _mm_unpacklo_epi16(lo, hi); - y = _mm_srli_si128(x, 8); - lo = _mm_min_epu16(x, y); - hi = _mm_max_epu16(x, y); - - return _mm_unpacklo_epi16(lo, hi); -} - -template<> m256i SortHelper::sort(VTArg _hgfedcba) -{ - VectorType hgfedcba = _hgfedcba; - const m128i hgfe = hi128(hgfedcba); - const m128i dcba = lo128(hgfedcba); - m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea - m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea - - m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea - m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc - - l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca - h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) - - x = _mm_min_epi32(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) - y = _mm_max_epi32(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) - - m128i b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 - m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 - - // _mm_extract_epi32 from clang < 3.4 returns an unsigned int - the static_cast is free for - // conforming compilers, but fixes broken ones - if (VC_IS_UNLIKELY(static_cast(_mm_extract_epi32(x, 2)) >= static_cast(_mm_extract_epi32(y, 1)))) { - return concat(Reg::permute(b), a); - } else if (VC_IS_UNLIKELY(static_cast(_mm_extract_epi32(x, 0)) >= static_cast(_mm_extract_epi32(y, 3)))) { - return concat(a, Reg::permute(b)); - } - - // merge - l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 - h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 - - a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 - b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 - l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 - h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) - - a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 - b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 - l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 - h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) - - return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); -} - -template<> m256i SortHelper::sort(VTArg _hgfedcba) -{ - VectorType hgfedcba = _hgfedcba; - const m128i hgfe = hi128(hgfedcba); - const m128i dcba = lo128(hgfedcba); - m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea - m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea - - m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea - m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc - - l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca - h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) - - x = _mm_min_epu32(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) - y = _mm_max_epu32(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) - - m128i b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 - m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 - - if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) { - return concat(Reg::permute(b), a); - } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) { - return concat(a, Reg::permute(b)); - } - - // merge - l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 - h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 - - a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 - b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 - l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 - h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) - - a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 - b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 - l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 - h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) - - return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); -} - -template<> m256 SortHelper::sort(VTArg _hgfedcba) -{ - VectorType hgfedcba = _hgfedcba; - const m128 hgfe = hi128(hgfedcba); - const m128 dcba = lo128(hgfedcba); - m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea - m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea - - m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea - m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc - - l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca - h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) - - x = _mm_min_ps(l, Reg::permute(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) - y = _mm_max_ps(h, Reg::permute(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) - - m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0 - m128 b = Reg::shuffle(y, x); // b3 <= b2 <= b1 <= b0 - - // merge - l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 - h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 - - a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 - b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 - l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 - h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) - - a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 - b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 - l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 - h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) - - return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h)); -} - -template<> m256 SortHelper::sort(VTArg hgfedcba) -{ - return SortHelper::sort(hgfedcba); -} - -template<> void SortHelper::sort(m256d &VC_RESTRICT x, m256d &VC_RESTRICT y) -{ - m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0 - m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0 - x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0 - y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1 - l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0 - h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1) - x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0 - y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0 - l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0 - h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0 - m256d a = Reg::permute(Reg::permute128(h, h)); // h0 h1 h3 h2 - m256d b = Reg::permute(l); // l2 l3 l1 l0 - - // a3 >= a2 >= b1 >= b0 - // b3 <= b2 <= a1 <= a0 - - // merge - l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 - h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 - - x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0 - y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1 - l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0 - h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1) - - x = Reg::permute128(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0 - y = Reg::permute128(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3 - l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3 - h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3) - - x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0 - y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1 -} -template<> m256d SortHelper::sort(VTArg _dcba) -{ - VectorType dcba = _dcba; - /* - * to find the second largest number find - * max(min(max(ab),max(cd)), min(max(ad),max(bc))) - * or - * max(max(min(ab),min(cd)), min(max(ab),max(cd))) - * - const m256d adcb = avx_cast(concat(_mm_alignr_epi8(avx_cast(dc), avx_cast(ba), 8), _mm_alignr_epi8(avx_cast(ba), avx_cast(dc), 8))); - const m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab) - const m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab) - // max(h3, h1) - // max(min(h0,h2), min(h3,h1)) - // min(max(l0,l2), max(l3,l1)) - // min(l3, l1) - - const m256d ll = _mm256_min_pd(h, Reg::permute128(h, h)); // min(h3h1 h2h0 h1h3 h0h2) - //const m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2); - const m256d hh = _mm256_max_pd( - Reg::permute128(_mm256_unpackhi_pd(ll, h), l), - Reg::permute128(_mm256_blend_pd(h ll, 0x1), l)); - _mm256_min_pd(hh0, hh1 - */ - - ////////////////////////////////////////////////////////////////////////////////// - // max(max(ac), max(bd)) - // max(max(min(ac),min(bd)), min(max(ac),max(bd))) - // min(max(min(ac),min(bd)), min(max(ac),max(bd))) - // min(min(ac), min(bd)) - m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac) - m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac) - m128d h0_l0 = _mm_unpacklo_pd(l, h); - m128d h1_l1 = _mm_unpackhi_pd(l, h); - l = _mm_min_pd(h0_l0, h1_l1); - h = _mm_max_pd(h0_l0, h1_l1); - return concat( - _mm_min_pd(l, Reg::permute(h)), - _mm_max_pd(h, Reg::permute(l)) - ); - // extract: 1 cycle - // min/max: 4 cycles - // unpacklo/hi: 2 cycles - // min/max: 4 cycles - // permute: 1 cycle - // min/max: 4 cycles - // insert: 1 cycle - // ---------------------- - // total: 17 cycles - - /* - m256d cdab = Reg::permute(dcba); - m256d l = _mm256_min_pd(dcba, cdab); - m256d h = _mm256_max_pd(dcba, cdab); - m256d maxmin_ba = Reg::permute128(l, h); - m256d maxmin_dc = Reg::permute128(l, h); - - l = _mm256_min_pd(maxmin_ba, maxmin_dc); - h = _mm256_max_pd(maxmin_ba, maxmin_dc); - - return _mm256_blend_pd(h, l, 0x55); - */ - - /* - // a b c d - // b a d c - // sort pairs - m256d y, l, h; - m128d l2, h2; - y = shuffle(x, x); - l = _mm256_min_pd(x, y); // min[ab ab cd cd] - h = _mm256_max_pd(x, y); // max[ab ab cd cd] - - // 1 of 2 is at [0] - // 1 of 4 is at [1] - // 1 of 4 is at [2] - // 1 of 2 is at [3] - - // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps - x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd - l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd) - h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd - - // either it is: - return concat(l2, h2); - // or: - // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2)); - - // I'd like to have four useful compares - const m128d dc = hi128(dcba); - const m128d ba = lo128(dcba); - const m256d adcb = avx_cast(concat(_mm_alignr_epi8(avx_cast(dc), avx_cast(ba), 8), _mm_alignr_epi8(avx_cast(ba), avx_cast(dc), 8))); - - const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba)); - // 0x0: d <= b && c <= a - // 0x1: d <= b && c > a - // 0x2: d > b && c <= a - // 0x3: d > b && c > a - - switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) { - // impossible: 0x0, 0xf - case 0x1: // a <= b && b <= c && c <= d && d > a - // abcd - return Reg::permute(Reg::permute(dcba, dcba)); - case 0x2: // a <= b && b <= c && c > d && d <= a - // dabc - return Reg::permute(adcb); - case 0x3: // a <= b && b <= c && c > d && d > a - // a[bd]c - if (extraCmp & 2) { - // abdc - return Reg::permute(Reg::permute(dcba, dcba)); - } else { - // adbc - return Reg::permute(adcb); - } - case 0x4: // a <= b && b > c && c <= d && d <= a - // cdab; - return Reg::permute(dcba); - case 0x5: // a <= b && b > c && c <= d && d > a - // [ac] < [bd] - switch (extraCmp) { - case 0x0: // d <= b && c <= a - // cadb - return shuffle<>(dcba, bcda); - case 0x1: // d <= b && c > a - case 0x2: // d > b && c <= a - case 0x3: // d > b && c > a - } - case 0x6: // a <= b && b > c && c > d && d <= a - // d[ac]b - case 0x7: // a <= b && b > c && c > d && d > a - // adcb; - return permute(permute128(bcda, bcda)); - case 0x8: // a > b && b <= c && c <= d && d <= a - return bcda; - case 0x9: // a > b && b <= c && c <= d && d > a - // b[ac]d; - case 0xa: // a > b && b <= c && c > d && d <= a - // [ac] > [bd] - case 0xb: // a > b && b <= c && c > d && d > a - // badc; - return permute128(dcba); - case 0xc: // a > b && b > c && c <= d && d <= a - // c[bd]a; - case 0xd: // a > b && b > c && c <= d && d > a - // cbad; - return permute(bcda); - case 0xe: // a > b && b > c && c > d && d <= a - return dcba; - } - */ -} - -} // namespace AVX -} // namespace Vc -} // namespace ROOT diff --git a/math/vc/src/const.cpp b/math/vc/src/const.cpp deleted file mode 100644 index d6581393a5dd3..0000000000000 --- a/math/vc/src/const.cpp +++ /dev/null @@ -1,529 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef V_ALIGN -# ifdef __GNUC__ -# define V_ALIGN(n) __attribute__((aligned(n))) -# else -# define V_ALIGN(n) __declspec(align(n)) -# endif -#endif - -#include "Vc/avx/const_data.h" -#include "Vc/sse/const_data.h" -#include - -#include -#include -#include - -#include "Vc/common/macros.h" - -namespace ROOT { -namespace Vc -{ -namespace AVX -{ - // cacheline 1 - V_ALIGN(64) extern const unsigned int _IndexesFromZero32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; - V_ALIGN(16) extern const unsigned short _IndexesFromZero16[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; - V_ALIGN(16) extern const unsigned char _IndexesFromZero8 [16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - - template<> const double c_trig::data[] = { - // cacheline 4 - Vc_buildDouble(1, 0x921fb54442d18ull, -1), // π/4 - Vc_buildDouble(1, 0x921fb40000000ull, -1), // π/4 - 30bits precision - Vc_buildDouble(1, 0x4442d00000000ull, -25), // π/4 remainder1 - 32bits precision - Vc_buildDouble(1, 0x8469898cc5170ull, -49), // π/4 remainder2 - 0.0625, - 16., - 0., // padding - 0., // padding - // cacheline 5 - Vc_buildDouble( 1, 0x555555555554bull, -5), // ~ 1/4! - Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10), // ~-1/6! - Vc_buildDouble( 1, 0xa01a019c844f5ull, -16), // ~ 1/8! - Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22), // ~-1/10! - Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29), // ~ 1/12! - Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37), // ~-1/14! - Vc_buildDouble(-1, 0x5555555555548ull, -3), // ~-1/3! - Vc_buildDouble( 1, 0x111111110f7d0ull, -7), // ~ 1/5! - // cacheline 8 - Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13), // ~-1/7! - Vc_buildDouble( 1, 0x71de3567d48a1ull, -19), // ~ 1/9! - Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26), // ~-1/11! - Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33), // ~ 1/13! - 0., // padding (for alignment with float) - Vc_buildDouble(1, 0x8BE60DB939105ull, 0), // 4/π - Vc_buildDouble(1, 0x921fb54442d18ull, 0), // π/2 - Vc_buildDouble(1, 0x921fb54442d18ull, 1), // π - // cacheline 10 - Vc_buildDouble(-1, 0xc007fa1f72594ull, -1), // atan P coefficients - Vc_buildDouble(-1, 0x028545b6b807aull, 4), // atan P coefficients - Vc_buildDouble(-1, 0x2c08c36880273ull, 6), // atan P coefficients - Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6), // atan P coefficients - Vc_buildDouble(-1, 0x03669fd28ec8eull, 6), // atan P coefficients - Vc_buildDouble( 1, 0x8dbc45b14603cull, 4), // atan Q coefficients - Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7), // atan Q coefficients - Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8), // atan Q coefficients - // cacheline 12 - Vc_buildDouble( 1, 0xe563f13b049eaull, 8), // atan Q coefficients - Vc_buildDouble( 1, 0x8519efbbd62ecull, 7), // atan Q coefficients - Vc_buildDouble( 1, 0x3504f333f9de6ull, 1), // tan( 3/8 π ) - 0.66, // lower threshold for special casing in atan - Vc_buildDouble(1, 0x1A62633145C07ull, -54), // remainder of pi/2 - 1.e-8, // small asin input threshold - 0.625, // large asin input threshold - 0., // padding - // cacheline 14 - Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9), // asinCoeff0 - Vc_buildDouble(-1, 0x2079259f9290full, -1), // asinCoeff0 - Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2), // asinCoeff0 - Vc_buildDouble(-1, 0x991aaac01ab68ull, 4), // asinCoeff0 - Vc_buildDouble( 1, 0xc896240f3081dull, 4), // asinCoeff0 - Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4), // asinCoeff1 - Vc_buildDouble( 1, 0x26219af6a7f42ull, 7), // asinCoeff1 - Vc_buildDouble(-1, 0x7fe08959063eeull, 8), // asinCoeff1 - // cacheline 16 - Vc_buildDouble( 1, 0x56709b0b644beull, 8), // asinCoeff1 - Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8), // asinCoeff2 - Vc_buildDouble(-1, 0x34341333e5c16ull, -1), // asinCoeff2 - Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2), // asinCoeff2 - Vc_buildDouble(-1, 0x04331de27907bull, 4), // asinCoeff2 - Vc_buildDouble( 1, 0x39007da779259ull, 4), // asinCoeff2 - Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3), // asinCoeff2 - Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3), // asinCoeff3 - // cacheline 18 - Vc_buildDouble( 1, 0x19fc025fe9054ull, 6), // asinCoeff3 - Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7), // asinCoeff3 - Vc_buildDouble( 1, 0x1705684ffbf9dull, 7), // asinCoeff3 - Vc_buildDouble(-1, 0x898220a3607acull, 5), // asinCoeff3 - }; -#define _4(x) x - template<> const float c_trig::data[] = { - // cacheline - _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 - _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision - _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision - _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 - _4(0.0625f), - _4(16.f), - _4(0.f), // padding - _4(0.f), // padding - _4(4.166664568298827e-2f), // ~ 1/4! - _4(-1.388731625493765e-3f), // ~-1/6! - _4(2.443315711809948e-5f), // ~ 1/8! - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(-1.6666654611e-1f), // ~-1/3! - _4(8.3321608736e-3f), // ~ 1/5! - // cacheline - _4(-1.9515295891e-4f), // ~-1/7! - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(8192.f), // loss threshold - _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π - _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 - _4(Vc_buildFloat(1, 0x490FDB, 1)), // π - _4(8.05374449538e-2f), // atan P coefficients - _4(1.38776856032e-1f), // atan P coefficients - _4(1.99777106478e-1f), // atan P coefficients - _4(3.33329491539e-1f), // atan P coefficients - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(2.414213562373095f), // tan( 3/8 π ) - _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan - _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 - _4(1.e-4f), // small asin input threshold - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(4.2163199048e-2f), // asinCoeff0 - _4(2.4181311049e-2f), // asinCoeff0 - _4(4.5470025998e-2f), // asinCoeff0 - _4(7.4953002686e-2f), // asinCoeff0 - _4(1.6666752422e-1f), // asinCoeff0 - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - }; -#undef _4 - - const unsigned int c_general::absMaskFloat[2] = { 0xffffffffu, 0x7fffffffu }; - const unsigned int c_general::signMaskFloat[2] = { 0x0u, 0x80000000u }; - const unsigned int c_general::highMaskFloat = 0xfffff000u; - const float c_general::oneFloat = 1.f; - const unsigned short c_general::minShort[2] = { 0x8000u, 0x8000u }; - const unsigned short c_general::one16[2] = { 1, 1 }; - const float c_general::_2power31 = 1u << 31; - - // cacheline 4 - const unsigned long long c_general::highMaskDouble = 0xfffffffff8000000ull; - const double c_general::oneDouble = 1.; - const unsigned long long c_general::frexpMask = 0xbfefffffffffffffull; - - const unsigned long long c_log::data[21] = { - 0x000003ff000003ffull // bias TODO: remove - , 0x7ff0000000000000ull // exponentMask (+inf) - - , 0x3f1ab4c293c31bb0ull // P[0] - , 0x3fdfd6f53f5652f2ull // P[1] - , 0x4012d2baed926911ull // P[2] - , 0x402cff72c63eeb2eull // P[3] - , 0x4031efd6924bc84dull // P[4] - , 0x401ed5637d7edcf8ull // P[5] - - , 0x40269320ae97ef8eull // Q[0] - , 0x40469d2c4e19c033ull // Q[1] - , 0x4054bf33a326bdbdull // Q[2] - , 0x4051c9e2eb5eae21ull // Q[3] - , 0x4037200a9e1f25b2ull // Q[4] - - , 0xfff0000000000000ull // -inf - , 0x0010000000000000ull // min() - , 0x3fe6a09e667f3bcdull // 1/sqrt(2) - , 0x3fe6300000000000ull // round(ln(2) * 512) / 512 - , 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 - , 0x3fe0000000000000ull // 0.5 - , 0x3fdbcb7b1526e50eull // log10(e) - , 0x3ff71547652b82feull // log2(e) - }; - - template<> const unsigned int c_log::data[21] = { - 0x0000007fu // bias TODO: remove - , 0x7f800000u // exponentMask (+inf) - - , 0x3d9021bbu // 7.0376836292e-2f // P[0] - , 0xbdebd1b8u // -1.1514610310e-1f // P[1] - , 0x3def251au // 1.1676998740e-1f // P[2] - , 0xbdfe5d4fu // -1.2420140846e-1f // P[3] - , 0x3e11e9bfu // 1.4249322787e-1f // P[4] - , 0xbe2aae50u // -1.6668057665e-1f // P[5] - , 0x3e4cceacu // 2.0000714765e-1f // P[6] - , 0xbe7ffffcu // -2.4999993993e-1f // P[7] - , 0x3eaaaaaau // 3.3333331174e-1f // P[8] - , 0 // padding because of c_log - , 0 // padding because of c_log - - , 0xff800000u // -inf - , 0x00800000u // min() - , 0x3f3504f3u // 1/sqrt(2) - , 0x3f318000u // round(ln(2) * 512) / 512 - , 0xb95e8083u // ln(2) - round(ln(2) * 512) / 512 - , 0x3f000000u // 0.5 - , 0x3ede5bd9u // log10(e) - , 0x3fb8aa3bu // log2(e) - }; -} // namespace AVX - -namespace SSE -{ - // cacheline 1 - V_ALIGN(64) const int c_general::absMaskFloat[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; - V_ALIGN(16) const unsigned int c_general::signMaskFloat[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; - V_ALIGN(16) const unsigned int c_general::highMaskFloat[4] = { 0xfffff000u, 0xfffff000u, 0xfffff000u, 0xfffff000u }; - V_ALIGN(16) const short c_general::minShort[8] = { -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000 }; - V_ALIGN(16) extern const unsigned short _IndexesFromZero8[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; - - // cacheline 2 - V_ALIGN(16) extern const unsigned int _IndexesFromZero4[4] = { 0, 1, 2, 3 }; - V_ALIGN(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; - V_ALIGN(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 }; - V_ALIGN(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f }; - - // cacheline 3 - V_ALIGN(16) const unsigned long long c_general::highMaskDouble[2] = { 0xfffffffff8000000ull, 0xfffffffff8000000ull }; - V_ALIGN(16) const double c_general::oneDouble[2] = { 1., 1. }; - V_ALIGN(16) const long long c_general::absMaskDouble[2] = { 0x7fffffffffffffffll, 0x7fffffffffffffffll }; - V_ALIGN(16) const unsigned long long c_general::signMaskDouble[2] = { 0x8000000000000000ull, 0x8000000000000000ull }; - V_ALIGN(16) const unsigned long long c_general::frexpMask[2] = { 0xbfefffffffffffffull, 0xbfefffffffffffffull }; - -#define _2(x) x, x - template<> const double c_trig::data[] = { - // cacheline 4 - _2(Vc_buildDouble(1, 0x921fb54442d18ull, -1)), // π/4 - _2(Vc_buildDouble(1, 0x921fb40000000ull, -1)), // π/4 - 30bits precision - _2(Vc_buildDouble(1, 0x4442d00000000ull, -25)), // π/4 remainder1 - 32bits precision - _2(Vc_buildDouble(1, 0x8469898cc5170ull, -49)), // π/4 remainder2 - // cacheline 5 - _2(0.0625), - _2(16.), - _2(0.), // padding - _2(0.), // padding - // cacheline 6 - _2(Vc_buildDouble( 1, 0x555555555554bull, -5)), // ~ 1/4! - _2(Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10)), // ~-1/6! - _2(Vc_buildDouble( 1, 0xa01a019c844f5ull, -16)), // ~ 1/8! - _2(Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22)), // ~-1/10! - // cacheline 7 - _2(Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29)), // ~ 1/12! - _2(Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37)), // ~-1/14! - _2(Vc_buildDouble(-1, 0x5555555555548ull, -3)), // ~-1/3! - _2(Vc_buildDouble( 1, 0x111111110f7d0ull, -7)), // ~ 1/5! - // cacheline 8 - _2(Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13)), // ~-1/7! - _2(Vc_buildDouble( 1, 0x71de3567d48a1ull, -19)), // ~ 1/9! - _2(Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26)), // ~-1/11! - _2(Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33)), // ~ 1/13! - // cacheline 9 - _2(0.), // padding (for alignment with float) - _2(Vc_buildDouble(1, 0x8BE60DB939105ull, 0)), // 4/π - _2(Vc_buildDouble(1, 0x921fb54442d18ull, 0)), // π/2 - _2(Vc_buildDouble(1, 0x921fb54442d18ull, 1)), // π - // cacheline 10 - _2(Vc_buildDouble(-1, 0xc007fa1f72594ull, -1)), // atan P coefficients - _2(Vc_buildDouble(-1, 0x028545b6b807aull, 4)), // atan P coefficients - _2(Vc_buildDouble(-1, 0x2c08c36880273ull, 6)), // atan P coefficients - _2(Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6)), // atan P coefficients - // cacheline 11 - _2(Vc_buildDouble(-1, 0x03669fd28ec8eull, 6)), // atan P coefficients - _2(Vc_buildDouble( 1, 0x8dbc45b14603cull, 4)), // atan Q coefficients - _2(Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7)), // atan Q coefficients - _2(Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8)), // atan Q coefficients - // cacheline 12 - _2(Vc_buildDouble( 1, 0xe563f13b049eaull, 8)), // atan Q coefficients - _2(Vc_buildDouble( 1, 0x8519efbbd62ecull, 7)), // atan Q coefficients - _2(Vc_buildDouble( 1, 0x3504f333f9de6ull, 1)), // tan( 3/8 π ) - _2(0.66), // lower threshold for special casing in atan - // cacheline 13 - _2(Vc_buildDouble(1, 0x1A62633145C07ull, -54)), // remainder of pi/2 - _2(1.e-8), // small asin input threshold - _2(0.625), // large asin input threshold - _2(0.), // padding - // cacheline 14 - _2(Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9)), // asinCoeff0 - _2(Vc_buildDouble(-1, 0x2079259f9290full, -1)), // asinCoeff0 - _2(Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2)), // asinCoeff0 - _2(Vc_buildDouble(-1, 0x991aaac01ab68ull, 4)), // asinCoeff0 - // cacheline 15 - _2(Vc_buildDouble( 1, 0xc896240f3081dull, 4)), // asinCoeff0 - _2(Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4)), // asinCoeff1 - _2(Vc_buildDouble( 1, 0x26219af6a7f42ull, 7)), // asinCoeff1 - _2(Vc_buildDouble(-1, 0x7fe08959063eeull, 8)), // asinCoeff1 - // cacheline 16 - _2(Vc_buildDouble( 1, 0x56709b0b644beull, 8)), // asinCoeff1 - _2(Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8)), // asinCoeff2 - _2(Vc_buildDouble(-1, 0x34341333e5c16ull, -1)), // asinCoeff2 - _2(Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2)), // asinCoeff2 - // cacheline 17 - _2(Vc_buildDouble(-1, 0x04331de27907bull, 4)), // asinCoeff2 - _2(Vc_buildDouble( 1, 0x39007da779259ull, 4)), // asinCoeff2 - _2(Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3)), // asinCoeff2 - _2(Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3)), // asinCoeff3 - // cacheline 18 - _2(Vc_buildDouble( 1, 0x19fc025fe9054ull, 6)), // asinCoeff3 - _2(Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7)), // asinCoeff3 - _2(Vc_buildDouble( 1, 0x1705684ffbf9dull, 7)), // asinCoeff3 - _2(Vc_buildDouble(-1, 0x898220a3607acull, 5)), // asinCoeff3 - }; -#undef _2 -#define _4(x) x, x, x, x - template<> const float c_trig::data[] = { - // cacheline - _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 - _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision - _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision - _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 - // cacheline - _4(0.0625f), - _4(16.f), - _4(0.f), // padding - _4(0.f), // padding - // cacheline - _4(4.166664568298827e-2f), // ~ 1/4! - _4(-1.388731625493765e-3f), // ~-1/6! - _4(2.443315711809948e-5f), // ~ 1/8! - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(-1.6666654611e-1f), // ~-1/3! - _4(8.3321608736e-3f), // ~ 1/5! - // cacheline - _4(-1.9515295891e-4f), // ~-1/7! - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(8192.f), // loss threshold - _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π - _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 - _4(Vc_buildFloat(1, 0x490FDB, 1)), // π - // cacheline - _4(8.05374449538e-2f), // atan P coefficients - _4(1.38776856032e-1f), // atan P coefficients - _4(1.99777106478e-1f), // atan P coefficients - _4(3.33329491539e-1f), // atan P coefficients - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(2.414213562373095f), // tan( 3/8 π ) - _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan - // cacheline - _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 - _4(1.e-4f), // small asin input threshold - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(4.2163199048e-2f), // asinCoeff0 - _4(2.4181311049e-2f), // asinCoeff0 - _4(4.5470025998e-2f), // asinCoeff0 - _4(7.4953002686e-2f), // asinCoeff0 - // cacheline - _4(1.6666752422e-1f), // asinCoeff0 - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - // cacheline - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - _4(0.f), // padding (for alignment with double) - }; -#undef _4 - - // cacheline 8 - V_ALIGN(16) extern const unsigned char _IndexesFromZero16[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - - V_ALIGN(64) const unsigned long long c_log::data[21 * 2] = { - /* 0*/ 0x000003ff000003ffull, 0x000003ff000003ffull // bias TODO: remove - /* 1*/ , 0x7ff0000000000000ull, 0x7ff0000000000000ull // exponentMask (+inf) - - /* 2*/ , 0x3f1ab4c293c31bb0ull, 0x3f1ab4c293c31bb0ull // P[0] - /* 3*/ , 0x3fdfd6f53f5652f2ull, 0x3fdfd6f53f5652f2ull // P[1] - /* 4*/ , 0x4012d2baed926911ull, 0x4012d2baed926911ull // P[2] - /* 5*/ , 0x402cff72c63eeb2eull, 0x402cff72c63eeb2eull // P[3] - /* 6*/ , 0x4031efd6924bc84dull, 0x4031efd6924bc84dull // P[4] - /* 7*/ , 0x401ed5637d7edcf8ull, 0x401ed5637d7edcf8ull // P[5] - - /* 8*/ , 0x40269320ae97ef8eull, 0x40269320ae97ef8eull // Q[0] - /* 9*/ , 0x40469d2c4e19c033ull, 0x40469d2c4e19c033ull // Q[1] - /*10*/ , 0x4054bf33a326bdbdull, 0x4054bf33a326bdbdull // Q[2] - /*11*/ , 0x4051c9e2eb5eae21ull, 0x4051c9e2eb5eae21ull // Q[3] - /*12*/ , 0x4037200a9e1f25b2ull, 0x4037200a9e1f25b2ull // Q[4] - - /*13*/ , 0xfff0000000000000ull, 0xfff0000000000000ull // -inf - /*14*/ , 0x0010000000000000ull, 0x0010000000000000ull // min() - /*15*/ , 0x3fe6a09e667f3bcdull, 0x3fe6a09e667f3bcdull // 1/sqrt(2) - /*16*/ , 0x3fe6300000000000ull, 0x3fe6300000000000ull // round(ln(2) * 512) / 512 - /*17*/ , 0xbf2bd0105c610ca8ull, 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 - /*18*/ , 0x3fe0000000000000ull, 0x3fe0000000000000ull // 0.5 - /*19*/ , 0x3fdbcb7b1526e50eull, 0x3fdbcb7b1526e50eull // log10(e) - /*20*/ , 0x3ff71547652b82feull, 0x3ff71547652b82feull // log2(e) - }; - - template<> V_ALIGN(64) const unsigned int c_log::data[21 * 4] = { - 0x0000007fu, 0x0000007fu, 0x0000007fu, 0x0000007fu, // bias TODO: remove - 0x7f800000u, 0x7f800000u, 0x7f800000u, 0x7f800000u, // exponentMask (+inf) - - 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, // 7.0376836292e-2f // P[0] - 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, // -1.1514610310e-1f // P[1] - 0x3def251au, 0x3def251au, 0x3def251au, 0x3def251au, // 1.1676998740e-1f // P[2] - 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, // -1.2420140846e-1f // P[3] - 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, // 1.4249322787e-1f // P[4] - 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, // -1.6668057665e-1f // P[5] - 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, // 2.0000714765e-1f // P[6] - 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, // -2.4999993993e-1f // P[7] - 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, // 3.3333331174e-1f // P[8] - 0, 0, 0, 0, // padding because of c_log - 0, 0, 0, 0, // padding because of c_log - - 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u, // -inf - 0x00800000u, 0x00800000u, 0x00800000u, 0x00800000u, // min() - 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, // 1/sqrt(2) - // ln(2) = 0x3fe62e42fefa39ef - // ln(2) = Vc_buildDouble( 1, 0x00062e42fefa39ef, -1) - // = Vc_buildFloat( 1, 0x00317217(f7d), -1) + Vc_buildFloat( 1, 0x0077d1cd, -25) - // = Vc_buildFloat( 1, 0x00318000(000), -1) + Vc_buildFloat(-1, 0x005e8083, -13) - 0x3f318000u, 0x3f318000u, 0x3f318000u, 0x3f318000u, // round(ln(2) * 512) / 512 - 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, // ln(2) - round(ln(2) * 512) / 512 - 0x3f000000u, 0x3f000000u, 0x3f000000u, 0x3f000000u, // 0.5 - 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, // log10(e) - 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, // log2(e) - // log10(2) = 0x3fd34413509f79ff - // = Vc_buildDouble( 1, 0x00034413509f79ff, -2) - // = Vc_buildFloat( 1, 0x001a209a(84fbcff8), -2) + Vc_buildFloat( 1, 0x0004fbcff(8), -26) - //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) - //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) - //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) - //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) - }; -} // namespace SSE - -V_ALIGN(64) unsigned int RandomState[16] = { - 0x5a383a4fu, 0xc68bd45eu, 0x691d6d86u, 0xb367e14fu, - 0xd689dbaau, 0xfde442aau, 0x3d265423u, 0x1a77885cu, - 0x36ed2684u, 0xfb1f049du, 0x19e52f31u, 0x821e4dd7u, - 0x23996d25u, 0x5962725au, 0x6aced4ceu, 0xd4c610f3u -}; - -// dummy symbol to emit warnings with GCC 4.3 -namespace Warnings { - void _operator_bracket_warning() {} -} // namespace Warnings - -const char LIBRARY_VERSION[] = VC_VERSION_STRING; -const unsigned int LIBRARY_VERSION_NUMBER = VC_VERSION_NUMBER; -const unsigned int LIBRARY_ABI_VERSION = VC_LIBRARY_ABI_VERSION; - -void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *compileTimeVersion) { - if (LIBRARY_ABI_VERSION != compileTimeAbi || LIBRARY_VERSION_NUMBER < versionNumber) { - printf("The versions of libVc.a (%s) and Vc/version.h (%s) are incompatible. Aborting.\n", LIBRARY_VERSION, compileTimeVersion); - abort(); - } -} - -} // namespace Vc -} // namespace ROOT - -#undef V_ALIGN diff --git a/math/vc/src/cpuid.cpp b/math/vc/src/cpuid.cpp deleted file mode 100644 index a2e0b7797579c..0000000000000 --- a/math/vc/src/cpuid.cpp +++ /dev/null @@ -1,623 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include -#include - -namespace ROOT { -namespace Vc -{ -CpuId::uint CpuId::s_ecx0 = 0; -CpuId::uint CpuId::s_logicalProcessors = 0; -CpuId::uint CpuId::s_processorFeaturesC = 0; -CpuId::uint CpuId::s_processorFeaturesD = 0; -CpuId::uint CpuId::s_processorFeatures8C = 0; -CpuId::uint CpuId::s_processorFeatures8D = 0; -CpuId::uint CpuId::s_L1Instruction = 0; -CpuId::uint CpuId::s_L1Data = 0; -CpuId::uint CpuId::s_L2Data = 0; -CpuId::uint CpuId::s_L3Data = 0; -CpuId::ushort CpuId::s_L1InstructionLineSize = 0; -CpuId::ushort CpuId::s_L1DataLineSize = 0; -CpuId::ushort CpuId::s_L2DataLineSize = 0; -CpuId::ushort CpuId::s_L3DataLineSize = 0; -CpuId::uint CpuId::s_L1Associativity = 0; -CpuId::uint CpuId::s_L2Associativity = 0; -CpuId::uint CpuId::s_L3Associativity = 0; -CpuId::ushort CpuId::s_prefetch = 32; // The Intel ORM says that if CPUID(2) doesn't set the prefetch size it is 32 -CpuId::uchar CpuId::s_brandIndex = 0; -CpuId::uchar CpuId::s_cacheLineSize = 0; -CpuId::uchar CpuId::s_processorModel = 0; -CpuId::uchar CpuId::s_processorFamily = 0; -CpuId::ProcessorType CpuId::s_processorType = CpuId::IntelReserved; -bool CpuId::s_noL2orL3 = false; - -#ifdef VC_MSVC -} // better not include intrin.h inside the Vc namespace :) -} // namespace ROOT -#include -namespace ROOT { -namespace Vc -{ -#define CPUID(leaf) \ - do { \ - int out[4]; \ - __cpuid(out, leaf); \ - eax = out[0]; \ - ebx = out[1]; \ - ecx = out[2]; \ - edx = out[3]; \ - } while (false) -#define CPUID_C(leaf, _ecx_) \ - do { \ - int out[4]; \ - __cpuidex(out, leaf, _ecx_); \ - eax = out[0]; \ - ebx = out[1]; \ - ecx = out[2]; \ - edx = out[3]; \ - } while (false) -#elif defined(__i386__) && defined(__PIC__) -// %ebx may be the PIC register. -static inline void _Vc_cpuid(int leaf, unsigned int &eax, unsigned int &ebx, unsigned int &ecx, unsigned int &edx) -{ - int tmpb; - asm("mov %%ebx, %[tmpb]\n\t" - "cpuid\n\t" - "mov %%ebx, %[ebx]\n\t" - "mov %[tmpb], %%ebx\n\t" - : [tmpb]"=m"(tmpb), "=a"(eax), [ebx] "=m"(ebx), "+c"(ecx), "=d"(edx) - : [leaf] "a"(leaf) - ); -} -#define CPUID(leaf) \ - ecx = 0; \ - _Vc_cpuid(leaf, eax, ebx, ecx, edx) -#define CPUID_C(leaf, _ecx_) \ - ecx = _ecx_; \ - _Vc_cpuid(leaf, eax, ebx, ecx, edx) -#else -#define CPUID(leaf) \ - __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(leaf)) -#define CPUID_C(leaf, _ecx_) \ - __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(leaf), "c"(_ecx_)) -#endif -static unsigned int CpuIdAmdAssociativityTable(int bits) -{ - switch (bits) { - case 0x0: return 0; - case 0x1: return 1; - case 0x2: return 2; - case 0x4: return 4; - case 0x6: return 8; - case 0x8: return 16; - case 0xA: return 32; - case 0xB: return 48; - case 0xC: return 64; - case 0xD: return 96; - case 0xE: return 128; - case 0xF: return 0xff; - } - return 0xffffffffu; -} - -void CpuId::init() -{ - { - static bool done = false; - if (done) return; - done = true; - } - uint eax, ebx, ecx, edx; - - CPUID(0); - s_ecx0 = ecx; - - CPUID(1); - s_processorFeaturesC = ecx; - s_processorFeaturesD = edx; - s_processorModel = (eax & 0x000000f0) >> 4; - s_processorFamily = (eax & 0x00000f00) >> 8; - if (isAmd()) { - if (s_processorFamily >= 0xf) { - const uchar processorFamilyExt = (eax & 0x0ff00000) >> 20; - s_processorFamily += processorFamilyExt; - const uchar processorModelExt = (eax & 0x000f0000) >> 12; - s_processorModel += processorModelExt; - } - } else if (s_processorFamily == 0xf) { - const uchar processorFamilyExt = (eax & 0x0ff00000) >> 20; - s_processorFamily += processorFamilyExt; - const uchar processorModelExt = (eax & 0x000f0000) >> 12; - s_processorModel += processorModelExt; - } else if (s_processorFamily == 0x6) { - const uchar processorModelExt = (eax & 0x000f0000) >> 12; - s_processorModel += processorModelExt; - } - s_processorType = static_cast((eax & 0x00003000) >> 12); - - s_brandIndex = ebx & 0xff; - ebx >>= 8; - s_cacheLineSize = ebx & 0xff; - ebx >>= 8; - s_logicalProcessors = ebx & 0xff; - - CPUID(0x80000001); - s_processorFeatures8C = ecx; - s_processorFeatures8D = edx; - - if (isAmd()) { - s_prefetch = cacheLineSize(); - - CPUID(0x80000005); - s_L1DataLineSize = ecx & 0xff; - s_L1Data = (ecx >> 24) * 1024; - s_L1Associativity = (ecx >> 16) & 0xff; - s_L1InstructionLineSize = edx & 0xff; - s_L1Instruction = (edx >> 24) * 1024; - - CPUID(0x80000006); - s_L2DataLineSize = ecx & 0xff; - s_L2Data = (ecx >> 16) * 1024; - s_L2Associativity = CpuIdAmdAssociativityTable((ecx >> 12) & 0xf); - s_L3DataLineSize = edx & 0xff; - s_L3Data = (edx >> 18) * 512 * 1024; - s_L3Associativity = CpuIdAmdAssociativityTable((ecx >> 12) & 0xf); - return; - } - - // Intel only - int repeat = 0; - bool checkLeaf4 = false; - do { - CPUID(2); - if (repeat == 0) { - repeat = eax & 0xff; - } - if (0 == (0x80000000u & eax)) { - for (int i = 0; i < 3; ++i) { - eax >>= 8; - interpret(eax & 0xff, &checkLeaf4); - } - } - if (0 == (0x80000000u & ebx)) { - for (int i = 0; i < 4; ++i) { - interpret(ebx & 0xff, &checkLeaf4); - ebx >>= 8; - } - } - if (0 == (0x80000000u & ecx)) { - for (int i = 0; i < 4; ++i) { - interpret(ecx & 0xff, &checkLeaf4); - ecx >>= 8; - } - } - if (0 == (0x80000000u & edx)) { - for (int i = 0; i < 4; ++i) { - interpret(edx & 0xff, &checkLeaf4); - edx >>= 8; - } - } - } while (--repeat > 0); - if (checkLeaf4) { - s_prefetch = cacheLineSize(); - if (s_prefetch == 0) { - s_prefetch = 64; - } - eax = 1; - for (int i = 0; eax & 0x1f; ++i) { - CPUID_C(4, i); - const int cacheLevel = (eax >> 5) & 7; - //const int sharedBy = 1 + ((eax >> 14) & 0xfff); - const int linesize = 1 + (ebx & 0xfff); ebx >>= 12; - const int partitions = 1 + (ebx & 0x3ff); ebx >>= 10; - const int ways = 1 + (ebx & 0x3ff); - const int sets = 1 + ecx; - const int size = ways * partitions * linesize * sets; - switch (eax & 0x1f) { - case 1: // data cache - switch (cacheLevel) { - case 1: - s_L1Data = size; - s_L1DataLineSize = linesize; - s_L1Associativity = ways; - break; - case 2: - s_L2Data = size; - s_L2DataLineSize = linesize; - s_L2Associativity = ways; - break; - case 3: - s_L3Data = size; - s_L3DataLineSize = linesize; - s_L3Associativity = ways; - break; - } - break; - case 2: // instruction cache - switch (cacheLevel) { - case 1: - s_L1Instruction = size; - s_L1InstructionLineSize = linesize; - break; - } - break; - case 3: // unified cache - switch (cacheLevel) { - case 1: - s_L1Data = size;// / sharedBy; - s_L1DataLineSize = linesize; - s_L1Associativity = ways; - break; - case 2: - s_L2Data = size;// / sharedBy; - s_L2DataLineSize = linesize; - s_L2Associativity = ways; - break; - case 3: - s_L3Data = size;// / sharedBy; - s_L3DataLineSize = linesize; - s_L3Associativity = ways; - break; - } - break; - case 0: // no more caches - break; - default: // reserved - break; - } - } - } -} - -void CpuId::interpret(uchar byte, bool *checkLeaf4) -{ - switch (byte) { - case 0x06: - s_L1Instruction = 8 * 1024; - s_L1InstructionLineSize = 32; - s_L1Associativity = 4; - break; - case 0x08: - s_L1Instruction = 16 * 1024; - s_L1InstructionLineSize = 32; - s_L1Associativity = 4; - break; - case 0x09: - s_L1Instruction = 32 * 1024; - s_L1InstructionLineSize = 64; - s_L1Associativity = 4; - break; - case 0x0A: - s_L1Data = 8 * 1024; - s_L1DataLineSize = 32; - s_L1Associativity = 2; - break; - case 0x0C: - s_L1Data = 16 * 1024; - s_L1DataLineSize = 32; - s_L1Associativity = 4; - break; - case 0x0D: - s_L1Data = 16 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 4; - break; - case 0x0E: - s_L1Data = 24 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 6; - break; - case 0x21: - s_L2Data = 256 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x22: - s_L3Data = 512 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 4; - break; - case 0x23: - s_L3Data = 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0x25: - s_L3Data = 2 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0x29: - s_L3Data = 4 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0x2C: - s_L1Data = 32 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 8; - break; - case 0x30: - s_L1Data = 32 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 8; - break; - case 0x40: - s_noL2orL3 = true; - break; - case 0x41: - s_L2Data = 128 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 4; - break; - case 0x42: - s_L2Data = 256 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 4; - break; - case 0x43: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 4; - break; - case 0x44: - s_L2Data = 1024 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 4; - break; - case 0x45: - s_L2Data = 2 * 1024 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 4; - break; - case 0x46: - s_L3Data = 4 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 4; - break; - case 0x47: - s_L3Data = 8 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0x48: - s_L2Data = 3 * 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 12; - break; - case 0x49: - if (s_processorFamily == 0xf && s_processorModel == 0x6) { - s_L3Data = 4 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - } else { - s_L2Data = 4 * 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 16; - } - break; - case 0x4A: - s_L3Data = 6 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 12; - break; - case 0x4B: - s_L3Data = 8 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - break; - case 0x4C: - s_L3Data = 12 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 12; - break; - case 0x4D: - s_L3Data = 16 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - break; - case 0x4E: - s_L2Data = 6 * 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 24; - break; - case 0x60: - s_L1Data = 16 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 8; - break; - case 0x66: - s_L1Data = 8 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 4; - break; - case 0x67: - s_L1Data = 16 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 4; - break; - case 0x68: - s_L1Data = 32 * 1024; - s_L1DataLineSize = 64; - s_L1Associativity = 4; - break; - case 0x78: - s_L2Data = 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 4; - break; - case 0x79: - s_L2Data = 128 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x7A: - s_L2Data = 256 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x7B: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x7C: - s_L2Data = 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x7D: - s_L2Data = 2 * 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x7F: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 2; - break; - case 0x80: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0x82: - s_L2Data = 256 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 8; - break; - case 0x83: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 8; - break; - case 0x84: - s_L2Data = 1024 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 8; - break; - case 0x85: - s_L2Data = 2 * 1024 * 1024; - s_L2DataLineSize = 32; - s_L2Associativity = 8; - break; - case 0x86: - s_L2Data = 512 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 4; - break; - case 0x87: - s_L2Data = 1024 * 1024; - s_L2DataLineSize = 64; - s_L2Associativity = 8; - break; - case 0xD0: - s_L3Data = 512 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 4; - break; - case 0xD1: - s_L3Data = 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 4; - break; - case 0xD2: - s_L3Data = 2 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 4; - break; - case 0xD6: - s_L3Data = 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0xD7: - s_L3Data = 2 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0xD8: - s_L3Data = 4 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 8; - break; - case 0xDC: - s_L3Data = 3 * 512 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 12; - break; - case 0xDD: - s_L3Data = 3 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 12; - break; - case 0xDE: - s_L3Data = 6 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 12; - break; - case 0xE2: - s_L3Data = 2 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - break; - case 0xE3: - s_L3Data = 4 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - break; - case 0xE4: - s_L3Data = 8 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 16; - break; - case 0xEA: - s_L3Data = 12 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 24; - break; - case 0xEB: - s_L3Data = 18 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 24; - break; - case 0xEC: - s_L3Data = 24 * 1024 * 1024; - s_L3DataLineSize = 64; - s_L3Associativity = 24; - break; - case 0xF0: - s_prefetch = 64; - break; - case 0xF1: - s_prefetch = 128; - break; - case 0xFF: - // we have to use CPUID(4) to find out - *checkLeaf4 = true; - break; - default: - break; - } -} -} // namespace Vc -} // namespace ROOT - -// vim: sw=4 sts=4 et tw=100 diff --git a/math/vc/src/support.cpp b/math/vc/src/support.cpp deleted file mode 100644 index 3bd8958510d2c..0000000000000 --- a/math/vc/src/support.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include -#include -#include - -#ifdef VC_MSVC -#include -#endif - -#if defined(VC_GCC) && VC_GCC >= 0x40400 -#define VC_TARGET_NO_SIMD __attribute__((target("no-sse2,no-avx"))) -#else -#define VC_TARGET_NO_SIMD -#endif - -namespace ROOT { -namespace Vc -{ - -VC_TARGET_NO_SIMD -static inline bool xgetbvCheck(unsigned int bits) -{ -#if defined(VC_MSVC) && VC_MSVC >= 160040219 // MSVC 2010 SP1 introduced _xgetbv - unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); - return (xcrFeatureMask & bits) == bits; -#elif defined(VC_GNU_ASM) && !defined(VC_NO_XGETBV) - unsigned int eax; - asm("xgetbv" : "=a"(eax) : "c"(0) : "edx"); - return (eax & bits) == bits; -#else - // can't check, but if OSXSAVE is true let's assume it'll work - return bits > 0; // ignore 'warning: unused parameter' -#endif -} - -VC_TARGET_NO_SIMD -bool isImplementationSupported(Implementation impl) -{ - CpuId::init(); - - switch (impl) { - case ScalarImpl: - return true; - case SSE2Impl: - return CpuId::hasSse2(); - case SSE3Impl: - return CpuId::hasSse3(); - case SSSE3Impl: - return CpuId::hasSsse3(); - case SSE41Impl: - return CpuId::hasSse41(); - case SSE42Impl: - return CpuId::hasSse42(); - case AVXImpl: - return CpuId::hasOsxsave() && CpuId::hasAvx() && xgetbvCheck(0x6); - case AVX2Impl: - return false; - case ImplementationMask: - return false; - } - return false; -} - -VC_TARGET_NO_SIMD -Vc::Implementation bestImplementationSupported() -{ - CpuId::init(); - - if (!CpuId::hasSse2 ()) return Vc::ScalarImpl; - if (!CpuId::hasSse3 ()) return Vc::SSE2Impl; - if (!CpuId::hasSsse3()) return Vc::SSE3Impl; - if (!CpuId::hasSse41()) return Vc::SSSE3Impl; - if (!CpuId::hasSse42()) return Vc::SSE41Impl; - if (CpuId::hasAvx() && CpuId::hasOsxsave() && xgetbvCheck(0x6)) { - return Vc::AVXImpl; - } - return Vc::SSE42Impl; -} - -VC_TARGET_NO_SIMD -unsigned int extraInstructionsSupported() -{ - unsigned int flags = 0; - if (CpuId::hasF16c()) flags |= Vc::Float16cInstructions; - if (CpuId::hasFma4()) flags |= Vc::Fma4Instructions; - if (CpuId::hasXop ()) flags |= Vc::XopInstructions; - if (CpuId::hasPopcnt()) flags |= Vc::PopcntInstructions; - if (CpuId::hasSse4a()) flags |= Vc::Sse4aInstructions; - if (CpuId::hasFma ()) flags |= Vc::FmaInstructions; - //if (CpuId::hasPclmulqdq()) flags |= Vc::PclmulqdqInstructions; - //if (CpuId::hasAes()) flags |= Vc::AesInstructions; - //if (CpuId::hasRdrand()) flags |= Vc::RdrandInstructions; - return flags; -} - -} // namespace Vc -} // namespace ROOT - -#undef VC_TARGET_NO_SIMD - -// vim: sw=4 sts=4 et tw=100 diff --git a/math/vc/src/trigonometric.cpp b/math/vc/src/trigonometric.cpp deleted file mode 100644 index 929690e88bb86..0000000000000 --- a/math/vc/src/trigonometric.cpp +++ /dev/null @@ -1,463 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#include -#if defined(VC_IMPL_SSE) || defined(VC_IMPL_AVX) -#include - -namespace ROOT { -namespace Vc -{ -namespace -{ - using Vc::Vector; - using Vc::float_v; - using Vc::double_v; - using Vc::sfloat_v; - - template static Vc_ALWAYS_INLINE Vector cosSeries(const Vector &x) - { - typedef Const C; - const Vector x2 = x * x; - return ((C::cosCoeff(2) * x2 + - C::cosCoeff(1)) * x2 + - C::cosCoeff(0)) * (x2 * x2) - - C::_1_2() * x2 + Vector::One(); - } - static Vc_ALWAYS_INLINE double_v cosSeries(const double_v &x) - { - typedef Const C; - const double_v x2 = x * x; - return (((((C::cosCoeff(5) * x2 + - C::cosCoeff(4)) * x2 + - C::cosCoeff(3)) * x2 + - C::cosCoeff(2)) * x2 + - C::cosCoeff(1)) * x2 + - C::cosCoeff(0)) * (x2 * x2) - - C::_1_2() * x2 + double_v::One(); - } - template static Vc_ALWAYS_INLINE Vector sinSeries(const Vector &x) - { - typedef Const C; - const Vector x2 = x * x; - return ((C::sinCoeff(2) * x2 + - C::sinCoeff(1)) * x2 + - C::sinCoeff(0)) * (x2 * x) - + x; - } - static Vc_ALWAYS_INLINE double_v sinSeries(const double_v &x) - { - typedef Const C; - const double_v x2 = x * x; - return (((((C::sinCoeff(5) * x2 + - C::sinCoeff(4)) * x2 + - C::sinCoeff(3)) * x2 + - C::sinCoeff(2)) * x2 + - C::sinCoeff(1)) * x2 + - C::sinCoeff(0)) * (x2 * x) - + x; - } - template struct signed_integer { typedef int_v type; }; - template<> struct signed_integer { typedef short_v type; }; - - template static Vc_ALWAYS_INLINE Vector<_T> foldInput(const Vector<_T> &_x, IV &quadrant) - { - typedef Vector<_T> V; - typedef Const<_T> C; - - const V x = abs(_x); -#if defined(VC_IMPL_FMA4) || defined(VC_IMPL_FMA) - quadrant = static_cast(x * C::_4_pi() + V::One()); // prefer the fma here - quadrant &= ~IV::One(); -#else - quadrant = static_cast(x * C::_4_pi()); - quadrant += quadrant & IV::One(); -#endif - const V y = static_cast(quadrant); - quadrant &= 7; - - return ((x - y * C::_pi_4_hi()) - y * C::_pi_4_rem1()) - y * C::_pi_4_rem2(); - } - static Vc_ALWAYS_INLINE double_v foldInput(const double_v &_x, int_v &quadrant) - { - typedef double_v V; - typedef Const C; - - const V x = abs(_x); - V y = trunc(x / C::_pi_4()); // * C::_4_pi() would work, but is >twice as imprecise - V z = y - trunc(y * C::_1_16()) * C::_16(); // y modulo 16 - quadrant = static_cast(z); - int_m mask = (quadrant & int_v::One()) != int_v::Zero(); - ++quadrant(mask); - y(static_cast(mask)) += V::One(); - quadrant &= 7; - - // since y is an integer we don't need to split y into low and high parts until the integer - // requires more bits than there are zero bits at the end of _pi_4_hi (30 bits -> 1e9) - return ((x - y * C::_pi_4_hi()) - y * C::_pi_4_rem1()) - y * C::_pi_4_rem2(); - } -} // anonymous namespace - -/* - * algorithm for sine and cosine: - * - * The result can be calculated with sine or cosine depending on the π/4 section the input is - * in. - * sine ≈ x + x³ - * cosine ≈ 1 - x² - * - * sine: - * Map -x to x and invert the output - * Extend precision of x - n * π/4 by calculating - * ((x - n * p1) - n * p2) - n * p3 (p1 + p2 + p3 = π/4) - * - * Calculate Taylor series with tuned coefficients. - * Fix sign. - */ -template<> template Vector<_T> Trigonometric::sin(const Vector<_T> &_x) -{ - typedef Vector<_T> V; - typedef typename V::Mask M; - typedef typename signed_integer::type IV; - - IV quadrant; - const V z = foldInput(_x, quadrant); - const M sign = (_x < V::Zero()) ^ static_cast(quadrant > 3); - quadrant(quadrant > 3) -= 4; - - V y = sinSeries(z); - y(quadrant == IV::One() || quadrant == 2) = cosSeries(z); - y(sign) = -y; - return y; -} - -template<> template<> double_v Trigonometric::sin(const double_v &_x) -{ - typedef double_v V; - typedef V::Mask M; - - int_v quadrant; - M sign = _x < V::Zero(); - const V x = foldInput(_x, quadrant); - sign ^= static_cast(quadrant > 3); - quadrant(quadrant > 3) -= 4; - - V y = sinSeries(x); - y(static_cast(quadrant == int_v::One() || quadrant == 2)) = cosSeries(x); - y(sign) = -y; - return y; -} -template<> template Vector<_T> Trigonometric::cos(const Vector<_T> &_x) { - typedef Vector<_T> V; - typedef typename V::Mask M; - typedef typename signed_integer::type IV; - - IV quadrant; - const V x = foldInput(_x, quadrant); - M sign = quadrant > 3; - quadrant(quadrant > 3) -= 4; - sign ^= quadrant > IV::One(); - - V y = cosSeries(x); - y(quadrant == IV::One() || quadrant == 2) = sinSeries(x); - y(sign) = -y; - return y; -} -template<> template<> double_v Trigonometric::cos(const double_v &_x) -{ - typedef double_v V; - typedef V::Mask M; - - int_v quadrant; - const V x = foldInput(_x, quadrant); - M sign = static_cast(quadrant > 3); - quadrant(quadrant > 3) -= 4; - sign ^= static_cast(quadrant > int_v::One()); - - V y = cosSeries(x); - y(static_cast(quadrant == int_v::One() || quadrant == 2)) = sinSeries(x); - y(sign) = -y; - return y; -} -template<> template void Trigonometric::sincos(const Vector<_T> &_x, Vector<_T> *_sin, Vector<_T> *_cos) { - typedef Vector<_T> V; - typedef typename V::Mask M; - typedef typename signed_integer::type IV; - - IV quadrant; - const V x = foldInput(_x, quadrant); - M sign = static_cast(quadrant > 3); - quadrant(quadrant > 3) -= 4; - - const V cos_s = cosSeries(x); - const V sin_s = sinSeries(x); - - V c = cos_s; - c(static_cast(quadrant == IV::One() || quadrant == 2)) = sin_s; - c(sign ^ static_cast(quadrant > IV::One())) = -c; - *_cos = c; - - V s = sin_s; - s(static_cast(quadrant == IV::One() || quadrant == 2)) = cos_s; - s(sign ^ static_cast(_x < V::Zero())) = -s; - *_sin = s; -} -template<> template<> void Trigonometric::sincos(const double_v &_x, double_v *_sin, double_v *_cos) { - typedef double_v V; - typedef V::Mask M; - - int_v quadrant; - const V x = foldInput(_x, quadrant); - M sign = static_cast(quadrant > 3); - quadrant(quadrant > 3) -= 4; - - const V cos_s = cosSeries(x); - const V sin_s = sinSeries(x); - - V c = cos_s; - c(static_cast(quadrant == int_v::One() || quadrant == 2)) = sin_s; - c(sign ^ static_cast(quadrant > int_v::One())) = -c; - *_cos = c; - - V s = sin_s; - s(static_cast(quadrant == int_v::One() || quadrant == 2)) = cos_s; - s(sign ^ static_cast(_x < V::Zero())) = -s; - *_sin = s; -} -template<> template Vector<_T> Trigonometric::asin (const Vector<_T> &_x) { - typedef Const<_T> C; - typedef Vector<_T> V; - typedef typename V::Mask M; - - const M &negative = _x < V::Zero(); - - const V &a = abs(_x); - const M outOfRange = a > V::One(); - const M &small = a < C::smallAsinInput(); - const M >_0_5 = a > C::_1_2(); - V x = a; - V z = a * a; - z(gt_0_5) = (V::One() - a) * C::_1_2(); - x(gt_0_5) = sqrt(z); - z = ((((C::asinCoeff0(0) * z - + C::asinCoeff0(1)) * z - + C::asinCoeff0(2)) * z - + C::asinCoeff0(3)) * z - + C::asinCoeff0(4)) * z * x - + x; - z(gt_0_5) = C::_pi_2() - (z + z); - z(small) = a; - z(negative) = -z; - z.setQnan(outOfRange); - - return z; -} -template<> template<> double_v Trigonometric::asin (const double_v &_x) { - typedef Const C; - typedef double_v V; - typedef V::Mask M; - - const M negative = _x < V::Zero(); - - const V a = abs(_x); - const M outOfRange = a > V::One(); - const M small = a < C::smallAsinInput(); - const M large = a > C::largeAsinInput(); - - V zz = V::One() - a; - const V r = (((C::asinCoeff0(0) * zz + C::asinCoeff0(1)) * zz + C::asinCoeff0(2)) * zz + - C::asinCoeff0(3)) * zz + C::asinCoeff0(4); - const V s = (((zz + C::asinCoeff1(0)) * zz + C::asinCoeff1(1)) * zz + - C::asinCoeff1(2)) * zz + C::asinCoeff1(3); - V sqrtzz = sqrt(zz + zz); - V z = C::_pi_4() - sqrtzz; - z -= sqrtzz * (zz * r / s) - C::_pi_2_rem(); - z += C::_pi_4(); - - V a2 = a * a; - const V p = ((((C::asinCoeff2(0) * a2 + C::asinCoeff2(1)) * a2 + C::asinCoeff2(2)) * a2 + - C::asinCoeff2(3)) * a2 + C::asinCoeff2(4)) * a2 + C::asinCoeff2(5); - const V q = ((((a2 + C::asinCoeff3(0)) * a2 + C::asinCoeff3(1)) * a2 + - C::asinCoeff3(2)) * a2 + C::asinCoeff3(3)) * a2 + C::asinCoeff3(4); - z(!large) = a * (a2 * p / q) + a; - - z(negative) = -z; - z(small) = _x; - z.setQnan(outOfRange); - - return z; -} -template<> template Vector<_T> Trigonometric::atan (const Vector<_T> &_x) { - typedef Const<_T> C; - typedef Vector<_T> V; - typedef typename V::Mask M; - V x = abs(_x); - const M >_tan_3pi_8 = x > C::atanThrsHi(); - const M >_tan_pi_8 = x > C::atanThrsLo() && !gt_tan_3pi_8; - V y = V::Zero(); - y(gt_tan_3pi_8) = C::_pi_2(); - y(gt_tan_pi_8) = C::_pi_4(); - x(gt_tan_3pi_8) = -V::One() / x; - x(gt_tan_pi_8) = (x - V::One()) / (x + V::One()); - const V &x2 = x * x; - y += (((C::atanP(0) * x2 - - C::atanP(1)) * x2 - + C::atanP(2)) * x2 - - C::atanP(3)) * x2 * x - + x; - y(_x < V::Zero()) = -y; - y.setQnan(isnan(_x)); - return y; -} -template<> template<> double_v Trigonometric::atan (const double_v &_x) { - typedef Const C; - typedef double_v V; - typedef V::Mask M; - - M sign = _x < V::Zero(); - V x = abs(_x); - M finite = isfinite(_x); - V ret = C::_pi_2(); - V y = V::Zero(); - const M large = x > C::atanThrsHi(); - const M gt_06 = x > C::atanThrsLo(); - V tmp = (x - V::One()) / (x + V::One()); - tmp(large) = -V::One() / x; - x(gt_06) = tmp; - y(gt_06) = C::_pi_4(); - y(large) = C::_pi_2(); - V z = x * x; - const V p = (((C::atanP(0) * z + C::atanP(1)) * z + C::atanP(2)) * z + C::atanP(3)) * z + C::atanP(4); - const V q = ((((z + C::atanQ(0)) * z + C::atanQ(1)) * z + C::atanQ(2)) * z + C::atanQ(3)) * z + C::atanQ(4); - z = z * p / q; - z = x * z + x; - V morebits = C::_pi_2_rem(); - morebits(!large) *= C::_1_2(); - z(gt_06) += morebits; - ret(finite) = y + z; - ret(sign) = -ret; - ret.setQnan(isnan(_x)); - return ret; -} -template<> template Vector<_T> Trigonometric::atan2(const Vector<_T> &y, const Vector<_T> &x) { - typedef Const<_T> C; - typedef Vector<_T> V; - typedef typename V::Mask M; - - const M xZero = x == V::Zero(); - const M yZero = y == V::Zero(); - const M xMinusZero = xZero && x.isNegative(); - const M yNeg = y < V::Zero(); - const M xInf = !isfinite(x); - const M yInf = !isfinite(y); - - V a = C::_pi().copySign(y); - a.setZero(x >= V::Zero()); - - // setting x to any finite value will have atan(y/x) return sign(y/x)*pi/2, just in case x is inf - V _x = x; - _x(yInf) = V::One().copySign(x); - - a += atan(y / _x); - - // if x is +0 and y is +/-0 the result is +0 - a.setZero(xZero && yZero); - - // for x = -0 we add/subtract pi to get the correct result - a(xMinusZero) += C::_pi().copySign(y); - - // atan2(-Y, +/-0) = -pi/2 - a(xZero && yNeg) = -C::_pi_2(); - - // if both inputs are inf the output is +/- (3)pi/4 - a(xInf && yInf) += C::_pi_4().copySign(x ^ ~y); - - // correct the sign of y if the result is 0 - a(a == V::Zero()) = a.copySign(y); - - // any NaN input will lead to NaN output - a.setQnan(isnan(y) || isnan(x)); - - return a; -} -template<> template<> double_v Trigonometric::atan2 (const double_v &y, const double_v &x) { - typedef Const C; - typedef double_v V; - typedef V::Mask M; - - const M xZero = x == V::Zero(); - const M yZero = y == V::Zero(); - const M xMinusZero = xZero && x.isNegative(); - const M yNeg = y < V::Zero(); - const M xInf = !isfinite(x); - const M yInf = !isfinite(y); - - V a = V(C::_pi()).copySign(y); - a.setZero(x >= V::Zero()); - - // setting x to any finite value will have atan(y/x) return sign(y/x)*pi/2, just in case x is inf - V _x = x; - _x(yInf) = V::One().copySign(x); - - a += atan(y / _x); - - // if x is +0 and y is +/-0 the result is +0 - a.setZero(xZero && yZero); - - // for x = -0 we add/subtract pi to get the correct result - a(xMinusZero) += C::_pi().copySign(y); - - // atan2(-Y, +/-0) = -pi/2 - a(xZero && yNeg) = -C::_pi_2(); - - // if both inputs are inf the output is +/- (3)pi/4 - a(xInf && yInf) += C::_pi_4().copySign(x ^ ~y); - - // correct the sign of y if the result is 0 - a(a == V::Zero()) = a.copySign(y); - - // any NaN input will lead to NaN output - a.setQnan(isnan(y) || isnan(x)); - - return a; -} -} // namespace Vc -} // namespace ROOT - -#include - -// instantiate the non-specialized template functions above -template Vc::float_v Vc::Trigonometric::sin(const Vc::float_v &); -template Vc::sfloat_v Vc::Trigonometric::sin(const Vc::sfloat_v &); - -template Vc::float_v Vc::Trigonometric::cos(const Vc::float_v &); -template Vc::sfloat_v Vc::Trigonometric::cos(const Vc::sfloat_v &); - -template void Vc::Trigonometric::sincos(const Vc::float_v &, Vc::float_v *, Vc::float_v *); -template void Vc::Trigonometric::sincos(const Vc::sfloat_v &, Vc::sfloat_v *, Vc::sfloat_v *); - -template Vc::float_v Vc::Trigonometric::asin(const Vc::float_v &); -template Vc::sfloat_v Vc::Trigonometric::asin(const Vc::sfloat_v &); - -template Vc::float_v Vc::Trigonometric::atan(const Vc::float_v &); -template Vc::sfloat_v Vc::Trigonometric::atan(const Vc::sfloat_v &); - -template Vc::float_v Vc::Trigonometric::atan2(const Vc::float_v &, const Vc::float_v &); -template Vc::sfloat_v Vc::Trigonometric::atan2(const Vc::sfloat_v &, const Vc::sfloat_v &); -#endif diff --git a/math/vc/tests/CMakeLists.txt b/math/vc/tests/CMakeLists.txt deleted file mode 100644 index c93fb60bfba14..0000000000000 --- a/math/vc/tests/CMakeLists.txt +++ /dev/null @@ -1,326 +0,0 @@ -include(AddFileDependencies) -if(CMAKE_VERSION VERSION_GREATER 3.0.0) - cmake_policy(SET CMP0042 OLD) -endif() -if(POLICY CMP0054) - cmake_policy(SET CMP0054 NEW) -endif() - -add_custom_target(build_tests ALL VERBATIM) - -add_definitions(-DCOMPILE_FOR_UNIT_TESTS) # -DVC_CHECK_ALIGNMENT) -if(Vc_COMPILER_IS_MSVC) - AddCompilerFlag("/wd4267") # Disable warning "conversion from 'size_t' to 'int', possible loss of data" - AddCompilerFlag("/wd4723") # Disable warning "potential divide by 0" (suppress doesn't work) - AddCompilerFlag("/wd4290") # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)" -endif() - -if(DEFINED Vc_INSIDE_ROOT) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "") # Reset the ROOT default executable destination - set(Vc_TEST_TARGET_PREFIX "vc-") -else() - set(Vc_TEST_TARGET_PREFIX "") -endif() - -set(CXX11_FLAG) -if(NOT DEFINED Vc_INSIDE_ROOT) - set(_cxx11_flags "-std=c++11" "-std=c++0x") - if(Vc_COMPILER_IS_GCC AND WIN32) - # MinGW fails to compile POSIX code unless gnu++11 is used - set(_cxx11_flags "-std=gnu++11" "-std=gnu++0x") - endif() - foreach(_flag ${_cxx11_flags}) - string(REGEX REPLACE "[-+/:= ]" "_" _flag_esc "${_flag}") - check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc}) - if(check_cxx_compiler_flag_${_flag_esc}) - set(CXX11_FLAG ${_flag}) - break() - endif() - endforeach() -endif() - -macro(vc_add_run_target _target) - add_custom_target(run_${_target} - ${_target} - DEPENDS ${_target} - COMMENT "Execute ${_target} test" - VERBATIM - ) -endmacro() - -macro(vc_add_test _name) - foreach(_std cxx98 cxx11) - set(_extra_flags) - set(name ${_name}) - foreach(_arg ${ARGN}) - set(_extra_flags "${_extra_flags} -D${_arg}") - set(name "${name}_${_arg}") - endforeach() - - if("${_std}" STREQUAL "cxx11") - if(NOT CXX11_FLAG) - break() - endif() - set(SAVE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - if(CMAKE_CXX_FLAGS MATCHES " -ansi ") - string(REPLACE " -ansi " " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - endif() - set(_extra_flags "${_extra_flags} ${CXX11_FLAG}") - set(name "c++11_${name}") - endif() - - set(_target "${name}_scalar") - list(FIND disabled_targets ${_target} _disabled) - if(_disabled EQUAL -1) - file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/scalar/*.tcc" "${CMAKE_SOURCE_DIR}/scalar/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") - add_file_dependencies(${_name}.cpp "${_extra_deps}") - add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) - target_link_libraries(${_target} Vc) - add_target_property(${_target} COMPILE_FLAGS "-DVC_IMPL=Scalar ${_extra_flags}") - add_target_property(${_target} LABELS "Scalar") - add_dependencies(build_tests ${_target}) - add_dependencies(Scalar ${_target}) - add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") - set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "Scalar") - vc_add_run_target(${_target}) - endif() - - if(USE_SSE2 AND NOT Vc_SSE_INTRINSICS_BROKEN) - set(DVC_IMPL "-DVC_IMPL=SSE") - if(USE_XOP) - set(DVC_IMPL "${DVC_IMPL}+XOP") - endif() - if(USE_FMA) - set(DVC_IMPL "${DVC_IMPL}+FMA") - elseif(USE_FMA4) - set(DVC_IMPL "${DVC_IMPL}+FMA4") - endif() - set(_target "${name}_sse") - list(FIND disabled_targets ${_target} _disabled) - if(_disabled EQUAL -1) - file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/sse/*.tcc" "${CMAKE_SOURCE_DIR}/sse/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") - add_file_dependencies(${_name}.cpp "${_extra_deps}") - add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) - target_link_libraries(${_target} Vc) - add_target_property(${_target} COMPILE_FLAGS "${DVC_IMPL} ${_extra_flags}") - add_target_property(${_target} LABELS "SSE") - add_dependencies(build_tests ${_target}) - add_dependencies(SSE ${_target}) - add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") - set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "SSE") - vc_add_run_target(${_target}) - endif() - endif() - - if(USE_AVX) - set(DVC_IMPL "-DVC_IMPL=AVX") - if(USE_XOP) - set(DVC_IMPL "${DVC_IMPL}+XOP") - endif() - if(USE_FMA) - set(DVC_IMPL "${DVC_IMPL}+FMA") - elseif(USE_FMA4) - set(DVC_IMPL "${DVC_IMPL}+FMA4") - endif() - set(_target "${name}_avx") - list(FIND disabled_targets ${_target} _disabled) - if(_disabled EQUAL -1) - file(GLOB _extra_deps "${CMAKE_SOURCE_DIR}/avx/*.tcc" "${CMAKE_SOURCE_DIR}/avx/*.h" "${CMAKE_SOURCE_DIR}/common/*.h") - add_file_dependencies(${_name}.cpp "${_extra_deps}") - add_executable(${_target} EXCLUDE_FROM_ALL ${_name}.cpp) - target_link_libraries(${_target} Vc) - add_target_property(${_target} COMPILE_FLAGS "${DVC_IMPL} ${_extra_flags}") - add_target_property(${_target} LABELS "AVX") - add_dependencies(build_tests ${_target}) - add_dependencies(AVX ${_target}) - add_test(${Vc_TEST_TARGET_PREFIX}${_target} "${CMAKE_CURRENT_BINARY_DIR}/${_target}") - set_property(TEST ${Vc_TEST_TARGET_PREFIX}${_target} PROPERTY LABELS "AVX") - vc_add_run_target(${_target}) - endif() - endif(USE_AVX) - if("${_std}" STREQUAL "cxx11") - set(CMAKE_CXX_FLAGS "${SAVE_CXX_FLAGS}") - endif() - endforeach() -endmacro(vc_add_test) - -vc_add_test(stlcontainer) -vc_add_test(scalaraccess) -vc_add_test(memory) -vc_add_test(arithmetics) -vc_add_test(implicit_type_conversion) -vc_add_test(expandandmerge) -vc_add_test(load) -vc_add_test(store) -vc_add_test(gather) -vc_add_test(gather VC_USE_BSF_GATHERS) -vc_add_test(gather VC_USE_POPCNT_BSF_GATHERS) -vc_add_test(gather VC_USE_SET_GATHERS) -vc_add_test(scatter) -vc_add_test(scatter VC_USE_BSF_SCATTERS) -vc_add_test(scatter VC_USE_POPCNT_BSF_SCATTERS) -vc_add_test(math) -vc_add_test(math VC_LOG_ILP) -vc_add_test(math VC_LOG_ILP2) -vc_add_test(mask) -vc_add_test(utils) -vc_add_test(deinterleave) -vc_add_test(deinterleave VC_USE_MASKMOV_SCATTER) -vc_add_test(casts) -vc_add_test(swizzles) - -if(USE_SSE2 AND NOT Vc_SSE_INTRINSICS_BROKEN) - list(FIND disabled_targets sse_blend _disabled) - if(_disabled EQUAL -1) - add_executable(sse2_blend EXCLUDE_FROM_ALL sse_blend.cpp) - add_target_property(sse2_blend COMPILE_FLAGS "-DVC_IMPL=SSE2") - add_target_property(sse2_blend LABELS "SSE") - add_dependencies(build_tests sse2_blend) - add_dependencies(SSE sse2_blend) - add_test(${Vc_TEST_TARGET_PREFIX}sse2_blend "${CMAKE_CURRENT_BINARY_DIR}/sse2_blend") - set_property(TEST ${Vc_TEST_TARGET_PREFIX}sse2_blend PROPERTY LABELS "SSE") - target_link_libraries(sse2_blend Vc) - - if(USE_SSE4_1) - add_executable(sse4_blend EXCLUDE_FROM_ALL sse_blend.cpp) - add_target_property(sse4_blend COMPILE_FLAGS "-DVC_IMPL=SSE4_1") - add_target_property(sse4_blend LABELS "SSE") - add_dependencies(build_tests sse4_blend) - add_dependencies(SSE sse4_blend) - add_test(${Vc_TEST_TARGET_PREFIX}sse4_blend "${CMAKE_CURRENT_BINARY_DIR}/sse4_blend") - set_property(TEST ${Vc_TEST_TARGET_PREFIX}sse4_blend PROPERTY LABELS "SSE") - target_link_libraries(sse4_blend Vc) - endif() - endif() -endif() - -add_executable(supportfunctions EXCLUDE_FROM_ALL supportfunctions.cpp) -target_link_libraries(supportfunctions Vc) -add_target_property(supportfunctions LABELS "other") -add_dependencies(build_tests supportfunctions) -add_dependencies(other supportfunctions) -add_test(${Vc_TEST_TARGET_PREFIX}supportfunctions "${CMAKE_CURRENT_BINARY_DIR}/supportfunctions") -set_property(TEST ${Vc_TEST_TARGET_PREFIX}supportfunctions PROPERTY LABELS "other") -vc_add_run_target(supportfunctions) - -get_property(_incdirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) -set(incdirs) -foreach(_d ${_incdirs}) - list(APPEND incdirs "-I${_d}") -endforeach() - -separate_arguments(_flags UNIX_COMMAND ${CMAKE_CXX_FLAGS}) -foreach(_f ${_flags}) - if(_f MATCHES "^-m" OR _f MATCHES "^/arch:" OR _f MATCHES "^-x") - list(REMOVE_ITEM _flags "${_f}") - endif() -endforeach() - -set(TEST_OPERATOR_FAILURES FALSE CACHE BOOL "Run implicit type conversion operator tests.") -if(TEST_OPERATOR_FAILURES) - macro(vc_test_implicit_type_conversion_failures A B) - foreach(impl Scalar SSE AVX) - if("${impl}" STREQUAL "Scalar") - set(_implFlags) - elseif("${impl}" STREQUAL "SSE") - if(Vc_COMPILER_IS_MSVC) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS _implFlags) - string(STRIP "${_implFlags}" _implFlags) - elseif(Vc_COMPILER_IS_INTEL) - set(_implFlags "-xSSE2") - else() - set(_implFlags "-msse2") - endif() - elseif("${impl}" STREQUAL "AVX") - if(Vc_AVX_INTRINSICS_BROKEN) - break() - endif() - if(Vc_COMPILER_IS_MSVC) - set(_implFlags "/arch:AVX") - elseif(Vc_COMPILER_IS_INTEL) - set(_implFlags "-xAVX") - else() - set(_implFlags "-mavx") - endif() - endif() - set(type_b ${B}) - foreach(type_a ${A} ${B}) - foreach(op "^" "==" "*") # "/" "+" "-" "&" "|" "!=" "<=" ">=" "<" ">") - set(name "implicit_type_conversion_failures_${type_a}_${op}_${type_b}_${impl}") - add_test(NAME "${Vc_TEST_TARGET_PREFIX}${name}" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND - ${CMAKE_CXX_COMPILER} ${_flags} ${_implFlags} - ${incdirs} -o "${name}.tmp" - "-DTYPE_A=${type_a}" "-DTEST_OP=${op}" "-DTYPE_B=${type_b}" "-DVC_IMPL=${impl}" - ${CMAKE_CURRENT_SOURCE_DIR}/implicit_type_conversion_failures.cpp - ) - set_property(TEST "${name}" PROPERTY LABELS "${impl}") - set_tests_properties("${name}" PROPERTIES - PASS_REGULAR_EXPRESSION "invalid operands to binary expression;error: no match for .*operator\\${op};error C267[789]: binary .*\\${op}.* no (global )?operator found;error: no operator \"\\${op}\" matches these operands" - FAIL_REGULAR_EXPRESSION "no such file or directory;undefined reference to" - ) - endforeach() - set(type_b ${A}) - endforeach() - endforeach() - endmacro() - vc_test_implicit_type_conversion_failures("double_v" "float_v") - vc_test_implicit_type_conversion_failures("double_v" "short_v") - vc_test_implicit_type_conversion_failures("double_v" "ushort_v") - vc_test_implicit_type_conversion_failures("double_v" "int_v") - vc_test_implicit_type_conversion_failures("double_v" "uint_v") - vc_test_implicit_type_conversion_failures( "float_v" "double") - vc_test_implicit_type_conversion_failures( "float_v" "short_v") - vc_test_implicit_type_conversion_failures( "float_v" "ushort_v") - vc_test_implicit_type_conversion_failures("sfloat_v" "double_v") - vc_test_implicit_type_conversion_failures("sfloat_v" "double") - vc_test_implicit_type_conversion_failures("sfloat_v" "float_v") - vc_test_implicit_type_conversion_failures("sfloat_v" "int_v") - vc_test_implicit_type_conversion_failures("sfloat_v" "uint_v") - vc_test_implicit_type_conversion_failures( "short_v" "int_v") - vc_test_implicit_type_conversion_failures( "short_v" "uint_v") - vc_test_implicit_type_conversion_failures("ushort_v" "int_v") - vc_test_implicit_type_conversion_failures("ushort_v" "uint_v") - vc_test_implicit_type_conversion_failures("double_v" "bool") - vc_test_implicit_type_conversion_failures("sfloat_v" "bool") - vc_test_implicit_type_conversion_failures( "float_v" "bool") - vc_test_implicit_type_conversion_failures( "int_v" "bool") - vc_test_implicit_type_conversion_failures( "uint_v" "bool") - vc_test_implicit_type_conversion_failures( "short_v" "bool") - vc_test_implicit_type_conversion_failures("ushort_v" "bool") -endif() - -# compile and link test for targets that need to link lots of stuff together -add_library(linkTestLibDynamic1 SHARED EXCLUDE_FROM_ALL linkTestLib0.cpp linkTestLib1.cpp) -add_library(linkTestLibDynamic2 SHARED EXCLUDE_FROM_ALL linkTestLib0.cpp linkTestLib1.cpp) -add_library(linkTestLibStatic STATIC EXCLUDE_FROM_ALL linkTestLib2.cpp linkTestLib3.cpp) -add_executable(linkTest EXCLUDE_FROM_ALL linkTest0.cpp linkTest1.cpp) -add_dependencies(build_tests linkTest) -add_dependencies(other linkTest) -target_link_libraries(linkTestLibDynamic1 Vc) -target_link_libraries(linkTestLibDynamic2 Vc) -add_target_property(linkTestLibDynamic1 COMPILE_FLAGS "-DPOSTFIX=A") -add_target_property(linkTestLibDynamic2 COMPILE_FLAGS "-DPOSTFIX=B") -target_link_libraries(linkTestLibStatic Vc) -target_link_libraries(linkTest Vc linkTestLibDynamic1 linkTestLibDynamic2 linkTestLibStatic) - -# Use the following program to generate the sincos-reference-*.dat files -#add_executable(convert-sincos-reference EXCLUDE_FROM_ALL convert-sincos-reference.cpp) - -set(_deps) -foreach(fun sincos asin acos atan ln log2 log10) - foreach(filename reference-${fun}-sp.dat reference-${fun}-dp.dat) - add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${filename}" - COMMAND ${CMAKE_COMMAND} -Dfilename=${filename} -P ${CMAKE_CURRENT_SOURCE_DIR}/download.cmake - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/download.cmake - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Downloading Test Data: ${filename}" - VERBATIM - ) - list(APPEND _deps "${CMAKE_CURRENT_BINARY_DIR}/${filename}") - endforeach() -endforeach() -add_custom_target(download-testdata ALL - DEPENDS ${_deps} - ) -add_dependencies(other download-testdata) diff --git a/math/vc/tests/Makefile b/math/vc/tests/Makefile deleted file mode 100644 index 90592037696be..0000000000000 --- a/math/vc/tests/Makefile +++ /dev/null @@ -1,160 +0,0 @@ -# Makefile for the ROOT test programs. -# This Makefile shows nicely how to compile and link applications -# using the ROOT libraries on all supported platforms. -# -# Copyright (c) 2000 Rene Brun and Fons Rademakers -# -# Author: Fons Rademakers, 29/2/2000 - -#ROOTSYS = ../../../.. -#include $(ROOTSYS)/etc/Makefile.arch -#include $(ROOTSYS)/config/Makefile.config - - -RC := root-config -ifeq ($(shell which $(RC) 2>&1 | sed -ne "s@.*/$(RC)@$(RC)@p"),$(RC)) -MKARCH := $(wildcard $(shell $(RC) --etcdir)/Makefile.arch) -RCONFIG := $(wildcard $(shell $(RC) --incdir)/RConfigure.h) -endif -ifneq ($(MKARCH),) -include $(MKARCH) -else -ifeq ($(ROOTSYS),) -ROOTSYS = .. -endif -include $(ROOTSYS)/etc/Makefile.arch -include $(ROOTSYS)/config/Makefile.config -endif - -#------------------------------------------------------------------------------ - -# ifeq ($(PLATFORM),macosx) -# #unroll loop better on gcc > 4 -#CXXFLAGS+= -O3 -g -# endif -#AVXCXXFLAG := -mavx -#SIMDCXXFLAGS := -mavx -msse4.2 -msse4.1 -msse4a -mssse3 -msse3 -msse2 -#VCFLAGS := -fabi-version=0 -Wno-unused-function - -CXXFLAGS+= $(VCFLAGS) - -ifeq ($(NOAVX),) -CXXFLAGS+= $(AVXCXXFLAG) -LDFLAGS += $(AVXCXXFLAG) -endif - -EXTRALIBS += $(ROOTSYS)/lib/libVc.a - - -OBJS = arithmetics.o casts.o deinterleave.o expandandmerge.o gather.o implicit_type_conversion.o load.o \ - mask.o math.o memory.o scalaraccess.o scatter.o sse_blend.o stlcontainer.o store.o supportfunctions.o swizzles.o utils.o - -PROGRAMS = arithmetics casts deinterleave expandandmerge gather implicit_type_conversion load \ - mask math memory scalaraccess scatter sse_blend stlcontainer store supportfunctions swizzles utils - - -.SUFFIXES: .$(SrcSuf) .$(ObjSuf) $(ExeSuf) - - -arithmetics: arithmetics.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -casts: casts.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -deinterleave: deinterleave.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -expandandmerge: expandandmerge.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -gather: gather.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -implicit_type_conversion: implicit_type_conversion.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -# implicit_type_conversion_failures: implicit_type_conversion_failures.o -# $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ -# @echo "$@ done" - -# linkTest1: linkTest1.o -# $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ -# @echo "$@ done" - -load: load.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -mask: mask.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -math: math.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -memory: memory.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -scalaraccess: scalaraccess.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -scatter: scatter.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -sse_blend: sse_blend.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -stlcontainer: stlcontainer.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -store: store.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -supportfunctions: supportfunctions.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -swizzles: swizzles.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - -utils: utils.o - $(LD) $(LDFLAGS) $^ $(EXTRALIBS) $(OutPutOpt)$@ - @echo "$@ done" - - - -all: $(PROGRAMS) - - -check: all - for prog in $(PROGRAMS); do \ - ./$$prog > $$prog.out; \ - done; - -clean: - @rm -f $(OBJS) $(PROGRAMS) - -distclean: clean - @rm -f $(PROGRAMS) - - -.SUFFIXES: .$(SrcSuf) - - -.$(SrcSuf).$(ObjSuf): - $(CXX) $(CXXFLAGS) -c $< diff --git a/math/vc/tests/arithmetics.cpp b/math/vc/tests/arithmetics.cpp deleted file mode 100644 index 53596be6ec35d..0000000000000 --- a/math/vc/tests/arithmetics.cpp +++ /dev/null @@ -1,583 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include -#include -#include - -using namespace Vc; - -template void testZero() -{ - Vec a(Zero), b(Zero); - COMPARE(a, b); - Vec c, d(1); - c.setZero(); - COMPARE(a, c); - d.setZero(); - COMPARE(a, d); - d = static_cast(0); - COMPARE(a, d); - const typename Vec::EntryType zero = 0; - COMPARE(a, Vec(zero)); - COMPARE(b, Vec(zero)); - COMPARE(c, Vec(zero)); - COMPARE(d, Vec(zero)); -} - -template void testCmp() -{ - typedef typename Vec::EntryType T; - Vec a(Zero), b(Zero); - COMPARE(a, b); - if (!(a != b).isEmpty()) { - std::cerr << a << " != " << b << ", (a != b) = " << (a != b) << ", (a == b) = " << (a == b) << std::endl; - } - VERIFY((a != b).isEmpty()); - - Vec c(1); - VERIFY((a < c).isFull()); - VERIFY((c > a).isFull()); - VERIFY((a <= b).isFull()); - VERIFY((a <= c).isFull()); - VERIFY((b >= a).isFull()); - VERIFY((c >= a).isFull()); - - { - const T max = static_cast(std::numeric_limits::max() * 0.95); - const T min = 0; - const T step = max / 200; - T j = min; - VERIFY(Vec(Zero) == Vec(j)); - VERIFY(!(Vec(Zero) < Vec(j))); - VERIFY(!(Vec(Zero) > Vec(j))); - VERIFY(!(Vec(Zero) != Vec(j))); - j += step; - for (int i = 0; i < 200; ++i, j += step) { - if(Vec(Zero) >= Vec(j)) { - std::cout << j << " " << Vec(j) << " " << (Vec(Zero) >= Vec(j)) << std::endl; - } - VERIFY(Vec(Zero) < Vec(j)); - VERIFY(Vec(j) > Vec(Zero)); - VERIFY(!(Vec(Zero) >= Vec(j))); - VERIFY(!(Vec(j) <= Vec(Zero))); - VERIFY(!static_cast(Vec(Zero) >= Vec(j))); - VERIFY(!static_cast(Vec(j) <= Vec(Zero))); - } - } - if (std::numeric_limits::min() <= 0) { - const T min = static_cast(std::numeric_limits::min() * 0.95); - if (min == 0) { - return; - } - const T step = min / T(-201); - T j = min; - for (int i = 0; i < 200; ++i, j += step) { - VERIFY(Vec(j) < Vec(Zero)); - VERIFY(Vec(Zero) > Vec(j)); - VERIFY(!(Vec(Zero) <= Vec(j))); - VERIFY(!(Vec(j) >= Vec(Zero))); - } - } -} - -template void testIsMix() -{ - Vec a(IndexesFromZero); - Vec b(Zero); - Vec c(One); - if (Vec::Size > 1) { - VERIFY((a == b).isMix()); - VERIFY((a != b).isMix()); - VERIFY((a == c).isMix()); - VERIFY((a != c).isMix()); - VERIFY(!(a == a).isMix()); - VERIFY(!(a != a).isMix()); - } else { // masks of size 1 can never be a mix of 0 and 1 - VERIFY(!(a == b).isMix()); - VERIFY(!(a != b).isMix()); - VERIFY(!(a == c).isMix()); - VERIFY(!(a != c).isMix()); - VERIFY(!(a == a).isMix()); - VERIFY(!(a != a).isMix()); - } -} - -template void testAdd() -{ - Vec a(Zero), b(Zero); - COMPARE(a, b); - - a += 1; - Vec c(1); - COMPARE(a, c); - - COMPARE(a, b + 1); - COMPARE(a, b + c); - Vec x(Zero); -} - -template void testSub() -{ - Vec a(2), b(2); - COMPARE(a, b); - - a -= 1; - Vec c(1); - COMPARE(a, c); - - COMPARE(a, b - 1); - COMPARE(a, b - c); -} - -template void testMul() -{ - for (int i = 0; i < 10000; ++i) { - V a = V::Random(); - V b = V::Random(); - V reference = a; - for (int j = 0; j < V::Size; ++j) { - // this could overflow - but at least the compiler can't know about it so it doesn't - // matter that it's undefined behavior in C++. The only thing that matters is what the - // hardware does... - reference[j] *= b[j]; - } - COMPARE(a * b, reference) << a << " * " << b; - } -} - -template void testMulAdd() -{ - for (unsigned int i = 0; i < 0xffff; ++i) { - const Vec i2(i * i + 1); - Vec a(i); - - FUZZY_COMPARE(a * a + 1, i2); - } -} - -template void testMulSub() -{ - typedef typename Vec::EntryType T; - for (unsigned int i = 0; i < 0xffff; ++i) { - const T j = static_cast(i); - const Vec test(j); - - FUZZY_COMPARE(test * test - test, Vec(j * j - j)); - } -} - -template void testDiv() -{ - typedef typename Vec::EntryType T; - // If this test fails for ICC see here: - // http://software.intel.com/en-us/forums/topic/488995 - - const T stepsize = std::max(T(1), T(std::numeric_limits::max() / 1024)); - for (T divisor = 1; divisor < 5; ++divisor) { - for (T scalar = std::numeric_limits::min(); scalar < std::numeric_limits::max() - stepsize + 1; scalar += stepsize) { - Vec vector(scalar); - Vec reference(scalar / divisor); - - COMPARE(vector / divisor, reference) << '\n' << vector << " / " << divisor - << ", reference: " << scalar << " / " << divisor << " = " << scalar / divisor; - vector /= divisor; - COMPARE(vector, reference); - } - } -} - -template void testAnd() -{ - Vec a(0x7fff); - Vec b(0xf); - COMPARE((a & 0xf), b); - Vec c(IndexesFromZero); - COMPARE(c, (c & 0xf)); - const typename Vec::EntryType zero = 0; - COMPARE((c & 0x7ff0), Vec(zero)); -} - -template void testShift() -{ - typedef typename Vec::EntryType T; - const T step = std::max(1, std::numeric_limits::max() / 1000); - enum { - NShifts = sizeof(T) * 8 - }; - for (Vec x = std::numeric_limits::min() + Vec::IndexesFromZero(); - x < std::numeric_limits::max() - step; - x += step) { - for (size_t shift = 0; shift < NShifts; ++shift) { - const Vec rightShift = x >> shift; - const Vec leftShift = x << shift; - for (size_t k = 0; k < Vec::Size; ++k) { - COMPARE(rightShift[k], T(x[k] >> shift)) << ", x[k] = " << x[k] << ", shift = " << shift; - COMPARE(leftShift [k], T(x[k] << shift)) << ", x[k] = " << x[k] << ", shift = " << shift; - } - } - } - - Vec a(1); - Vec b(2); - - // left shifts - COMPARE((a << 1), b); - COMPARE((a << 2), (a << 2)); - COMPARE((a << 2), (b << 1)); - - Vec shifts(IndexesFromZero); - a <<= shifts; - for (typename Vec::EntryType i = 0, x = 1; i < Vec::Size; ++i, x <<= 1) { - COMPARE(a[i], x); - } - - // right shifts - a = Vec(4); - COMPARE((a >> 1), b); - COMPARE((a >> 2), (a >> 2)); - COMPARE((a >> 2), (b >> 1)); - - a = Vec(16); - a >>= shifts; - for (typename Vec::EntryType i = 0, x = 16; i < Vec::Size; ++i, x >>= 1) { - COMPARE(a[i], x); - } -} - -template void testOnesComplement() -{ - Vec a(One); - Vec b = ~a; - COMPARE(~a, b); - COMPARE(~b, a); - COMPARE(~(a + b), Vec(Zero)); -} - -template struct NegateRangeHelper -{ - typedef int Iterator; - static const Iterator Start; - static const Iterator End; -}; -template<> struct NegateRangeHelper { - typedef unsigned int Iterator; - static const Iterator Start; - static const Iterator End; -}; -template<> const int NegateRangeHelper::Start = -0xffffff; -template<> const int NegateRangeHelper::End = 0xffffff - 133; -template<> const int NegateRangeHelper::Start = -0xffffff; -template<> const int NegateRangeHelper::End = 0xffffff - 133; -template<> const int NegateRangeHelper::Start = -0x7fffffff; -template<> const int NegateRangeHelper::End = 0x7fffffff - 0xee; -const unsigned int NegateRangeHelper::Start = 0; -const unsigned int NegateRangeHelper::End = 0xffffffff - 0xee; -template<> const int NegateRangeHelper::Start = -0x7fff; -template<> const int NegateRangeHelper::End = 0x7fff - 0xee; -template<> const int NegateRangeHelper::Start = 0; -template<> const int NegateRangeHelper::End = 0xffff - 0xee; - -template void testNegate() -{ - typedef typename Vec::EntryType T; - typedef NegateRangeHelper Range; - for (typename Range::Iterator i = Range::Start; i < Range::End; i += 0xef) { - T i2 = static_cast(i); - Vec a(i2); - - COMPARE(static_cast(-a), Vec(-i2)) << " i2: " << i2; - } -} - -template void testMin() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - typedef typename Vec::IndexType I; - - Vec v(I::IndexesFromZero()); - - COMPARE(v.min(), static_cast(0)); - COMPARE((T(Vec::Size) - v).min(), static_cast(1)); - - int j = 0; - Mask m; - do { - m = allMasks(j++); - if (m.isEmpty()) { - break; - } - COMPARE(v.min(m), static_cast(m.firstOne())) << m << v; - } while (true); -} - -template void testMax() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - typedef typename Vec::IndexType I; - - Vec v(I::IndexesFromZero()); - - COMPARE(v.max(), static_cast(Vec::Size - 1)); - v = T(Vec::Size) - v; - COMPARE(v.max(), static_cast(Vec::Size)); - - int j = 0; - Mask m; - do { - m = allMasks(j++); - if (m.isEmpty()) { - break; - } - COMPARE(v.max(m), static_cast(Vec::Size - m.firstOne())) << m << v; - } while (true); -} - -template void testProduct() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - - for (int i = 0; i < 10; ++i) { - T x = static_cast(i); - Vec v(x); - T x2 = x; - for (int k = Vec::Size; k > 1; k /= 2) { - x2 *= x2; - } - COMPARE(v.product(), x2); - - int j = 0; - Mask m; - do { - m = allMasks(j++); - if (m.isEmpty()) { - break; - } - if (std::numeric_limits::is_exact) { - x2 = x; - for (int k = m.count(); k > 1; --k) { - x2 *= x; - } - } else { - x2 = static_cast(pow(static_cast(x), m.count())); - } - COMPARE(v.product(m), x2) << m << v; - } while (true); - } -} - -template void testSum() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - - for (int i = 0; i < 10; ++i) { - T x = static_cast(i); - Vec v(x); - COMPARE(v.sum(), x * Vec::Size); - - int j = 0; - Mask m; - do { - m = allMasks(j++); - COMPARE(v.sum(m), x * m.count()) << m << v; - } while (!m.isEmpty()); - } -} - -template void fma() -{ - for (int i = 0; i < 1000; ++i) { - V a = V::Random(); - const V b = V::Random(); - const V c = V::Random(); - const V reference = a * b + c; - a.fusedMultiplyAdd(b, c); - COMPARE(a, reference) << ", a = " << a << ", b = " << b << ", c = " << c; - } -} - -template<> void fma() -{ - float_v b = Vc_buildFloat(1, 0x000001, 0); - float_v c = Vc_buildFloat(1, 0x000000, -24); - float_v a = b; - /*a *= b; - a += c; - COMPARE(a, float_v(Vc_buildFloat(1, 0x000002, 0))); - a = b;*/ - a.fusedMultiplyAdd(b, c); - COMPARE(a, float_v(Vc_buildFloat(1, 0x000003, 0))); - - a = Vc_buildFloat(1, 0x000002, 0); - b = Vc_buildFloat(1, 0x000002, 0); - c = Vc_buildFloat(-1, 0x000000, 0); - /*a *= b; - a += c; - COMPARE(a, float_v(Vc_buildFloat(1, 0x000000, -21))); - a = b;*/ - a.fusedMultiplyAdd(b, c); // 1 + 2^-21 + 2^-44 - 1 == (1 + 2^-20)*2^-18 - COMPARE(a, float_v(Vc_buildFloat(1, 0x000001, -21))); -} - -template<> void fma() -{ - sfloat_v b = Vc_buildFloat(1, 0x000001, 0); - sfloat_v c = Vc_buildFloat(1, 0x000000, -24); - sfloat_v a = b; - /*a *= b; - a += c; - COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000002, 0))); - a = b;*/ - a.fusedMultiplyAdd(b, c); - COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000003, 0))); - - a = Vc_buildFloat(1, 0x000002, 0); - b = Vc_buildFloat(1, 0x000002, 0); - c = Vc_buildFloat(-1, 0x000000, 0); - /*a *= b; - a += c; - COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000000, -21))); - a = b;*/ - a.fusedMultiplyAdd(b, c); // 1 + 2^-21 + 2^-44 - 1 == (1 + 2^-20)*2^-18 - COMPARE(a, sfloat_v(Vc_buildFloat(1, 0x000001, -21))); -} - -template<> void fma() -{ - double_v b = Vc_buildDouble(1, 0x0000000000001, 0); - double_v c = Vc_buildDouble(1, 0x0000000000000, -53); - double_v a = b; - a.fusedMultiplyAdd(b, c); - COMPARE(a, double_v(Vc_buildDouble(1, 0x0000000000003, 0))); - - a = Vc_buildDouble(1, 0x0000000000002, 0); - b = Vc_buildDouble(1, 0x0000000000002, 0); - c = Vc_buildDouble(-1, 0x0000000000000, 0); - a.fusedMultiplyAdd(b, c); // 1 + 2^-50 + 2^-102 - 1 - COMPARE(a, double_v(Vc_buildDouble(1, 0x0000000000001, -50))); -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - testAllTypes(fma); - - runTest(testZero); - runTest(testZero); - runTest(testZero); - runTest(testZero); - runTest(testZero); - runTest(testZero); - runTest(testZero); - - runTest(testCmp); - runTest(testCmp); - runTest(testCmp); - runTest(testCmp); - runTest(testCmp); - runTest(testCmp); - runTest(testCmp); - - runTest(testIsMix); - runTest(testIsMix); - //runTest(testIsMix); - //runTest(testIsMix); - runTest(testIsMix); - runTest(testIsMix); - //runTest(testIsMix); - - runTest(testAdd); - runTest(testAdd); - runTest(testAdd); - runTest(testAdd); - runTest(testAdd); - runTest(testAdd); - runTest(testAdd); - - runTest(testSub); - runTest(testSub); - runTest(testSub); - runTest(testSub); - runTest(testSub); - runTest(testSub); - runTest(testSub); - - runTest(testMul); - runTest(testMul); - runTest(testMul); - runTest(testMul); - runTest(testMul); - runTest(testMul); - runTest(testMul); - - runTest(testDiv); - runTest(testDiv); - runTest(testDiv); - runTest(testDiv); - runTest(testDiv); - runTest(testDiv); - runTest(testDiv); - - runTest(testAnd); - runTest(testAnd); - runTest(testAnd); - runTest(testAnd); - // no operator& for float/double - - runTest(testShift); - runTest(testShift); - runTest(testShift); - runTest(testShift); - - runTest(testMulAdd); - runTest(testMulAdd); - runTest(testMulAdd); - runTest(testMulAdd); - runTest(testMulAdd); - runTest(testMulAdd); - runTest(testMulAdd); - - runTest(testMulSub); - runTest(testMulSub); - runTest(testMulSub); - runTest(testMulSub); - runTest(testMulSub); - runTest(testMulSub); - runTest(testMulSub); - - runTest(testOnesComplement); - runTest(testOnesComplement); - runTest(testOnesComplement); - runTest(testOnesComplement); - - testAllTypes(testNegate); - testAllTypes(testMin); - testAllTypes(testMax); - testAllTypes(testProduct); - testAllTypes(testSum); - - return 0; -} diff --git a/math/vc/tests/casts.cpp b/math/vc/tests/casts.cpp deleted file mode 100644 index 404a3bb29d824..0000000000000 --- a/math/vc/tests/casts.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include - -using namespace Vc; - -template void testNumber(double n) -{ - typedef typename V1::EntryType T1; - typedef typename V2::EntryType T2; - - // compare casts from T1 -> T2 with casts from V1 -> V2 - - const T1 n1 = static_cast(n); - //std::cerr << "n1 = " << n1 << ", static_cast(n1) = " << static_cast(n1) << std::endl; - COMPARE(static_cast(V1(n1)), V2(static_cast(n1))) << "\n n1: " << n1; -} - -template double maxHelper() -{ - return static_cast(std::numeric_limits::max()); -} - -template<> double maxHelper() -{ - const int intDigits = std::numeric_limits::digits; - const int floatDigits = std::numeric_limits::digits; - return static_cast(((int(1) << floatDigits) - 1) << (intDigits - floatDigits)); -} - -template<> double maxHelper() -{ - const int intDigits = std::numeric_limits::digits; - const int floatDigits = std::numeric_limits::digits; - return static_cast(((unsigned(1) << floatDigits) - 1) << (intDigits - floatDigits)); -} - -template void testCast2() -{ - typedef typename V1::EntryType T1; - typedef typename V2::EntryType T2; - - const double max = std::min(maxHelper(), maxHelper()); - const double min = std::max( - std::numeric_limits::is_integer ? - static_cast(std::numeric_limits::min()) : - static_cast(-std::numeric_limits::max()), - std::numeric_limits::is_integer ? - static_cast(std::numeric_limits::min()) : - static_cast(-std::numeric_limits::max()) - ); - - testNumber(0.); - testNumber(1.); - testNumber(2.); - testNumber(max); - testNumber(max / 4 + max / 2); - testNumber(max / 2); - testNumber(max / 4); - testNumber(min); -} - -template void testCast() -{ - testCast2(); -} - -#define _CONCAT(A, B) A ## _ ## B -#define CONCAT(A, B) _CONCAT(A, B) -template -struct T2Helper -{ - typedef T1 V1; - typedef T2 V2; -}; - -void testFloatIndexesFromZero() -{ - Vc::float_v test(Vc::int_v::IndexesFromZero()); - for (int i = 0; i < float_v::Size; ++i) { - COMPARE(test[i], float(i)); - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - -#define TEST(v1, v2) \ - typedef T2Helper CONCAT(v1, v2); \ - runTest(testCast) - - TEST(float_v, float_v); - TEST(float_v, int_v); - TEST(float_v, uint_v); - // needs special handling for different Size: - //TEST(float_v, double_v); - //TEST(float_v, short_v); - //TEST(float_v, ushort_v); - - TEST(int_v, float_v); - TEST(int_v, int_v); - TEST(int_v, uint_v); - - TEST(uint_v, float_v); - TEST(uint_v, int_v); - TEST(uint_v, uint_v); - - TEST(ushort_v, sfloat_v); - TEST(ushort_v, short_v); - TEST(ushort_v, ushort_v); - - TEST(short_v, sfloat_v); - TEST(short_v, short_v); - TEST(short_v, ushort_v); - - TEST(sfloat_v, sfloat_v); - TEST(sfloat_v, short_v); - TEST(sfloat_v, ushort_v); -#undef TEST - - runTest(testFloatIndexesFromZero); - - return 0; -} diff --git a/math/vc/tests/const.h b/math/vc/tests/const.h deleted file mode 100644 index b89e2fffc7254..0000000000000 --- a/math/vc/tests/const.h +++ /dev/null @@ -1,66 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef VC_TESTS_CONST_H_ -#define VC_TESTS_CONST_H_ - -#include - -namespace ROOT { -namespace Vc -{ - template struct Math; - template<> struct Math - { - static _VC_CONSTEXPR float e() { return 2.7182818284590452353602874713526625f; } - static _VC_CONSTEXPR float log2e() { return 1.4426950408889634073599246810018921f; } - static _VC_CONSTEXPR float log10e() { return 0.4342944819032518276511289189166051f; } - static _VC_CONSTEXPR float ln2() { return Vc_buildFloat(1, 0x317218, -1); } // .693147182464599609375 - static _VC_CONSTEXPR float ln10() { return 2.3025850929940456840179914546843642f; } - static _VC_CONSTEXPR float pi() { return 3.1415926535897932384626433832795029f; } - static _VC_CONSTEXPR float pi_2() { return 1.5707963267948966192313216916397514f; } - static _VC_CONSTEXPR float pi_4() { return 0.7853981633974483096156608458198757f; } - static _VC_CONSTEXPR float _1_pi() { return 0.3183098861837906715377675267450287f; } - static _VC_CONSTEXPR float _2_pi() { return 0.6366197723675813430755350534900574f; } - static _VC_CONSTEXPR float _2_sqrtpi() { return 1.1283791670955125738961589031215452f; } - static _VC_CONSTEXPR float sqrt2() { return 1.4142135623730950488016887242096981f; } - static _VC_CONSTEXPR float sqrt1_2() { return 0.7071067811865475244008443621048490f; } - }; - template<> struct Math - { - static _VC_CONSTEXPR double e() { return 2.7182818284590452353602874713526625; } - static _VC_CONSTEXPR double log2e() { return 1.4426950408889634073599246810018921; } - static _VC_CONSTEXPR double log10e() { return 0.4342944819032518276511289189166051; } - static _VC_CONSTEXPR double ln2() { return Vc_buildDouble(1, 0x62E42FEFA39EFull, -1); } // .69314718055994528622676398299518041312694549560546875 - static _VC_CONSTEXPR double ln10() { return 2.3025850929940456840179914546843642; } - static _VC_CONSTEXPR double pi() { return 3.1415926535897932384626433832795029; } - static _VC_CONSTEXPR double pi_2() { return 1.5707963267948966192313216916397514; } - static _VC_CONSTEXPR double pi_4() { return 0.7853981633974483096156608458198757; } - static _VC_CONSTEXPR double _1_pi() { return 0.3183098861837906715377675267450287; } - static _VC_CONSTEXPR double _2_pi() { return 0.6366197723675813430755350534900574; } - static _VC_CONSTEXPR double _2_sqrtpi() { return 1.1283791670955125738961589031215452; } - static _VC_CONSTEXPR double sqrt2() { return 1.4142135623730950488016887242096981; } - static _VC_CONSTEXPR double sqrt1_2() { return 0.7071067811865475244008443621048490; } - }; -} // namespace Vc -} // namespace ROOT - -#include - -#endif // VC_TESTS_CONST_H_ diff --git a/math/vc/tests/convert-sincos-reference.cpp b/math/vc/tests/convert-sincos-reference.cpp deleted file mode 100644 index 8fdc32d0412e7..0000000000000 --- a/math/vc/tests/convert-sincos-reference.cpp +++ /dev/null @@ -1,126 +0,0 @@ -#include - -template struct SincosReference -{ - const T x, s, c; -}; - -template struct Reference -{ - const T x, ref; -}; - -template struct Data -{ - static const SincosReference sincosReference[]; - static const Reference asinReference[]; - static const Reference acosReference[]; - static const Reference atanReference[]; - static const Reference lnReference[]; - static const Reference log2Reference[]; - static const Reference log10Reference[]; -}; - -namespace Function { - enum Function { - sincos, atan, asin, acos, ln, log2, log10 - }; -} -template static inline const char *filenameOut(); -template<> inline const char *filenameOut() { return "sincos-reference-single.dat"; } -template<> inline const char *filenameOut() { return "sincos-reference-double.dat"; } -template<> inline const char *filenameOut() { return "atan-reference-single.dat"; } -template<> inline const char *filenameOut() { return "atan-reference-double.dat"; } -template<> inline const char *filenameOut() { return "asin-reference-single.dat"; } -template<> inline const char *filenameOut() { return "asin-reference-double.dat"; } -template<> inline const char *filenameOut() { return "acos-reference-single.dat"; } -template<> inline const char *filenameOut() { return "acos-reference-double.dat"; } -template<> inline const char *filenameOut() { return "reference-ln-sp.dat"; } -template<> inline const char *filenameOut() { return "reference-ln-dp.dat"; } -template<> inline const char *filenameOut() { return "reference-log2-sp.dat"; } -template<> inline const char *filenameOut() { return "reference-log2-dp.dat"; } -template<> inline const char *filenameOut() { return "reference-log10-sp.dat"; } -template<> inline const char *filenameOut() { return "reference-log10-dp.dat"; } - -template<> const SincosReference Data::sincosReference[] = { -#include "sincos-reference-single.h" -}; -template<> const SincosReference Data::sincosReference[] = { -#include "sincos-reference-double.h" -}; -template<> const Reference Data::asinReference[] = { -#include "asin-reference-single.h" -}; -template<> const Reference Data::asinReference[] = { -#include "asin-reference-double.h" -}; -template<> const Reference Data::acosReference[] = { -#include "acos-reference-single.h" -}; -template<> const Reference Data::acosReference[] = { -#include "acos-reference-double.h" -}; -template<> const Reference Data::atanReference[] = { -#include "atan-reference-single.h" -}; -template<> const Reference Data::atanReference[] = { -#include "atan-reference-double.h" -}; -template<> const Reference Data::lnReference[] = { -#include "reference-ln-sp.h" -}; -template<> const Reference Data::lnReference[] = { -#include "reference-ln-dp.h" -}; -template<> const Reference Data::log2Reference[] = { -#include "reference-log2-sp.h" -}; -template<> const Reference Data::log2Reference[] = { -#include "reference-log2-dp.h" -}; -template<> const Reference Data::log10Reference[] = { -#include "reference-log10-sp.h" -}; -template<> const Reference Data::log10Reference[] = { -#include "reference-log10-dp.h" -}; - -template -static void convert() -{ - FILE *file; - file = fopen(filenameOut(), "wb"); - fwrite(&Data::sincosReference[0], sizeof(SincosReference), sizeof(Data::sincosReference) / sizeof(SincosReference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::atanReference[0], sizeof(Reference), sizeof(Data::atanReference) / sizeof(Reference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::asinReference[0], sizeof(Reference), sizeof(Data::asinReference) / sizeof(Reference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::acosReference[0], sizeof(Reference), sizeof(Data::acosReference) / sizeof(Reference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::lnReference[0], sizeof(Reference), sizeof(Data::lnReference) / sizeof(Reference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::log2Reference[0], sizeof(Reference), sizeof(Data::log2Reference) / sizeof(Reference), file); - fclose(file); - - file = fopen(filenameOut(), "wb"); - fwrite(&Data::log10Reference[0], sizeof(Reference), sizeof(Data::log10Reference) / sizeof(Reference), file); - fclose(file); -} - -int main() -{ - convert(); - convert(); - return 0; -} diff --git a/math/vc/tests/deinterleave.cpp b/math/vc/tests/deinterleave.cpp deleted file mode 100644 index 72ddec9cdf67c..0000000000000 --- a/math/vc/tests/deinterleave.cpp +++ /dev/null @@ -1,421 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include - -using namespace Vc; - - -/* - * V \ M | float | double | ushort | short | uint | int - * ---------+---------------------------------------------- - * float_v | X | | X | X | | - * sfloat_v | X | | X | X | | - * double_v | | X | | | | - * int_v | | | | X | | X - * uint_v | | | X | | X | - * short_v | | | | X | | - * ushort_v | | | X | | | - */ -template struct TPair { typedef A V; typedef B M; }; - -typedef TPair float_float; -typedef TPair float_ushort; -typedef TPair float_short; - -typedef TPair sfloat_float; -typedef TPair sfloat_ushort; -typedef TPair sfloat_short; - -typedef TPair double_double; -typedef TPair short_short; -typedef TPair ushort_ushort; - -typedef TPair int_int; -typedef TPair int_short; - -typedef TPair uint_uint; -typedef TPair uint_ushort; - -template void testDeinterleave() -{ - typedef typename Pair::V V; - typedef typename Pair::M M; - typedef typename V::IndexType I; - - const bool isSigned = std::numeric_limits::is_signed; - - const typename V::EntryType offset = isSigned ? -512 : 0; - const V _0246 = static_cast(I::IndexesFromZero()) * 2 + offset; - - M memory[1024]; - for (int i = 0; i < 1024; ++i) { - memory[i] = static_cast(i + offset); - } - - V a, b; - - for (int i = 0; i < 1024 - 2 * V::Size; ++i) { - // note that a 32 bit integer is certainly enough to decide on alignment... - // ... but uintptr_t is C99 but not C++ yet - // ... and GCC refuses to do the cast, even if I know what I'm doing - if (reinterpret_cast(&memory[i]) & (VectorAlignment - 1)) { - Vc::deinterleave(&a, &b, &memory[i], Unaligned); - } else { - Vc::deinterleave(&a, &b, &memory[i]); - } - COMPARE(_0246 + i, a); - COMPARE(_0246 + i + 1, b); - } -} - -template struct SomeStruct -{ - T d[N]; -}; - -template struct Types -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef typename V::AsArg VArg; - typedef typename I::AsArg IArg; - typedef SomeStruct S; - typedef const Vc::InterleavedMemoryWrapper &Wrapper; -}; -template struct TestDeinterleaveGatherCompare; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V v0, v1, v2, v3, v4, v5, v6, v7; - (v0, v1, v2, v3, v4, v5, v6, v7) = data_v[indexes]; - COMPARE(v0, reference + 0) << "N = 8"; - COMPARE(v1, reference + 1) << "N = 8"; - COMPARE(v2, reference + 2) << "N = 8"; - COMPARE(v3, reference + 3) << "N = 8"; - COMPARE(v4, reference + 4) << "N = 8"; - COMPARE(v5, reference + 5) << "N = 8"; - COMPARE(v6, reference + 6) << "N = 8"; - COMPARE(v7, reference + 7) << "N = 8"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V v0, v1, v2, v3, v4, v5, v6; - (v0, v1, v2, v3, v4, v5, v6) = data_v[indexes]; - COMPARE(v0, reference + 0) << "N = 7"; - COMPARE(v1, reference + 1) << "N = 7"; - COMPARE(v2, reference + 2) << "N = 7"; - COMPARE(v3, reference + 3) << "N = 7"; - COMPARE(v4, reference + 4) << "N = 7"; - COMPARE(v5, reference + 5) << "N = 7"; - COMPARE(v6, reference + 6) << "N = 7"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V v0, v1, v2, v3, v4, v5; - (v0, v1, v2, v3, v4, v5) = data_v[indexes]; - COMPARE(v0, reference + 0) << "N = 6"; - COMPARE(v1, reference + 1) << "N = 6"; - COMPARE(v2, reference + 2) << "N = 6"; - COMPARE(v3, reference + 3) << "N = 6"; - COMPARE(v4, reference + 4) << "N = 6"; - COMPARE(v5, reference + 5) << "N = 6"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V v0, v1, v2, v3, v4; - (v0, v1, v2, v3, v4) = data_v[indexes]; - COMPARE(v0, reference + 0) << "N = 5"; - COMPARE(v1, reference + 1) << "N = 5"; - COMPARE(v2, reference + 2) << "N = 5"; - COMPARE(v3, reference + 3) << "N = 5"; - COMPARE(v4, reference + 4) << "N = 5"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V a, b, c, d; - (a, b, c, d) = data_v[indexes]; - COMPARE(a, reference + 0) << "N = 4"; - COMPARE(b, reference + 1) << "N = 4"; - COMPARE(c, reference + 2) << "N = 4"; - COMPARE(d, reference + 3) << "N = 4"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V a, b, c; - (a, b, c) = data_v[indexes]; - COMPARE(a, reference + 0) << "N = 3"; - COMPARE(b, reference + 1) << "N = 3"; - COMPARE(c, reference + 2) << "N = 3"; - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -}; -template struct TestDeinterleaveGatherCompare { - static void test(typename Types::Wrapper data_v, typename Types::IArg indexes, const typename V::AsArg reference) - { - V a, b; - (a, b) = data_v[indexes]; - COMPARE(a, reference + 0) << "N = 2"; - COMPARE(b, reference + 1) << "N = 2"; - } -}; - -size_t createNMask(size_t N) -{ - size_t NMask = (N >> 1) | (N >> 2); - for (size_t shift = 2; shift < sizeof(size_t) * 8; shift *= 2) { - NMask |= NMask >> shift; - } - return NMask; -} - -template void testDeinterleaveGatherImpl() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef SomeStruct S; - typedef Vc::InterleavedMemoryWrapper Wrapper; - const size_t N = std::min(std::numeric_limits::max(), 1024 * 1024 / sizeof(S)); - const size_t NMask = createNMask(N); - - S *data = Vc::malloc(N); - for (size_t i = 0; i < N; ++i) { - for (size_t j = 0; j < StructSize; ++j) { - data[i].d[j] = i * StructSize + j; - } - } - const Wrapper data_v(data); - - for (int retest = 0; retest < 10000; ++retest) { - I indexes = (I::Random() >> 10) & I(NMask); - VERIFY(indexes >= 0); - VERIFY(indexes < N); - const V reference = static_cast(indexes) * V(StructSize); - - TestDeinterleaveGatherCompare::test(data_v, indexes, reference); - } -} - -template void testDeinterleaveGather() -{ - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); - testDeinterleaveGatherImpl(); -} - -template struct TestInterleavingScatterCompare; -#define _IMPL(STRUCTSIZE, _code_) \ -template struct TestInterleavingScatterCompare { \ - typedef TestInterleavingScatterCompare NextTest; \ - template static void test(Wrapper &data, const typename V::IndexType &i) { \ - _code_ \ - } \ -} -_IMPL(2, - const V v0 = V::Random(); - const V v1 = V::Random(); - V t0; - V t1; - data[i] = (v0, v1); - (t0, t1) = data[i]; - COMPARE(t0, v0) << 2; - COMPARE(t1, v1) << 2; - ); -_IMPL(3, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - V t0; V t1; V t2; - data[i] = (v0, v1, v2); - (t0, t1, t2) = data[i]; - COMPARE(t0, v0) << 3; - COMPARE(t1, v1) << 3; - COMPARE(t2, v2) << 3; - NextTest::test(data, i); - ); -_IMPL(4, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - const V v3 = V::Random(); - V t0; V t1; V t2; V t3; - data[i] = (v0, v1, v2, v3); - (t0, t1, t2, t3) = data[i]; - COMPARE(t0, v0) << 4; - COMPARE(t1, v1) << 4; - COMPARE(t2, v2) << 4; - COMPARE(t3, v3) << 4; - NextTest::test(data, i); - ); -_IMPL(5, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - const V v3 = V::Random(); - const V v4 = V::Random(); - V t0; V t1; V t2; V t3; V t4; - data[i] = (v0, v1, v2, v3, v4); - (t0, t1, t2, t3, t4) = data[i]; - COMPARE(t0, v0) << 5; - COMPARE(t1, v1) << 5; - COMPARE(t2, v2) << 5; - COMPARE(t3, v3) << 5; - COMPARE(t4, v4) << 5; - NextTest::test(data, i); - ); -_IMPL(6, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - const V v3 = V::Random(); - const V v4 = V::Random(); - const V v5 = V::Random(); - V t0; V t1; V t2; V t3; V t4; V t5; - data[i] = (v0, v1, v2, v3, v4, v5); - (t0, t1, t2, t3, t4, t5) = data[i]; - COMPARE(t0, v0) << 6; - COMPARE(t1, v1) << 6; - COMPARE(t2, v2) << 6; - COMPARE(t3, v3) << 6; - COMPARE(t4, v4) << 6; - COMPARE(t5, v5) << 6; - NextTest::test(data, i); - ); -_IMPL(7, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - const V v3 = V::Random(); - const V v4 = V::Random(); - const V v5 = V::Random(); - const V v6 = V::Random(); - V t0; V t1; V t2; V t3; V t4; V t5; V t6; - data[i] = (v0, v1, v2, v3, v4, v5, v6); - (t0, t1, t2, t3, t4, t5, t6) = data[i]; - COMPARE(t0, v0) << 7; - COMPARE(t1, v1) << 7; - COMPARE(t2, v2) << 7; - COMPARE(t3, v3) << 7; - COMPARE(t4, v4) << 7; - COMPARE(t5, v5) << 7; - COMPARE(t6, v6) << 7; - NextTest::test(data, i); - ); -_IMPL(8, - const V v0 = V::Random(); - const V v1 = V::Random(); - const V v2 = V::Random(); - const V v3 = V::Random(); - const V v4 = V::Random(); - const V v5 = V::Random(); - const V v6 = V::Random(); - const V v7 = V::Random(); - V t0; V t1; V t2; V t3; V t4; V t5; V t6; V t7; - data[i] = (v0, v1, v2, v3, v4, v5, v6, v7); - (t0, t1, t2, t3, t4, t5, t6, t7) = data[i]; - COMPARE(t0, v0) << 8; - COMPARE(t1, v1) << 8; - COMPARE(t2, v2) << 8; - COMPARE(t3, v3) << 8; - COMPARE(t4, v4) << 8; - COMPARE(t5, v5) << 8; - COMPARE(t6, v6) << 8; - COMPARE(t7, v7) << 8; - NextTest::test(data, i); - ); - -template void testInterleavingScatterImpl() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef SomeStruct S; - typedef Vc::InterleavedMemoryWrapper Wrapper; - const size_t N = std::min(std::numeric_limits::max(), 1024 * 1024 / sizeof(S)); - const size_t NMask = createNMask(N); - - S *data = Vc::malloc(N); - std::memset(data, 0, sizeof(S) * N); - Wrapper data_v(data); - - for (int retest = 0; retest < 10000; ++retest) { - I indexes = (I::Random() >> 10) & I(NMask); - if (I::Size != 1) { - // ensure the indexes are unique - while(!(indexes.sorted() == indexes.sorted().rotated(1)).isEmpty()) { - indexes = (I::Random() >> 10) & I(NMask); - } - } - VERIFY(indexes >= 0); - VERIFY(indexes < N); - - TestInterleavingScatterCompare::test(data_v, indexes); - } -} - -template void testInterleavingScatter() -{ - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); - testInterleavingScatterImpl(); -} - -int main() -{ - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - runTest(testDeinterleave); - - testAllTypes(testDeinterleaveGather); - testAllTypes(testInterleavingScatter); -} diff --git a/math/vc/tests/download.cmake b/math/vc/tests/download.cmake deleted file mode 100644 index be627511ac8d0..0000000000000 --- a/math/vc/tests/download.cmake +++ /dev/null @@ -1 +0,0 @@ -file(DOWNLOAD "http://compeng.uni-frankfurt.de/~kretz/Vc-testdata/${filename}" "./${filename}") diff --git a/math/vc/tests/expandandmerge.cpp b/math/vc/tests/expandandmerge.cpp deleted file mode 100644 index 203ed17712c8d..0000000000000 --- a/math/vc/tests/expandandmerge.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include - -using namespace Vc; - -enum { - VectorSizeFactor = short_v::Size / int_v::Size -}; - -void testSigned() -{ - for (int start = -32000; start < 32000; start += 5) { - int_v a[VectorSizeFactor]; - for (int i = 0; i < VectorSizeFactor; ++i) { - a[i] = int_v(IndexesFromZero) + int_v::Size * i + start; - } - short_v b(a); - COMPARE(b, short_v(IndexesFromZero) + start); - - // false positive: warning: ‘c’ is used uninitialized in this function - int_v c[VectorSizeFactor]; - b.expand(c); - for (int i = 0; i < VectorSizeFactor; ++i) { - COMPARE(c[i], int_v(IndexesFromZero) + int_v::Size * i + start); - } - } -} - -void testUnsigned() -{ -#if defined(VC_IMPL_SSE4_1) || defined(VC_IMPL_AVX) - for (unsigned int start = 0; start < 64000; start += 5) { -#else - for (unsigned int start = 0; start < 32000; start += 5) { -#endif - uint_v a[VectorSizeFactor]; - for (unsigned int i = 0; i < VectorSizeFactor; ++i) { - a[i] = uint_v(IndexesFromZero) + uint_v::Size * i + start; - } - ushort_v b(a); - COMPARE(b, ushort_v(IndexesFromZero) + start); - - // false positive: warning: ‘c’ is used uninitialized in this function - uint_v c[VectorSizeFactor]; - b.expand(c); - for (unsigned int i = 0; i < VectorSizeFactor; ++i) { - COMPARE(c[i], uint_v(IndexesFromZero) + uint_v::Size * i + start); - } - } - for (unsigned int start = 32000; start < 64000; start += 5) { - ushort_v b(IndexesFromZero); - b += start; - COMPARE(b, ushort_v(IndexesFromZero) + start); - - // false positive: warning: ‘c’ may be used uninitialized in this function - uint_v c[VectorSizeFactor]; - b.expand(c); - for (unsigned int i = 0; i < VectorSizeFactor; ++i) { - COMPARE(c[i], uint_v(IndexesFromZero) + uint_v::Size * i + start); - } - } -} - -int main() -{ - runTest(testSigned); - runTest(testUnsigned); - return 0; -} diff --git a/math/vc/tests/gather.cpp b/math/vc/tests/gather.cpp deleted file mode 100644 index b5a67f435ebe3..0000000000000 --- a/math/vc/tests/gather.cpp +++ /dev/null @@ -1,212 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include - -using namespace Vc; - -template void maskedGatherArray() -{ - typedef typename Vec::IndexType It; - typedef typename Vec::EntryType T; - - T mem[Vec::Size]; - for (int i = 0; i < Vec::Size; ++i) { - mem[i] = i + 1; - } - - It indexes = It::IndexesFromZero(); - for_all_masks(Vec, m) { - const Vec a(mem, indexes, m); - for (int i = 0; i < Vec::Size; ++i) { - COMPARE(a[i], m[i] ? mem[i] : 0) << " i = " << i << ", m = " << m; - } - - T x = Vec::Size + 1; - Vec b = x; - b.gather(mem, indexes, m); - for (int i = 0; i < Vec::Size; ++i) { - COMPARE(b[i], m[i] ? mem[i] : x) << " i = " << i << ", m = " << m; - } - } -} - -template void gatherArray() -{ - typedef typename Vec::IndexType It; - typedef typename Vec::EntryType T; - typedef typename It::Mask M; - - const int count = 39999; - T array[count]; - for (int i = 0; i < count; ++i) { - array[i] = i + 1; - } - M mask; - for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - const Vec ii(i + 1); - const typename Vec::Mask castedMask = static_cast(mask); - if (castedMask.isFull()) { - Vec a(array, i); - COMPARE(a, ii) << "\n i: " << i; - Vec b(Zero); - b.gather(array, i); - COMPARE(b, ii); - COMPARE(a, b); - } - Vec b(Zero); - b.gather(array, i, castedMask); - COMPARE(castedMask, (b == ii)) << ", b = " << b << ", ii = " << ii << ", i = " << i; - if (!castedMask.isFull()) { - COMPARE(!castedMask, b == Vec(Zero)); - } - } - - const typename Vec::Mask k(Zero); - Vec a(One); - a.gather(array, It(IndexesFromZero), k); - COMPARE(a, Vec(One)); -} - -template struct Struct -{ - T a; - char x; - T b; - short y; - T c; - char z; -}; - -template void gatherStruct() -{ - typedef typename Vec::IndexType It; - typedef typename Vec::EntryType T; - typedef Struct S; - const int count = 3999; - S array[count]; - for (int i = 0; i < count; ++i) { - array[i].a = i; - array[i].b = i + 1; - array[i].c = i + 2; - } - typename It::Mask mask; - for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - // if Vec is double_v the cast keeps only the lower two values, which is why the == - // comparison works - const Vec i0(i); - const Vec i1(i + 1); - const Vec i2(i + 2); - const typename Vec::Mask castedMask(mask); - - if (castedMask.isFull()) { - Vec a(array, &S::a, i); - COMPARE(a, i0) << "\ni: " << i; - a.gather(array, &S::b, i); - COMPARE(a, i1); - a.gather(array, &S::c, i); - COMPARE(a, i2); - } - - Vec b(Zero); - b.gather(array, &S::a, i, castedMask); - COMPARE(castedMask, (b == i0)); - if (!castedMask.isFull()) { - COMPARE(!castedMask, b == Vec(Zero)); - } - b.gather(array, &S::b, i, castedMask); - COMPARE(castedMask, (b == i1)); - if (!castedMask.isFull()) { - COMPARE(!castedMask, b == Vec(Zero)); - } - b.gather(array, &S::c, i, castedMask); - COMPARE(castedMask, (b == i2)); - if (!castedMask.isFull()) { - COMPARE(!castedMask, b == Vec(Zero)); - } - } -} - -template struct Row -{ - T *data; -}; - -template void gather2dim() -{ - typedef typename Vec::IndexType It; - typedef typename Vec::EntryType T; - const int count = 399; - typedef Row S; - S array[count]; - for (int i = 0; i < count; ++i) { - array[i].data = new T[count]; - for (int j = 0; j < count; ++j) { - array[i].data[j] = 2 * i + j + 1; - } - } - - typename It::Mask mask; - for (It i = It(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - for (It j = It(IndexesFromZero); !(mask &= (j < count)).isEmpty(); j += Vec::Size) { - const Vec i0(i * 2 + j + 1); - const typename Vec::Mask castedMask(mask); - - Vec a(array, &S::data, i, j, castedMask); - COMPARE(castedMask, castedMask && (a == i0)) << ", a = " << a << ", i0 = " << i0 << ", i = " << i << ", j = " << j; - - Vec b(Zero); - b.gather(array, &S::data, i, j, castedMask); - COMPARE(castedMask, (b == i0)); - if (!castedMask.isFull()) { - COMPARE(!castedMask, b == Vec(Zero)); - } else { - Vec c(array, &S::data, i, j); - VERIFY((c == i0).isFull()); - - Vec d(Zero); - d.gather(array, &S::data, i, j); - VERIFY((d == i0).isFull()); - } - } - } - for (int i = 0; i < count; ++i) { - delete[] array[i].data; - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - testAllTypes(gatherArray); - testAllTypes(maskedGatherArray); -#if defined(VC_CLANG) && VC_CLANG <= 0x030000 - // clang fails with: - // candidate template ignored: failed template argument deduction - // template inline Vector(const S1 *array, const T S1::* member1, IT indexes, Mask mask = true) -#warning "Skipping compilation of tests gatherStruct and gather2dim because of clang bug" -#else - testAllTypes(gatherStruct); - testAllTypes(gather2dim); -#endif - - return 0; -} diff --git a/math/vc/tests/implicit_type_conversion.cpp b/math/vc/tests/implicit_type_conversion.cpp deleted file mode 100644 index d6d8ebf4b122a..0000000000000 --- a/math/vc/tests/implicit_type_conversion.cpp +++ /dev/null @@ -1,275 +0,0 @@ -/*{{{ - Copyright (C) 2012 Matthias Kretz - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -}}}*/ - -#include "unittest.h" - -//#define QUICK 1 - -using namespace Vc; - -typedef unsigned short ushort; -typedef unsigned int uint; -typedef unsigned long ulong; -typedef long long longlong; -typedef unsigned long long ulonglong; - -#ifdef QUICK -#define _TYPE_TEST(a, b, c) -#define _TYPE_TEST_ERR(a, b) -#else -#if defined(VC_GCC) && VC_GCC == 0x40801 -// Skipping tests involving operator& because of a bug in GCC 4.8.1 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57532) -#define _TYPE_TEST(a, b, c) \ - COMPARE(typeid(a() * b()), typeid(c)); \ - COMPARE(typeid(a() / b()), typeid(c)); \ - COMPARE(typeid(a() + b()), typeid(c)); \ - COMPARE(typeid(a() - b()), typeid(c)); \ - COMPARE(typeid(a() | b()), typeid(c)); \ - COMPARE(typeid(a() ^ b()), typeid(c)); \ - COMPARE(typeid(a() == b()), typeid(c::Mask)); \ - COMPARE(typeid(a() != b()), typeid(c::Mask)); \ - COMPARE(typeid(a() <= b()), typeid(c::Mask)); \ - COMPARE(typeid(a() >= b()), typeid(c::Mask)); \ - COMPARE(typeid(a() < b()), typeid(c::Mask)); -#else -#define _TYPE_TEST(a, b, c) \ - COMPARE(typeid(a() * b()), typeid(c)); \ - COMPARE(typeid(a() / b()), typeid(c)); \ - COMPARE(typeid(a() + b()), typeid(c)); \ - COMPARE(typeid(a() - b()), typeid(c)); \ - COMPARE(typeid(a() & b()), typeid(c)); \ - COMPARE(typeid(a() | b()), typeid(c)); \ - COMPARE(typeid(a() ^ b()), typeid(c)); \ - COMPARE(typeid(a() == b()), typeid(c::Mask)); \ - COMPARE(typeid(a() != b()), typeid(c::Mask)); \ - COMPARE(typeid(a() <= b()), typeid(c::Mask)); \ - COMPARE(typeid(a() >= b()), typeid(c::Mask)); \ - COMPARE(typeid(a() < b()), typeid(c::Mask)); -#endif -#define _TYPE_TEST_ERR(a, b) \ - COMPARE(typeid(a() * b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() / b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() + b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() - b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() & b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() | b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() ^ b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() == b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() != b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() <= b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() >= b()), typeid(Vc::Error::invalid_operands_of_types)); \ - COMPARE(typeid(a() < b()), typeid(Vc::Error::invalid_operands_of_types)); -#endif - -#define TYPE_TEST(a, b, c) \ - _TYPE_TEST(a, b, c) \ - COMPARE(typeid(a() > b()), typeid(c::Mask)) - -template -struct TestImplicitCast { - static bool test(const T &) { return true; } - static bool test( ... ) { return false; } -}; - -enum SomeEnum { EnumValue = 0 }; -SomeEnum Enum() { return EnumValue; } - -void testImplicitTypeConversions() -{ - VERIFY( TestImplicitCast< int>::test(double())); - VERIFY( TestImplicitCast< int>::test( float())); - VERIFY( TestImplicitCast< int>::test( Enum())); - VERIFY( TestImplicitCast< int>::test( short())); - VERIFY( TestImplicitCast< int>::test(ushort())); - VERIFY( TestImplicitCast< int>::test( char())); - VERIFY( TestImplicitCast< int>::test( uint())); - VERIFY( TestImplicitCast< int>::test( long())); - VERIFY( TestImplicitCast< int>::test( ulong())); - VERIFY( TestImplicitCast< int>::test( bool())); - VERIFY( TestImplicitCast::test(double())); - VERIFY(!TestImplicitCast::test( float())); - VERIFY(!TestImplicitCast::test( int())); - VERIFY( TestImplicitCast< float_v>::test( float())); - VERIFY( TestImplicitCast::test( float())); - VERIFY( TestImplicitCast< int_v>::test( int())); - VERIFY( TestImplicitCast< uint_v>::test( uint())); - VERIFY( TestImplicitCast< short_v>::test( short())); - VERIFY( TestImplicitCast::test(ushort())); - - TYPE_TEST( double_v, double_v, double_v); - TYPE_TEST( double_v, double, double_v); - TYPE_TEST( double_v, float, double_v); - TYPE_TEST( double_v, short, double_v); - TYPE_TEST( double_v, ushort, double_v); - TYPE_TEST( double_v, int, double_v); - TYPE_TEST( double_v, uint, double_v); - TYPE_TEST( double_v, long, double_v); - TYPE_TEST( double_v, ulong, double_v); - TYPE_TEST( double_v, longlong, double_v); - TYPE_TEST( double_v, ulonglong, double_v); - TYPE_TEST( double_v, Enum, double_v); - TYPE_TEST( double, double_v, double_v); - TYPE_TEST( float, double_v, double_v); - TYPE_TEST( short, double_v, double_v); - TYPE_TEST( ushort, double_v, double_v); - TYPE_TEST( int, double_v, double_v); - TYPE_TEST( uint, double_v, double_v); - TYPE_TEST( long, double_v, double_v); - TYPE_TEST( ulong, double_v, double_v); - TYPE_TEST( longlong, double_v, double_v); - TYPE_TEST(ulonglong, double_v, double_v); - // double_v done - - TYPE_TEST( float_v, float_v, float_v); - TYPE_TEST( float_v, float, float_v); - TYPE_TEST( float_v, short, float_v); - TYPE_TEST( float_v, ushort, float_v); - TYPE_TEST( float_v, int_v, float_v); - TYPE_TEST( float_v, int, float_v); - TYPE_TEST( float_v, uint_v, float_v); - TYPE_TEST( float_v, uint, float_v); - TYPE_TEST( float_v, long, float_v); - TYPE_TEST( float_v, ulong, float_v); - TYPE_TEST( float_v, longlong, float_v); - TYPE_TEST( float_v, ulonglong, float_v); - TYPE_TEST( float, float_v, float_v); - TYPE_TEST( short, float_v, float_v); - TYPE_TEST( ushort, float_v, float_v); - TYPE_TEST( int_v, float_v, float_v); - TYPE_TEST( int, float_v, float_v); - TYPE_TEST( uint_v, float_v, float_v); - TYPE_TEST( uint, float_v, float_v); - TYPE_TEST( long, float_v, float_v); - TYPE_TEST( ulong, float_v, float_v); - TYPE_TEST( longlong, float_v, float_v); - TYPE_TEST(ulonglong, float_v, float_v); - // double_v + float_v done - - TYPE_TEST( sfloat_v, sfloat_v, sfloat_v); - TYPE_TEST( sfloat_v, float, sfloat_v); - TYPE_TEST( sfloat_v, short_v, sfloat_v); - TYPE_TEST( sfloat_v, short, sfloat_v); - TYPE_TEST( sfloat_v, ushort_v, sfloat_v); - TYPE_TEST( sfloat_v, ushort, sfloat_v); - TYPE_TEST( sfloat_v, int, sfloat_v); - TYPE_TEST( sfloat_v, uint, sfloat_v); - TYPE_TEST( sfloat_v, long, sfloat_v); - TYPE_TEST( sfloat_v, ulong, sfloat_v); - TYPE_TEST( sfloat_v, longlong, sfloat_v); - TYPE_TEST( sfloat_v, ulonglong, sfloat_v); - TYPE_TEST( sfloat_v, sfloat_v, sfloat_v); - TYPE_TEST( float, sfloat_v, sfloat_v); - TYPE_TEST( short_v, sfloat_v, sfloat_v); - TYPE_TEST( short, sfloat_v, sfloat_v); - TYPE_TEST( ushort_v, sfloat_v, sfloat_v); - TYPE_TEST( ushort, sfloat_v, sfloat_v); - TYPE_TEST( int, sfloat_v, sfloat_v); - TYPE_TEST( uint, sfloat_v, sfloat_v); - TYPE_TEST( long, sfloat_v, sfloat_v); - TYPE_TEST( ulong, sfloat_v, sfloat_v); - TYPE_TEST( longlong, sfloat_v, sfloat_v); - TYPE_TEST(ulonglong, sfloat_v, sfloat_v); - // double_v + float_v + sfloat_v done - - TYPE_TEST( short_v, short_v, short_v); - TYPE_TEST( short_v, short, short_v); - TYPE_TEST( short_v, ushort_v, ushort_v); - TYPE_TEST( short_v, ushort, ushort_v); - TYPE_TEST( short_v, int, short_v); - TYPE_TEST( short_v, uint, ushort_v); - TYPE_TEST( short_v, long, short_v); - TYPE_TEST( short_v, ulong, ushort_v); - TYPE_TEST( short_v, longlong, short_v); - TYPE_TEST( short_v, ulonglong, ushort_v); - TYPE_TEST( short, short_v, short_v); - TYPE_TEST( ushort_v, short_v, ushort_v); - TYPE_TEST( ushort, short_v, ushort_v); - TYPE_TEST( int, short_v, short_v); - TYPE_TEST( uint, short_v, ushort_v); - TYPE_TEST( long, short_v, short_v); - TYPE_TEST( ulong, short_v, ushort_v); - TYPE_TEST( longlong, short_v, short_v); - TYPE_TEST(ulonglong, short_v, ushort_v); - // double_v + float_v + sfloat_v + short_v done - - TYPE_TEST( ushort_v, short, ushort_v); - TYPE_TEST( ushort_v, ushort_v, ushort_v); - TYPE_TEST( ushort_v, ushort, ushort_v); - TYPE_TEST( ushort_v, int, ushort_v); - TYPE_TEST( ushort_v, uint, ushort_v); - TYPE_TEST( ushort_v, long, ushort_v); - TYPE_TEST( ushort_v, ulong, ushort_v); - TYPE_TEST( ushort_v, longlong, ushort_v); - TYPE_TEST( ushort_v, ulonglong, ushort_v); - TYPE_TEST( short, ushort_v, ushort_v); - TYPE_TEST( ushort, ushort_v, ushort_v); - TYPE_TEST( int, ushort_v, ushort_v); - TYPE_TEST( uint, ushort_v, ushort_v); - TYPE_TEST( long, ushort_v, ushort_v); - TYPE_TEST( ulong, ushort_v, ushort_v); - TYPE_TEST( longlong, ushort_v, ushort_v); - TYPE_TEST(ulonglong, ushort_v, ushort_v); - // double_v + float_v + sfloat_v + short_v + ushort_v done - - TYPE_TEST( int_v, ushort, uint_v); - TYPE_TEST( int_v, short, int_v); - TYPE_TEST( int_v, int_v, int_v); - TYPE_TEST( int_v, int, int_v); - TYPE_TEST( int_v, uint_v, uint_v); - TYPE_TEST( int_v, uint, uint_v); - TYPE_TEST( int_v, long, int_v); - TYPE_TEST( int_v, ulong, uint_v); - TYPE_TEST( int_v, longlong, int_v); - TYPE_TEST( int_v, ulonglong, uint_v); - TYPE_TEST( ushort, int_v, uint_v); - TYPE_TEST( short, int_v, int_v); - TYPE_TEST( int, int_v, int_v); - TYPE_TEST( uint_v, int_v, uint_v); - TYPE_TEST( uint, int_v, uint_v); - TYPE_TEST( long, int_v, int_v); - TYPE_TEST( ulong, int_v, uint_v); - TYPE_TEST( longlong, int_v, int_v); - TYPE_TEST(ulonglong, int_v, uint_v); - - TYPE_TEST( uint_v, short, uint_v); - TYPE_TEST( uint_v, ushort, uint_v); - TYPE_TEST( uint_v, int_v, uint_v); - TYPE_TEST( uint_v, int, uint_v); - TYPE_TEST( uint_v, uint_v, uint_v); - TYPE_TEST( uint_v, uint, uint_v); - TYPE_TEST( uint_v, long, uint_v); - TYPE_TEST( uint_v, ulong, uint_v); - TYPE_TEST( uint_v, longlong, uint_v); - TYPE_TEST( uint_v, ulonglong, uint_v); - TYPE_TEST( short, uint_v, uint_v); - TYPE_TEST( ushort, uint_v, uint_v); - TYPE_TEST( int_v, uint_v, uint_v); - TYPE_TEST( int, uint_v, uint_v); - TYPE_TEST( uint, uint_v, uint_v); - TYPE_TEST( long, uint_v, uint_v); - TYPE_TEST( ulong, uint_v, uint_v); - TYPE_TEST( longlong, uint_v, uint_v); - TYPE_TEST(ulonglong, uint_v, uint_v); -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - runTest(testImplicitTypeConversions); - return 0; -} diff --git a/math/vc/tests/implicit_type_conversion_failures.cpp b/math/vc/tests/implicit_type_conversion_failures.cpp deleted file mode 100644 index f17541a4084db..0000000000000 --- a/math/vc/tests/implicit_type_conversion_failures.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include - -#if !defined(TYPE_A) || !defined(TEST_OP) || !defined(TYPE_B) -#error "Need to define TYPE_A, TEST_OP, and TYPE_B" -#endif - -using namespace Vc; - -int main() -{ - TYPE_A() TEST_OP TYPE_B(); - return 0; -} diff --git a/math/vc/tests/linkTest0.cpp b/math/vc/tests/linkTest0.cpp deleted file mode 100644 index 3488757237f10..0000000000000 --- a/math/vc/tests/linkTest0.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include -#if !(defined VC_GCC && VC_GCC < 0x40400) && !defined VC_MSVC -#include -#endif -#include -#include -#include - -using namespace Vc; -float_v foo0(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} diff --git a/math/vc/tests/linkTest1.cpp b/math/vc/tests/linkTest1.cpp deleted file mode 100644 index 3039d763ae372..0000000000000 --- a/math/vc/tests/linkTest1.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include -#include - -using namespace Vc; - -float_v fooLib0A(float_v::AsArg a); -float_v fooLib1A(float_v::AsArg a); -float_v fooLib0B(float_v::AsArg a); -float_v fooLib1B(float_v::AsArg a); -float_v fooLib2(float_v::AsArg a); -float_v fooLib3(float_v::AsArg a); -float_v foo0(float_v::AsArg a); -float_v foo1(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} - -int main() -{ - float_v x = float_v::Random(); - x = fooLib0A(fooLib0B(fooLib1A(fooLib1B(fooLib2(fooLib3(foo0(foo1(x)))))))); - return static_cast(x.sum()); -} diff --git a/math/vc/tests/linkTestLib0.cpp b/math/vc/tests/linkTestLib0.cpp deleted file mode 100644 index 8d4ccd97c3f1b..0000000000000 --- a/math/vc/tests/linkTestLib0.cpp +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include - -#define CAT(a, b) a##b -#define name(a, b) CAT(a, b) - -using namespace Vc; -float_v -#ifdef VC_MSVC -__declspec(dllexport) -#endif -name(fooLib0, POSTFIX)(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} diff --git a/math/vc/tests/linkTestLib1.cpp b/math/vc/tests/linkTestLib1.cpp deleted file mode 100644 index fa7ed4dbe6285..0000000000000 --- a/math/vc/tests/linkTestLib1.cpp +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include - -#define CAT(a, b) a##b -#define name(a, b) CAT(a, b) - -using namespace Vc; -float_v -#ifdef VC_MSVC -__declspec(dllexport) -#endif -name(fooLib1, POSTFIX)(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} diff --git a/math/vc/tests/linkTestLib2.cpp b/math/vc/tests/linkTestLib2.cpp deleted file mode 100644 index ab49384a7cd45..0000000000000 --- a/math/vc/tests/linkTestLib2.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include -#include - -using namespace Vc; -float_v fooLib2(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} diff --git a/math/vc/tests/linkTestLib3.cpp b/math/vc/tests/linkTestLib3.cpp deleted file mode 100644 index 7ef82959da856..0000000000000 --- a/math/vc/tests/linkTestLib3.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include -#include - -using namespace Vc; -float_v fooLib3(float_v::AsArg a) -{ - const float_v b = sin(a + float_v::One()); - std::cerr << b; - return b; -} diff --git a/math/vc/tests/load.cpp b/math/vc/tests/load.cpp deleted file mode 100644 index cdf074555c468..0000000000000 --- a/math/vc/tests/load.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include - -using namespace Vc; - -template unsigned long alignmentMask() -{ - if (Vec::Size == 1) { - // on 32bit the maximal alignment is 4 Bytes, even for 8-Byte doubles. - return std::min(sizeof(void*), sizeof(typename Vec::EntryType)) - 1; - } - // sizeof(SSE::sfloat_v) is too large - // AVX::VectorAlignment is too large - return std::min(sizeof(Vec), VectorAlignment) - 1; -} - -template void checkAlignment() -{ - unsigned char i = 1; - Vec a[10]; - unsigned long mask = alignmentMask(); - for (i = 0; i < 10; ++i) { - VERIFY((reinterpret_cast(&a[i]) & mask) == 0) << "a = " << a << ", mask = " << mask; - } - const char *data = reinterpret_cast(&a[0]); - for (i = 0; i < 10; ++i) { - VERIFY(&data[i * Vec::Size * sizeof(typename Vec::EntryType)] == reinterpret_cast(&a[i])); - } -} - -void *hack_to_put_b_on_the_stack = 0; - -template void checkMemoryAlignment() -{ - typedef typename Vec::EntryType T; - const T *b = 0; - Vc::Memory a; - b = a; - hack_to_put_b_on_the_stack = &b; - unsigned long mask = alignmentMask(); - for (int i = 0; i < 10; ++i) { - VERIFY((reinterpret_cast(&b[i * Vec::Size]) & mask) == 0) << "b = " << b << ", mask = " << mask; - } -} - -template void loadArray() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::IndexType I; - - enum loadArrayEnum { count = 256 * 1024 / sizeof(T) }; - Vc::Memory array; - for (int i = 0; i < count; ++i) { - array[i] = i; - } - - const I indexesFromZero(IndexesFromZero); - - const Vec offsets(indexesFromZero); - for (int i = 0; i < count; i += Vec::Size) { - const T *const addr = &array[i]; - Vec ii(i); - ii += offsets; - - Vec a(addr); - COMPARE(a, ii); - - Vec b = Vec::Zero(); - b.load(addr); - COMPARE(b, ii); - } -} - -enum Enum { - loadArrayShortCount = 32 * 1024, - streamingLoadCount = 1024 -}; -template void loadArrayShort() -{ - typedef typename Vec::EntryType T; - - Vc::Memory array; - for (int i = 0; i < loadArrayShortCount; ++i) { - array[i] = i; - } - - const Vec &offsets = static_cast(ushort_v::IndexesFromZero()); - for (int i = 0; i < loadArrayShortCount; i += Vec::Size) { - const T *const addr = &array[i]; - Vec ii(i); - ii += offsets; - - Vec a(addr); - COMPARE(a, ii); - - Vec b = Vec::Zero(); - b.load(addr); - COMPARE(b, ii); - } -} - -template void streamingLoad() -{ - typedef typename Vec::EntryType T; - - Vc::Memory data; - data[0] = static_cast(-streamingLoadCount/2); - for (int i = 1; i < streamingLoadCount; ++i) { - data[i] = data[i - 1]; - ++data[i]; - } - - Vec ref = data.firstVector(); - for (int i = 0; i < streamingLoadCount - Vec::Size; ++i, ++ref) { - Vec v1, v2; - if (0 == i % Vec::Size) { - v1 = Vec(&data[i], Vc::Streaming | Vc::Aligned); - v2.load (&data[i], Vc::Streaming | Vc::Aligned); - } else { - v1 = Vec(&data[i], Vc::Streaming | Vc::Unaligned); - v2.load (&data[i], Vc::Streaming | Vc::Unaligned); - } - COMPARE(v1, ref); - COMPARE(v2, ref); - } -} - -template struct TypeInfo; -template<> struct TypeInfo { static const char *string() { return "double"; } }; -template<> struct TypeInfo { static const char *string() { return "float"; } }; -template<> struct TypeInfo { static const char *string() { return "int"; } }; -template<> struct TypeInfo { static const char *string() { return "uint"; } }; -template<> struct TypeInfo { static const char *string() { return "short"; } }; -template<> struct TypeInfo { static const char *string() { return "ushort"; } }; -template<> struct TypeInfo { static const char *string() { return "schar"; } }; -template<> struct TypeInfo { static const char *string() { return "uchar"; } }; -template<> struct TypeInfo { static const char *string() { return "double_v"; } }; -template<> struct TypeInfo { static const char *string() { return "float_v"; } }; -template<> struct TypeInfo { static const char *string() { return "sfloat_v"; } }; -template<> struct TypeInfo { static const char *string() { return "int_v"; } }; -template<> struct TypeInfo { static const char *string() { return "uint_v"; } }; -template<> struct TypeInfo { static const char *string() { return "short_v"; } }; -template<> struct TypeInfo { static const char *string() { return "ushort_v"; } }; - -template struct SupportedConversions { typedef void Next; }; -template<> struct SupportedConversions { typedef double Next; }; -template<> struct SupportedConversions { typedef int Next; }; -template<> struct SupportedConversions { typedef unsigned int Next; }; -template<> struct SupportedConversions { typedef short Next; }; -template<> struct SupportedConversions { typedef unsigned short Next; }; -template<> struct SupportedConversions { typedef signed char Next; }; -template<> struct SupportedConversions { typedef unsigned char Next; }; -template<> struct SupportedConversions { typedef void Next; }; -template<> struct SupportedConversions { typedef unsigned int Next; }; -template<> struct SupportedConversions { typedef short Next; }; -template<> struct SupportedConversions { typedef unsigned short Next; }; -template<> struct SupportedConversions { typedef signed char Next; }; -template<> struct SupportedConversions { typedef unsigned char Next; }; -template<> struct SupportedConversions { typedef void Next; }; -template<> struct SupportedConversions { typedef unsigned short Next; }; -template<> struct SupportedConversions { typedef unsigned char Next; }; -template<> struct SupportedConversions { typedef void Next; }; -template<> struct SupportedConversions { typedef unsigned char Next; }; -template<> struct SupportedConversions { typedef void Next; }; -template<> struct SupportedConversions< short, void > { typedef unsigned char Next; }; -template<> struct SupportedConversions< short, unsigned char > { typedef signed char Next; }; -template<> struct SupportedConversions< short, signed char > { typedef void Next; }; - -template struct LoadCvt { - static void test() { - typedef typename Vec::EntryType VecT; - MemT *data = Vc::malloc(128); - for (size_t i = 0; i < 128; ++i) { - data[i] = static_cast(i - 64); - } - - for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { - Vec v; - if (i % (2 * Vec::Size) == 0) { - v = Vec(&data[i]); - } else if (i % Vec::Size == 0) { - v = Vec(&data[i], Vc::Aligned); - } else { - v = Vec(&data[i], Vc::Unaligned); - } - for (size_t j = 0; j < Vec::Size; ++j) { - COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); - } - } - for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { - Vec v; - if (i % (2 * Vec::Size) == 0) { - v.load(&data[i]); - } else if (i % Vec::Size == 0) { - v.load(&data[i], Vc::Aligned); - } else { - v.load(&data[i], Vc::Unaligned); - } - for (size_t j = 0; j < Vec::Size; ++j) { - COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); - } - } - for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { - Vec v; - if (i % (2 * Vec::Size) == 0) { - v = Vec(&data[i], Vc::Streaming); - } else if (i % Vec::Size == 0) { - v = Vec(&data[i], Vc::Streaming | Vc::Aligned); - } else { - v = Vec(&data[i], Vc::Streaming | Vc::Unaligned); - } - for (size_t j = 0; j < Vec::Size; ++j) { - COMPARE(v[j], static_cast(data[i + j])) << " " << TypeInfo::string(); - } - } - - ADD_PASS() << "loadCvt: load " << TypeInfo::string() << "* as " << TypeInfo::string(); - LoadCvt::Next>::test(); - } -}; -template struct LoadCvt { static void test() {} }; - -template void loadCvt() -{ - typedef typename Vec::EntryType T; - LoadCvt::Next>::test(); -} - -int main() -{ - runTest(checkAlignment); - runTest(checkAlignment); - runTest(checkAlignment); - runTest(checkAlignment); - runTest(checkAlignment); - runTest(checkAlignment); - runTest(checkAlignment); - testAllTypes(checkMemoryAlignment); - runTest(loadArray); - runTest(loadArray); - runTest(loadArray); - runTest(loadArray); - runTest(loadArray); - runTest(loadArrayShort); - runTest(loadArrayShort); - - testAllTypes(streamingLoad); - - testAllTypes(loadCvt); - return 0; -} diff --git a/math/vc/tests/mask.cpp b/math/vc/tests/mask.cpp deleted file mode 100644 index 230c28470350d..0000000000000 --- a/math/vc/tests/mask.cpp +++ /dev/null @@ -1,312 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include "vectormemoryhelper.h" -#include - -using namespace Vc; - -template void testInc() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i); - data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 1 : 0); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - Vec aa(a); - COMPARE(aa(m)++, a) << ", border: " << border << ", m: " << m; - COMPARE(aa, b) << ", border: " << border << ", m: " << m; - COMPARE(++a(m), b) << ", border: " << border << ", m: " << m; - COMPARE(a, b) << ", border: " << border << ", m: " << m; - } -} - -template void testDec() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i + 1); - data[i + Vec::Size] = data[i] - static_cast(data[i] < border ? 1 : 0); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - Vec aa(a); - COMPARE(aa(m)--, a); - COMPARE(--a(m), b); - COMPARE(a, b); - COMPARE(aa, b); - } -} - -template void testPlusEq() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i + 1); - data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 2 : 0); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - COMPARE(a(m) += static_cast(2), b); - COMPARE(a, b); - } -} - -template void testMinusEq() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i + 2); - data[i + Vec::Size] = data[i] - static_cast(data[i] < border ? 2 : 0); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - COMPARE(a(m) -= static_cast(2), b); - COMPARE(a, b); - } -} - -template void testTimesEq() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i); - data[i + Vec::Size] = data[i] * static_cast(data[i] < border ? 2 : 1); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - COMPARE(a(m) *= static_cast(2), b); - COMPARE(a, b); - } -} - -template void testDivEq() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(5 * i); - data[i + Vec::Size] = data[i] / static_cast(data[i] < border ? 3 : 1); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - COMPARE(a(m) /= static_cast(3), b); - COMPARE(a, b); - } -} - -template void testAssign() -{ - VectorMemoryHelper mem(2); - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - T *data = mem; - for (int borderI = 0; borderI < Vec::Size; ++borderI) { - const T border = static_cast(borderI); - for (int i = 0; i < Vec::Size; ++i) { - data[i] = static_cast(i); - data[i + Vec::Size] = data[i] + static_cast(data[i] < border ? 2 : 0); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Mask m = a < border; - COMPARE(a(m) = b, b); - COMPARE(a, b); - } -} - -template void testZero() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask Mask; - typedef typename Vec::IndexType I; - - for (int cut = 0; cut < Vec::Size; ++cut) { - const Mask mask(I(Vc::IndexesFromZero) < cut); - //std::cout << mask << std::endl; - - const T aa = 4; - Vec a(aa); - Vec b(Vc::Zero); - - b(!mask) = a; - a.setZero(mask); - - COMPARE(a, b); - } -} - -template void testCount() -{ - for_all_masks(Vec, m) { - int count = 0; - for (int i = 0; i < Vec::Size; ++i) { - if (m[i]) { - ++count; - } - } - COMPARE(m.count(), count) << ", m = " << m; - } -} - -template void testFirstOne() -{ - typedef typename Vec::IndexType I; - typedef typename Vec::Mask M; - - for (int i = 0; i < Vec::Size; ++i) { - const M mask(I(Vc::IndexesFromZero) == i); - COMPARE(mask.firstOne(), i); - } -} - -template void testLogicalOperatorsImpl() -{ - VERIFY((M1(true) && M2(true)).isFull()); - VERIFY((M1(true) && M2(false)).isEmpty()); - VERIFY((M1(true) || M2(true)).isFull()); - VERIFY((M1(true) || M2(false)).isFull()); - VERIFY((M1(false) || M2(false)).isEmpty()); -} - -template void testBinaryOperatorsImpl() -{ - testLogicalOperatorsImpl(); - - VERIFY((M1(true) & M2(true)).isFull()); - VERIFY((M1(true) & M2(false)).isEmpty()); - VERIFY((M1(true) | M2(true)).isFull()); - VERIFY((M1(true) | M2(false)).isFull()); - VERIFY((M1(false) | M2(false)).isEmpty()); - VERIFY((M1(true) ^ M2(true)).isEmpty()); - VERIFY((M1(true) ^ M2(false)).isFull()); -} - -void testBinaryOperators() -{ - testLogicalOperatorsImpl< short_m, sfloat_m>(); - testLogicalOperatorsImpl(); - testLogicalOperatorsImpl(); - testLogicalOperatorsImpl(); - - testBinaryOperatorsImpl< short_m, short_m>(); - testBinaryOperatorsImpl< short_m, ushort_m>(); - testBinaryOperatorsImpl(); - testBinaryOperatorsImpl(); - testBinaryOperatorsImpl(); - - testBinaryOperatorsImpl< int_m, int_m>(); - testBinaryOperatorsImpl< int_m, uint_m>(); - testBinaryOperatorsImpl< int_m, float_m>(); - testBinaryOperatorsImpl< uint_m, int_m>(); - testBinaryOperatorsImpl< uint_m, uint_m>(); - testBinaryOperatorsImpl< uint_m, float_m>(); - testBinaryOperatorsImpl< float_m, int_m>(); - testBinaryOperatorsImpl< float_m, uint_m>(); - testBinaryOperatorsImpl< float_m, float_m>(); - - testBinaryOperatorsImpl(); -} - -#ifdef VC_IMPL_SSE -void testFloat8GatherMask() -{ - Memory data; - short_v::Memory andMemory; - for (int i = 0; i < short_v::Size; ++i) { - andMemory[i] = 1 << i; - } - const short_v andMask(andMemory); - - for (unsigned int i = 0; i < data.vectorsCount(); ++i) { - data.vector(i) = andMask & i; - } - - for (unsigned int i = 0; i < data.vectorsCount(); ++i) { - const short_m mask = data.vector(i) == short_v::Zero(); - - SSE::Float8GatherMask - gatherMaskA(mask), - gatherMaskB(static_cast(mask)); - COMPARE(gatherMaskA.toInt(), gatherMaskB.toInt()); - } -} -#endif - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - testAllTypes(testInc); - testAllTypes(testDec); - testAllTypes(testPlusEq); - testAllTypes(testMinusEq); - testAllTypes(testTimesEq); - testAllTypes(testDivEq); - testAllTypes(testAssign); - testAllTypes(testZero); - testAllTypes(testCount); - testAllTypes(testFirstOne); - runTest(testBinaryOperators); - -#ifdef VC_IMPL_SSE - runTest(testFloat8GatherMask); -#endif - - return 0; -} diff --git a/math/vc/tests/math.cpp b/math/vc/tests/math.cpp deleted file mode 100644 index 1ba9fa8ccb1a4..0000000000000 --- a/math/vc/tests/math.cpp +++ /dev/null @@ -1,957 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ -/*includes {{{*/ -#include "unittest.h" -#include -#include "vectormemoryhelper.h" -#include "const.h" -#include -#include -#include -/*}}}*/ -using namespace Vc; -/*fix isfinite and isnan{{{*/ -#ifdef isfinite -#undef isfinite -#endif -#ifdef isnan -#undef isnan -#endif -/*}}}*/ -template struct SincosReference/*{{{*/ -{ - T x, s, c; -}; -template struct Reference -{ - T x, ref; -}; - -template struct Array -{ - size_t size; - T *data; - Array() : size(0), data(0) {} -}; -template struct StaticDeleter -{ - T *ptr; - StaticDeleter(T *p) : ptr(p) {} - ~StaticDeleter() { delete[] ptr; } -}; - -enum Function { - Sincos, Atan, Asin, Acos, Log, Log2, Log10 -}; -template static inline const char *filename(); -template<> inline const char *filename() { return "reference-sincos-sp.dat"; } -template<> inline const char *filename() { return "reference-sincos-dp.dat"; } -template<> inline const char *filename() { return "reference-atan-sp.dat"; } -template<> inline const char *filename() { return "reference-atan-dp.dat"; } -template<> inline const char *filename() { return "reference-asin-sp.dat"; } -template<> inline const char *filename() { return "reference-asin-dp.dat"; } -// template<> inline const char *filename() { return "reference-acos-sp.dat"; } -// template<> inline const char *filename() { return "reference-acos-dp.dat"; } -template<> inline const char *filename() { return "reference-ln-sp.dat"; } -template<> inline const char *filename() { return "reference-ln-dp.dat"; } -template<> inline const char *filename() { return "reference-log2-sp.dat"; } -template<> inline const char *filename() { return "reference-log2-dp.dat"; } -template<> inline const char *filename() { return "reference-log10-sp.dat"; } -template<> inline const char *filename() { return "reference-log10-dp.dat"; } - -template -static Array > sincosReference() -{ - static Array > data; - if (data.data == 0) { - FILE *file = fopen(filename(), "rb"); - if (file) { - fseek(file, 0, SEEK_END); - const size_t size = ftell(file) / sizeof(SincosReference); - rewind(file); - data.data = new SincosReference[size]; - static StaticDeleter > _cleanup(data.data); - data.size = fread(data.data, sizeof(SincosReference), size, file); - fclose(file); - } else { - FAIL() << "the reference data " << filename() << " does not exist in the current working directory."; - } - } - return data; -} - -template -static Array > referenceData() -{ - static Array > data; - if (data.data == 0) { - FILE *file = fopen(filename(), "rb"); - if (file) { - fseek(file, 0, SEEK_END); - const size_t size = ftell(file) / sizeof(Reference); - rewind(file); - data.data = new Reference[size]; - static StaticDeleter > _cleanup(data.data); - data.size = fread(data.data, sizeof(Reference), size, file); - fclose(file); - } else { - FAIL() << "the reference data " << filename() << " does not exist in the current working directory."; - } - } - return data; -}/*}}}*/ - -template struct Denormals { static T *data; };/*{{{*/ -template<> float *Denormals::data = 0; -template<> double *Denormals::data = 0; -enum { - NDenormals = 64 -}; -/*}}}*/ -template V apply_v(VC_ALIGNED_PARAMETER(V) x, typename V::EntryType (func)(typename V::EntryType))/*{{{*/ -{ - V r; - for (size_t i = 0; i < V::Size; ++i) { - r[i] = func(x[i]); - } - return r; -} -/*}}}*/ -template void testAbs()/*{{{*/ -{ - for (int i = 0; i < 0x7fff; ++i) { - Vec a(i); - Vec b(-i); - COMPARE(a, Vc::abs(a)); - COMPARE(a, Vc::abs(b)); - } -} -/*}}}*/ -static inline float my_trunc(float x)/*{{{*/ -{ -#if __cplusplus >= 201103 /*C++11*/ - return std::trunc(x); -#elif defined(_ISOC99_SOURCE) - return truncf(x); -#else - return x > 0 ? std::floor(x) : std::ceil(x); -#endif -} - -static inline double my_trunc(double x) -{ -#if __cplusplus >= 201103 /*C++11*/ - return std::trunc(x); -#elif defined(_ISOC99_SOURCE) - return trunc(x); -#else - return x > 0 ? std::floor(x) : std::ceil(x); -#endif -} -/*}}}*/ -template void testTrunc()/*{{{*/ -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - for (size_t i = 0; i < 100000 / V::Size; ++i) { - V x = (V::Random() - T(0.5)) * T(100); - V reference = apply_v(x, my_trunc); - COMPARE(Vc::trunc(x), reference) << ", x = " << x << ", i = " << i; - } - V x = static_cast(I::IndexesFromZero()); - V reference = apply_v(x, my_trunc); - COMPARE(Vc::trunc(x), reference) << ", x = " << x; -} -/*}}}*/ -template void testFloor()/*{{{*/ -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - for (size_t i = 0; i < 100000 / V::Size; ++i) { - V x = (V::Random() - T(0.5)) * T(100); - V reference = apply_v(x, std::floor); - COMPARE(Vc::floor(x), reference) << ", x = " << x << ", i = " << i; - } - V x = static_cast(I::IndexesFromZero()); - V reference = apply_v(x, std::floor); - COMPARE(Vc::floor(x), reference) << ", x = " << x; -} -/*}}}*/ -template void testCeil()/*{{{*/ -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - for (size_t i = 0; i < 100000 / V::Size; ++i) { - V x = (V::Random() - T(0.5)) * T(100); - V reference = apply_v(x, std::ceil); - COMPARE(Vc::ceil(x), reference) << ", x = " << x << ", i = " << i; - } - V x = static_cast(I::IndexesFromZero()); - V reference = apply_v(x, std::ceil); - COMPARE(Vc::ceil(x), reference) << ", x = " << x; -} -/*}}}*/ -template void testExp()/*{{{*/ -{ - setFuzzyness(1); - setFuzzyness(2); - typedef typename V::EntryType T; - for (size_t i = 0; i < 100000 / V::Size; ++i) { - V x = (V::Random() - T(0.5)) * T(20); - V reference = apply_v(x, std::exp); - FUZZY_COMPARE(Vc::exp(x), reference) << ", x = " << x << ", i = " << i; - } - COMPARE(Vc::exp(V::Zero()), V::One()); -} -/*}}}*/ -template void testLog()/*{{{*/ -{ - setFuzzyness(1); - typedef typename V::EntryType T; - Array > reference = referenceData(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, ref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - ref[j] = reference.data[i + j].ref; - } - FUZZY_COMPARE(Vc::log(x), ref) << " x = " << x << ", i = " << i; - } - - COMPARE(Vc::log(V::Zero()), V(std::log(T(0)))); - for (int i = 0; i < NDenormals; i += V::Size) { - V x(&Denormals::data[i]); - V ref = apply_v(x, std::log); - FUZZY_COMPARE(Vc::log(x), ref) << ", x = " << x << ", i = " << i; - } -} -/*}}}*/ -#if (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600) || defined(_ISOC99_SOURCE) || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) -static inline float my_log2(float x) { return ::log2f(x); } -/* I need to make sure whether the log2 that I compare against is really precise to <0.5ulp. At - * least I get different results when I use "double log2(double)", which is somewhat unexpected. - * Well, conversion from double to float goes via truncation, so if the most significant truncated - * mantissa bit is set the resulting float is incorrect by 1 ulp - -static inline float my_log2(float x) { return ::log2(static_cast(x)); } -static inline float my_log2(float x) { - double tmp = ::log2(static_cast(x)); - int e; - frexp(tmp, &e); // frexp(0.5) -> e = 0 - return tmp + ldexp(tmp < 0 ? -0.5 : 0.5, e - 24); -} - */ -static inline double my_log2(double x) { return ::log2(x); } -#else -static inline float my_log2(float x) { return ::logf(x) / Vc::Math::ln2(); } -static inline double my_log2(double x) { return ::log(x) / Vc::Math::ln2(); } -#endif -/*}}}*/ -template void testLog2()/*{{{*/ -{ -#if defined(VC_LOG_ILP) || defined(VC_LOG_ILP2) - setFuzzyness(3); -#else - setFuzzyness(1); -#endif -#if (defined(VC_MSVC) || defined(__APPLE__)) && defined(VC_IMPL_Scalar) - setFuzzyness(2); -#else - setFuzzyness(1); -#endif - typedef typename V::EntryType T; - Array > reference = referenceData(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, ref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - ref[j] = reference.data[i + j].ref; - } - FUZZY_COMPARE(Vc::log2(x), ref) << " x = " << x << ", i = " << i; - } - - COMPARE(Vc::log2(V::Zero()), V(my_log2(T(0)))); - for (int i = 0; i < NDenormals; i += V::Size) { - V x(&Denormals::data[i]); - V ref = apply_v(x, my_log2); - FUZZY_COMPARE(Vc::log2(x), ref) << ", x = " << x << ", i = " << i; - } -} -/*}}}*/ -template void testLog10()/*{{{*/ -{ - setFuzzyness(2); - setFuzzyness(2); - typedef typename V::EntryType T; - Array > reference = referenceData(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, ref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - ref[j] = reference.data[i + j].ref; - } - FUZZY_COMPARE(Vc::log10(x), ref) << " x = " << x << ", i = " << i; - } - - COMPARE(Vc::log10(V::Zero()), V(std::log10(T(0)))); - for (int i = 0; i < NDenormals; i += V::Size) { - V x(&Denormals::data[i]); - V ref = apply_v(x, std::log10); - FUZZY_COMPARE(Vc::log10(x), ref) << ", x = " << x << ", i = " << i; - } -} -/*}}}*/ -template void testMax()/*{{{*/ -{ - typedef typename Vec::EntryType T; - VectorMemoryHelper mem(3); - T *data = mem; - for (int i = 0; i < Vec::Size; ++i) { - data[i] = i; - data[i + Vec::Size] = Vec::Size + 1 - i; - data[i + 2 * Vec::Size] = std::max(data[i], data[i + Vec::Size]); - } - Vec a(&data[0]); - Vec b(&data[Vec::Size]); - Vec c(&data[2 * Vec::Size]); - - COMPARE(Vc::max(a, b), c); -} -/*}}}*/ - /*{{{*/ -#define FillHelperMemory(code) \ - typename V::Memory data; \ - typename V::Memory reference; \ - for (int ii = 0; ii < V::Size; ++ii) { \ - const T i = static_cast(ii); \ - data[ii] = i; \ - reference[ii] = code; \ - } do {} while (false) -/*}}}*/ -template void testSqrt()/*{{{*/ -{ - typedef typename V::EntryType T; - FillHelperMemory(std::sqrt(i)); - V a(data); - V b(reference); - - FUZZY_COMPARE(Vc::sqrt(a), b); -} -/*}}}*/ -template void testRSqrt()/*{{{*/ -{ - typedef typename V::EntryType T; - for (size_t i = 0; i < 1024 / V::Size; ++i) { - const V x = V::Random() * T(1000); - // RSQRTPS is documented as having a relative error <= 1.5 * 2^-12 - VERIFY(Vc::abs(Vc::rsqrt(x) * Vc::sqrt(x) - V::One()) < static_cast(std::ldexp(1.5, -12))); - } -} -/*}}}*/ -template void testSincos()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(2); - setFuzzyness(1e7); - Array > reference = sincosReference(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, sref, cref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - sref[j] = reference.data[i + j].s; - cref[j] = reference.data[i + j].c; - } - V sin, cos; - Vc::sincos(x, &sin, &cos); - FUZZY_COMPARE(sin, sref) << " x = " << x << ", i = " << i; - FUZZY_COMPARE(cos, cref) << " x = " << x << ", i = " << i; - Vc::sincos(-x, &sin, &cos); - FUZZY_COMPARE(sin, -sref) << " x = " << -x << ", i = " << i; - FUZZY_COMPARE(cos, cref) << " x = " << -x << ", i = " << i; - } -} -/*}}}*/ -template void testSin()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(2); - setFuzzyness(1e7); - Array > reference = sincosReference(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, sref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - sref[j] = reference.data[i + j].s; - } - FUZZY_COMPARE(Vc::sin(x), sref) << " x = " << x << ", i = " << i; - FUZZY_COMPARE(Vc::sin(-x), -sref) << " x = " << x << ", i = " << i; - } -} -/*}}}*/ -template void testCos()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(2); - setFuzzyness(1e7); - Array > reference = sincosReference(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, cref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - cref[j] = reference.data[i + j].c; - } - FUZZY_COMPARE(Vc::cos(x), cref) << " x = " << x << ", i = " << i; - FUZZY_COMPARE(Vc::cos(-x), cref) << " x = " << x << ", i = " << i; - } -} -/*}}}*/ -template void testAsin()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(2); - setFuzzyness(36); - Array > reference = referenceData(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, ref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - ref[j] = reference.data[i + j].ref; - } - FUZZY_COMPARE(Vc::asin(x), ref) << " x = " << x << ", i = " << i; - FUZZY_COMPARE(Vc::asin(-x), -ref) << " -x = " << -x << ", i = " << i; - } -} -/*}}}*/ -const union { - unsigned int hex; - float value; -} INF = { 0x7f800000 }; - -#if defined(__APPLE__) && defined(VC_IMPL_Scalar) -#define ATAN_COMPARE FUZZY_COMPARE -#else -#define ATAN_COMPARE COMPARE -#endif - -template void testAtan()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(3); - setFuzzyness(2); - - { - const V Pi_2 = T(Vc_buildDouble(1, 0x921fb54442d18ull, 0)); - V nan; nan.setQnan(); - const V inf = T(INF.value); - - VERIFY(Vc::isnan(Vc::atan(nan))); - ATAN_COMPARE(Vc::atan(+inf), +Pi_2); -#ifdef VC_MSVC -#pragma warning(suppress: 4756) // overflow in constant arithmetic -#endif - ATAN_COMPARE(Vc::atan(-inf), -Pi_2); - } - - Array > reference = referenceData(); - for (size_t i = 0; i + V::Size - 1 < reference.size; i += V::Size) { - V x, ref; - for (int j = 0; j < V::Size; ++j) { - x[j] = reference.data[i + j].x; - ref[j] = reference.data[i + j].ref; - } - FUZZY_COMPARE(Vc::atan(x), ref) << " x = " << x << ", i = " << i; - FUZZY_COMPARE(Vc::atan(-x), -ref) << " -x = " << -x << ", i = " << i; - } -} -/*}}}*/ -template void testAtan2()/*{{{*/ -{ - typedef typename V::EntryType T; - setFuzzyness(3); - setFuzzyness(2); - - { - const V Pi = T(Vc_buildDouble(1, 0x921fb54442d18ull, 1)); - const V Pi_2 = T(Vc_buildDouble(1, 0x921fb54442d18ull, 0)); - V nan; nan.setQnan(); - const V inf = T(INF.value); - - // If y is +0 (-0) and x is less than 0, +pi (-pi) is returned. - ATAN_COMPARE(Vc::atan2(V(T(+0.)), V(T(-3.))), +Pi); - ATAN_COMPARE(Vc::atan2(V(T(-0.)), V(T(-3.))), -Pi); - // If y is +0 (-0) and x is greater than 0, +0 (-0) is returned. - COMPARE(Vc::atan2(V(T(+0.)), V(T(+3.))), V(T(+0.))); - VERIFY(!Vc::atan2(V(T(+0.)), V(T(+3.))).isNegative()); - COMPARE(Vc::atan2(V(T(-0.)), V(T(+3.))), V(T(-0.))); - VERIFY (Vc::atan2(V(T(-0.)), V(T(+3.))).isNegative()); - // If y is less than 0 and x is +0 or -0, -pi/2 is returned. - COMPARE(Vc::atan2(V(T(-3.)), V(T(+0.))), -Pi_2); - COMPARE(Vc::atan2(V(T(-3.)), V(T(-0.))), -Pi_2); - // If y is greater than 0 and x is +0 or -0, pi/2 is returned. - COMPARE(Vc::atan2(V(T(+3.)), V(T(+0.))), +Pi_2); - COMPARE(Vc::atan2(V(T(+3.)), V(T(-0.))), +Pi_2); - // If either x or y is NaN, a NaN is returned. - VERIFY(Vc::isnan(Vc::atan2(nan, V(T(3.))))); - VERIFY(Vc::isnan(Vc::atan2(V(T(3.)), nan))); - VERIFY(Vc::isnan(Vc::atan2(nan, nan))); - // If y is +0 (-0) and x is -0, +pi (-pi) is returned. - ATAN_COMPARE(Vc::atan2(V(T(+0.)), V(T(-0.))), +Pi); - ATAN_COMPARE(Vc::atan2(V(T(-0.)), V(T(-0.))), -Pi); - // If y is +0 (-0) and x is +0, +0 (-0) is returned. - COMPARE(Vc::atan2(V(T(+0.)), V(T(+0.))), V(T(+0.))); - COMPARE(Vc::atan2(V(T(-0.)), V(T(+0.))), V(T(-0.))); - VERIFY(!Vc::atan2(V(T(+0.)), V(T(+0.))).isNegative()); - VERIFY( Vc::atan2(V(T(-0.)), V(T(+0.))).isNegative()); - // If y is a finite value greater (less) than 0, and x is negative infinity, +pi (-pi) is returned. - ATAN_COMPARE(Vc::atan2(V(T(+1.)), -inf), +Pi); - ATAN_COMPARE(Vc::atan2(V(T(-1.)), -inf), -Pi); - // If y is a finite value greater (less) than 0, and x is positive infinity, +0 (-0) is returned. - COMPARE(Vc::atan2(V(T(+3.)), +inf), V(T(+0.))); - VERIFY(!Vc::atan2(V(T(+3.)), +inf).isNegative()); - COMPARE(Vc::atan2(V(T(-3.)), +inf), V(T(-0.))); - VERIFY (Vc::atan2(V(T(-3.)), +inf).isNegative()); - // If y is positive infinity (negative infinity), and x is finite, pi/2 (-pi/2) is returned. - COMPARE(Vc::atan2(+inf, V(T(+3.))), +Pi_2); - COMPARE(Vc::atan2(-inf, V(T(+3.))), -Pi_2); - COMPARE(Vc::atan2(+inf, V(T(-3.))), +Pi_2); - COMPARE(Vc::atan2(-inf, V(T(-3.))), -Pi_2); -#ifndef _WIN32 // the Microsoft implementation of atan2 fails this test - const V Pi_4 = T(Vc_buildDouble(1, 0x921fb54442d18ull, -1)); - // If y is positive infinity (negative infinity) and x is negative infinity, +3*pi/4 (-3*pi/4) is returned. - COMPARE(Vc::atan2(+inf, -inf), T(+3.) * Pi_4); - COMPARE(Vc::atan2(-inf, -inf), T(-3.) * Pi_4); - // If y is positive infinity (negative infinity) and x is positive infinity, +pi/4 (-pi/4) is returned. - COMPARE(Vc::atan2(+inf, +inf), +Pi_4); - COMPARE(Vc::atan2(-inf, +inf), -Pi_4); -#endif - } - - for (int xoffset = -100; xoffset < 54613; xoffset += 47 * V::Size) { - for (int yoffset = -100; yoffset < 54613; yoffset += 47 * V::Size) { - FillHelperMemory(std::atan2((i + xoffset) * T(0.15), (i + yoffset) * T(0.15))); - const V a(data); - const V b(reference); - - const V x = (a + xoffset) * T(0.15); - const V y = (a + yoffset) * T(0.15); - FUZZY_COMPARE(Vc::atan2(x, y), b) << ", x = " << x << ", y = " << y; - } - } -} -/*}}}*/ -template void testReciprocal()/*{{{*/ -{ - typedef typename Vec::EntryType T; - setFuzzyness(1.258295e+07); - setFuzzyness(0); - const T one = 1; - for (int offset = -1000; offset < 1000; offset += 10) { - const T scale = T(0.1); - typename Vec::Memory data; - typename Vec::Memory reference; - for (int ii = 0; ii < Vec::Size; ++ii) { - const T i = static_cast(ii); - data[ii] = i; - T tmp = (i + offset) * scale; - reference[ii] = one / tmp; - } - Vec a(data); - Vec b(reference); - - FUZZY_COMPARE(Vc::reciprocal((a + offset) * scale), b); - } -} -/*}}}*/ -template void isNegative()/*{{{*/ -{ - typedef typename V::EntryType T; - VERIFY(V::One().isNegative().isEmpty()); - VERIFY(V::Zero().isNegative().isEmpty()); - VERIFY((-V::One()).isNegative().isFull()); - VERIFY(V(T(-0.)).isNegative().isFull()); -} -/*}}}*/ -template void testInf()/*{{{*/ -{ - typedef typename Vec::EntryType T; - const T one = 1; - const Vec zero(Zero); - VERIFY(Vc::isfinite(zero)); - VERIFY(Vc::isfinite(Vec(one))); - VERIFY(!Vc::isfinite(one / zero)); -} -/*}}}*/ -template void testNaN()/*{{{*/ -{ - typedef typename Vec::EntryType T; - typedef typename Vec::IndexType I; - typedef typename Vec::Mask M; - const T one = 1; - const Vec zero(Zero); - VERIFY(!Vc::isnan(zero)); - VERIFY(!Vc::isnan(Vec(one))); - const Vec inf = one / zero; - VERIFY(Vc::isnan(Vec(inf * zero))); - Vec nan = Vec::Zero(); - const M mask(I::IndexesFromZero() == I::Zero()); - nan.setQnan(mask); - COMPARE(Vc::isnan(nan), mask); - nan.setQnan(); - VERIFY(Vc::isnan(nan)); -} -/*}}}*/ -template void testRound()/*{{{*/ -{ - typedef typename Vec::EntryType T; - enum { - Count = (16 + Vec::Size) / Vec::Size - }; - VectorMemoryHelper mem1(Count); - VectorMemoryHelper mem2(Count); - T *data = mem1; - T *reference = mem2; - for (int i = 0; i < Count * Vec::Size; ++i) { - data[i] = i * 0.25 - 2.0; - reference[i] = std::floor(i * 0.25 - 2.0 + 0.5); - if (i % 8 == 2) { - reference[i] -= 1.; - } - //std::cout << reference[i] << " "; - } - //std::cout << std::endl; - for (int i = 0; i < Count; ++i) { - const Vec a(&data[i * Vec::Size]); - const Vec ref(&reference[i * Vec::Size]); - //std::cout << a << ref << std::endl; - COMPARE(Vc::round(a), ref); - } -} -/*}}}*/ -template void testReduceMin()/*{{{*/ -{ - typedef typename Vec::EntryType T; - const T one = 1; - VectorMemoryHelper mem(Vec::Size); - T *data = mem; - for (int i = 0; i < Vec::Size * Vec::Size; ++i) { - data[i] = i % (Vec::Size + 1) + one; - } - for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { - const Vec a(&data[0]); - //std::cout << a << std::endl; - COMPARE(a.min(), one); - } -} -/*}}}*/ -template void testReduceMax()/*{{{*/ -{ - typedef typename Vec::EntryType T; - const T max = Vec::Size + 1; - VectorMemoryHelper mem(Vec::Size); - T *data = mem; - for (int i = 0; i < Vec::Size * Vec::Size; ++i) { - data[i] = (i + Vec::Size) % (Vec::Size + 1) + 1; - } - for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { - const Vec a(&data[0]); - //std::cout << a << std::endl; - COMPARE(a.max(), max); - } -} -/*}}}*/ -template void testReduceProduct()/*{{{*/ -{ - enum { - Max = Vec::Size > 8 ? Vec::Size / 2 : Vec::Size - }; - typedef typename Vec::EntryType T; - int _product = 1; - for (int i = 1; i < Vec::Size; ++i) { - _product *= (i % Max) + 1; - } - const T product = _product; - VectorMemoryHelper mem(Vec::Size); - T *data = mem; - for (int i = 0; i < Vec::Size * Vec::Size; ++i) { - data[i] = ((i + (i / Vec::Size)) % Max) + 1; - } - for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { - const Vec a(&data[0]); - //std::cout << a << std::endl; - COMPARE(a.product(), product); - } -} -/*}}}*/ -template void testReduceSum()/*{{{*/ -{ - typedef typename Vec::EntryType T; - int _sum = 1; - for (int i = 2; i <= Vec::Size; ++i) { - _sum += i; - } - const T sum = _sum; - VectorMemoryHelper mem(Vec::Size); - T *data = mem; - for (int i = 0; i < Vec::Size * Vec::Size; ++i) { - data[i] = (i + i / Vec::Size) % Vec::Size + 1; - } - for (int i = 0; i < Vec::Size; ++i, data += Vec::Size) { - const Vec a(&data[0]); - //std::cout << a << std::endl; - COMPARE(a.sum(), sum); - } -} -/*}}}*/ -template void testExponent()/*{{{*/ -{ - typedef typename V::EntryType T; - Vc::Memory input; - Vc::Memory expected; - input[ 0] = T(0.25); expected[ 0] = T(-2); - input[ 1] = T( 1); expected[ 1] = T( 0); - input[ 2] = T( 2); expected[ 2] = T( 1); - input[ 3] = T( 3); expected[ 3] = T( 1); - input[ 4] = T( 4); expected[ 4] = T( 2); - input[ 5] = T( 0.5); expected[ 5] = T(-1); - input[ 6] = T( 6); expected[ 6] = T( 2); - input[ 7] = T( 7); expected[ 7] = T( 2); - input[ 8] = T( 8); expected[ 8] = T( 3); - input[ 9] = T( 9); expected[ 9] = T( 3); - input[10] = T( 10); expected[10] = T( 3); - input[11] = T( 11); expected[11] = T( 3); - input[12] = T( 12); expected[12] = T( 3); - input[13] = T( 13); expected[13] = T( 3); - input[14] = T( 14); expected[14] = T( 3); - input[15] = T( 15); expected[15] = T( 3); - input[16] = T( 16); expected[16] = T( 4); - input[17] = T( 17); expected[17] = T( 4); - input[18] = T( 18); expected[18] = T( 4); - input[19] = T( 19); expected[19] = T( 4); - input[20] = T( 20); expected[20] = T( 4); - input[21] = T( 21); expected[21] = T( 4); - input[22] = T( 22); expected[22] = T( 4); - input[23] = T( 23); expected[23] = T( 4); - input[24] = T( 24); expected[24] = T( 4); - input[25] = T( 25); expected[25] = T( 4); - input[26] = T( 26); expected[26] = T( 4); - input[27] = T( 27); expected[27] = T( 4); - input[28] = T( 28); expected[28] = T( 4); - input[29] = T( 29); expected[29] = T( 4); - input[30] = T( 32); expected[30] = T( 5); - input[31] = T( 31); expected[31] = T( 4); - for (size_t i = 0; i < input.vectorsCount(); ++i) { - COMPARE(V(input.vector(i)).exponent(), V(expected.vector(i))); - } -} -/*}}}*/ -template struct _ExponentVector { typedef int_v Type; }; -template<> struct _ExponentVector { typedef short_v Type; }; - -template void testFrexp()/*{{{*/ -{ - typedef typename V::EntryType T; - typedef typename _ExponentVector::Type ExpV; - Vc::Memory input; - Vc::Memory expectedFraction; - Vc::Memory expectedExponent; - input[ 0] = T(0.25); expectedFraction[ 0] = T(.5 ); expectedExponent[ 0] = -1; - input[ 1] = T( 1); expectedFraction[ 1] = T(.5 ); expectedExponent[ 1] = 1; - input[ 2] = T( 0); expectedFraction[ 2] = T(0. ); expectedExponent[ 2] = 0; - input[ 3] = T( 3); expectedFraction[ 3] = T(.75 ); expectedExponent[ 3] = 2; - input[ 4] = T( 4); expectedFraction[ 4] = T(.5 ); expectedExponent[ 4] = 3; - input[ 5] = T( 0.5); expectedFraction[ 5] = T(.5 ); expectedExponent[ 5] = 0; - input[ 6] = T( 6); expectedFraction[ 6] = T( 6./8. ); expectedExponent[ 6] = 3; - input[ 7] = T( 7); expectedFraction[ 7] = T( 7./8. ); expectedExponent[ 7] = 3; - input[ 8] = T( 8); expectedFraction[ 8] = T( 8./16.); expectedExponent[ 8] = 4; - input[ 9] = T( 9); expectedFraction[ 9] = T( 9./16.); expectedExponent[ 9] = 4; - input[10] = T( 10); expectedFraction[10] = T(10./16.); expectedExponent[10] = 4; - input[11] = T( 11); expectedFraction[11] = T(11./16.); expectedExponent[11] = 4; - input[12] = T( 12); expectedFraction[12] = T(12./16.); expectedExponent[12] = 4; - input[13] = T( 13); expectedFraction[13] = T(13./16.); expectedExponent[13] = 4; - input[14] = T( 14); expectedFraction[14] = T(14./16.); expectedExponent[14] = 4; - input[15] = T( 15); expectedFraction[15] = T(15./16.); expectedExponent[15] = 4; - input[16] = T( 16); expectedFraction[16] = T(16./32.); expectedExponent[16] = 5; - input[17] = T( 17); expectedFraction[17] = T(17./32.); expectedExponent[17] = 5; - input[18] = T( 18); expectedFraction[18] = T(18./32.); expectedExponent[18] = 5; - input[19] = T( 19); expectedFraction[19] = T(19./32.); expectedExponent[19] = 5; - input[20] = T( 20); expectedFraction[20] = T(20./32.); expectedExponent[20] = 5; - input[21] = T( 21); expectedFraction[21] = T(21./32.); expectedExponent[21] = 5; - input[22] = T( 22); expectedFraction[22] = T(22./32.); expectedExponent[22] = 5; - input[23] = T( 23); expectedFraction[23] = T(23./32.); expectedExponent[23] = 5; - input[24] = T( 24); expectedFraction[24] = T(24./32.); expectedExponent[24] = 5; - input[25] = T( 25); expectedFraction[25] = T(25./32.); expectedExponent[25] = 5; - input[26] = T( 26); expectedFraction[26] = T(26./32.); expectedExponent[26] = 5; - input[27] = T( 27); expectedFraction[27] = T(27./32.); expectedExponent[27] = 5; - input[28] = T( 28); expectedFraction[28] = T(28./32.); expectedExponent[28] = 5; - input[29] = T( 29); expectedFraction[29] = T(29./32.); expectedExponent[29] = 5; - input[30] = T( 32); expectedFraction[30] = T(32./64.); expectedExponent[30] = 6; - input[31] = T( 31); expectedFraction[31] = T(31./32.); expectedExponent[31] = 5; - for (size_t i = 0; i < input.vectorsCount(); ++i) { - const V v = input.vector(i); - ExpV exp; - COMPARE(frexp(v, &exp), V(expectedFraction.vector(i))); - if (V::Size * 2 == ExpV::Size) { - for (size_t j = 0; j < V::Size; ++j) { - COMPARE(exp[j * 2], expectedExponent[i * V::Size + j]); - } - } else { - COMPARE(exp, ExpV(expectedExponent.vector(i))); - } - } -} -/*}}}*/ -template void testLdexp()/*{{{*/ -{ - typedef typename V::EntryType T; - typedef typename _ExponentVector::Type ExpV; - for (size_t i = 0; i < 1024 / V::Size; ++i) { - const V v = (V::Random() - T(0.5)) * T(1000); - ExpV e; - const V m = frexp(v, &e); - COMPARE(ldexp(m, e), v) << ", m = " << m << ", e = " << e; - } -} -/*}}}*/ -#include "ulp.h" -template void testUlpDiff()/*{{{*/ -{ - typedef typename V::EntryType T; - - COMPARE(ulpDiffToReference(V::Zero(), V::Zero()), V::Zero()); - COMPARE(ulpDiffToReference(std::numeric_limits::min(), V::Zero()), V::One()); - COMPARE(ulpDiffToReference(V::Zero(), std::numeric_limits::min()), V::One()); - for (size_t count = 0; count < 1024 / V::Size; ++count) { - const V base = (V::Random() - T(0.5)) * T(1000); - typename _Ulp_ExponentVector::Type exp; - Vc::frexp(base, &exp); - const V eps = ldexp(V(std::numeric_limits::epsilon()), exp - 1); - //std::cout << base << ", " << exp << ", " << eps << std::endl; - for (int i = -10000; i <= 10000; ++i) { - const V i_v = V(T(i)); - const V diff = base + i_v * eps; - - // if diff and base have a different exponent then ulpDiffToReference has an uncertainty - // of +/-1 - const V ulpDifference = ulpDiffToReference(diff, base); - const V expectedDifference = Vc::abs(i_v); - const V maxUncertainty = Vc::abs(abs(diff).exponent() - abs(base).exponent()); - - VERIFY(Vc::abs(ulpDifference - expectedDifference) <= maxUncertainty) - << ", base = " << base << ", epsilon = " << eps << ", diff = " << diff; - for (int k = 0; k < V::Size; ++k) { - VERIFY(std::abs(ulpDifference[k] - expectedDifference[k]) <= maxUncertainty[k]); - } - } - } -}/*}}}*/ - -int main(int argc, char **argv)/*{{{*/ -{ - initTest(argc, argv); - - Denormals::data = Vc::malloc(NDenormals);/*{{{*/ - Denormals::data[0] = std::numeric_limits::denorm_min(); - for (int i = 1; i < NDenormals; ++i) { - Denormals::data[i] = Denormals::data[i - 1] * 2.173f; - } - Denormals::data = Vc::malloc(NDenormals); - Denormals::data[0] = std::numeric_limits::denorm_min(); - for (int i = 1; i < NDenormals; ++i) { - Denormals::data[i] = Denormals::data[i - 1] * 2.173; - }/*}}}*/ - - testRealTypes(isNegative); - testRealTypes(testFrexp); - testRealTypes(testLdexp); - - runTest(testAbs); - runTest(testAbs); - runTest(testAbs); - runTest(testAbs); - runTest(testAbs); - - testRealTypes(testUlpDiff); - - testRealTypes(testTrunc); - testRealTypes(testFloor); - testRealTypes(testCeil); - testRealTypes(testExp); - testRealTypes(testLog); - testRealTypes(testLog2); - testRealTypes(testLog10); - - runTest(testMax); - runTest(testMax); - runTest(testMax); - runTest(testMax); - runTest(testMax); - runTest(testMax); - runTest(testMax); - - testRealTypes(testSqrt); - testRealTypes(testRSqrt); - testRealTypes(testSin); - testRealTypes(testCos); - testRealTypes(testAsin); - testRealTypes(testAtan); - testRealTypes(testAtan2); - testRealTypes(testReciprocal); - testRealTypes(testInf); - testRealTypes(testNaN); - testRealTypes(testRound); - - runTest(testReduceMin); - runTest(testReduceMin); - runTest(testReduceMin); - runTest(testReduceMin); - runTest(testReduceMin); - runTest(testReduceMin); - runTest(testReduceMin); - - runTest(testReduceMax); - runTest(testReduceMax); - runTest(testReduceMax); - runTest(testReduceMax); - runTest(testReduceMax); - runTest(testReduceMax); - runTest(testReduceMax); - - runTest(testReduceProduct); - runTest(testReduceProduct); - runTest(testReduceProduct); - runTest(testReduceProduct); - runTest(testReduceProduct); - runTest(testReduceProduct); - runTest(testReduceProduct); - - runTest(testReduceSum); - runTest(testReduceSum); - runTest(testReduceSum); - runTest(testReduceSum); - runTest(testReduceSum); - runTest(testReduceSum); - runTest(testReduceSum); - - testRealTypes(testSincos); - testRealTypes(testExponent); - - return 0; -}/*}}}*/ - -// vim: foldmethod=marker diff --git a/math/vc/tests/memory.cpp b/math/vc/tests/memory.cpp deleted file mode 100644 index 9091b14966e1a..0000000000000 --- a/math/vc/tests/memory.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" - -using namespace Vc; - -template class TestClass> struct TestWrapper -{ - static inline void run() - { - TestWrapper::run(); - TestClass::test(); - TestClass::test(); - } -}; -template class TestClass> struct TestWrapper { - static inline void run() {} -}; - -template struct TestEntries { static void test() { - typedef typename V::EntryType T; - const T x = Size; - Memory m; - const Memory &m2 = m; - Memory m3(Size); - for (unsigned int i = 0; i < Size; ++i) { - m[i] = x; - m3[i] = x; - } - for (unsigned int i = 0; i < Size; ++i) { - COMPARE(m[i], x); - COMPARE(m2[i], x); - COMPARE(m3[i], x); - } - for (unsigned int i = 0; i < Size; ++i) { - COMPARE(m.entries()[i], x); - COMPARE(m2.entries()[i], x); - COMPARE(m3.entries()[i], x); - } - const T *ptr = m2; - for (unsigned int i = 0; i < Size; ++i) { - COMPARE(ptr[i], x); - } - ptr = m3; - for (unsigned int i = 0; i < Size; ++i) { - COMPARE(ptr[i], x); - } -}}; - -template struct TestEntries2D { static void test() { - typedef typename V::EntryType T; - const T x = Size; - Memory m; - const Memory &m2 = m; - - for (size_t i = 0; i < Size; ++i) { - for (size_t j = 0; j < Size; ++j) { - m[i][j] = x + i + j; - } - } - for (size_t i = 0; i < Size; ++i) { - for (size_t j = 0; j < Size; ++j) { - COMPARE(m[i][j], T(x + i + j)); - COMPARE(m2[i][j], T(x + i + j)); - } - } - for (size_t i = 0; i < Size; ++i) { - for (size_t j = 0; j < Size; ++j) { - COMPARE(m[i].entries()[j], T(x + i + j)); - COMPARE(m2[i].entries()[j], T(x + i + j)); - } - } - for (size_t i = 0; i < Size; ++i) { - const T *ptr = m2[i]; - for (size_t j = 0; j < Size; ++j) { - COMPARE(ptr[j], T(x + i + j)); - } - } -}}; - -template struct TestVectors { static void test() -{ - const V startX(V::IndexType::IndexesFromZero() + Size); - Memory m; - const Memory &m2 = m; - Memory m3(Size); - V x = startX; - for (unsigned int i = 0; i < m.vectorsCount(); ++i, x += V::Size) { - m.vector(i) = x; - m3.vector(i) = x; - } - x = startX; - unsigned int i; - for (i = 0; i + 1 < m.vectorsCount(); ++i) { - COMPARE(V(m.vector(i)), x); - COMPARE(V(m2.vector(i)), x); - COMPARE(V(m3.vector(i)), x); - for (int shift = 0; shift < V::Size; ++shift, ++x) { - COMPARE(V(m.vector(i, shift)), x); - COMPARE(V(m2.vector(i, shift)), x); - COMPARE(V(m3.vector(i, shift)), x); - } - } - COMPARE(V(m.vector(i)), x); - COMPARE(V(m2.vector(i)), x); - COMPARE(V(m3.vector(i)), x); -}}; - -template struct TestVectors2D { static void test() -{ - const V startX(V::IndexType::IndexesFromZero() + Size); - Memory m; - const Memory &m2 = m; - V x = startX; - for (size_t i = 0; i < m.rowsCount(); ++i, x += V::Size) { - Memory &mrow = m[i]; - for (size_t j = 0; j < mrow.vectorsCount(); ++j, x += V::Size) { - mrow.vector(j) = x; - } - } - x = startX; - for (size_t i = 0; i < m.rowsCount(); ++i, x += V::Size) { - Memory &mrow = m[i]; - const Memory &m2row = m2[i]; - size_t j; - for (j = 0; j < mrow.vectorsCount() - 1; ++j) { - COMPARE(V(mrow.vector(j)), x); - COMPARE(V(m2row.vector(j)), x); - for (int shift = 0; shift < V::Size; ++shift, ++x) { - COMPARE(V(mrow.vector(j, shift)), x); - COMPARE(V(m2row.vector(j, shift)), x); - } - } - COMPARE(V(mrow.vector(j)), x) << i << " " << j; - COMPARE(V(m2row.vector(j)), x); - x += V::Size; - } -}}; - -template struct TestVectorReorganization { static void test() -{ - typename V::Memory init; - for (unsigned int i = 0; i < V::Size; ++i) { - init[i] = i; - } - V x(init); - Memory m; - Memory m3(Size); - for (unsigned int i = 0; i < m.vectorsCount(); ++i) { - m.vector(i) = x; - m3.vector(i) = x; - x += V::Size; - } - /////////////////////////////////////////////////////////////////////////// - x = V(init); - for (unsigned int i = 0; i < m.vectorsCount(); ++i) { - COMPARE(V(m.vector(i)), x); - COMPARE(V(m3.vector(i)), x); - x += V::Size; - } - /////////////////////////////////////////////////////////////////////////// - x = V(init); - unsigned int indexes[Size]; - for (unsigned int i = 0; i < Size; ++i) { - indexes[i] = i; - } - for (unsigned int i = 0; i + V::Size < Size; ++i) { - COMPARE(m.gather(&indexes[i]), x); - COMPARE(m3.gather(&indexes[i]), x); - x += 1; - } - /////////////////////////////////////////////////////////////////////////// - for (unsigned int i = 0; i < V::Size; ++i) { - init[i] = i * 2; - } - x = V(init); - for (unsigned int i = 0; i < Size; ++i) { - indexes[i] = (i * 2) % Size; - } - for (unsigned int i = 0; i + V::Size < Size; ++i) { - COMPARE(m.gather(&indexes[i]), x); - COMPARE(m3.gather(&indexes[i]), x); - x += 2; - x(x >= Size) -= Size; - } -}}; - -template void testEntries() -{ - TestWrapper::run(); -} - -template void testEntries2D() -{ - TestWrapper::run(); -} - -template void testVectors() -{ - TestWrapper::run(); -} - -template void testVectors2D() -{ - TestWrapper::run(); -} - -template void testVectorReorganization() -{ - TestWrapper::run(); -} - -template void memoryOperators() -{ - Memory m1, m2; - m1.setZero(); - m2.setZero(); - VERIFY(m1 == m2); - VERIFY(!(m1 != m2)); - VERIFY(!(m1 < m2)); - VERIFY(!(m1 > m2)); - m1 += m2; - VERIFY(m1 == m2); - VERIFY(m1 <= m2); - VERIFY(m1 >= m2); - m1 += 1; - VERIFY(m1 != m2); - VERIFY(m1 > m2); - VERIFY(m1 >= m2); - VERIFY(m2 < m1); - VERIFY(m2 <= m1); - VERIFY(!(m1 == m2)); - VERIFY(!(m1 <= m2)); - VERIFY(!(m2 >= m1)); - m2 += m1; - VERIFY(m1 == m2); - m2 *= 2; - m1 += 1; - VERIFY(m1 == m2); - m2 /= 2; - m1 -= 1; - VERIFY(m1 == m2); - m1 *= m2; - VERIFY(m1 == m2); - m1 /= m2; - VERIFY(m1 == m2); - m1 -= m2; - m2 -= m2; - VERIFY(m1 == m2); -} - -template void testCCtor() -{ - Memory m1(5); - for (size_t i = 0; i < m1.entriesCount(); ++i) { - m1[i] = i; - } - Memory m2(m1); - for (size_t i = 0; i < m1.entriesCount(); ++i) { - m1[i] += 1; - } - for (size_t i = 0; i < m1.entriesCount(); ++i) { - COMPARE(m1[i], m2[i] + 1); - } -} - -template void testCopyAssignment() -{ - typedef typename V::EntryType T; - - Memory m1; - m1.setZero(); - - Memory m2(m1); - for (size_t i = 0; i < m2.entriesCount(); ++i) { - COMPARE(m2[i], T(0)); - m2[i] += 1; - } - m1 = m2; - for (size_t i = 0; i < m2.entriesCount(); ++i) { - COMPARE(m1[i], T(1)); - } -} - -int main() -{ - testAllTypes(testEntries); - testAllTypes(testEntries2D); - testAllTypes(testVectors); - testAllTypes(testVectors2D); - testAllTypes(testVectorReorganization); - testAllTypes(memoryOperators); - testAllTypes(testCCtor); - testAllTypes(testCopyAssignment); - - return 0; -} diff --git a/math/vc/tests/scalaraccess.cpp b/math/vc/tests/scalaraccess.cpp deleted file mode 100644 index 6146414824095..0000000000000 --- a/math/vc/tests/scalaraccess.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2010-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" - -using namespace Vc; - -template void reads() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - - V a = V::Zero(); - const T zero = 0; - for (int i = 0; i < V::Size; ++i) { - const T x = a[i]; - COMPARE(x, zero); - } - a = static_cast(I::IndexesFromZero()); - for (int i = 0; i < V::Size; ++i) { - const T x = a[i]; - const T y = i; - COMPARE(x, y); - } -} - -template -inline void readsConstantIndexTest(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) -{ - typedef typename V::EntryType T; - { - const T x = a[Index]; - const T zero = 0; - COMPARE(x, zero) << Index; - }{ - const T x = b[Index]; - const T y = Index; - COMPARE(x, y) << Index; - } -} - -template -struct ReadsConstantIndex -{ - ReadsConstantIndex(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) - { - readsConstantIndexTest(a, b); - ReadsConstantIndex(a, b); - } -}; - - -template -struct ReadsConstantIndex -{ - ReadsConstantIndex(VC_ALIGNED_PARAMETER(V) a, VC_ALIGNED_PARAMETER(V) b) - { - readsConstantIndexTest(a, b); - } -}; - -template void readsConstantIndex() -{ - typedef typename V::IndexType I; - - V a = V::Zero(); - V b = static_cast(I::IndexesFromZero()); - ReadsConstantIndex(a, b); -} - -template void writes() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - - V a; - for (int i = 0; i < V::Size; ++i) { - a[i] = static_cast(i); - } - V b = static_cast(I::IndexesFromZero()); - COMPARE(a, b); - - const T one = 1; - const T two = 2; - - if (V::Size == 1) { - a(a == 0) += one; - a[0] += one; - a(a == 0) += one; - COMPARE(a, V(2)); - } else if (V::Size == 4) { - a(a == 1) += two; - a[2] += one; - a(a == 3) += one; - b(b == 1) += one; - b(b == 2) += one; - b(b == 3) += one; - COMPARE(a, b); - } else if (V::Size == 8 || V::Size == 16) { - a(a == 2) += two; - a[3] += one; - a(a == 4) += one; - b(b == 2) += one; - b(b == 3) += one; - b(b == 4) += one; - COMPARE(a, b); - } else if (V::Size == 2) { // a = [0, 1]; b = [0, 1] - a(a == 0) += two; // a = [2, 1] - a[1] += one; // a = [2, 2] - a(a == 2) += one; // a = [3, 3] - b(b == 0) += one; // b = [1, 1] - b(b == 1) += one; // b = [2, 2] - b(b == 2) += one; // b = [3, 3] - COMPARE(a, b); - } else { - FAIL() << "unsupported Vector::Size"; - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - testAllTypes(reads); - testAllTypes(writes); - testAllTypes(readsConstantIndex); - //testAllTypes(writesConstantIndex); - - return 0; -} diff --git a/math/vc/tests/scatter.cpp b/math/vc/tests/scatter.cpp deleted file mode 100644 index ce2191b6ad1e4..0000000000000 --- a/math/vc/tests/scatter.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ -// includes {{{1 -#include "unittest.h" -#include -#include - -using namespace Vc; - -template void maskedScatterArray() //{{{1 -{ - typedef typename Vec::IndexType It; - typedef typename Vec::EntryType T; - - T mem[Vec::Size]; - const Vec v(It::IndexesFromZero() + 1); - - for_all_masks(Vec, m) { - Vec::Zero().store(mem, Vc::Unaligned); - v.scatter(&mem[0], It::IndexesFromZero(), m); - - for (int i = 0; i < Vec::Size; ++i) { - COMPARE(mem[i], m[i] ? v[i] : T(0)) << " i = " << i << ", m = " << m; - } - } -} - -template void scatterArray() //{{{1 -{ - typedef typename Vec::IndexType It; - const int count = 31999; - typename Vec::EntryType array[count], out[count]; - for (int i = 0; i < count; ++i) { - array[i] = i - 100; - } - typename It::Mask mask; - for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - typename Vec::Mask castedMask(mask); - if (castedMask.isFull()) { - Vec a(array, i); - a += Vec(One); - a.scatter(out, i); - } else { - Vec a(array, i, castedMask); - a += Vec(One); - a.scatter(out, i, castedMask); - } - } - for (int i = 0; i < count; ++i) { - array[i] += 1; - COMPARE(array[i], out[i]); - } - COMPARE(0, std::memcmp(array, out, count * sizeof(typename Vec::EntryType))); -} - -template struct Struct //{{{1 -{ - T a; - char x; - T b; - short y; - T c; - char z; -}; - -template void scatterStruct() //{{{1 -{ - typedef typename Vec::IndexType It; - typedef Struct S; - const int count = 3999; - S array[count], out[count]; - memset(array, 0, count * sizeof(S)); - memset(out, 0, count * sizeof(S)); - for (int i = 0; i < count; ++i) { - array[i].a = i; - array[i].b = i + 1; - array[i].c = i + 2; - } - typename It::Mask mask; - for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - typename Vec::Mask castedMask(mask); - Vec a(array, &S::a, i, castedMask); - Vec b(array, &S::b, i, castedMask); - Vec c(array, &S::c, i, castedMask); - a.scatter(out, &S::a, i, castedMask); - b.scatter(out, &S::b, i, castedMask); - c.scatter(out, &S::c, i, castedMask); - } - VERIFY(0 == memcmp(array, out, count * sizeof(S))); -} - -template struct Struct2 //{{{1 -{ - char x; - Struct b; - short y; -}; - -template void scatterStruct2() //{{{1 -{ - typedef typename Vec::IndexType It; - typedef Struct2 S1; - typedef Struct S2; - const int count = 97; - S1 array[count], out[count]; - memset(array, 0, count * sizeof(S1)); - memset(out, 0, count * sizeof(S1)); - for (int i = 0; i < count; ++i) { - array[i].b.a = i + 0; - array[i].b.b = i + 1; - array[i].b.c = i + 2; - } - typename It::Mask mask; - for (It i(IndexesFromZero); !(mask = (i < count)).isEmpty(); i += Vec::Size) { - typename Vec::Mask castedMask(mask); - Vec a(array, &S1::b, &S2::a, i, castedMask); - Vec b(array, &S1::b, &S2::b, i, castedMask); - Vec c(array, &S1::b, &S2::c, i, castedMask); - a.scatter(out, &S1::b, &S2::a, i, castedMask); - b.scatter(out, &S1::b, &S2::b, i, castedMask); - c.scatter(out, &S1::b, &S2::c, i, castedMask); - } - VERIFY(0 == memcmp(array, out, count * sizeof(S1))); -} - -int main(int argc, char **argv) //{{{1 -{ - initTest(argc, argv); - - runTest(scatterArray); - runTest(scatterArray); - runTest(scatterArray); - runTest(scatterArray); - runTest(scatterArray); - runTest(scatterArray); - runTest(scatterArray); - testAllTypes(maskedScatterArray); -#if defined(VC_CLANG) && VC_CLANG <= 0x030000 - // clang fails with: - // candidate template ignored: failed template argument deduction - // template inline Vector(const S1 *array, const T S1::* - // member1, IT indexes, Mask mask = true) -#warning "Skipping compilation of tests scatterStruct and scatterStruct2 because of clang bug" -#else - runTest(scatterStruct); - runTest(scatterStruct); - runTest(scatterStruct); - runTest(scatterStruct); - runTest(scatterStruct); - runTest(scatterStruct); - runTest(scatterStruct); - testAllTypes(scatterStruct2); -#endif - return 0; -} - -// vim: foldmethod=marker diff --git a/math/vc/tests/sse_blend.cpp b/math/vc/tests/sse_blend.cpp deleted file mode 100644 index 480d77de7ed35..0000000000000 --- a/math/vc/tests/sse_blend.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include - -namespace std -{ -ostream &operator<<(ostream &out, const __m128i &v) -{ - union { - __m128i v; - short m[8]; - } x = { v }; - - out << "[" << x.m[0]; - for (int i = 1; i < 8; ++i) { - out << ", " << x.m[i]; - } - return out << "]"; -} -} // namespace std - -template<> inline bool unittest_compareHelper<__m128i, __m128i>(const __m128i &a, const __m128i &b) -{ - return _mm_movemask_epi8(_mm_cmpeq_epi16(a, b)) == 0xffff; -} - -void blendpd() -{ -#ifdef VC_IMPL_SSE4_1 -#define blend _mm_blend_pd -#else -#define blend Vc::SSE::mm_blend_pd -#endif - __m128d a = _mm_set_pd(11, 10); - __m128d b = _mm_set_pd(21, 20); - - COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x0), a)), 0x3); - COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x1), _mm_set_pd(11, 20))), 0x3); - COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x2), _mm_set_pd(21, 10))), 0x3); - COMPARE(_mm_movemask_pd(_mm_cmpeq_pd(blend(a, b, 0x3), b)), 0x3); -#undef blend -} -void blendps() -{ -#ifdef VC_IMPL_SSE4_1 -#define blend _mm_blend_ps -#else -#define blend Vc::SSE::mm_blend_ps -#endif - __m128 a = _mm_set_ps(13, 12, 11, 10); - __m128 b = _mm_set_ps(23, 22, 21, 20); - - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x0), a)), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x1), _mm_set_ps(13, 12, 11, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x2), _mm_set_ps(13, 12, 21, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x3), _mm_set_ps(13, 12, 21, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x4), _mm_set_ps(13, 22, 11, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x5), _mm_set_ps(13, 22, 11, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x6), _mm_set_ps(13, 22, 21, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x7), _mm_set_ps(13, 22, 21, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x8), _mm_set_ps(23, 12, 11, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0x9), _mm_set_ps(23, 12, 11, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xa), _mm_set_ps(23, 12, 21, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xb), _mm_set_ps(23, 12, 21, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xc), _mm_set_ps(23, 22, 11, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xd), _mm_set_ps(23, 22, 11, 20))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xe), _mm_set_ps(23, 22, 21, 10))), 0xf); - COMPARE(_mm_movemask_ps(_mm_cmpeq_ps(blend(a, b, 0xf), b)), 0xf); -#undef blend -} -void blendepi16() -{ -#ifdef VC_IMPL_SSE4_1 -#define blend _mm_blend_epi16 -#else -#define blend Vc::SSE::mm_blend_epi16 -#endif - __m128i a = _mm_set_epi16(17, 16, 15, 14, 13, 12, 11, 10); - __m128i b = _mm_set_epi16(27, 26, 25, 24, 23, 22, 21, 20); - -#define CALL_2(_i, code) { enum { i = _i }; code } { enum { i = _i + 1 }; code } -#define CALL_4(_i, code) CALL_2(_i, code) CALL_2(_i + 2, code) -#define CALL_8(_i, code) CALL_4(_i, code) CALL_4(_i + 4, code) -#define CALL_16(_i, code) CALL_8(_i, code) CALL_8(_i + 8, code) -#define CALL_32(_i, code) CALL_16(_i, code) CALL_16(_i + 16, code) -#define CALL_64(_i, code) CALL_32(_i, code) CALL_32(_i + 32, code) -#define CALL_128(_i, code) CALL_64(_i, code) CALL_64(_i + 64, code) -#define CALL_256(code) CALL_128(0, code) CALL_128(128, code) -#define CALL_100(code) CALL_64(0, code) CALL_32(64, code) CALL_4(96, code) - - CALL_256( - short r[8]; - for (int j = 0; j < 8; ++j) { - r[j] = j + ((((i >> j) & 1) == 0) ? 10 : 20); - } - __m128i reference = _mm_set_epi16(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); - COMPARE_NOEQ(blend(a, b, i), reference); - ) -#undef blend -} - -int main() -{ - runTest(blendpd); - runTest(blendps); - runTest(blendepi16); -} diff --git a/math/vc/tests/stlcontainer.cpp b/math/vc/tests/stlcontainer.cpp deleted file mode 100644 index 8c3e6037d4a14..0000000000000 --- a/math/vc/tests/stlcontainer.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/*{{{ - Copyright (C) 2012 Matthias Kretz - - Permission to use, copy, modify, and distribute this software - and its documentation for any purpose and without fee is hereby - granted, provided that the above copyright notice appear in all - copies and that both that the copyright notice and this - permission notice and warranty disclaimer appear in supporting - documentation, and that the name of the author not be used in - advertising or publicity pertaining to distribution of the - software without specific, written prior permission. - - The author disclaim all warranties with regard to this - software, including all implied warranties of merchantability - and fitness. In no event shall the author be liable for any - special, indirect or consequential damages or any damages - whatsoever resulting from loss of use, data or profits, whether - in an action of contract, negligence or other tortious action, - arising out of or in connection with the use or performance of - this software. - -}}}*/ - -#include -#include -#include "unittest.h" - -#include "Vc/common/macros.h" - -template size_t alignmentMask() -{ - if (Vec::Size == 1) { - // on 32bit the maximal alignment is 4 Bytes, even for 8-Byte doubles. - return std::min(sizeof(void*), sizeof(typename Vec::EntryType)) - 1; - } - // sizeof(SSE::sfloat_v) is too large - // AVX::VectorAlignment is too large - return std::min(sizeof(Vec), Vc::VectorAlignment) - 1; -} - -template struct SomeStruct { char a; T x; }; - -template void stdVectorAlignment() -{ - const size_t mask = alignmentMask(); - const char *const null = 0; - - std::vector v(11); - for (int i = 0; i < 11; ++i) { - COMPARE((reinterpret_cast(&v[i]) - null) & mask, 0u) << "&v[i] = " << &v[i] << ", mask = " << mask << ", i = " << i; - } - - std::vector, Vc::Allocator > > v2(11); - for (int i = 0; i < 11; ++i) { - COMPARE((reinterpret_cast(&v2[i]) - null) & mask, 0u) << "&v2[i] = " << &v2[i] << ", mask = " << mask << ", i = " << i; - } - - std::vector v3(v); - std::vector, Vc::Allocator > > v4(v2); - - typedef typename V::EntryType T; - for (int i = 1; i < 100; ++i) { - std::vector > v5(i); - const size_t expectedAlignment = Vc_ALIGNOF(V); - COMPARE((&v5[0] - static_cast(0)) * sizeof(T) & (expectedAlignment - 1), 0u); - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - using namespace Vc; - testAllTypes(stdVectorAlignment); -} diff --git a/math/vc/tests/store.cpp b/math/vc/tests/store.cpp deleted file mode 100644 index aeb3792374534..0000000000000 --- a/math/vc/tests/store.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2011 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include - -using namespace Vc; - -template void alignedStore() -{ - typedef typename Vec::EntryType T; - enum { - Count = 256 * 1024 / sizeof(T) - }; - - Memory array; - // do the memset to make sure the array doesn't have the old data from a previous call which - // would mask a real problem - std::memset(array, 0xff, Count * sizeof(T)); - T xValue = 1; - const Vec x(xValue); - for (int i = 0; i < Count; i += Vec::Size) { - x.store(&array[i]); - } - - for (int i = 0; i < Count; ++i) { - COMPARE(array[i], xValue); - } -} - -template void unalignedStore() -{ - typedef typename Vec::EntryType T; - enum { - Count = 256 * 1024 / sizeof(T) - }; - - Memory array; - // do the memset to make sure the array doesn't have the old data from a previous call which - // would mask a real problem - std::memset(array, 0xff, Count * sizeof(T)); - T xValue = 1; - const Vec x(xValue); - for (int i = 1; i < Count - Vec::Size + 1; i += Vec::Size) { - x.store(&array[i], Unaligned); - } - - for (int i = 1; i < Count - Vec::Size + 1; ++i) { - COMPARE(array[i], xValue); - } -} - -template void streamingAndAlignedStore() -{ - typedef typename Vec::EntryType T; - enum { - Count = 256 * 1024 / sizeof(T) - }; - - Memory array; - // do the memset to make sure the array doesn't have the old data from a previous call which - // would mask a real problem - std::memset(array, 0xff, Count * sizeof(T)); - T xValue = 1; - const Vec x(xValue); - for (int i = 0; i < Count; i += Vec::Size) { - x.store(&array[i], Streaming | Aligned); - } - - for (int i = 0; i < Count; ++i) { - COMPARE(array[i], xValue); - } -} - -template void streamingAndUnalignedStore() -{ - typedef typename Vec::EntryType T; - enum { - Count = 256 * 1024 / sizeof(T) - }; - - Memory array; - // do the memset to make sure the array doesn't have the old data from a previous call which - // would mask a real problem - std::memset(array, 0xff, Count * sizeof(T)); - T xValue = 1; - const Vec x(xValue); - for (int i = 1; i < Count - Vec::Size + 1; i += Vec::Size) { - x.store(&array[i], Streaming | Unaligned); - } - - for (int i = 1; i < Count - Vec::Size + 1; ++i) { - COMPARE(array[i], xValue); - } -} - -template void maskedStore() -{ - typedef typename Vec::EntryType T; - typedef typename Vec::Mask M; - M mask; - { - typedef typename Vec::IndexType I; - const I tmp(IndexesFromZero); - const typename I::Mask k = (tmp & I(One)) > 0; - mask = M(k); - } - - const int count = 256 * 1024 / sizeof(T); - const int outerCount = count / Vec::Size; - Vc::Memory array(count); - array.setZero(); - const T nullValue = 0; - const T setValue = 170; - const Vec x(setValue); - for (int i = 0; i < count; i += Vec::Size) { - x.store(&array[i], mask); - } - - for (int i = 1; i < count; i += 2) { - COMPARE(array[i], setValue) << ", i: " << i << ", count: " << count << ", outer: " << outerCount; - } - for (int i = 0; i < count; i += 2) { - COMPARE(array[i], nullValue) << ", i: " << i << ", count: " << count << ", outer: " << outerCount; - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - testAllTypes(alignedStore); - testAllTypes(unalignedStore); - testAllTypes(streamingAndAlignedStore); - testAllTypes(streamingAndUnalignedStore); - - if (float_v::Size > 1) { - runTest(maskedStore); - runTest(maskedStore); - runTest(maskedStore); - runTest(maskedStore); - runTest(maskedStore); - runTest(maskedStore); - runTest(maskedStore); - } - return 0; -} diff --git a/math/vc/tests/supportfunctions.cpp b/math/vc/tests/supportfunctions.cpp deleted file mode 100644 index b0c9120e0054f..0000000000000 --- a/math/vc/tests/supportfunctions.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/*{{{ - Copyright (C) 2013 Matthias Kretz - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -}}}*/ - -#include "unittest.h" - -void testCompiledImplementation() -{ - VERIFY(Vc::currentImplementationSupported()); -} - -void testIsSupported() -{ - using Vc::CpuId; - VERIFY(Vc::isImplementationSupported(Vc::ScalarImpl)); - COMPARE(Vc::isImplementationSupported(Vc::SSE2Impl ), CpuId::hasSse2()); - COMPARE(Vc::isImplementationSupported(Vc::SSE3Impl ), CpuId::hasSse3()); - COMPARE(Vc::isImplementationSupported(Vc::SSSE3Impl), CpuId::hasSsse3()); - COMPARE(Vc::isImplementationSupported(Vc::SSE41Impl), CpuId::hasSse41()); - COMPARE(Vc::isImplementationSupported(Vc::SSE42Impl), CpuId::hasSse42()); - COMPARE(Vc::isImplementationSupported(Vc::AVXImpl ), CpuId::hasOsxsave() && CpuId::hasAvx()); - COMPARE(Vc::isImplementationSupported(Vc::AVX2Impl ), false); -} - -void testBestImplementation() -{ - // when building with a recent and fully featured compiler the following should pass - // but - old GCC versions have to fall back to Scalar, even though SSE is supported by the CPU - // - ICC/MSVC can't use XOP/FMA4 - //COMPARE(Vc::bestImplementationSupported(), VC_IMPL); -} - -void testExtraInstructions() -{ - using Vc::CpuId; - unsigned int extra = Vc::extraInstructionsSupported(); - COMPARE(!(extra & Vc::Float16cInstructions), !CpuId::hasF16c()); - COMPARE(!(extra & Vc::XopInstructions), !CpuId::hasXop()); - COMPARE(!(extra & Vc::Fma4Instructions), !CpuId::hasFma4()); - COMPARE(!(extra & Vc::PopcntInstructions), !CpuId::hasPopcnt()); - COMPARE(!(extra & Vc::Sse4aInstructions), !CpuId::hasSse4a()); -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - - runTest(testCompiledImplementation); - runTest(testIsSupported); - runTest(testBestImplementation); - runTest(testExtraInstructions); - - return 0; -} diff --git a/math/vc/tests/swizzles.cpp b/math/vc/tests/swizzles.cpp deleted file mode 100644 index 022d5d5d769b8..0000000000000 --- a/math/vc/tests/swizzles.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#include "unittest.h" - -using namespace Vc; - -enum Swizzle { - BADC, CDAB, AAAA, BBBB, CCCC, DDDD, BCAD, BCDA, DABC, ACBD, DBCA, DCBA -}; - -template V scalarSwizzle(VC_ALIGNED_PARAMETER(V) v, Swizzle s) -{ - V r = v; - for (int i = 0; i + 4 <= V::Size; i += 4) { - switch (s) { - case BADC: - r[i + 0] = v[i + 1]; - r[i + 1] = v[i + 0]; - r[i + 2] = v[i + 3]; - r[i + 3] = v[i + 2]; - break; - case CDAB: - r[i + 0] = v[i + 2]; - r[i + 1] = v[i + 3]; - r[i + 2] = v[i + 0]; - r[i + 3] = v[i + 1]; - break; - case AAAA: - r[i + 0] = v[i + 0]; - r[i + 1] = v[i + 0]; - r[i + 2] = v[i + 0]; - r[i + 3] = v[i + 0]; - break; - case BBBB: - r[i + 0] = v[i + 1]; - r[i + 1] = v[i + 1]; - r[i + 2] = v[i + 1]; - r[i + 3] = v[i + 1]; - break; - case CCCC: - r[i + 0] = v[i + 2]; - r[i + 1] = v[i + 2]; - r[i + 2] = v[i + 2]; - r[i + 3] = v[i + 2]; - break; - case DDDD: - r[i + 0] = v[i + 3]; - r[i + 1] = v[i + 3]; - r[i + 2] = v[i + 3]; - r[i + 3] = v[i + 3]; - break; - case BCAD: - r[i + 0] = v[i + 1]; - r[i + 1] = v[i + 2]; - r[i + 2] = v[i + 0]; - r[i + 3] = v[i + 3]; - break; - case BCDA: - r[i + 0] = v[i + 1]; - r[i + 1] = v[i + 2]; - r[i + 2] = v[i + 3]; - r[i + 3] = v[i + 0]; - break; - case DABC: - r[i + 0] = v[i + 3]; - r[i + 1] = v[i + 0]; - r[i + 2] = v[i + 1]; - r[i + 3] = v[i + 2]; - break; - case ACBD: - r[i + 0] = v[i + 0]; - r[i + 1] = v[i + 2]; - r[i + 2] = v[i + 1]; - r[i + 3] = v[i + 3]; - break; - case DBCA: - r[i + 0] = v[i + 3]; - r[i + 1] = v[i + 1]; - r[i + 2] = v[i + 2]; - r[i + 3] = v[i + 0]; - break; - case DCBA: - r[i + 0] = v[i + 3]; - r[i + 1] = v[i + 2]; - r[i + 2] = v[i + 1]; - r[i + 3] = v[i + 0]; - break; - } - } - return r; -} - -template void testSwizzle() -{ - for (int i = 0; i < 100; ++i) { - const V test = V::Random(); - COMPARE(test.abcd(), test); - COMPARE(test.badc(), scalarSwizzle(test, BADC)); - COMPARE(test.cdab(), scalarSwizzle(test, CDAB)); - COMPARE(test.aaaa(), scalarSwizzle(test, AAAA)); - COMPARE(test.bbbb(), scalarSwizzle(test, BBBB)); - COMPARE(test.cccc(), scalarSwizzle(test, CCCC)); - COMPARE(test.dddd(), scalarSwizzle(test, DDDD)); - COMPARE(test.bcad(), scalarSwizzle(test, BCAD)); - COMPARE(test.bcda(), scalarSwizzle(test, BCDA)); - COMPARE(test.dabc(), scalarSwizzle(test, DABC)); - COMPARE(test.acbd(), scalarSwizzle(test, ACBD)); - COMPARE(test.dbca(), scalarSwizzle(test, DBCA)); - COMPARE(test.dcba(), scalarSwizzle(test, DCBA)); - } -} - -int main(int argc, char **argv) -{ - initTest(argc, argv); - -#if VC_DOUBLE_V_SIZE >= 4 || VC_DOUBLE_V_SIZE == 1 - runTest(testSwizzle); -#endif - runTest(testSwizzle); - runTest(testSwizzle); - runTest(testSwizzle); - runTest(testSwizzle); - runTest(testSwizzle); - runTest(testSwizzle); - - return 0; -} diff --git a/math/vc/tests/ulp.h b/math/vc/tests/ulp.h deleted file mode 100644 index 9e23c15fc88f6..0000000000000 --- a/math/vc/tests/ulp.h +++ /dev/null @@ -1,96 +0,0 @@ -/* This file is part of the Vc library. {{{ - - Copyright (C) 2011-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -}}}*/ - -#ifndef TESTS_ULP_H -#define TESTS_ULP_H - -#include -#include - -#ifdef VC_MSVC -namespace std -{ - static inline bool isnan(float x) { return _isnan(x); } - static inline bool isnan(double x) { return _isnan(x); } -} // namespace std -#endif - -template static T ulpDiffToReference(T val, T ref) -{ - if (val == ref || (std::isnan(val) && std::isnan(ref))) { - return 0; - } - if (ref == T(0)) { - return 1 + ulpDiffToReference(std::abs(val), std::numeric_limits::min()); - } - if (val == T(0)) { - return 1 + ulpDiffToReference(std::numeric_limits::min(), std::abs(ref)); - } - - int exp; - /*tmp = */ frexp(ref, &exp); // ref == tmp * 2 ^ exp => tmp == ref * 2 ^ -exp - // tmp is now in the range [0.5, 1.0[ - // now we want to know how many times we can fit 2^-numeric_limits::digits between tmp and - // val * 2 ^ -exp - return ldexp(std::abs(ref - val), std::numeric_limits::digits - exp); -} -template static T ulpDiffToReferenceSigned(T val, T ref) -{ - return ulpDiffToReference(val, ref) * (val - ref < 0 ? -1 : 1); -} - -template struct _Ulp_ExponentVector { typedef Vc::int_v Type; }; -template<> struct _Ulp_ExponentVector { typedef Vc::short_v Type; }; - -template static Vc::Vector<_T> ulpDiffToReference(const Vc::Vector<_T> &_val, const Vc::Vector<_T> &_ref) -{ - using namespace Vc; - typedef Vector<_T> V; - typedef typename V::EntryType T; - typedef typename V::Mask M; - - V val = _val; - V ref = _ref; - - V diff = V::Zero(); - - M zeroMask = ref == V::Zero(); - val (zeroMask)= abs(val); - ref (zeroMask)= std::numeric_limits::min(); - diff (zeroMask)= V::One(); - zeroMask = val == V::Zero(); - ref (zeroMask)= abs(ref); - val (zeroMask)= std::numeric_limits::min(); - diff (zeroMask)+= V::One(); - - typename _Ulp_ExponentVector::Type exp; - frexp(ref, &exp); - diff += ldexp(abs(ref - val), std::numeric_limits::digits - exp); - diff.setZero(_val == _ref || (isnan(_val) && isnan(_ref))); - return diff; -} - -template static Vc::Vector<_T> ulpDiffToReferenceSigned(const Vc::Vector<_T> &_val, const Vc::Vector<_T> &_ref) -{ - return ulpDiffToReference(_val, _ref).copySign(_val - _ref); -} - -#endif // TESTS_ULP_H - -// vim: foldmethod=marker diff --git a/math/vc/tests/unittest.h b/math/vc/tests/unittest.h deleted file mode 100644 index fdef2ec997607..0000000000000 --- a/math/vc/tests/unittest.h +++ /dev/null @@ -1,681 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef UNITTEST_H -#define UNITTEST_H - -#ifdef VC_ASSERT -#error "include unittest.h before any Vc header" -#endif -inline void unittest_assert(bool cond, const char *code, const char *file, int line); -#define VC_ASSERT(cond) unittest_assert(cond, #cond, __FILE__, __LINE__); - -#include -#include -#include -#include -#include -#include -#include -#include -#include "ulp.h" -#include -#include - -#define _expand(name) #name -#define runTest(name) _unit_test_global.runTestInt(&name, _expand(name)) -#define testAllTypes(name) \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name "") -#define testRealTypes(name) \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); \ - _unit_test_global.runTestInt(&name, #name ""); - -template struct isEqualType -{ - operator bool() const { return false; } -}; - -template struct isEqualType -{ - operator bool() const { return true; } -}; - -inline void printPass() -{ - std::cout << AnsiColor::green << " PASS: " << AnsiColor::normal; -} - -bool _UnitTest_verify_vector_unit_supported() -{ - bool s = Vc::currentImplementationSupported(); - if (!s) { - std::cerr << "CPU or OS requirements not met for the compiled in vector unit!\n"; - exit(-1); - } - return s; -} - -static bool _UnitTest_verify_vector_unit_supported_result = _UnitTest_verify_vector_unit_supported(); - -class _UnitTest_Failure -{ -}; - -typedef void (*testFunction)(); -class _UnitTest_Global_Object -{ - public: - _UnitTest_Global_Object() - : status(true), - expect_failure(false), - assert_failure(0), - expect_assert_failure(false), - float_fuzzyness( 1.f ), - double_fuzzyness( 1. ), - only_name(0), - m_finalized(false), - failedTests(0), passedTests(0), - findMaximumDistance(false), - maximumDistance(0), - meanDistance(0), - meanCount(0) - { - } - - ~_UnitTest_Global_Object() - { - if (m_finalized) { - // on windows std::exit will call the dtor again, leading to infinite recursion - return; - } - if (plotFile.is_open()) { - plotFile.flush(); - plotFile.close(); - } - std::cout << "\n Testing done. " << passedTests << " tests passed. " << failedTests << " tests failed." << std::endl; - m_finalized = true; - std::exit(failedTests); - } - - void runTestInt(testFunction fun, const char *name); - - bool status; - bool expect_failure; - int assert_failure; - bool expect_assert_failure; - float float_fuzzyness; - double double_fuzzyness; - const char *only_name; - std::fstream plotFile; - private: - bool m_finalized; - int failedTests; - public: - int passedTests; - bool findMaximumDistance; - double maximumDistance; - double meanDistance; - int meanCount; -}; - -static _UnitTest_Global_Object _unit_test_global; - -void EXPECT_FAILURE() -{ - _unit_test_global.expect_failure = true; -} - -inline const char *_unittest_fail() -{ - if (_unit_test_global.expect_failure) { - return "XFAIL: "; - } - static const char *str = 0; - if (str == 0) { - if (mayUseColor(std::cout)) { - static const char *fail = " \033[1;40;31mFAIL:\033[0m "; - str = fail; - } else { - static const char *fail = " FAIL: "; - str = fail; - } - } - return str; -} - -void initTest(int argc, char **argv) -{ - for (int i = 1; i < argc; ++i) { - if (0 == std::strcmp(argv[i], "--help") || 0 == std::strcmp(argv[i], "-h")) { - std::cout << - "Usage: " << argv[0] << " [-h|--help] [--only ] [--maxdist] [--plotdist ]\n"; - exit(0); - } - if (0 == std::strcmp(argv[i], "--only") && i + 1 < argc) { - _unit_test_global.only_name = argv[i + 1]; - } else if (0 == std::strcmp(argv[i], "--maxdist")) { - _unit_test_global.findMaximumDistance = true; - } else if (0 == std::strcmp(argv[i], "--plotdist") && i + 1 < argc) { - _unit_test_global.plotFile.open(argv[i + 1], std::ios_base::out); - _unit_test_global.plotFile << "# reference\tdistance\n"; - } - } -} - -template inline void setFuzzyness( T ); -template<> inline void setFuzzyness( float fuzz ) { _unit_test_global.float_fuzzyness = fuzz; } -template<> inline void setFuzzyness( double fuzz ) { _unit_test_global.double_fuzzyness = fuzz; } - -void _UnitTest_Global_Object::runTestInt(testFunction fun, const char *name) -{ - if (_unit_test_global.only_name && 0 != std::strcmp(name, _unit_test_global.only_name)) { - return; - } - _unit_test_global.status = true; - _unit_test_global.expect_failure = false; - try { - setFuzzyness(1); - setFuzzyness(1); - maximumDistance = 0.; - meanDistance = 0.; - meanCount = 0; - fun(); - } catch(_UnitTest_Failure) { - } - if (_unit_test_global.expect_failure) { - if (!_unit_test_global.status) { - std::cout << "XFAIL: " << name << std::endl; - } else { - std::cout << "unexpected PASS: " << name << - "\n This test should have failed but didn't. Check the code!" << std::endl; - ++failedTests; - } - } else { - if (!_unit_test_global.status) { - if (findMaximumDistance) { - std::cout << _unittest_fail() << "│ with a maximal distance of " << maximumDistance << " to the reference (mean: " << meanDistance / meanCount << ").\n"; - } - std::cout << _unittest_fail() << "┕ " << name << std::endl; - ++failedTests; - } else { - printPass(); - std::cout << name; - if (findMaximumDistance) { - if (maximumDistance > 0.) { - std::cout << " with a maximal distance of " << maximumDistance << " to the reference (mean: " << meanDistance / meanCount << ")."; - } else { - std::cout << " all values matched the reference precisely."; - } - } - std::cout << std::endl; - ++passedTests; - } - } -} - -template inline bool unittest_compareHelper( const T1 &a, const T2 &b ) { return a == b; } -template<> inline bool unittest_compareHelper( const Vc::int_v &a, const Vc::int_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::uint_v &a, const Vc::uint_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::float_v &a, const Vc::float_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::sfloat_v &a, const Vc::sfloat_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::double_v &a, const Vc::double_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::ushort_v &a, const Vc::ushort_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper( const Vc::short_v &a, const Vc::short_v &b ) { return (a == b).isFull(); } -template<> inline bool unittest_compareHelper(const std::type_info &a, const std::type_info &b ) { return &a == &b; } - -template T ulpDiffToReferenceWrapper(T a, T b) { - const T diff = ulpDiffToReference(a, b); - if (VC_IS_UNLIKELY(_unit_test_global.findMaximumDistance)) { - _unit_test_global.maximumDistance = std::max(std::abs(diff), _unit_test_global.maximumDistance); - _unit_test_global.meanDistance += std::abs(diff); - ++_unit_test_global.meanCount; - } - return diff; -} -template Vc::Vector ulpDiffToReferenceWrapper(VC_ALIGNED_PARAMETER(Vc::Vector) a, VC_ALIGNED_PARAMETER(Vc::Vector) b) { - const Vc::Vector diff = ulpDiffToReference(a, b); - if (VC_IS_UNLIKELY(_unit_test_global.findMaximumDistance)) { - _unit_test_global.maximumDistance = std::max(Vc::abs(diff).max(), _unit_test_global.maximumDistance); - _unit_test_global.meanDistance += Vc::abs(diff).sum(); - _unit_test_global.meanCount += Vc::Vector::Size; - } - return diff; -} -template inline bool unittest_fuzzyCompareHelper( const T &a, const T &b ) { return a == b; } -template<> inline bool unittest_fuzzyCompareHelper( const float &a, const float &b ) { - return ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness; -} -template<> inline bool unittest_fuzzyCompareHelper( const Vc::float_v &a, const Vc::float_v &b ) { - return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness).isFull(); -} -template<> inline bool unittest_fuzzyCompareHelper( const Vc::sfloat_v &a, const Vc::sfloat_v &b ) { - return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.float_fuzzyness).isFull(); -} -template<> inline bool unittest_fuzzyCompareHelper( const double &a, const double &b ) { - return ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.double_fuzzyness; -} -template<> inline bool unittest_fuzzyCompareHelper( const Vc::double_v &a, const Vc::double_v &b ) { - return (ulpDiffToReferenceWrapper(a, b) <= _unit_test_global.double_fuzzyness).isFull(); -} - -template inline void unitttest_comparePrintHelper(const T1 &a, const T2 &b, const M &m, const char *aa, const char *bb, const char *file, int line, double fuzzyness = 0.) { - std::cout << " " << aa << " (" << std::setprecision(10) << a << std::setprecision(6) << ") == " << bb << " (" << std::setprecision(10) << b << std::setprecision(6) << ") -> " << m; - if (fuzzyness > 0.) { - std::cout << " with fuzzyness " << fuzzyness; - } - std::cout << " at " << file << ":" << line << " failed.\n"; -} - -template inline double unittest_fuzzynessHelper(const T &) { return 0.; } -template<> inline double unittest_fuzzynessHelper(const float &) { return _unit_test_global.float_fuzzyness; } -template<> inline double unittest_fuzzynessHelper(const Vc::float_v &) { return _unit_test_global.float_fuzzyness; } -template<> inline double unittest_fuzzynessHelper(const double &) { return _unit_test_global.double_fuzzyness; } -template<> inline double unittest_fuzzynessHelper(const Vc::double_v &) { return _unit_test_global.double_fuzzyness; } - -class _UnitTest_Compare -{ - public: - enum OptionFuzzy { Fuzzy }; - enum OptionNoEq { NoEq }; - - template - Vc_ALWAYS_INLINE _UnitTest_Compare(const T1 &a, const T2 &b, const char *_a, const char *_b, const char *_file, int _line) - : m_ip(getIp()), m_failed(!unittest_compareHelper(a, b)) - { - if (VC_IS_UNLIKELY(m_failed)) { - printFirst(); - printPosition(_file, _line); print(":\n"); - print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") == "); - print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); - print(") -> "); print(a == b); - } - } - - template - Vc_ALWAYS_INLINE _UnitTest_Compare(const T1 &a, const T2 &b, const char *_a, const char *_b, const char *_file, int _line, OptionNoEq) - : m_ip(getIp()), m_failed(!unittest_compareHelper(a, b)) - { - if (VC_IS_UNLIKELY(m_failed)) { - printFirst(); - printPosition(_file, _line); print(":\n"); - print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") == "); - print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); - print(')'); - } - } - - template - Vc_ALWAYS_INLINE _UnitTest_Compare(const T &a, const T &b, const char *_a, const char *_b, const char *_file, int _line, OptionFuzzy) - : m_ip(getIp()), m_failed(!unittest_fuzzyCompareHelper(a, b)) - { - if (VC_IS_UNLIKELY(m_failed)) { - printFirst(); - printPosition(_file, _line); print(":\n"); - print(_a); print(" ("); print(std::setprecision(10)); print(a); print(") ≈ "); - print(_b); print(" ("); print(std::setprecision(10)); print(b); print(std::setprecision(6)); - print(") -> "); print(a == b); - printFuzzyInfo(a, b); - } - if (_unit_test_global.plotFile.is_open()) { - writePlotData(_unit_test_global.plotFile, a, b); - } - } - - Vc_ALWAYS_INLINE _UnitTest_Compare(bool good, const char *cond, const char *_file, int _line) - : m_ip(getIp()), m_failed(!good) - { - if (VC_IS_UNLIKELY(m_failed)) { - printFirst(); - printPosition(_file, _line); - print(": "); print(cond); - } - } - - Vc_ALWAYS_INLINE _UnitTest_Compare(const char *_file, int _line) - : m_ip(getIp()), m_failed(true) - { - printFirst(); - printPosition(_file, _line); - print(":\n"); - } - - template Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const T &x) const { - if (VC_IS_UNLIKELY(m_failed)) { - print(x); - } - return *this; - } - - Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const char *str) const { - if (VC_IS_UNLIKELY(m_failed)) { - print(str); - } - return *this; - } - - Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(const char ch) const { - if (VC_IS_UNLIKELY(m_failed)) { - print(ch); - } - return *this; - } - - Vc_ALWAYS_INLINE const _UnitTest_Compare &operator<<(bool b) const { - if (VC_IS_UNLIKELY(m_failed)) { - print(b); - } - return *this; - } - - Vc_ALWAYS_INLINE ~_UnitTest_Compare() throw(_UnitTest_Failure) - { - if (VC_IS_UNLIKELY(m_failed)) { - printLast(); - } - } - - private: - static Vc_ALWAYS_INLINE size_t getIp() { - size_t _ip; -#if defined(__x86_64__) && defined(VC_GNU_ASM) - asm("lea 0(%%rip),%0" : "=r"(_ip)); -#else - _ip = 0; -#endif - return _ip; - } - static void printFirst() { std::cout << _unittest_fail() << "┍ "; } - template static void print(const T &x) { std::cout << x; } - static void print(const std::type_info &x) { std::cout << x.name(); } - static void print(const char *str) { - const char *pos = 0; - if (0 != (pos = std::strchr(str, '\n'))) { - if (pos == str) { - std::cout << '\n' << _unittest_fail() << "│ " << &str[1]; - } else { - char *left = strdup(str); - left[pos - str] = '\0'; - std::cout << left << '\n' << _unittest_fail() << "│ " << &pos[1]; - free(left); - } - } else { - std::cout << str; - } - } - static void print(const char ch) { - if (ch == '\n') { - std::cout << '\n' << _unittest_fail() << "│ "; - } else { - std::cout << ch; - } - } - static void print(bool b) { - std::cout << (b ? "true" : "false"); - } - static void printLast() { - std::cout << std::endl; - _unit_test_global.status = false; - //if (!_unit_test_global.plotFile.is_open()) { - throw _UnitTest_Failure(); - //} - } - void printPosition(const char *_file, int _line) { - std::cout << "at " << _file << ':' << _line << " (0x" << std::hex << m_ip << std::dec << ')'; - } - template static inline void writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b); - template static inline void printFuzzyInfo(VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b); - template static inline void printFuzzyInfoImpl(VC_ALIGNED_PARAMETER(T) a, VC_ALIGNED_PARAMETER(T) b, double fuzzyness) { - print("\ndistance: "); - print(ulpDiffToReferenceSigned(a, b)); - print(", allowed distance: "); - print(fuzzyness); - } - const size_t m_ip; - const bool m_failed; -}; -template inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(T), VC_ALIGNED_PARAMETER(T)) {} -template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(float) a, VC_ALIGNED_PARAMETER(float) b) { - printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); -} -template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(double) a, VC_ALIGNED_PARAMETER(double) b) { - printFuzzyInfoImpl(a, b, _unit_test_global.double_fuzzyness); -} -template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::float_v) a, VC_ALIGNED_PARAMETER(Vc::float_v) b) { - printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); -} -template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::double_v) a, VC_ALIGNED_PARAMETER(Vc::double_v) b) { - printFuzzyInfoImpl(a, b, _unit_test_global.double_fuzzyness); -} -template<> inline void _UnitTest_Compare::printFuzzyInfo(VC_ALIGNED_PARAMETER(Vc::sfloat_v) a, VC_ALIGNED_PARAMETER(Vc::sfloat_v) b) { - printFuzzyInfoImpl(a, b, _unit_test_global.float_fuzzyness); -} -template inline void _UnitTest_Compare::writePlotData(std::fstream &, VC_ALIGNED_PARAMETER(T), VC_ALIGNED_PARAMETER(T)) {} -template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(float) a, VC_ALIGNED_PARAMETER(float) b) { - file << std::setprecision(12) << b << "\t" << ulpDiffToReferenceSigned(a, b) << "\n"; -} -template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(double) a, VC_ALIGNED_PARAMETER(double) b) { - file << std::setprecision(12) << b << "\t" << ulpDiffToReferenceSigned(a, b) << "\n"; -} -template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::float_v) a, VC_ALIGNED_PARAMETER(Vc::float_v) b) { - const Vc::float_v ref = b; - const Vc::float_v dist = ulpDiffToReferenceSigned(a, b); - for (int i = 0; i < Vc::float_v::Size; ++i) { - file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; - } -} -template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::double_v) a, VC_ALIGNED_PARAMETER(Vc::double_v) b) { - const Vc::double_v ref = b; - const Vc::double_v dist = ulpDiffToReferenceSigned(a, b); - for (int i = 0; i < Vc::double_v::Size; ++i) { - file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; - } -} -template<> inline void _UnitTest_Compare::writePlotData(std::fstream &file, VC_ALIGNED_PARAMETER(Vc::sfloat_v) a, VC_ALIGNED_PARAMETER(Vc::sfloat_v) b) { - const Vc::sfloat_v ref = b; - const Vc::sfloat_v dist = ulpDiffToReferenceSigned(a, b); - for (int i = 0; i < Vc::sfloat_v::Size; ++i) { - file << std::setprecision(12) << ref[i] << "\t" << dist[i] << "\n"; - } -} - -// Workaround for clang: The "<< ' '" is only added to silence the warnings about unused return -// values. -#define FUZZY_COMPARE( a, b ) \ - _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__, _UnitTest_Compare::Fuzzy) << ' ' - -#define COMPARE( a, b ) \ - _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__) << ' ' - -#define COMPARE_NOEQ( a, b ) \ - _UnitTest_Compare(a, b, #a, #b, __FILE__, __LINE__, _UnitTest_Compare::NoEq) << ' ' - -#define VERIFY(cond) \ - _UnitTest_Compare(cond, #cond, __FILE__, __LINE__) << ' ' - -#define FAIL() \ - _UnitTest_Compare(__FILE__, __LINE__) << ' ' - -class ADD_PASS -{ - public: - ADD_PASS() { ++_unit_test_global.passedTests; printPass(); } - ~ADD_PASS() { std::cout << std::endl; } - template ADD_PASS &operator<<(const T &x) { std::cout << x; return *this; } -}; - -inline void unittest_assert(bool cond, const char *code, const char *file, int line) -{ - if (!cond) { - if (_unit_test_global.expect_assert_failure) { - ++_unit_test_global.assert_failure; - } else { - _UnitTest_Compare(file, line) << "assert(" << code << ") failed."; - } - } -} -#ifdef assert -#undef assert -#endif -#define assert(cond) unittest_assert(cond, #cond, __FILE__, __LINE__) - -#define EXPECT_ASSERT_FAILURE(code) \ - _unit_test_global.expect_assert_failure = true; \ - _unit_test_global.assert_failure = 0; \ - code; \ - if (_unit_test_global.assert_failure == 0) { \ - /* failure expected but it didn't fail */ \ - std::cout << " " << #code << " at " << __FILE__ << ":" << __LINE__ << \ - " did not fail as was expected.\n"; \ - _unit_test_global.status = false; \ - throw _UnitTest_Failure(); \ - return; \ - } \ - _unit_test_global.expect_assert_failure = false - -template inline typename Vec::Mask allMasks(int i) -{ - typedef typename Vec::IndexType I; - typedef typename Vec::Mask M; - - if (i == 0) { - return M(true); - } - --i; - if (i < Vec::Size) { - return M (I(Vc::IndexesFromZero) == i); - } - i -= Vec::Size; - if (Vec::Size < 3) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b); - } - --i; - } - } - if (Vec::Size < 4) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - for (int c = b + 1; c < Vec::Size; ++c) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b || indexes == c); - } - --i; - } - } - } - if (Vec::Size < 5) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - for (int c = b + 1; c < Vec::Size; ++c) { - for (int d = c + 1; d < Vec::Size; ++d) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b || indexes == c || indexes == d); - } - --i; - } - } - } - } - if (Vec::Size < 6) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - for (int c = b + 1; c < Vec::Size; ++c) { - for (int d = c + 1; d < Vec::Size; ++d) { - for (int e = d + 1; e < Vec::Size; ++e) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b || indexes == c || indexes == d || indexes == e); - } - --i; - } - } - } - } - } - if (Vec::Size < 7) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - for (int c = b + 1; c < Vec::Size; ++c) { - for (int d = c + 1; d < Vec::Size; ++d) { - for (int e = d + 1; e < Vec::Size; ++e) { - for (int f = e + 1; f < Vec::Size; ++f) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b || indexes == c || indexes == d || indexes == e || indexes == f); - } - --i; - } - } - } - } - } - } - if (Vec::Size < 8) { - return M(false); - } - for (int a = 0; a < Vec::Size - 1; ++a) { - for (int b = a + 1; b < Vec::Size; ++b) { - for (int c = b + 1; c < Vec::Size; ++c) { - for (int d = c + 1; d < Vec::Size; ++d) { - for (int e = d + 1; e < Vec::Size; ++e) { - for (int f = e + 1; f < Vec::Size; ++f) { - for (int g = f + 1; g < Vec::Size; ++g) { - if (i == 0) { - I indexes(Vc::IndexesFromZero); - return M(indexes == a || indexes == b || indexes == c || indexes == d - || indexes == e || indexes == f || indexes == g); - } - --i; - } - } - } - } - } - } - } - return M(false); -} - -#define for_all_masks(VecType, _mask_) \ - for (int _Vc_for_all_masks_i = 0; _Vc_for_all_masks_i == 0; ++_Vc_for_all_masks_i) \ - for (typename VecType::Mask _mask_ = allMasks(_Vc_for_all_masks_i++); !_mask_.isEmpty(); _mask_ = allMasks(_Vc_for_all_masks_i++)) - -#endif // UNITTEST_H diff --git a/math/vc/tests/utils.cpp b/math/vc/tests/utils.cpp deleted file mode 100644 index f44a740b23391..0000000000000 --- a/math/vc/tests/utils.cpp +++ /dev/null @@ -1,401 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009-2012 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#include "unittest.h" -#include -#include "vectormemoryhelper.h" -#include - -using namespace Vc; - -template void testSort() -{ - typedef typename Vec::IndexType IndexType; - - const IndexType _ref(IndexesFromZero); - Vec ref(_ref); - Vec a; - int maxPerm = 1; - for (int x = Vec::Size; x > 0; --x) { - maxPerm *= x; - } - for (int perm = 0; perm < maxPerm; ++perm) { - int rest = perm; - for (int i = 0; i < Vec::Size; ++i) { - a[i] = 0; - for (int j = 0; j < i; ++j) { - if (a[i] == a[j]) { - ++(a[i]); - j = -1; - } - } - a[i] += rest % (Vec::Size - i); - rest /= (Vec::Size - i); - for (int j = 0; j < i; ++j) { - if (a[i] == a[j]) { - ++(a[i]); - j = -1; - } - } - } - //std::cout << a << a.sorted() << std::endl; - COMPARE(ref, a.sorted()) << ", a: " << a; - } - - for (int repetition = 0; repetition < 1000; ++repetition) { - Vec test = Vec::Random(); - Vc::Memory reference; - reference.vector(0) = test; - std::sort(&reference[0], &reference[Vec::Size]); - ref = reference.vector(0); - COMPARE(ref, test.sorted()); - } -} - -template struct Foo -{ - Foo() : i(0) {} - void reset() { i = 0; } - void operator()(T v) { d[i++] = v; } - Mem d; - int i; -}; - -template void testCall() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - typedef typename V::Mask M; - typedef typename I::Mask MI; - const I _indexes(IndexesFromZero); - const MI _odd = (_indexes & I(One)) > 0; - const M odd(_odd); - V a(_indexes); - Foo f; - a.callWithValuesSorted(f); - V b(f.d); - COMPARE(b, a); - - f.reset(); - a(odd) -= 1; - a.callWithValuesSorted(f); - V c(f.d); - for (int i = 0; i < V::Size / 2; ++i) { - COMPARE(a[i * 2], c[i]); - } - for (int i = V::Size / 2; i < V::Size; ++i) { - COMPARE(b[i], c[i]); - } -} - -template void testForeachBit() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - const I indexes(IndexesFromZero); - for_all_masks(V, mask) { - V tmp = V::Zero(); - foreach_bit(int j, mask) { - tmp[j] = T(1); - } - COMPARE(tmp == V::One(), mask); - - int count = 0; - foreach_bit(int j, mask) { - ++count; - if (j >= 0) { - continue; - } - } - COMPARE(count, mask.count()); - - count = 0; - foreach_bit(int j, mask) { - if (j >= 0) { - break; - } - ++count; - } - COMPARE(count, 0); - } -} - -template void copySign() -{ - V v(One); - V positive(One); - V negative = -positive; - COMPARE(v, v.copySign(positive)); - COMPARE(-v, v.copySign(negative)); -} - -#ifdef _WIN32 -void bzero(void *p, size_t n) { memset(p, 0, n); } -#else -#include -#endif - -template void Random() -{ - typedef typename V::EntryType T; - enum { - NBits = 3, - NBins = 1 << NBits, // short int - TotalBits = sizeof(T) * 8, // 16 32 - RightShift = TotalBits - NBits, // 13 29 - NHistograms = TotalBits - NBits + 1, // 14 30 - LeftShift = (RightShift + 1) / NHistograms,// 1 1 - Mean = 135791, - MinGood = Mean - Mean/10, - MaxGood = Mean + Mean/10 - }; - const V mask((1 << NBits) - 1); - int histogram[NHistograms][NBins]; - bzero(&histogram[0][0], sizeof(histogram)); - for (size_t i = 0; i < NBins * Mean / V::Size; ++i) { - const V rand = V::Random(); - for (size_t hist = 0; hist < NHistograms; ++hist) { - const V bin = ((rand << (hist * LeftShift)) >> RightShift) & mask; - for (size_t k = 0; k < V::Size; ++k) { - ++histogram[hist][bin[k]]; - } - } - } -//#define PRINT_RANDOM_HISTOGRAM -#ifdef PRINT_RANDOM_HISTOGRAM - for (size_t hist = 0; hist < NHistograms; ++hist) { - std::cout << "histogram[" << std::setw(2) << hist << "]: "; - for (size_t bin = 0; bin < NBins; ++bin) { - std::cout << std::setw(3) << (histogram[hist][bin] - Mean) * 1000 / Mean << "|"; - } - std::cout << std::endl; - } -#endif - for (size_t hist = 0; hist < NHistograms; ++hist) { - for (size_t bin = 0; bin < NBins; ++bin) { - VERIFY(histogram[hist][bin] > MinGood) - << " bin = " << bin << " is " << histogram[0][bin]; - VERIFY(histogram[hist][bin] < MaxGood) - << " bin = " << bin << " is " << histogram[0][bin]; - } - } -} - -template void FloatRandom() -{ - typedef typename V::EntryType T; - enum { - NBins = 64, - NHistograms = 1, - Mean = 135791, - MinGood = Mean - Mean/10, - MaxGood = Mean + Mean/10 - }; - int histogram[NHistograms][NBins]; - bzero(&histogram[0][0], sizeof(histogram)); - for (size_t i = 0; i < NBins * Mean / V::Size; ++i) { - const V rand = V::Random(); - const I bin = static_cast(rand * T(NBins)); - for (size_t k = 0; k < V::Size; ++k) { - ++histogram[0][bin[k]]; - } - } -#ifdef PRINT_RANDOM_HISTOGRAM - for (size_t hist = 0; hist < NHistograms; ++hist) { - std::cout << "histogram[" << std::setw(2) << hist << "]: "; - for (size_t bin = 0; bin < NBins; ++bin) { - std::cout << std::setw(3) << (histogram[hist][bin] - Mean) * 1000 / Mean << "|"; - } - std::cout << std::endl; - } -#endif - for (size_t hist = 0; hist < NHistograms; ++hist) { - for (size_t bin = 0; bin < NBins; ++bin) { - VERIFY(histogram[hist][bin] > MinGood) - << " bin = " << bin << " is " << histogram[0][bin]; - VERIFY(histogram[hist][bin] < MaxGood) - << " bin = " << bin << " is " << histogram[0][bin]; - } - } -} - -template<> void Random() { FloatRandom(); } -template<> void Random() { FloatRandom(); } -template<> void Random() { FloatRandom(); } - -template T add2(T x) { return x + T(2); } - -template -class CallTester -{ - public: - CallTester() : v(Vc::Zero), i(0) {} - - void operator()(T x) { - v[i] = x; - ++i; - } - - void reset() { v.setZero(); i = 0; } - - int callCount() const { return i; } - V callValues() const { return v; } - - private: - V v; - int i; -}; - -#if __cplusplus >= 201103 && (!defined(VC_CLANG) || VC_CLANG > 0x30000) -#define DO_LAMBDA_TESTS 1 -#endif - -template -void applyAndCall() -{ - typedef typename V::EntryType T; - - const V two(T(2)); - for (int i = 0; i < 1000; ++i) { - const V rand = V::Random(); - COMPARE(rand.apply(add2), rand + two); -#ifdef DO_LAMBDA_TESTS - COMPARE(rand.apply([](T x) { return x + T(2); }), rand + two); -#endif - - CallTester callTester; - rand.call(callTester); - COMPARE(callTester.callCount(), int(V::Size)); - COMPARE(callTester.callValues(), rand); - - for_all_masks(V, mask) { - V copy1 = rand; - V copy2 = rand; - copy1(mask) += two; - - COMPARE(copy2(mask).apply(add2), copy1) << mask; - COMPARE(rand.apply(add2, mask), copy1) << mask; -#ifdef DO_LAMBDA_TESTS - COMPARE(copy2(mask).apply([](T x) { return x + T(2); }), copy1) << mask; - COMPARE(rand.apply([](T x) { return x + T(2); }, mask), copy1) << mask; -#endif - - callTester.reset(); - copy2(mask).call(callTester); - COMPARE(callTester.callCount(), mask.count()); - - callTester.reset(); - rand.call(callTester, mask); - COMPARE(callTester.callCount(), mask.count()); - } - } -} - -template T returnConstant() { return T(value); } -template T returnConstantOffset(int i) { return T(value) + T(i); } -template T returnConstantOffset2(unsigned short i) { return T(value) + T(i); } - -template void fill() -{ - typedef typename V::EntryType T; - typedef typename V::IndexType I; - V test = V::Random(); - test.fill(returnConstant); - COMPARE(test, V(T(2))); - - test = V::Random(); - test.fill(returnConstantOffset); - COMPARE(test, static_cast(I::IndexesFromZero())); - - test = V::Random(); - test.fill(returnConstantOffset2); - COMPARE(test, static_cast(I::IndexesFromZero())); -} - -template void shifted() -{ - typedef typename V::EntryType T; - for (int shift = -2 * V::Size; shift <= 2 * V::Size; ++shift) { - const V reference = V::Random(); - const V test = reference.shifted(shift); - for (int i = 0; i < V::Size; ++i) { - if (i + shift >= 0 && i + shift < V::Size) { - COMPARE(test[i], reference[i + shift]) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; - } else { - COMPARE(test[i], T(0)) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; - } - } - } -} - -template void rotated() -{ - for (int shift = -2 * V::Size; shift <= 2 * V::Size; ++shift) { - //std::cout << "amount = " << shift % V::Size << std::endl; - const V reference = V::Random(); - const V test = reference.rotated(shift); - for (int i = 0; i < V::Size; ++i) { - unsigned int refShift = i + shift; - COMPARE(test[i], reference[refShift % V::Size]) << "shift: " << shift << ", i: " << i << ", test: " << test << ", reference: " << reference; - } - } -} - -void testMallocAlignment() -{ - int_v *a = Vc::malloc(10); - - unsigned long mask = VectorAlignment - 1; - for (int i = 0; i < 10; ++i) { - VERIFY((reinterpret_cast(&a[i]) & mask) == 0); - } - const char *data = reinterpret_cast(&a[0]); - for (int i = 0; i < 10; ++i) { - VERIFY(&data[i * int_v::Size * sizeof(int_v::EntryType)] == reinterpret_cast(&a[i])); - } - - a = Vc::malloc(10); - mask = CpuId::cacheLineSize() - 1; - COMPARE((reinterpret_cast(&a[0]) & mask), 0ul); - - // I don't know how to properly check page alignment. So we check for 4 KiB alignment as this is - // the minimum page size on x86 - a = Vc::malloc(10); - mask = 4096 - 1; - COMPARE((reinterpret_cast(&a[0]) & mask), 0ul); -} - -int main() -{ - testAllTypes(testCall); - testAllTypes(testForeachBit); - testAllTypes(testSort); - testRealTypes(copySign); - - testAllTypes(shifted); - testAllTypes(rotated); - testAllTypes(Random); - - testAllTypes(applyAndCall); - testAllTypes(fill); - - runTest(testMallocAlignment); - - return 0; -} diff --git a/math/vc/tests/vectormemoryhelper.h b/math/vc/tests/vectormemoryhelper.h deleted file mode 100644 index c9b9b1529947b..0000000000000 --- a/math/vc/tests/vectormemoryhelper.h +++ /dev/null @@ -1,41 +0,0 @@ -/* This file is part of the Vc library. - - Copyright (C) 2009 Matthias Kretz - - Vc is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of - the License, or (at your option) any later version. - - Vc is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with Vc. If not, see . - -*/ - -#ifndef VECTORMEMORYHELPER_H -#define VECTORMEMORYHELPER_H - -#include - -template -class VectorMemoryHelper -{ - char *const mem; - char *const aligned; - public: - VectorMemoryHelper(int count) - : mem(new char[count * sizeof(Vec) + Vc::VectorAlignment]), - aligned(mem + (Vc::VectorAlignment - (reinterpret_cast( mem ) & ( Vc::VectorAlignment - 1 )))) - { - } - ~VectorMemoryHelper() { delete[] mem; } - - operator typename Vec::EntryType *() { return reinterpret_cast(aligned); } -}; - -#endif // VECTORMEMORYHELPER_H