/* This file is part of Mitsuba, a physically based rendering system. Copyright (c) 2007-2012 by Wenzel Jakob and others. Mitsuba is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License Version 3 as published by the Free Software Foundation. Mitsuba is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #pragma once #if !defined(__MITSUBA_CORE_SSEMATH_H_) #define __MITSUBA_CORE_SSEMATH_H_ #ifdef MTS_SSE #include MTS_NAMESPACE_BEGIN namespace math { /** * \brief SIMD (SSE2) implementation of \c log * \author Julien Pommier */ extern MTS_EXPORT_CORE __m128 log_ps(__m128 x); /** * \brief SIMD (SSE2) implementation of \c exp * \author Julien Pommier */ extern MTS_EXPORT_CORE __m128 exp_ps(__m128 x); /** * \brief SIMD (SSE2) implementation of \c sin * \author Julien Pommier */ extern MTS_EXPORT_CORE __m128 sin_ps(__m128 x); /** * \brief SIMD (SSE2) implementation of \c cos * \author Julien Pommier */ extern MTS_EXPORT_CORE __m128 cos_ps(__m128 x); /** * \brief SIMD (SSE2) implementation which simultaneously * computes the sine and cosine of a given value * \author Julien Pommier */ extern MTS_EXPORT_CORE void sincos_ps(__m128 x, __m128* s, __m128* c); /** * \brief Fast SIMD (SSE2) approximation of \c log * which provides about 10-11 mantissa bits. * Inspired by the Intel Approximate Math Library. */ extern MTS_EXPORT_CORE __m128 fastlog_ps(__m128 x); /** * \brief Fast SIMD (SSE2) approximation of \c pow * which provides about 10-11 mantissa bits. * Inspired by the Intel Approximate Math Library. */ extern MTS_EXPORT_CORE __m128 fastpow_ps(__m128 x, __m128 y); /** * \brief The arguments row0, row1, row2 and * row3 are \c __m128 values whose elements form the corresponding * rows of a 4-by-4 matrix. The matrix transposition is returned in * arguments row0, row1, row2 and row3 * where \c row0 now holds column 0 of the original matrix, \c row1 now * holds column 1 of the original matrix, and so on. * \author Intel Intrinsics Guide for AVX2 */ FINLINE void transpose_ps(__m128& row0, __m128& row1, __m128& row2, __m128& row3) { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps(row0, row1); tmp2 = _mm_unpacklo_ps(row2, row3); tmp1 = _mm_unpackhi_ps(row0, row1); tmp3 = _mm_unpackhi_ps(row2, row3); row0 = _mm_movelh_ps(tmp0, tmp2); row1 = _mm_movehl_ps(tmp2, tmp0); row2 = _mm_movelh_ps(tmp1, tmp3); row3 = _mm_movehl_ps(tmp3, tmp1); } /// Component-wise clamp: max(min(x, maxVal), minVal) inline __m128 clamp_ps(__m128 x, __m128 minVal, __m128 maxVal) { return _mm_max_ps(_mm_min_ps(x, maxVal), minVal); } /// Sum of all elements in the vector inline float hsum_ps(__m128 vec) { __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2)); __m128 sum_tmp = _mm_add_ps(vec, tmp); tmp = _mm_shuffle_ps(sum_tmp, sum_tmp, _MM_SHUFFLE(2,3,0,1)); sum_tmp = _mm_add_ps(sum_tmp, tmp); return _mm_cvtss_f32(sum_tmp); } /// Maximum across all the elements of a vector inline float hmax_ps(__m128 vec) { __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2)); __m128 tmp_max = _mm_max_ps(vec, tmp); tmp = _mm_shuffle_ps(tmp_max, tmp_max, _MM_SHUFFLE(2,3,0,1)); tmp_max = _mm_max_ps(tmp_max, tmp); return _mm_cvtss_f32(tmp_max); } /// Minimum across all the elements of a vector inline float hmin_ps(__m128 vec) { __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2)); __m128 tmp_min = _mm_min_ps(vec, tmp); tmp = _mm_shuffle_ps(tmp_min, tmp_min, _MM_SHUFFLE(2,3,0,1)); tmp_min = _mm_min_ps(tmp_min, tmp); return _mm_cvtss_f32(tmp_min); } }; MTS_NAMESPACE_END #endif /* MTS_SSE */ #endif /* __MITSUBA_CORE_SSEMATH_H_ */