27 #ifndef AVX256TEMPLATESPFUNCTS_H_
28 #define AVX256TEMPLATESPFUNCTS_H_
32 #include <x86intrin.h>
35 #include <immintrin.h>
38 template<>
inline __m256d simd_processing::LoadPackedDataAligned<double, __m256d>(
39 unsigned int x,
unsigned int y,
unsigned int current_disparity,
41 unsigned int numDispVals,
const double* inData)
45 current_bp_level.
height_level_, current_disparity, numDispVals)]);
48 template<>
inline __m256 simd_processing::LoadPackedDataAligned<float, __m256>(
49 unsigned int x,
unsigned int y,
unsigned int current_disparity,
51 unsigned int numDispVals,
const float* inData)
55 current_bp_level.
height_level_, current_disparity, numDispVals)]);
58 template<>
inline __m128i simd_processing::LoadPackedDataAligned<short, __m128i>(
59 unsigned int x,
unsigned int y,
unsigned int current_disparity,
61 unsigned int numDispVals,
const short* inData)
69 #if defined(FLOAT16_VECTORIZATION)
71 template<>
inline __m256h simd_processing::LoadPackedDataAligned<_Float16, __m256h>(
72 unsigned int x,
unsigned int y,
unsigned int current_disparity,
74 unsigned int numDispVals,
const _Float16* inData)
84 template<>
inline __m256 simd_processing::LoadPackedDataUnaligned<float, __m256>(
85 unsigned int x,
unsigned int y,
unsigned int current_disparity,
87 unsigned int numDispVals,
const float* inData)
91 current_bp_level.
height_level_, current_disparity, numDispVals)]);
94 template<>
inline __m128i simd_processing::LoadPackedDataUnaligned<short, __m128i>(
95 unsigned int x,
unsigned int y,
unsigned int current_disparity,
97 unsigned int numDispVals,
const short* inData)
101 current_bp_level.
height_level_, current_disparity, numDispVals)]));
104 template<>
inline __m256d simd_processing::LoadPackedDataUnaligned<double, __m256d>(
105 unsigned int x,
unsigned int y,
unsigned int current_disparity,
107 unsigned int numDispVals,
const double* inData)
111 current_bp_level.
height_level_, current_disparity, numDispVals)]);
114 #if defined(FLOAT16_VECTORIZATION)
116 template<>
inline __m256h simd_processing::LoadPackedDataUnaligned<_Float16, __m256h>(
117 unsigned int x,
unsigned int y,
unsigned int current_disparity,
119 unsigned int numDispVals,
const _Float16* inData)
123 current_bp_level.
height_level_, current_disparity, numDispVals)]);
128 template<>
inline __m256 simd_processing::createSIMDVectorSameData<__m256>(
float data) {
129 return _mm256_set1_ps(data);
132 template<>
inline __m128i simd_processing::createSIMDVectorSameData<__m128i>(
float data) {
133 return _mm256_cvtps_ph(_mm256_set1_ps(data), 0);
136 template<>
inline __m256d simd_processing::createSIMDVectorSameData<__m256d>(
float data) {
137 return _mm256_set1_pd((
double)data);
140 #if defined(FLOAT16_VECTORIZATION)
142 template<>
inline __m256h simd_processing::createSIMDVectorSameData<__m256h>(
float data) {
143 return _mm256_set1_ph((_Float16)data);
148 template<>
inline __m256 simd_processing::AddVals<__m256, __m256, __m256>(
149 const __m256& val1,
const __m256& val2)
151 return _mm256_add_ps(val1, val2);
154 template<>
inline __m256d simd_processing::AddVals<__m256d, __m256d, __m256d>(
155 const __m256d& val1,
const __m256d& val2)
157 return _mm256_add_pd(val1, val2);
160 template<>
inline __m256 simd_processing::AddVals<__m256, __m128i, __m256>(
161 const __m256& val1,
const __m128i& val2)
163 return _mm256_add_ps(val1, _mm256_cvtph_ps(val2));
166 template<>
inline __m256 simd_processing::AddVals<__m128i, __m256, __m256>(
167 const __m128i& val1,
const __m256& val2)
169 return _mm256_add_ps(_mm256_cvtph_ps(val1), val2);
172 template<>
inline __m256 simd_processing::AddVals<__m128i, __m128i, __m256>(
173 const __m128i& val1,
const __m128i& val2)
175 return _mm256_add_ps(_mm256_cvtph_ps(val1), _mm256_cvtph_ps(val2));
178 #if defined(FLOAT16_VECTORIZATION)
180 template<>
inline __m256h simd_processing::AddVals<__m256h, __m256h, __m256h>(
181 const __m256h& val1,
const __m256h& val2)
183 return _mm256_add_ph(val1, val2);
188 template<>
inline __m256 simd_processing::SubtractVals<__m256, __m256, __m256>(
189 const __m256& val1,
const __m256& val2)
191 return _mm256_sub_ps(val1, val2);
194 template<>
inline __m256d simd_processing::SubtractVals<__m256d, __m256d, __m256d>(
195 const __m256d& val1,
const __m256d& val2)
197 return _mm256_sub_pd(val1, val2);
200 #if defined(FLOAT16_VECTORIZATION)
202 template<>
inline __m256h simd_processing::SubtractVals<__m256h, __m256h, __m256h>(
203 const __m256h& val1,
const __m256h& val2)
205 return _mm256_sub_ph(val1, val2);
210 template<>
inline __m256 simd_processing::divideVals<__m256, __m256, __m256>(
211 const __m256& val1,
const __m256& val2)
213 return _mm256_div_ps(val1, val2);
216 template<>
inline __m256d simd_processing::divideVals<__m256d, __m256d, __m256d>(
217 const __m256d& val1,
const __m256d& val2)
219 return _mm256_div_pd(val1, val2);
222 #if defined(FLOAT16_VECTORIZATION)
224 template<>
inline __m256h simd_processing::divideVals<__m256h, __m256h, __m256h>(
225 const __m256h& val1,
const __m256h& val2)
227 return _mm256_div_ph(val1, val2);
232 template<>
inline __m256 simd_processing::ConvertValToDatatype<__m256, float>(
float val) {
233 return _mm256_set1_ps(val);
236 template<>
inline __m256d simd_processing::ConvertValToDatatype<__m256d, double>(
double val) {
237 return _mm256_set1_pd(val);
240 #if defined(FLOAT16_VECTORIZATION)
242 template<>
inline __m256h simd_processing::ConvertValToDatatype<__m256h, _Float16>(_Float16 val) {
243 return _mm256_set1_ph((_Float16)val);
248 template<>
inline __m256 simd_processing::GetMinByElement<__m256>(
249 const __m256& val1,
const __m256& val2)
251 return _mm256_min_ps(val1, val2);
254 template<>
inline __m256d simd_processing::GetMinByElement<__m256d>(
255 const __m256d& val1,
const __m256d& val2)
257 return _mm256_min_pd(val1, val2);
260 #if defined(FLOAT16_VECTORIZATION)
262 template<>
inline __m256h simd_processing::GetMinByElement<__m256h>(
263 const __m256h& val1,
const __m256h& val2)
265 return _mm256_min_ph(val1, val2);
270 template<>
inline void simd_processing::StorePackedDataAligned<float, __m256>(
271 unsigned int indexDataStore,
float* locationDataStore,
const __m256& dataToStore)
273 _mm256_store_ps(&locationDataStore[indexDataStore], dataToStore);
276 template<>
inline void simd_processing::StorePackedDataAligned<short, __m256>(
277 unsigned int indexDataStore,
short* locationDataStore,
const __m256& dataToStore)
279 _mm_store_si128((__m128i*)(&locationDataStore[indexDataStore]), _mm256_cvtps_ph(dataToStore, 0));
282 template<>
inline void simd_processing::StorePackedDataAligned<double, __m256d>(
283 unsigned int indexDataStore,
double* locationDataStore,
const __m256d& dataToStore)
285 _mm256_store_pd(&locationDataStore[indexDataStore], dataToStore);
288 #if defined(FLOAT16_VECTORIZATION)
290 template<>
inline void simd_processing::StorePackedDataAligned<_Float16, __m256h>(
291 unsigned int indexDataStore, _Float16* locationDataStore,
const __m256h& dataToStore)
293 _mm256_store_ph(&locationDataStore[indexDataStore], dataToStore);
298 template<>
inline void simd_processing::StorePackedDataUnaligned<float, __m256>(
299 unsigned int indexDataStore,
float* locationDataStore,
const __m256& dataToStore)
301 _mm256_storeu_ps(&locationDataStore[indexDataStore], dataToStore);
304 template<>
inline void simd_processing::StorePackedDataUnaligned<short, __m256>(
305 unsigned int indexDataStore,
short* locationDataStore,
const __m256& dataToStore)
307 _mm_storeu_si128((__m128i*)(&locationDataStore[indexDataStore]), _mm256_cvtps_ph(dataToStore, 0));
310 template<>
inline void simd_processing::StorePackedDataUnaligned<double, __m256d>(
311 unsigned int indexDataStore,
double* locationDataStore,
const __m256d& dataToStore)
313 _mm256_storeu_pd(&locationDataStore[indexDataStore], dataToStore);
316 #if defined(FLOAT16_VECTORIZATION)
318 template<>
inline void simd_processing::StorePackedDataUnaligned<_Float16, __m256h>(
319 unsigned int indexDataStore, _Float16* locationDataStore,
const __m256h& dataToStore)
321 _mm256_storeu_ph(&locationDataStore[indexDataStore], dataToStore);
Contains namespace with CPU run defaults and constants.
Contains general functions for processing using SIMD vector data types on CPU.
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
unsigned int height_level_
unsigned int padded_width_checkerboard_level_