27 #ifndef AVX512TEMPLATESPFUNCTS_H_
28 #define AVX512TEMPLATESPFUNCTS_H_
32 #include <x86intrin.h>
36 #include <immintrin.h>
38 template<>
inline __m512d simd_processing::LoadPackedDataAligned<double, __m512d>(
39 unsigned int x,
unsigned int y,
unsigned int current_disparity,
41 unsigned int numDispVals,
const double* inData)
45 current_bp_level.
height_level_, current_disparity, numDispVals)]);
48 template<>
inline __m512 simd_processing::LoadPackedDataAligned<float, __m512>(
49 unsigned int x,
unsigned int y,
unsigned int current_disparity,
51 unsigned int numDispVals,
const float* inData)
55 current_bp_level.
height_level_, current_disparity, numDispVals)]);
58 template<>
inline __m256i simd_processing::LoadPackedDataAligned<short, __m256i>(
59 unsigned int x,
unsigned int y,
unsigned int current_disparity,
61 unsigned int numDispVals,
const short* inData)
69 #if defined(FLOAT16_VECTORIZATION)
71 template<>
inline __m512h simd_processing::LoadPackedDataAligned<_Float16, __m512h>(
72 unsigned int x,
unsigned int y,
unsigned int current_disparity,
74 unsigned int numDispVals,
const _Float16* inData)
84 template<>
inline __m512 simd_processing::LoadPackedDataUnaligned<float, __m512>(
85 unsigned int x,
unsigned int y,
unsigned int current_disparity,
87 unsigned int numDispVals,
const float* inData)
91 current_bp_level.
height_level_, current_disparity, numDispVals)]);
94 template<>
inline __m256i simd_processing::LoadPackedDataUnaligned<short, __m256i>(
95 unsigned int x,
unsigned int y,
unsigned int current_disparity,
97 unsigned int numDispVals,
const short* inData)
101 current_bp_level.
height_level_, current_disparity, numDispVals)]));
104 template<>
inline __m512d simd_processing::LoadPackedDataUnaligned<double, __m512d>(
105 unsigned int x,
unsigned int y,
unsigned int current_disparity,
107 unsigned int numDispVals,
const double* inData)
111 current_bp_level.
height_level_, current_disparity, numDispVals)]);
114 #if defined(FLOAT16_VECTORIZATION)
116 template<>
inline __m512h simd_processing::LoadPackedDataUnaligned<_Float16, __m512h>(
117 unsigned int x,
unsigned int y,
unsigned int current_disparity,
119 unsigned int numDispVals,
const _Float16* inData)
123 current_bp_level.
height_level_, current_disparity, numDispVals)]);
128 template<>
inline __m512 simd_processing::createSIMDVectorSameData<__m512>(
float data) {
129 return _mm512_set1_ps(data);
132 template<>
inline __m256i simd_processing::createSIMDVectorSameData<__m256i>(
float data) {
133 return _mm512_cvtps_ph(_mm512_set1_ps(data), 0);
136 template<>
inline __m512d simd_processing::createSIMDVectorSameData<__m512d>(
float data) {
137 return _mm512_set1_pd((
double)data);
140 #if defined(FLOAT16_VECTORIZATION)
142 template<>
inline __m512h simd_processing::createSIMDVectorSameData<__m512h>(
float data) {
143 return _mm512_set1_ph((_Float16)data);
148 template<>
inline __m512 simd_processing::AddVals<__m512, __m512, __m512>(
149 const __m512& val1,
const __m512& val2)
151 return _mm512_add_ps(val1, val2);
154 template<>
inline __m512d simd_processing::AddVals<__m512d, __m512d, __m512d>(
155 const __m512d& val1,
const __m512d& val2)
157 return _mm512_add_pd(val1, val2);
160 #if defined(FLOAT16_VECTORIZATION)
162 template<>
inline __m512h simd_processing::AddVals<__m512h, __m512h, __m512h>(
163 const __m512h& val1,
const __m512h& val2)
165 return _mm512_add_ph(val1, val2);
170 template<>
inline __m512 simd_processing::AddVals<__m512, __m256i, __m512>(
171 const __m512& val1,
const __m256i& val2)
173 return _mm512_add_ps(val1, _mm512_cvtph_ps(val2));
176 template<>
inline __m512 simd_processing::AddVals<__m256i, __m512, __m512>(
177 const __m256i& val1,
const __m512& val2)
179 return _mm512_add_ps(_mm512_cvtph_ps(val1), val2);
182 template<>
inline __m512 simd_processing::AddVals<__m256i, __m256i, __m512>(
183 const __m256i& val1,
const __m256i& val2)
185 return _mm512_add_ps(_mm512_cvtph_ps(val1), _mm512_cvtph_ps(val2));
188 template<>
inline __m512 simd_processing::SubtractVals<__m512, __m512, __m512>(
189 const __m512& val1,
const __m512& val2)
191 return _mm512_sub_ps(val1, val2);
194 template<>
inline __m512d simd_processing::SubtractVals<__m512d, __m512d, __m512d>(
195 const __m512d& val1,
const __m512d& val2)
197 return _mm512_sub_pd(val1, val2);
200 #if defined(FLOAT16_VECTORIZATION)
202 template<>
inline __m512h simd_processing::SubtractVals<__m512h, __m512h, __m512h>(
203 const __m512h& val1,
const __m512h& val2)
205 return _mm512_sub_ph(val1, val2);
210 template<>
inline __m512 simd_processing::divideVals<__m512, __m512, __m512>(
211 const __m512& val1,
const __m512& val2)
213 return _mm512_div_ps(val1, val2);
216 template<>
inline __m512d simd_processing::divideVals<__m512d, __m512d, __m512d>(
217 const __m512d& val1,
const __m512d& val2)
219 return _mm512_div_pd(val1, val2);
222 #if defined(FLOAT16_VECTORIZATION)
224 template<>
inline __m512h simd_processing::divideVals<__m512h, __m512h, __m512h>(
225 const __m512h& val1,
const __m512h& val2)
227 return _mm512_div_ph(val1, val2);
232 template<>
inline __m512 simd_processing::ConvertValToDatatype<__m512, float>(
float val) {
233 return _mm512_set1_ps(val);
236 template<>
inline __m512d simd_processing::ConvertValToDatatype<__m512d, double>(
double val) {
237 return _mm512_set1_pd(val);
240 #if defined(FLOAT16_VECTORIZATION)
242 template<>
inline __m512h simd_processing::ConvertValToDatatype<__m512h, _Float16>(_Float16 val) {
243 return _mm512_set1_ph((_Float16)val);
248 template<>
inline __m512 simd_processing::GetMinByElement<__m512>(
249 const __m512& val1,
const __m512& val2)
251 return _mm512_min_ps(val1, val2);
254 template<>
inline __m512d simd_processing::GetMinByElement<__m512d>(
255 const __m512d& val1,
const __m512d& val2)
257 return _mm512_min_pd(val1, val2);
260 #if defined(FLOAT16_VECTORIZATION)
262 template<>
inline __m512h simd_processing::GetMinByElement<__m512h>(
263 const __m512h& val1,
const __m512h& val2)
265 return _mm512_min_ph(val1, val2);
270 template<>
inline void simd_processing::StorePackedDataAligned<float, __m512>(
271 unsigned int indexDataStore,
float* locationDataStore,
const __m512& dataToStore)
273 _mm512_store_ps(&locationDataStore[indexDataStore], dataToStore);
276 template<>
inline void simd_processing::StorePackedDataAligned<short, __m512>(
277 unsigned int indexDataStore,
short* locationDataStore,
const __m512& dataToStore)
279 _mm256_store_si256((__m256i*)(&locationDataStore[indexDataStore]), _mm512_cvtps_ph(dataToStore, 0));
282 template<>
inline void simd_processing::StorePackedDataAligned<double, __m512d>(
283 unsigned int indexDataStore,
double* locationDataStore,
const __m512d& dataToStore)
285 _mm512_store_pd(&locationDataStore[indexDataStore], dataToStore);
288 #if defined(FLOAT16_VECTORIZATION)
290 template<>
inline void simd_processing::StorePackedDataAligned<_Float16, __m512h>(
291 unsigned int indexDataStore, _Float16* locationDataStore,
const __m512h& dataToStore)
293 _mm512_store_ph(&locationDataStore[indexDataStore], dataToStore);
298 template<>
inline void simd_processing::StorePackedDataUnaligned<float, __m512>(
299 unsigned int indexDataStore,
float* locationDataStore,
const __m512& dataToStore)
301 _mm512_storeu_ps(&locationDataStore[indexDataStore], dataToStore);
304 template<>
inline void simd_processing::StorePackedDataUnaligned<short, __m512>(
305 unsigned int indexDataStore,
short* locationDataStore,
const __m512& dataToStore)
307 _mm256_storeu_si256((__m256i*)(&locationDataStore[indexDataStore]), _mm512_cvtps_ph(dataToStore, 0));
310 template<>
inline void simd_processing::StorePackedDataUnaligned<double, __m512d>(
311 unsigned int indexDataStore,
double* locationDataStore,
const __m512d& dataToStore)
313 _mm512_storeu_pd(&locationDataStore[indexDataStore], dataToStore);
316 #if defined(FLOAT16_VECTORIZATION)
318 template<>
inline void simd_processing::StorePackedDataUnaligned<_Float16, __m512h>(
319 unsigned int indexDataStore, _Float16* locationDataStore,
const __m512h& dataToStore)
321 _mm512_storeu_ph(&locationDataStore[indexDataStore], dataToStore);
Contains namespace with CPU run defaults and constants.
Contains general functions for processing using SIMD vector data types on CPU.
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
unsigned int height_level_
unsigned int padded_width_checkerboard_level_