Optimized Belief Propagation (CPU and GPU)
AVX512TemplateSpFuncts.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
27 #ifndef AVX512TEMPLATESPFUNCTS_H_
28 #define AVX512TEMPLATESPFUNCTS_H_
29 #ifdef _WIN32
30 #include <intrin.h>
31 #else
32 #include <x86intrin.h>
33 #endif
35 #include "SIMDProcessing.h"
36 #include <immintrin.h>
37 
38 template<> inline __m512d simd_processing::LoadPackedDataAligned<double, __m512d>(
39  unsigned int x, unsigned int y, unsigned int current_disparity,
40  const beliefprop::BpLevelProperties& current_bp_level,
41  unsigned int numDispVals, const double* inData)
42 {
43  return _mm512_load_pd(&inData[beliefprop::RetrieveIndexInDataAndMessage(
44  x, y, current_bp_level.padded_width_checkerboard_level_,
45  current_bp_level.height_level_, current_disparity, numDispVals)]);
46 }
47 
48 template<> inline __m512 simd_processing::LoadPackedDataAligned<float, __m512>(
49  unsigned int x, unsigned int y, unsigned int current_disparity,
50  const beliefprop::BpLevelProperties& current_bp_level,
51  unsigned int numDispVals, const float* inData)
52 {
53  return _mm512_load_ps(&inData[beliefprop::RetrieveIndexInDataAndMessage(
54  x, y, current_bp_level.padded_width_checkerboard_level_,
55  current_bp_level.height_level_, current_disparity, numDispVals)]);
56 }
57 
58 template<> inline __m256i simd_processing::LoadPackedDataAligned<short, __m256i>(
59  unsigned int x, unsigned int y, unsigned int current_disparity,
60  const beliefprop::BpLevelProperties& current_bp_level,
61  unsigned int numDispVals, const short* inData)
62 {
63  return _mm256_load_si256((__m256i*)(&inData[beliefprop::RetrieveIndexInDataAndMessage(
64  x, y, current_bp_level.padded_width_checkerboard_level_,
65  current_bp_level.height_level_, current_disparity,
66  numDispVals)]));
67 }
68 
69 #if defined(FLOAT16_VECTORIZATION)
70 
71 template<> inline __m512h simd_processing::LoadPackedDataAligned<_Float16, __m512h>(
72  unsigned int x, unsigned int y, unsigned int current_disparity,
73  const beliefprop::BpLevelProperties& current_bp_level,
74  unsigned int numDispVals, const _Float16* inData)
75 {
76  return _mm512_load_ph((__m512h*)(&inData[beliefprop::RetrieveIndexInDataAndMessage(
77  x, y, current_bp_level.padded_width_checkerboard_level_,
78  current_bp_level.height_level_, current_disparity,
79  numDispVals)]));
80 }
81 
82 #endif //FLOAT16_VECTORIZATION
83 
84 template<> inline __m512 simd_processing::LoadPackedDataUnaligned<float, __m512>(
85  unsigned int x, unsigned int y, unsigned int current_disparity,
86  const beliefprop::BpLevelProperties& current_bp_level,
87  unsigned int numDispVals, const float* inData)
88 {
89  return _mm512_loadu_ps(&inData[beliefprop::RetrieveIndexInDataAndMessage(
90  x, y, current_bp_level.padded_width_checkerboard_level_,
91  current_bp_level.height_level_, current_disparity, numDispVals)]);
92 }
93 
94 template<> inline __m256i simd_processing::LoadPackedDataUnaligned<short, __m256i>(
95  unsigned int x, unsigned int y, unsigned int current_disparity,
96  const beliefprop::BpLevelProperties& current_bp_level,
97  unsigned int numDispVals, const short* inData)
98 {
99  return _mm256_loadu_si256((__m256i*)(&inData[beliefprop::RetrieveIndexInDataAndMessage(
100  x, y, current_bp_level.padded_width_checkerboard_level_,
101  current_bp_level.height_level_, current_disparity, numDispVals)]));
102 }
103 
104 template<> inline __m512d simd_processing::LoadPackedDataUnaligned<double, __m512d>(
105  unsigned int x, unsigned int y, unsigned int current_disparity,
106  const beliefprop::BpLevelProperties& current_bp_level,
107  unsigned int numDispVals, const double* inData)
108 {
109  return _mm512_loadu_pd(&inData[beliefprop::RetrieveIndexInDataAndMessage(
110  x, y, current_bp_level.padded_width_checkerboard_level_,
111  current_bp_level.height_level_, current_disparity, numDispVals)]);
112 }
113 
114 #if defined(FLOAT16_VECTORIZATION)
115 
116 template<> inline __m512h simd_processing::LoadPackedDataUnaligned<_Float16, __m512h>(
117  unsigned int x, unsigned int y, unsigned int current_disparity,
118  const beliefprop::BpLevelProperties& current_bp_level,
119  unsigned int numDispVals, const _Float16* inData)
120 {
121  return _mm512_loadu_ph(&inData[beliefprop::RetrieveIndexInDataAndMessage(
122  x, y, current_bp_level.padded_width_checkerboard_level_,
123  current_bp_level.height_level_, current_disparity, numDispVals)]);
124 }
125 
126 #endif //FLOAT16_VECTORIZATION
127 
128 template<> inline __m512 simd_processing::createSIMDVectorSameData<__m512>(float data) {
129  return _mm512_set1_ps(data);
130 }
131 
132 template<> inline __m256i simd_processing::createSIMDVectorSameData<__m256i>(float data) {
133  return _mm512_cvtps_ph(_mm512_set1_ps(data), 0);
134 }
135 
136 template<> inline __m512d simd_processing::createSIMDVectorSameData<__m512d>(float data) {
137  return _mm512_set1_pd((double)data);
138 }
139 
140 #if defined(FLOAT16_VECTORIZATION)
141 
142 template<> inline __m512h simd_processing::createSIMDVectorSameData<__m512h>(float data) {
143  return _mm512_set1_ph((_Float16)data);
144 }
145 
146 #endif //FLOAT16_VECTORIZATION
147 
148 template<> inline __m512 simd_processing::AddVals<__m512, __m512, __m512>(
149  const __m512& val1, const __m512& val2)
150 {
151  return _mm512_add_ps(val1, val2);
152 }
153 
154 template<> inline __m512d simd_processing::AddVals<__m512d, __m512d, __m512d>(
155  const __m512d& val1, const __m512d& val2)
156 {
157  return _mm512_add_pd(val1, val2);
158 }
159 
160 #if defined(FLOAT16_VECTORIZATION)
161 
162 template<> inline __m512h simd_processing::AddVals<__m512h, __m512h, __m512h>(
163  const __m512h& val1, const __m512h& val2)
164 {
165  return _mm512_add_ph(val1, val2);
166 }
167 
168 #endif //FLOAT16_VECTORIZATION
169 
170 template<> inline __m512 simd_processing::AddVals<__m512, __m256i, __m512>(
171  const __m512& val1, const __m256i& val2)
172 {
173  return _mm512_add_ps(val1, _mm512_cvtph_ps(val2));
174 }
175 
176 template<> inline __m512 simd_processing::AddVals<__m256i, __m512, __m512>(
177  const __m256i& val1, const __m512& val2)
178 {
179  return _mm512_add_ps(_mm512_cvtph_ps(val1), val2);
180 }
181 
182 template<> inline __m512 simd_processing::AddVals<__m256i, __m256i, __m512>(
183  const __m256i& val1, const __m256i& val2)
184 {
185  return _mm512_add_ps(_mm512_cvtph_ps(val1), _mm512_cvtph_ps(val2));
186 }
187 
188 template<> inline __m512 simd_processing::SubtractVals<__m512, __m512, __m512>(
189  const __m512& val1, const __m512& val2)
190 {
191  return _mm512_sub_ps(val1, val2);
192 }
193 
194 template<> inline __m512d simd_processing::SubtractVals<__m512d, __m512d, __m512d>(
195  const __m512d& val1, const __m512d& val2)
196 {
197  return _mm512_sub_pd(val1, val2);
198 }
199 
200 #if defined(FLOAT16_VECTORIZATION)
201 
202 template<> inline __m512h simd_processing::SubtractVals<__m512h, __m512h, __m512h>(
203  const __m512h& val1, const __m512h& val2)
204 {
205  return _mm512_sub_ph(val1, val2);
206 }
207 
208 #endif //FLOAT16_VECTORIZATION
209 
210 template<> inline __m512 simd_processing::divideVals<__m512, __m512, __m512>(
211  const __m512& val1, const __m512& val2)
212 {
213  return _mm512_div_ps(val1, val2);
214 }
215 
216 template<> inline __m512d simd_processing::divideVals<__m512d, __m512d, __m512d>(
217  const __m512d& val1, const __m512d& val2)
218 {
219  return _mm512_div_pd(val1, val2);
220 }
221 
222 #if defined(FLOAT16_VECTORIZATION)
223 
224 template<> inline __m512h simd_processing::divideVals<__m512h, __m512h, __m512h>(
225  const __m512h& val1, const __m512h& val2)
226 {
227  return _mm512_div_ph(val1, val2);
228 }
229 
230 #endif //FLOAT16_VECTORIZATION
231 
232 template<> inline __m512 simd_processing::ConvertValToDatatype<__m512, float>(float val) {
233  return _mm512_set1_ps(val);
234 }
235 
236 template<> inline __m512d simd_processing::ConvertValToDatatype<__m512d, double>(double val) {
237  return _mm512_set1_pd(val);
238 }
239 
240 #if defined(FLOAT16_VECTORIZATION)
241 
242 template<> inline __m512h simd_processing::ConvertValToDatatype<__m512h, _Float16>(_Float16 val) {
243  return _mm512_set1_ph((_Float16)val);
244 }
245 
246 #endif //FLOAT16_VECTORIZATION
247 
248 template<> inline __m512 simd_processing::GetMinByElement<__m512>(
249  const __m512& val1, const __m512& val2)
250 {
251  return _mm512_min_ps(val1, val2);
252 }
253 
254 template<> inline __m512d simd_processing::GetMinByElement<__m512d>(
255  const __m512d& val1, const __m512d& val2)
256 {
257  return _mm512_min_pd(val1, val2);
258 }
259 
260 #if defined(FLOAT16_VECTORIZATION)
261 
262 template<> inline __m512h simd_processing::GetMinByElement<__m512h>(
263  const __m512h& val1, const __m512h& val2)
264 {
265  return _mm512_min_ph(val1, val2);
266 }
267 
268 #endif //FLOAT16_VECTORIZATION
269 
270 template<> inline void simd_processing::StorePackedDataAligned<float, __m512>(
271  unsigned int indexDataStore, float* locationDataStore, const __m512& dataToStore)
272 {
273  _mm512_store_ps(&locationDataStore[indexDataStore], dataToStore);
274 }
275 
276 template<> inline void simd_processing::StorePackedDataAligned<short, __m512>(
277  unsigned int indexDataStore, short* locationDataStore, const __m512& dataToStore)
278 {
279  _mm256_store_si256((__m256i*)(&locationDataStore[indexDataStore]), _mm512_cvtps_ph(dataToStore, 0));
280 }
281 
282 template<> inline void simd_processing::StorePackedDataAligned<double, __m512d>(
283  unsigned int indexDataStore, double* locationDataStore, const __m512d& dataToStore)
284 {
285  _mm512_store_pd(&locationDataStore[indexDataStore], dataToStore);
286 }
287 
288 #if defined(FLOAT16_VECTORIZATION)
289 
290 template<> inline void simd_processing::StorePackedDataAligned<_Float16, __m512h>(
291  unsigned int indexDataStore, _Float16* locationDataStore, const __m512h& dataToStore)
292 {
293  _mm512_store_ph(&locationDataStore[indexDataStore], dataToStore);
294 }
295 
296 #endif //FLOAT16_VECTORIZATION
297 
298 template<> inline void simd_processing::StorePackedDataUnaligned<float, __m512>(
299  unsigned int indexDataStore, float* locationDataStore, const __m512& dataToStore)
300 {
301  _mm512_storeu_ps(&locationDataStore[indexDataStore], dataToStore);
302 }
303 
304 template<> inline void simd_processing::StorePackedDataUnaligned<short, __m512>(
305  unsigned int indexDataStore, short* locationDataStore, const __m512& dataToStore)
306 {
307  _mm256_storeu_si256((__m256i*)(&locationDataStore[indexDataStore]), _mm512_cvtps_ph(dataToStore, 0));
308 }
309 
310 template<> inline void simd_processing::StorePackedDataUnaligned<double, __m512d>(
311  unsigned int indexDataStore, double* locationDataStore, const __m512d& dataToStore)
312 {
313  _mm512_storeu_pd(&locationDataStore[indexDataStore], dataToStore);
314 }
315 
316 #if defined(FLOAT16_VECTORIZATION)
317 
318 template<> inline void simd_processing::StorePackedDataUnaligned<_Float16, __m512h>(
319  unsigned int indexDataStore, _Float16* locationDataStore, const __m512h& dataToStore)
320 {
321  _mm512_storeu_ph(&locationDataStore[indexDataStore], dataToStore);
322 }
323 
324 #endif //FLOAT16_VECTORIZATION
325 
326 #endif /* AVX512TEMPLATESPFUNCTS_H_ */
Contains namespace with CPU run defaults and constants.
Contains general functions for processing using SIMD vector data types on CPU.
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
Definition: BpLevel.h:42
unsigned int height_level_
Definition: BpLevel.h:44
unsigned int padded_width_checkerboard_level_
Definition: BpLevel.h:47