Optimized Belief Propagation (CPU and GPU)
KernelBpStereoCPU.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
28 #ifndef KERNEL_BP_STEREO_CPU_H
29 #define KERNEL_BP_STEREO_CPU_H
30 
31 #include <math.h>
32 #include <omp.h>
33 #include <algorithm>
34 #include <iostream>
35 //TODO: switch use of printf with std::format when it is supported on compiler used for development
36 //#include <format>
43 #include "RunImp/UtilityFuncts.h"
46 
51 namespace beliefprop_cpu
52 {
53  //initialize the "data cost" for each possible disparity between the two
54  //full-sized input images ("bottom" of the image pyramid)
55  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
57  const beliefprop::BpLevelProperties& current_bp_level,
58  const float* image_1_pixels_device, const float* image_2_pixels_device,
59  T* data_cost_stereo_checkerboard_0, T* data_cost_stereo_checkerboard_1,
60  float lambda_bp, float data_k_bp, unsigned int bp_settings_disp_vals,
61  const ParallelParams& opt_cpu_params);
62 
63  //initialize the "data cost" for each possible disparity at the current
64  //level using the data costs from the previous level
65  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
67  beliefprop::CheckerboardPart checkerboard_part,
68  const beliefprop::BpLevelProperties& current_bp_level, const beliefprop::BpLevelProperties& prev_bp_level,
69  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
70  T* data_cost_current_level, unsigned int offset_num, unsigned int bp_settings_disp_vals,
71  const ParallelParams& opt_cpu_params);
72 
73  //initialize the message values at each pixel of the current level to the
74  //default value
75  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
77  const beliefprop::BpLevelProperties& current_bp_level,
78  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
79  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
80  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
81  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
82  unsigned int bp_settings_disp_vals,
83  const ParallelParams& opt_cpu_params);
84 
85  //run the current iteration of belief propagation using the checkerboard
86  //update method where half the pixels in the "checkerboard" scheme retrieve
87  //messages from each 4-connected neighbor and then update their message based
88  //on the retrieved messages and the data cost
89  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
91  beliefprop::CheckerboardPart checkerboard_to_update,
92  const beliefprop::BpLevelProperties& current_bp_level,
93  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
94  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
95  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
96  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
97  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
98  float disc_k_bp, unsigned int bp_settings_num_disp_vals,
99  const ParallelParams& opt_cpu_params);
100 
101  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
103  beliefprop::CheckerboardPart checkerboard_part_update,
104  const beliefprop::BpLevelProperties& current_bp_level,
105  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
106  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
107  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
108  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
109  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
110  float disc_k_bp, unsigned int bp_settings_disp_vals,
111  const ParallelParams& opt_cpu_params);
112 
113  //copy the computed BP message values at the current level to the
114  //corresponding locations at the "next" level down
115  //the kernel works from the point of view of the pixel at the prev level
116  //that is being copied to four different places
117  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
119  beliefprop::CheckerboardPart checkerboard_part,
120  const beliefprop::BpLevelProperties& current_bp_level,
121  const beliefprop::BpLevelProperties& next_bp_level,
122  const T* message_u_prev_checkerboard_0, const T* message_d_prev_checkerboard_0,
123  const T* message_l_prev_checkerboard_0, const T* message_r_prev_checkerboard_0,
124  const T* message_u_prev_checkerboard_1, const T* message_d_prev_checkerboard_1,
125  const T* message_l_prev_checkerboard_1, const T* message_r_prev_checkerboard_1,
126  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
127  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
128  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
129  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1,
130  unsigned int bp_settings_disp_vals,
131  const ParallelParams& opt_cpu_params);
132 
133  //retrieve the best disparity estimate from image 1 to image 2 for each pixel
134  //in parallel
135  template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
137  const beliefprop::BpLevelProperties& current_bp_level,
138  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
139  const T* message_u_prev_checkerboard_0, const T* message_d_prev_checkerboard_0,
140  const T* message_l_prev_checkerboard_0, const T* message_r_prev_checkerboard_0,
141  const T* message_u_prev_checkerboard_1, const T* message_d_prev_checkerboard_1,
142  const T* message_l_prev_checkerboard_1, const T* message_r_prev_checkerboard_1,
143  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
144  const ParallelParams& opt_cpu_params);
145 
146  //retrieve the best disparity estimate from image 1 to image 2 for each pixel
147  //in parallel using SIMD vectors
148  template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W, unsigned int DISP_VALS>
150  const beliefprop::BpLevelProperties& current_bp_level,
151  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
152  const T* message_u_prev_checkerboard_0, const T* message_d_prev_checkerboard_0,
153  const T* message_l_prev_checkerboard_0, const T* message_r_prev_checkerboard_0,
154  const T* message_u_prev_checkerboard_1, const T* message_d_prev_checkerboard_1,
155  const T* message_l_prev_checkerboard_1, const T* message_r_prev_checkerboard_1,
156  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
157  const ParallelParams& opt_cpu_params);
158 
159  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
161  const beliefprop::BpLevelProperties& current_bp_level,
162  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
163  const float* message_u_prev_checkerboard_0, const float* message_d_prev_checkerboard_0,
164  const float* message_l_prev_checkerboard_0, const float* message_r_prev_checkerboard_0,
165  const float* message_u_prev_checkerboard_1, const float* message_d_prev_checkerboard_1,
166  const float* message_l_prev_checkerboard_1, const float* message_r_prev_checkerboard_1,
167  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
168  const ParallelParams& opt_cpu_params);
169 
170  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
172  const beliefprop::BpLevelProperties& current_bp_level,
173  const short* data_cost_checkerboard_0, const short* data_cost_checkerboard_1,
174  const short* message_u_prev_checkerboard_0, const short* message_d_prev_checkerboard_0,
175  const short* message_l_prev_checkerboard_0, const short* message_r_prev_checkerboard_0,
176  const short* message_u_prev_checkerboard_1, const short* message_d_prev_checkerboard_1,
177  const short* message_l_prev_checkerboard_1, const short* message_r_prev_checkerboard_1,
178  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
179  const ParallelParams& opt_cpu_params);
180 
181 #if defined(FLOAT16_VECTORIZATION)
182 
183  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
185  const beliefprop::BpLevelProperties& current_bp_level,
186  const _Float16* data_cost_checkerboard_0, const _Float16* data_cost_checkerboard_1,
187  const _Float16* message_u_prev_checkerboard_0, const _Float16* message_d_prev_checkerboard_0,
188  const _Float16* message_l_prev_checkerboard_0, const _Float16* message_r_prev_checkerboard_0,
189  const _Float16* message_u_prev_checkerboard_1, const _Float16* message_d_prev_checkerboard_1,
190  const _Float16* message_l_prev_checkerboard_1, const _Float16* message_r_prev_checkerboard_1,
191  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
192  const ParallelParams& opt_cpu_params);
193 
194 #endif //FLOAT16_VECTORIZATION
195 
196  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
198  const beliefprop::BpLevelProperties& current_bp_level,
199  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
200  const double* message_u_prev_checkerboard_0, const double* message_d_prev_checkerboard_0,
201  const double* message_l_prev_checkerboard_0, const double* message_r_prev_checkerboard_0,
202  const double* message_u_prev_checkerboard_1, const double* message_d_prev_checkerboard_1,
203  const double* message_l_prev_checkerboard_1, const double* message_r_prev_checkerboard_1,
204  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
205  const ParallelParams& opt_cpu_params);
206 
207 #if ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE))
208  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
210  const beliefprop::BpLevelProperties& current_bp_level,
211  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
212  const float* message_u_prev_checkerboard_0, const float* message_d_prev_checkerboard_0,
213  const float* message_l_prev_checkerboard_0, const float* message_r_prev_checkerboard_0,
214  const float* message_u_prev_checkerboard_1, const float* message_d_prev_checkerboard_1,
215  const float* message_l_prev_checkerboard_1, const float* message_r_prev_checkerboard_1,
216  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
217  const ParallelParams& opt_cpu_params);
218 
219  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
221  const beliefprop::BpLevelProperties& current_bp_level,
222  const short* data_cost_checkerboard_0, const short* data_cost_checkerboard_1,
223  const short* message_u_prev_checkerboard_0, const short* message_d_prev_checkerboard_0,
224  const short* message_l_prev_checkerboard_0, const short* message_r_prev_checkerboard_0,
225  const short* message_u_prev_checkerboard_1, const short* message_d_prev_checkerboard_1,
226  const short* message_l_prev_checkerboard_1, const short* message_r_prev_checkerboard_1,
227  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
228  const ParallelParams& opt_cpu_params);
229 
230 #if defined(FLOAT16_VECTORIZATION)
231 
232  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
234  const beliefprop::BpLevelProperties& current_bp_level,
235  const _Float16* data_cost_checkerboard_0, const _Float16* data_cost_checkerboard_1,
236  const _Float16* message_u_prev_checkerboard_0, const _Float16* message_d_prev_checkerboard_0,
237  const _Float16* message_l_prev_checkerboard_0, const _Float16* message_r_prev_checkerboard_0,
238  const _Float16* message_u_prev_checkerboard_1, const _Float16* message_d_prev_checkerboard_1,
239  const _Float16* message_l_prev_checkerboard_1, const _Float16* message_r_prev_checkerboard_1,
240  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
241  const ParallelParams& opt_cpu_params);
242 
243 #endif //FLOAT16_VECTORIZATION
244 
245  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
247  const beliefprop::BpLevelProperties& current_bp_level,
248  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
249  const double* message_u_prev_checkerboard_0, const double* message_d_prev_checkerboard_0,
250  const double* message_l_prev_checkerboard_0, const double* message_r_prev_checkerboard_0,
251  const double* message_u_prev_checkerboard_1, const double* message_d_prev_checkerboard_1,
252  const double* message_l_prev_checkerboard_1, const double* message_r_prev_checkerboard_1,
253  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
254  const ParallelParams& opt_cpu_params);
255 #endif //(CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE)
256 
257  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
259  const beliefprop::BpLevelProperties& current_bp_level,
260  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
261  const float* message_u_prev_checkerboard_0, const float* message_d_prev_checkerboard_0,
262  const float* message_l_prev_checkerboard_0, const float* message_r_prev_checkerboard_0,
263  const float* message_u_prev_checkerboard_1, const float* message_d_prev_checkerboard_1,
264  const float* message_l_prev_checkerboard_1, const float* message_r_prev_checkerboard_1,
265  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
266  const ParallelParams& opt_cpu_params);
267 
268  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
270  const beliefprop::BpLevelProperties& current_bp_level,
271  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
272  const double* message_u_prev_checkerboard_0, const double* message_d_prev_checkerboard_0,
273  const double* message_l_prev_checkerboard_0, const double* message_r_prev_checkerboard_0,
274  const double* message_u_prev_checkerboard_1, const double* message_d_prev_checkerboard_1,
275  const double* message_l_prev_checkerboard_1, const double* message_r_prev_checkerboard_1,
276  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
277  const ParallelParams& opt_cpu_params);
278 
279 #if defined(COMPILING_FOR_ARM)
280  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
282  const beliefprop::BpLevelProperties& current_bp_level,
283  const float16_t* data_cost_checkerboard_0, const float16_t* data_cost_checkerboard_1,
284  const float16_t* message_u_prev_checkerboard_0, const float16_t* message_d_prev_checkerboard_0,
285  const float16_t* message_l_prev_checkerboard_0, const float16_t* message_r_prev_checkerboard_0,
286  const float16_t* message_u_prev_checkerboard_1, const float16_t* message_d_prev_checkerboard_1,
287  const float16_t* message_l_prev_checkerboard_1, const float16_t* message_r_prev_checkerboard_1,
288  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
289  const ParallelParams& opt_cpu_params);
290 #endif //COMPILING_FOR_ARM
291 
292  //run the current iteration of belief propagation where the input messages and data costs come in as arrays
293  //and the output message values are written to output message arrays
294  template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
296  unsigned int x_val_start_processing, unsigned int y_val,
297  const beliefprop::BpLevelProperties& current_bp_level,
298  const U prev_u_message[DISP_VALS], const U prev_d_message[DISP_VALS],
299  const U prev_l_message[DISP_VALS], const U prev_r_message[DISP_VALS],
300  const U data_message[DISP_VALS],
301  T* current_u_message, T* current_d_message,
302  T* current_l_message, T* current_r_message,
303  const U disc_k_bp_vect, bool data_aligned);
304 
305  template<RunData_t T, RunDataVect_t U>
307  unsigned int x_val_start_processing, unsigned int y_val,
308  const beliefprop::BpLevelProperties& current_bp_level,
309  const U* prev_u_message, const U* prev_d_message,
310  const U* prev_l_message, const U* prev_r_message,
311  const U* data_message,
312  T* current_u_message, T* current_d_message,
313  T* current_l_message, T* current_r_message,
314  const U disc_k_bp_vect, bool data_aligned,
315  unsigned int bp_settings_disp_vals);
316 
317  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
319  beliefprop::CheckerboardPart checkerboard_to_update,
320  const beliefprop::BpLevelProperties& current_bp_level,
321  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
322  float* message_u_checkerboard_0, float* message_d_checkerboard_0,
323  float* message_l_checkerboard_0, float* message_r_checkerboard_0,
324  float* message_u_checkerboard_1, float* message_d_checkerboard_1,
325  float* message_l_checkerboard_1, float* message_r_checkerboard_1,
326  float disc_k_bp, unsigned int bp_settings_disp_vals,
327  const ParallelParams& opt_cpu_params);
328 
329  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
331  beliefprop::CheckerboardPart checkerboard_to_update,
332  const beliefprop::BpLevelProperties& current_bp_level,
333  const short* data_cost_checkerboard_0, const short* data_cost_checkerboard_1,
334  short* message_u_checkerboard_0, short* message_d_checkerboard_0,
335  short* message_l_checkerboard_0, short* message_r_checkerboard_0,
336  short* message_u_checkerboard_1, short* message_d_checkerboard_1,
337  short* message_l_checkerboard_1, short* message_r_checkerboard_1,
338  float disc_k_bp, unsigned int bp_settings_disp_vals,
339  const ParallelParams& opt_cpu_params);
340 
341 #if defined(FLOAT16_VECTORIZATION)
342 
343  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
345  beliefprop::CheckerboardPart checkerboard_to_update,
346  const beliefprop::BpLevelProperties& current_bp_level,
347  const _Float16* data_cost_checkerboard_0, const _Float16* data_cost_checkerboard_1,
348  _Float16* message_u_checkerboard_0, _Float16* message_d_checkerboard_0,
349  _Float16* message_l_checkerboard_0, _Float16* message_r_checkerboard_0,
350  _Float16* message_u_checkerboard_1, _Float16* message_d_checkerboard_1,
351  _Float16* message_l_checkerboard_1, _Float16* message_r_checkerboard_1,
352  float disc_k_bp, unsigned int bp_settings_disp_vals,
353  const ParallelParams& opt_cpu_params);
354 
355 #endif //FLOAT16_VECTORIZATION
356 
357  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
359  beliefprop::CheckerboardPart checkerboard_to_update,
360  const beliefprop::BpLevelProperties& current_bp_level,
361  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
362  double* message_u_checkerboard_0, double* message_d_checkerboard_0,
363  double* message_l_checkerboard_0, double* message_r_checkerboard_0,
364  double* message_u_checkerboard_1, double* message_d_checkerboard_1,
365  double* message_l_checkerboard_1, double* message_r_checkerboard_1,
366  float disc_k_bp, unsigned int bp_settings_disp_vals,
367  const ParallelParams& opt_cpu_params);
368 
369 #if ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE))
370  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
372  beliefprop::CheckerboardPart checkerboard_to_update,
373  const beliefprop::BpLevelProperties& current_bp_level,
374  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
375  float* message_u_checkerboard_0, float* message_d_checkerboard_0,
376  float* message_l_checkerboard_0, float* message_r_checkerboard_0,
377  float* message_u_checkerboard_1, float* message_d_checkerboard_1,
378  float* message_l_checkerboard_1, float* message_r_checkerboard_1,
379  float disc_k_bp, unsigned int bp_settings_disp_vals,
380  const ParallelParams& opt_cpu_params);
381 
382  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
384  beliefprop::CheckerboardPart checkerboard_to_update,
385  const beliefprop::BpLevelProperties& current_bp_level,
386  const short* data_cost_checkerboard_0, const short* data_cost_checkerboard_1,
387  short* message_u_checkerboard_0, short* message_d_checkerboard_0,
388  short* message_l_checkerboard_0, short* message_r_checkerboard_0,
389  short* message_u_checkerboard_1, short* message_d_checkerboard_1,
390  short* message_l_checkerboard_1, short* message_r_checkerboard_1,
391  float disc_k_bp, unsigned int bp_settings_disp_vals,
392  const ParallelParams& opt_cpu_params);
393 
394 #if defined(FLOAT16_VECTORIZATION)
395 
396  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
398  beliefprop::CheckerboardPart checkerboard_to_update,
399  const beliefprop::BpLevelProperties& current_bp_level,
400  const _Float16* data_cost_checkerboard_0, const _Float16* data_cost_checkerboard_1,
401  _Float16* message_u_checkerboard_0, _Float16* message_d_checkerboard_0,
402  _Float16* message_l_checkerboard_0, _Float16* message_r_checkerboard_0,
403  _Float16* message_u_checkerboard_1, _Float16* message_d_checkerboard_1,
404  _Float16* message_l_checkerboard_1, _Float16* message_r_checkerboard_1,
405  float disc_k_bp, unsigned int bp_settings_disp_vals,
406  const ParallelParams& opt_cpu_params);
407 
408 #endif //FLOAT16_VECTORIZATION
409 
410  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
412  beliefprop::CheckerboardPart checkerboard_to_update,
413  const beliefprop::BpLevelProperties& current_bp_level,
414  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
415  double* message_u_checkerboard_0, double* message_d_checkerboard_0,
416  double* message_l_checkerboard_0, double* message_r_checkerboard_0,
417  double* message_u_checkerboard_1, double* message_d_checkerboard_1,
418  double* message_l_checkerboard_1, double* message_r_checkerboard_1,
419  float disc_k_bp, unsigned int bp_settings_disp_vals,
420  const ParallelParams& opt_cpu_params);
421 #endif //(CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE)
422 
423  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
425  beliefprop::CheckerboardPart checkerboard_to_update,
426  const beliefprop::BpLevelProperties& current_bp_level,
427  const float* data_cost_checkerboard_0, const float* data_cost_checkerboard_1,
428  float* message_u_checkerboard_0, float* message_d_checkerboard_0,
429  float* message_l_checkerboard_0, float* message_r_checkerboard_0,
430  float* message_u_checkerboard_1, float* message_d_checkerboard_1,
431  float* message_l_checkerboard_1, float* message_r_checkerboard_1,
432  float disc_k_bp, unsigned int bp_settings_disp_vals,
433  const ParallelParams& opt_cpu_params);
434 
435  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
437  beliefprop::CheckerboardPart checkerboard_to_update,
438  const beliefprop::BpLevelProperties& current_bp_level,
439  const double* data_cost_checkerboard_0, const double* data_cost_checkerboard_1,
440  double* message_u_checkerboard_0, double* message_d_checkerboard_0,
441  double* message_l_checkerboard_0, double* message_r_checkerboard_0,
442  double* message_u_checkerboard_1, double* message_d_checkerboard_1,
443  double* message_l_checkerboard_1, double* message_r_checkerboard_1,
444  float disc_k_bp, unsigned int bp_settings_disp_vals,
445  const ParallelParams& opt_cpu_params);
446 
447 #if defined(COMPILING_FOR_ARM)
448  template<unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
450  beliefprop::CheckerboardPart checkerboard_to_update,
451  const beliefprop::BpLevelProperties& current_bp_level,
452  const float16_t* data_cost_checkerboard_0, const float16_t* data_cost_checkerboard_1,
453  float16_t* message_u_checkerboard_0, float16_t* message_d_checkerboard_0,
454  float16_t* message_l_checkerboard_0, float16_t* message_r_checkerboard_0,
455  float16_t* message_u_checkerboard_1, float16_t* message_d_checkerboard_1,
456  float16_t* message_l_checkerboard_1, float16_t* message_r_checkerboard_1,
457  float disc_k_bp, unsigned int bp_settings_disp_vals,
458  const ParallelParams& opt_cpu_params);
459 #endif //COMPILING_FOR_ARM
460 
461  template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
463  beliefprop::CheckerboardPart checkerboard_to_update,
464  const beliefprop::BpLevelProperties& current_bp_level,
465  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
466  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
467  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
468  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
469  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
470  float disc_k_bp, unsigned int bp_settings_disp_vals,
471  const ParallelParams& opt_cpu_params);
472 
473  // compute current message
474  template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
475  void MsgStereoSIMD(
476  unsigned int x_val, unsigned int y_val,
477  const beliefprop::BpLevelProperties& current_bp_level,
478  const U messages_neighbor_1[DISP_VALS], const U messages_neighbor_2[DISP_VALS],
479  const U messages_neighbor_3[DISP_VALS], const U data_costs[DISP_VALS],
480  T* dst_message_array, const U& disc_k_bp, bool data_aligned);
481 
482  // compute current message
483  template<RunData_t T, RunDataVect_t U>
484  void MsgStereoSIMD(
485  unsigned int x_val, unsigned int y_val,
486  const beliefprop::BpLevelProperties& current_bp_level,
487  const U* messages_neighbor_1, const U* messages_neighbor_2,
488  const U* messages_neighbor_3, const U* data_costs,
489  T* dst_message_array, const U& disc_k_bp, bool data_aligned,
490  unsigned int bp_settings_disp_vals);
491 
492  // compute current message
493  template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W>
495  unsigned int x_val, unsigned int y_val,
496  const beliefprop::BpLevelProperties& current_bp_level,
497  const U* messages_neighbor_1, const U* messages_neighbor_2,
498  const U* messages_neighbor_3, const U* data_costs,
499  T* dst_message_array, const U& disc_k_bp, bool data_aligned,
500  unsigned int bp_settings_disp_vals);
501 
502  // compute current message
503  template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W, unsigned int DISP_VALS>
505  unsigned int x_val, unsigned int y_val,
506  const beliefprop::BpLevelProperties& current_bp_level,
507  const U messages_neighbor_1[DISP_VALS], const U messages_neighbor_2[DISP_VALS],
508  const U messages_neighbor_3[DISP_VALS], const U data_costs[DISP_VALS],
509  T* dst_message_array, const U& disc_k_bp, bool data_aligned);
510 
511  //function retrieve the minimum value at each 1-d disparity value in O(n) time using Felzenszwalb's method
512  //(see "Efficient Belief Propagation for Early Vision")
513  template<RunDataProcess_t T, RunDataVectProcess_t U, unsigned int DISP_VALS>
514  void DtStereoSIMD(U f[DISP_VALS]);
515 
516  //function retrieve the minimum value at each 1-d disparity value in O(n) time using Felzenszwalb's method
517  //(see "Efficient Belief Propagation for Early Vision")
518  template<RunDataProcess_t T, RunDataVectProcess_t U>
519  void DtStereoSIMD(U* f, unsigned int bp_settings_disp_vals);
520 
521  template<RunDataVectProcess_t T>
523  T& best_disparities, T& best_vals,
524  const T& current_disparity, const T& val_at_disp) {
525  std::cout << "Data type not supported for updating best disparities and values" << std::endl;
526  }
527 
528  template<RunData_t T, unsigned int DISP_VALS>
530  unsigned int x_val, unsigned int y_val,
531  const beliefprop::BpLevelProperties& current_bp_level,
532  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
533  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
534  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
535  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
536  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1);
537 
538  template<RunData_t T, unsigned int DISP_VALS>
540  unsigned int x_val, unsigned int y_val,
541  const beliefprop::BpLevelProperties& current_bp_level,
542  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
543  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
544  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
545  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
546  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1);
547 };
548 
549 //headers to include differ depending on architecture and CPU vectorization setting
550 #if defined(COMPILING_FOR_ARM)
551 
552 #if (CPU_VECTORIZATION_DEFINE == NEON_DEFINE)
553 #include "KernelBpStereoCPU_NEON.h"
554 #endif //CPU_VECTORIZATION_DEFINE == NEON_DEFINE
555 
556 #else
557 
558 #if ((CPU_VECTORIZATION_DEFINE == AVX_256_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_256_F16_DEFINE))
560 #elif ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE))
563 #endif //CPU_VECTORIZATION_DEFINE
564 
565 #endif //COMPILING_FOR_ARM
566 
567 //definitions of CPU functions declared in namespace
568 
569 //initialize the "data cost" for each possible disparity between the two full-sized input images ("bottom" of the image pyramid)
570 //the image data is stored in the CUDA arrays image1PixelsTextureBPStereo and image2PixelsTextureBPStereo
571 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
573  const beliefprop::BpLevelProperties& current_bp_level,
574  const float* image_1_pixels_device, const float* image_2_pixels_device,
575  T* data_cost_stereo_checkerboard_0, T* data_cost_stereo_checkerboard_1,
576  float lambda_bp, float data_k_bp, unsigned int bp_settings_disp_vals,
577  const ParallelParams& opt_cpu_params)
578 {
579 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
580  int num_threads_kernel{
581  (int)opt_cpu_params.OptParamsForKernel(
582  {static_cast<unsigned int>(beliefprop::BpKernel::kDataCostsAtLevel), 0})[0]};
583  #pragma omp parallel for num_threads(num_threads_kernel)
584 #else
585  #pragma omp parallel for
586 #endif
587 #ifdef _WIN32
588  for (int val = 0; val < (current_bp_level.width_level_*current_bp_level.height_level_); val++)
589 #else
590  for (unsigned int val = 0; val < (current_bp_level.width_level_*current_bp_level.height_level_); val++)
591 #endif //_WIN32
592  {
593  const unsigned int y_val = val / current_bp_level.width_level_;
594  const unsigned int x_val = val % current_bp_level.width_level_;
595 
596  beliefprop::InitializeBottomLevelDataPixel<T, DISP_VALS>(x_val, y_val, current_bp_level,
597  image_1_pixels_device, image_2_pixels_device,
598  data_cost_stereo_checkerboard_0, data_cost_stereo_checkerboard_1,
599  lambda_bp, data_k_bp, bp_settings_disp_vals);
600  }
601 }
602 
603 //initialize the data costs at the "next" level up in the pyramid given that the data at the lower has been set
604 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
606  beliefprop::CheckerboardPart checkerboard_part,
607  const beliefprop::BpLevelProperties& current_bp_level,
608  const beliefprop::BpLevelProperties& prev_bp_level,
609  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
610  T* data_cost_current_level, unsigned int offset_num, unsigned int bp_settings_disp_vals,
611  const ParallelParams& opt_cpu_params)
612 {
613 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
614  int num_threads_kernel{(int)opt_cpu_params.OptParamsForKernel(
615  {static_cast<unsigned int>(beliefprop::BpKernel::kDataCostsAtLevel), current_bp_level.level_num_})[0]};
616  #pragma omp parallel for num_threads(num_threads_kernel)
617 #else
618  #pragma omp parallel for
619 #endif
620 #ifdef _WIN32
621  for (int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
622 #else
623  for (unsigned int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
624 #endif //_WIN32
625  {
626  const unsigned int y_val = val / current_bp_level.width_checkerboard_level_;
627  const unsigned int x_val = val % current_bp_level.width_checkerboard_level_;
628 
629  //if datatype is halftype (2 bytes) and acceleration type doesn't support float16
630  //vectorization, set to process data using float
631  if constexpr ((sizeof(T) == 2) &&
632  (((ACCELERATION == run_environment::AccSetting::kAVX256) ||
633  (ACCELERATION == run_environment::AccSetting::kAVX512)) ||
634  (ACCELERATION == run_environment::AccSetting::kNEON)))
635  {
636  beliefprop::InitializeCurrentLevelDataPixel<T, float, DISP_VALS>(
637  x_val, y_val, checkerboard_part,
638  current_bp_level, prev_bp_level,
639  data_cost_checkerboard_0, data_cost_checkerboard_1,
640  data_cost_current_level, offset_num, bp_settings_disp_vals);
641  }
642  else
643  {
644  beliefprop::InitializeCurrentLevelDataPixel<T, T, DISP_VALS>(
645  x_val, y_val, checkerboard_part,
646  current_bp_level, prev_bp_level,
647  data_cost_checkerboard_0, data_cost_checkerboard_1,
648  data_cost_current_level, offset_num, bp_settings_disp_vals);
649  }
650  }
651 }
652 
653 //initialize the message values at each pixel of the current level to the default value
654 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
656  const beliefprop::BpLevelProperties& current_bp_level,
657  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
658  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
659  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
660  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
661  unsigned int bp_settings_disp_vals,
662  const ParallelParams& opt_cpu_params)
663 {
664 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
665  int num_threads_kernel{
666  (int)opt_cpu_params.OptParamsForKernel(
667  {static_cast<unsigned int>(beliefprop::BpKernel::kInitMessageVals), 0})[0]};
668  #pragma omp parallel for num_threads(num_threads_kernel)
669 #else
670  #pragma omp parallel for
671 #endif
672 #ifdef _WIN32
673  for (int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
674 #else
675  for (unsigned int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
676 #endif //_WIN32
677  {
678  const unsigned int y_val = val / current_bp_level.width_checkerboard_level_;
679  const unsigned int x_val_in_checkerboard = val % current_bp_level.width_checkerboard_level_;
680 
681  beliefprop::InitializeMessageValsToDefaultKernelPixel<T, DISP_VALS>(
682  x_val_in_checkerboard, y_val, current_bp_level,
683  message_u_checkerboard_0, message_d_checkerboard_0,
684  message_l_checkerboard_0, message_r_checkerboard_0,
685  message_u_checkerboard_1, message_d_checkerboard_1,
686  message_l_checkerboard_1, message_r_checkerboard_1,
687  bp_settings_disp_vals);
688  }
689 }
690 
691 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
693  beliefprop::CheckerboardPart checkerboard_part_update,
694  const beliefprop::BpLevelProperties& current_bp_level,
695  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
696  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
697  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
698  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
699  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
700  float disc_k_bp, unsigned int bp_settings_disp_vals,
701  const ParallelParams& opt_cpu_params)
702 {
703  const unsigned int width_checkerboard_run_processing = current_bp_level.width_level_ / 2;
704 
705  //in cuda kernel storing data one at a time (though it is coalesced), so simd_data_size not relevant here and set to 1
706  //still is a check if start of row is aligned
707  const bool data_aligned = beliefprop::MemoryAlignedAtDataStart<T>(
708  0, 1, current_bp_level.bytes_align_memory_, current_bp_level.padded_width_checkerboard_level_);
709 
710 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
711  int num_threads_kernel{(int)opt_cpu_params.OptParamsForKernel(
712  {static_cast<unsigned int>(beliefprop::BpKernel::kBpAtLevel), current_bp_level.level_num_})[0]};
713  #pragma omp parallel for num_threads(num_threads_kernel)
714 #else
715  #pragma omp parallel for
716 #endif
717 #ifdef _WIN32
718  for (int val = 0; val < (width_checkerboard_run_processing * current_bp_level.height_level_); val++)
719 #else
720  for (unsigned int val = 0; val < (width_checkerboard_run_processing * current_bp_level.height_level_); val++)
721 #endif //_WIN32
722  {
723  const unsigned int y_val = val / width_checkerboard_run_processing;
724  const unsigned int x_val = val % width_checkerboard_run_processing;
725 
726  //if datatype is halftype (2 bytes) and acceleration type doesn't support float16
727  //vectorization, set to process data using float
728  if constexpr ((sizeof(T) == 2) &&
729  (((ACCELERATION == run_environment::AccSetting::kAVX256) ||
730  (ACCELERATION == run_environment::AccSetting::kAVX512)) ||
731  (ACCELERATION == run_environment::AccSetting::kNEON)))
732  {
733  beliefprop::RunBPIterationUsingCheckerboardUpdatesKernel<T, float, DISP_VALS>(
734  x_val, y_val, checkerboard_part_update, current_bp_level,
735  data_cost_checkerboard_0, data_cost_checkerboard_1,
736  message_u_checkerboard_0, message_d_checkerboard_0,
737  message_l_checkerboard_0, message_r_checkerboard_0,
738  message_u_checkerboard_1, message_d_checkerboard_1,
739  message_l_checkerboard_1, message_r_checkerboard_1,
740  disc_k_bp, 0, data_aligned, bp_settings_disp_vals);
741 
742  }
743  else
744  {
745  beliefprop::RunBPIterationUsingCheckerboardUpdatesKernel<T, T, DISP_VALS>(
746  x_val, y_val, checkerboard_part_update, current_bp_level,
747  data_cost_checkerboard_0, data_cost_checkerboard_1,
748  message_u_checkerboard_0, message_d_checkerboard_0,
749  message_l_checkerboard_0, message_r_checkerboard_0,
750  message_u_checkerboard_1, message_d_checkerboard_1,
751  message_l_checkerboard_1, message_r_checkerboard_1,
752  disc_k_bp, 0, data_aligned, bp_settings_disp_vals);
753  }
754  }
755 }
756 
757 template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
759  unsigned int x_val_start_processing, unsigned int y_val,
760  const beliefprop::BpLevelProperties& current_bp_level,
761  const U prev_u_message[DISP_VALS], const U prev_d_message[DISP_VALS],
762  const U prev_l_message[DISP_VALS], const U prev_r_message[DISP_VALS],
763  const U data_message[DISP_VALS],
764  T* current_u_message, T* current_d_message,
765  T* current_l_message, T* current_r_message,
766  const U disc_k_bp_vect, bool data_aligned)
767 {
768  MsgStereoSIMD<T, U, DISP_VALS>(x_val_start_processing, y_val, current_bp_level,
769  prev_u_message, prev_l_message, prev_r_message, data_message, current_u_message,
770  disc_k_bp_vect, data_aligned);
771 
772  MsgStereoSIMD<T, U, DISP_VALS>(x_val_start_processing, y_val, current_bp_level,
773  prev_d_message, prev_l_message, prev_r_message, data_message, current_d_message,
774  disc_k_bp_vect, data_aligned);
775 
776  MsgStereoSIMD<T, U, DISP_VALS>(x_val_start_processing, y_val, current_bp_level,
777  prev_u_message, prev_d_message, prev_r_message, data_message, current_r_message,
778  disc_k_bp_vect, data_aligned);
779 
780  MsgStereoSIMD<T, U, DISP_VALS>(x_val_start_processing, y_val, current_bp_level,
781  prev_u_message, prev_d_message, prev_l_message, data_message, current_l_message,
782  disc_k_bp_vect, data_aligned);
783 }
784 
785 template<RunData_t T, RunDataVect_t U>
787  unsigned int x_val_start_processing, unsigned int y_val,
788  const beliefprop::BpLevelProperties& current_bp_level,
789  const U* prev_u_message, const U* prev_d_message,
790  const U* prev_l_message, const U* prev_r_message,
791  const U* data_message,
792  T* current_u_message, T* current_d_message,
793  T* current_l_message, T* current_r_message,
794  const U disc_k_bp_vect, bool data_aligned, unsigned int bp_settings_disp_vals)
795 {
796  MsgStereoSIMD<T, U>(x_val_start_processing, y_val, current_bp_level,
797  prev_u_message, prev_l_message, prev_r_message, data_message, current_u_message,
798  disc_k_bp_vect, data_aligned, bp_settings_disp_vals);
799 
800  MsgStereoSIMD<T, U>(x_val_start_processing, y_val, current_bp_level,
801  prev_d_message, prev_l_message, prev_r_message, data_message, current_d_message,
802  disc_k_bp_vect, data_aligned, bp_settings_disp_vals);
803 
804  MsgStereoSIMD<T, U>(x_val_start_processing, y_val, current_bp_level,
805  prev_u_message, prev_d_message, prev_r_message, data_message, current_r_message,
806  disc_k_bp_vect, data_aligned, bp_settings_disp_vals);
807 
808  MsgStereoSIMD<T, U>(x_val_start_processing, y_val, current_bp_level,
809  prev_u_message, prev_d_message, prev_l_message, data_message, current_l_message,
810  disc_k_bp_vect, data_aligned, bp_settings_disp_vals);
811 }
812 
813 template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
815  beliefprop::CheckerboardPart checkerboard_to_update,
816  const beliefprop::BpLevelProperties& current_bp_level,
817  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
818  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
819  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
820  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
821  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
822  float disc_k_bp, unsigned int bp_settings_disp_vals,
823  const ParallelParams& opt_cpu_params)
824 {
825  constexpr size_t simd_data_size{sizeof(U) / sizeof(T)};
826  const unsigned int width_checkerboard_run_processing = current_bp_level.width_level_ / 2;
827  const U disc_k_bp_vect = simd_processing::createSIMDVectorSameData<U>(disc_k_bp);
828 
829  if constexpr (DISP_VALS > 0) {
830 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
831  int num_threads_kernel{(int)opt_cpu_params.OptParamsForKernel(
832  {static_cast<unsigned int>(beliefprop::BpKernel::kBpAtLevel), current_bp_level.level_num_})[0]};
833  #pragma omp parallel for num_threads(num_threads_kernel)
834 #else
835  #pragma omp parallel for
836 #endif
837 #ifdef _WIN32
838  for (int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
839 #else
840  for (unsigned int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
841 #endif //_WIN32
842  //checkerboard_adjustment used for indexing into current checkerboard to update
843  const unsigned int checkerboard_adjustment =
844  (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) ?
845  ((y_val) % 2) :
846  ((y_val + 1) % 2);
847  const unsigned int start_x = (checkerboard_adjustment == 1) ? 0 : 1;
848  const unsigned int end_final =
849  std::min(
850  current_bp_level.width_checkerboard_level_ - checkerboard_adjustment,
851  width_checkerboard_run_processing);
852  const int end_x_simd_vect_start =
853  std::max(0, (int)(end_final / simd_data_size) * (int)simd_data_size - (int)simd_data_size);
854 
855  for (unsigned int x_val = 0; x_val < end_final; x_val += simd_data_size) {
856  unsigned int x_val_process = x_val;
857 
858  //need this check first for case where endXAvxStart is 0 and start_x is 1
859  //if past the last AVX start (since the next one would go beyond the row),
860  //set to simd_data_size from the final pixel so processing the last
861  //numDataInAvxVector in avx
862  //may be a few pixels that are computed twice but that's OK
863  if (((int)x_val_process > end_x_simd_vect_start) &&
864  (end_final > simd_data_size))
865  {
866  x_val_process = end_final - simd_data_size;
867  }
868 
869  //not processing at x=0 if start_x is 1 (this will cause this
870  //processing to be less aligned than ideal for this iteration)
871  x_val_process = std::max(start_x, x_val_process);
872 
873  //check if the memory is aligned for AVX instructions at x_val_process
874  //location
875  const bool data_aligned_x_val =
876  beliefprop::MemoryAlignedAtDataStart<T>(
877  x_val_process,
878  simd_data_size,
879  current_bp_level.bytes_align_memory_,
880  current_bp_level.padded_width_checkerboard_level_);
881 
882  //initialize arrays for data and message values
883  U data_message[DISP_VALS], prev_u_message[DISP_VALS],
884  prev_d_message[DISP_VALS], prev_l_message[DISP_VALS],
885  prev_r_message[DISP_VALS];
886 
887  //load using aligned instructions when possible
888  if (data_aligned_x_val) {
889  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
890  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
891  data_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
892  x_val_process, y_val, current_disparity, current_bp_level,
893  DISP_VALS, data_cost_checkerboard_0);
894  prev_u_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
895  x_val_process, y_val + 1, current_disparity, current_bp_level,
896  DISP_VALS, message_u_checkerboard_1);
897  prev_d_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
898  x_val_process, y_val - 1, current_disparity, current_bp_level,
899  DISP_VALS, message_d_checkerboard_1);
900  prev_l_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
901  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
902  DISP_VALS, message_l_checkerboard_1);
903  prev_r_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
904  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
905  DISP_VALS, message_r_checkerboard_1);
906  }
907  else //checkerboard_part_update == beliefprop::CheckerboardPart::kCheckerboardPart1
908  {
909  data_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
910  x_val_process, y_val, current_disparity, current_bp_level,
911  DISP_VALS, data_cost_checkerboard_1);
912  prev_u_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
913  x_val_process, y_val + 1, current_disparity, current_bp_level,
914  DISP_VALS, message_u_checkerboard_0);
915  prev_d_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
916  x_val_process, y_val - 1, current_disparity, current_bp_level,
917  DISP_VALS, message_d_checkerboard_0);
918  prev_l_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
919  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
920  DISP_VALS, message_l_checkerboard_0);
921  prev_r_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
922  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
923  DISP_VALS, message_r_checkerboard_0);
924  }
925  }
926  } else {
927  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
928  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
929  data_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
930  x_val_process, y_val, current_disparity, current_bp_level,
931  DISP_VALS, data_cost_checkerboard_0);
932  prev_u_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
933  x_val_process, y_val + 1, current_disparity, current_bp_level,
934  DISP_VALS, message_u_checkerboard_1);
935  prev_d_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
936  x_val_process, y_val - 1, current_disparity, current_bp_level,
937  DISP_VALS, message_d_checkerboard_1);
938  prev_l_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
939  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
940  DISP_VALS, message_l_checkerboard_1);
941  prev_r_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
942  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
943  DISP_VALS, message_r_checkerboard_1);
944  }
945  else //checkerboard_part_update == beliefprop::CheckerboardPart::kCheckerboardPart1
946  {
947  data_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
948  x_val_process, y_val, current_disparity, current_bp_level,
949  DISP_VALS, data_cost_checkerboard_1);
950  prev_u_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
951  x_val_process, y_val + 1, current_disparity, current_bp_level,
952  DISP_VALS, message_u_checkerboard_0);
953  prev_d_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
954  x_val_process, y_val - 1, current_disparity, current_bp_level,
955  DISP_VALS, message_d_checkerboard_0);
956  prev_l_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
957  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
958  DISP_VALS, message_l_checkerboard_0);
959  prev_r_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
960  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
961  DISP_VALS, message_r_checkerboard_0);
962  }
963  }
964  }
965 
966  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
967  RunBPIterationUpdateMsgValsUseSIMDVectors<T, U, DISP_VALS>(
968  x_val_process, y_val, current_bp_level,
969  prev_u_message, prev_d_message, prev_l_message, prev_r_message, data_message,
970  message_u_checkerboard_0, message_d_checkerboard_0,
971  message_l_checkerboard_0, message_r_checkerboard_0,
972  disc_k_bp_vect, data_aligned_x_val);
973  }
974  else {
975  RunBPIterationUpdateMsgValsUseSIMDVectors<T, U, DISP_VALS>(
976  x_val_process, y_val, current_bp_level,
977  prev_u_message, prev_d_message, prev_l_message, prev_r_message, data_message,
978  message_u_checkerboard_1, message_d_checkerboard_1,
979  message_l_checkerboard_1, message_r_checkerboard_1,
980  disc_k_bp_vect, data_aligned_x_val);
981  }
982  }
983  }
984  }
985  else {
986 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
987  int num_threads_kernel{
988  (int)opt_cpu_params.OptParamsForKernel(
989  {static_cast<unsigned int>(beliefprop::BpKernel::kBpAtLevel), current_bp_level.level_num_})[0]};
990  #pragma omp parallel for num_threads(num_threads_kernel)
991 #else
992  #pragma omp parallel for
993 #endif
994 #ifdef _WIN32
995  for (int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
996 #else
997  for (unsigned int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
998 #endif //_WIN32
999  //checkerboard_adjustment used for indexing into current checkerboard to update
1000  const unsigned int checkerboard_adjustment =
1001  (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) ?
1002  ((y_val) % 2) :
1003  ((y_val + 1) % 2);
1004  const unsigned int start_x = (checkerboard_adjustment == 1) ? 0 : 1;
1005  const unsigned int end_final =
1006  std::min(
1007  current_bp_level.width_checkerboard_level_ - checkerboard_adjustment,
1008  width_checkerboard_run_processing);
1009  const int end_x_simd_vect_start =
1010  std::max(0, (int)(end_final / simd_data_size) * (int)simd_data_size - (int)simd_data_size);
1011 
1012  for (unsigned int x_val = 0; x_val < end_final; x_val += simd_data_size) {
1013  unsigned int x_val_process = x_val;
1014 
1015  //need this check first for case where endXAvxStart is 0 and start_x is 1
1016  //if past the last AVX start (since the next one would go beyond the row),
1017  //set to simd_data_size from the final pixel so processing the last
1018  //numDataInAvxVector in avx
1019  //may be a few pixels that are computed twice but that's OK
1020  if (((int)x_val_process > end_x_simd_vect_start) &&
1021  (end_final > simd_data_size))
1022  {
1023  x_val_process = end_final - simd_data_size;
1024  }
1025 
1026  //not processing at x=0 if start_x is 1 (this will cause this
1027  //processing to be less aligned than ideal for this iteration)
1028  x_val_process = std::max(start_x, x_val_process);
1029 
1030  //check if the memory is aligned for AVX instructions at x_val_process
1031  //location
1032  const bool data_aligned_x_val =
1033  beliefprop::MemoryAlignedAtDataStart<T>(
1034  x_val_process, simd_data_size, current_bp_level.bytes_align_memory_,
1035  current_bp_level.padded_width_checkerboard_level_);
1036 
1037  //initialize arrays for data and message values
1038  U* data_message = new U[bp_settings_disp_vals];
1039  U* prev_u_message = new U[bp_settings_disp_vals];
1040  U* prev_d_message = new U[bp_settings_disp_vals];
1041  U* prev_l_message = new U[bp_settings_disp_vals];
1042  U* prev_r_message = new U[bp_settings_disp_vals];
1043 
1044  //load using aligned instructions when possible
1045  if (data_aligned_x_val) {
1046  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++) {
1047  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1048  data_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
1049  x_val_process, y_val,
1050  current_disparity, current_bp_level, bp_settings_disp_vals, data_cost_checkerboard_0);
1051  prev_u_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
1052  x_val_process, y_val + 1,
1053  current_disparity, current_bp_level, bp_settings_disp_vals, message_u_checkerboard_1);
1054  prev_d_message[current_disparity] = simd_processing::LoadPackedDataAligned<T, U>(
1055  x_val_process, y_val - 1,
1056  current_disparity, current_bp_level, bp_settings_disp_vals, message_d_checkerboard_1);
1057  prev_l_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
1058  x_val_process + checkerboard_adjustment, y_val,
1059  current_disparity, current_bp_level, bp_settings_disp_vals, message_l_checkerboard_1);
1060  prev_r_message[current_disparity] = simd_processing::LoadPackedDataUnaligned<T, U>(
1061  (x_val_process + checkerboard_adjustment) - 1, y_val,
1062  current_disparity, current_bp_level, bp_settings_disp_vals, message_r_checkerboard_1);
1063  }
1064  else //checkerboard_part_update == beliefprop::CheckerboardPart::kCheckerboardPart1
1065  {
1066  data_message[current_disparity] =
1067  simd_processing::LoadPackedDataAligned<T, U>(
1068  x_val_process, y_val, current_disparity, current_bp_level,
1069  bp_settings_disp_vals, data_cost_checkerboard_1);
1070  prev_u_message[current_disparity] =
1071  simd_processing::LoadPackedDataAligned<T, U>(
1072  x_val_process, y_val + 1, current_disparity, current_bp_level,
1073  bp_settings_disp_vals, message_u_checkerboard_0);
1074  prev_d_message[current_disparity] =
1075  simd_processing::LoadPackedDataAligned<T, U>(
1076  x_val_process, y_val - 1, current_disparity, current_bp_level,
1077  bp_settings_disp_vals, message_d_checkerboard_0);
1078  prev_l_message[current_disparity] =
1079  simd_processing::LoadPackedDataUnaligned<T, U>(
1080  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
1081  bp_settings_disp_vals, message_l_checkerboard_0);
1082  prev_r_message[current_disparity] =
1083  simd_processing::LoadPackedDataUnaligned<T, U>(
1084  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
1085  bp_settings_disp_vals, message_r_checkerboard_0);
1086  }
1087  }
1088  }
1089  else {
1090  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++) {
1091  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1092  data_message[current_disparity] =
1093  simd_processing::LoadPackedDataUnaligned<T, U>(
1094  x_val_process, y_val, current_disparity, current_bp_level,
1095  bp_settings_disp_vals, data_cost_checkerboard_0);
1096  prev_u_message[current_disparity] =
1097  simd_processing::LoadPackedDataUnaligned<T, U>(
1098  x_val_process, y_val + 1, current_disparity, current_bp_level,
1099  bp_settings_disp_vals, message_u_checkerboard_1);
1100  prev_d_message[current_disparity] =
1101  simd_processing::LoadPackedDataUnaligned<T, U>(
1102  x_val_process, y_val - 1, current_disparity, current_bp_level,
1103  bp_settings_disp_vals, message_d_checkerboard_1);
1104  prev_l_message[current_disparity] =
1105  simd_processing::LoadPackedDataUnaligned<T, U>(
1106  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
1107  bp_settings_disp_vals, message_l_checkerboard_1);
1108  prev_r_message[current_disparity] =
1109  simd_processing::LoadPackedDataUnaligned<T, U>(
1110  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
1111  bp_settings_disp_vals, message_r_checkerboard_1);
1112  }
1113  else //checkerboard_part_update == beliefprop::CheckerboardPart::kCheckerboardPart1
1114  {
1115  data_message[current_disparity] =
1116  simd_processing::LoadPackedDataUnaligned<T, U>(
1117  x_val_process, y_val, current_disparity, current_bp_level,
1118  bp_settings_disp_vals, data_cost_checkerboard_1);
1119  prev_u_message[current_disparity] =
1120  simd_processing::LoadPackedDataUnaligned<T, U>(
1121  x_val_process, y_val + 1, current_disparity, current_bp_level,
1122  bp_settings_disp_vals, message_u_checkerboard_0);
1123  prev_d_message[current_disparity] =
1124  simd_processing::LoadPackedDataUnaligned<T, U>(
1125  x_val_process, y_val - 1, current_disparity, current_bp_level,
1126  bp_settings_disp_vals, message_d_checkerboard_0);
1127  prev_l_message[current_disparity] =
1128  simd_processing::LoadPackedDataUnaligned<T, U>(
1129  x_val_process + checkerboard_adjustment, y_val, current_disparity, current_bp_level,
1130  bp_settings_disp_vals, message_l_checkerboard_0);
1131  prev_r_message[current_disparity] =
1132  simd_processing::LoadPackedDataUnaligned<T, U>(
1133  (x_val_process + checkerboard_adjustment) - 1, y_val, current_disparity, current_bp_level,
1134  bp_settings_disp_vals, message_r_checkerboard_0);
1135  }
1136  }
1137  }
1138 
1139  if (checkerboard_to_update == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1140  RunBPIterationUpdateMsgValsUseSIMDVectors<T, U>(
1141  x_val_process, y_val, current_bp_level,
1142  prev_u_message, prev_d_message, prev_l_message, prev_r_message, data_message,
1143  message_u_checkerboard_0, message_d_checkerboard_0,
1144  message_l_checkerboard_0, message_r_checkerboard_0,
1145  disc_k_bp_vect, data_aligned_x_val, bp_settings_disp_vals);
1146  }
1147  else {
1148  RunBPIterationUpdateMsgValsUseSIMDVectors<T, U>(
1149  x_val_process, y_val, current_bp_level,
1150  prev_u_message, prev_d_message, prev_l_message, prev_r_message, data_message,
1151  message_u_checkerboard_1, message_d_checkerboard_1,
1152  message_l_checkerboard_1, message_r_checkerboard_1,
1153  disc_k_bp_vect, data_aligned_x_val, bp_settings_disp_vals);
1154  }
1155 
1156  delete [] data_message;
1157  delete [] prev_u_message;
1158  delete [] prev_d_message;
1159  delete [] prev_l_message;
1160  delete [] prev_r_message;
1161  }
1162  }
1163  }
1164 }
1165 
1166 //kernel function to run the current iteration of belief propagation in parallel using
1167 //the checkerboard update method where half the pixels in the "checkerboard" scheme
1168 //retrieve messages from each 4-connected neighbor and then update their message based
1169 //on the retrieved messages and the data cost
1170 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
1172  beliefprop::CheckerboardPart checkerboard_to_update,
1173  const beliefprop::BpLevelProperties& current_bp_level,
1174  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
1175  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
1176  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
1177  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
1178  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
1179  float disc_k_bp, unsigned int bp_settings_num_disp_vals,
1180  const ParallelParams& opt_cpu_params)
1181 {
1182 #if defined(COMPILING_FOR_ARM)
1183  if constexpr (ACCELERATION == run_environment::AccSetting::kNEON)
1184  {
1185  //only use NEON if width of processing checkerboard w/ padding over the number of
1186  //elements in SIMD vector plus one to account for case when getting
1187  //messages from the right neighbor
1188  //if type is half, float vectorization is used and check is adjusted using
1189  //float size
1190  //NEON vectors are 128 bits and divided by 8 to get bytes in vector
1191  constexpr size_t kNEONSimdBytes{128 / 8};
1192  if ((current_bp_level.padded_width_checkerboard_level_ > ((kNEONSimdBytes / sizeof(T)) + 1)) ||
1193  ((sizeof(T) == 2) &&
1194  (current_bp_level.padded_width_checkerboard_level_ > ((kNEONSimdBytes / sizeof(float)) + 1))))
1195  {
1196  RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsNEON<DISP_VALS, ACCELERATION>(
1197  checkerboard_to_update, current_bp_level,
1198  data_cost_checkerboard_0, data_cost_checkerboard_1,
1199  message_u_checkerboard_0, message_d_checkerboard_0,
1200  message_l_checkerboard_0, message_r_checkerboard_0,
1201  message_u_checkerboard_1, message_d_checkerboard_1,
1202  message_l_checkerboard_1, message_r_checkerboard_1,
1203  disc_k_bp, bp_settings_num_disp_vals, opt_cpu_params);
1204 
1205  //return now that bp iteration run
1206  return;
1207  }
1208  }
1209 #else
1210 #if (((CPU_VECTORIZATION_DEFINE == AVX_256_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_256_F16_DEFINE)) || ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE)))
1211  if constexpr ((ACCELERATION == run_environment::AccSetting::kAVX256) ||
1212  (ACCELERATION == run_environment::AccSetting::kAVX256_F16))
1213  {
1214  //only use AVX-256 if width of processing checkerboard w/ padding over the number of
1215  //elements in SIMD vector plus one to account for case when getting
1216  //messages from the right neighbor
1217  //if type is half but acceleration doesn't support half vectorization, then
1218  //float vectorization is used and check is adjusted using float size
1219  //AVX256 vectors are 256 bits and divided by 8 to get bytes in vector
1220  constexpr size_t kAVX256SimdBytes{256 / 8};
1221  if ((current_bp_level.padded_width_checkerboard_level_ > ((kAVX256SimdBytes / sizeof(T)) + 1)) ||
1222  (((sizeof(T) == 2) && (ACCELERATION != run_environment::AccSetting::kAVX256_F16)) &&
1223  (current_bp_level.padded_width_checkerboard_level_ > ((kAVX256SimdBytes / sizeof(float)) + 1))))
1224  {
1225  RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsAVX256<DISP_VALS, ACCELERATION>(
1226  checkerboard_to_update, current_bp_level,
1227  data_cost_checkerboard_0, data_cost_checkerboard_1,
1228  message_u_checkerboard_0, message_d_checkerboard_0,
1229  message_l_checkerboard_0, message_r_checkerboard_0,
1230  message_u_checkerboard_1, message_d_checkerboard_1,
1231  message_l_checkerboard_1, message_r_checkerboard_1,
1232  disc_k_bp, bp_settings_num_disp_vals, opt_cpu_params);
1233 
1234  //return now that bp iteration run
1235  return;
1236  }
1237  }
1238 #endif //CPU_VECTORIZATION_DEFINE
1239 #if ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE))
1240  else if constexpr ((ACCELERATION == run_environment::AccSetting::kAVX512) ||
1241  (ACCELERATION == run_environment::AccSetting::kAVX512_F16))
1242  {
1243  //only use AVX-512 if width of processing checkerboard w/ padding over the number of
1244  //elements in SIMD vector plus one to account for case when getting
1245  //messages from the right neighbor
1246  //if type is half but acceleration doesn't support half vectorization, then
1247  //float vectorization is used and check is adjusted using float size
1248  //AVX512 vectors are 512 bits and divided by 8 to get bytes in vector
1249  constexpr size_t kAVX512SimdBytes{512 / 8};
1250  if ((current_bp_level.padded_width_checkerboard_level_ > ((kAVX512SimdBytes / sizeof(T)) + 1)) ||
1251  (((sizeof(T) == 2) && (ACCELERATION != run_environment::AccSetting::kAVX256_F16)) &&
1252  (current_bp_level.padded_width_checkerboard_level_ > ((kAVX512SimdBytes / sizeof(float)) + 1))))
1253  {
1254  RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsAVX512<DISP_VALS, ACCELERATION>(
1255  checkerboard_to_update, current_bp_level,
1256  data_cost_checkerboard_0, data_cost_checkerboard_1,
1257  message_u_checkerboard_0, message_d_checkerboard_0,
1258  message_l_checkerboard_0, message_r_checkerboard_0,
1259  message_u_checkerboard_1, message_d_checkerboard_1,
1260  message_l_checkerboard_1, message_r_checkerboard_1,
1261  disc_k_bp, bp_settings_num_disp_vals, opt_cpu_params);
1262 
1263  //return now that bp iteration run
1264  return;
1265  }
1266  }
1267 #endif //CPU_VECTORIZATION_DEFINE
1268 #endif //COMPILING_FOR_ARM
1269  //run bp iteration without vectorization if it hasn't been run
1270  //function should have already returned if bp iteration run
1271  RunBPIterationUsingCheckerboardUpdatesNoPackedInstructions<T, DISP_VALS, ACCELERATION>(
1272  checkerboard_to_update, current_bp_level,
1273  data_cost_checkerboard_0, data_cost_checkerboard_1,
1274  message_u_checkerboard_0, message_d_checkerboard_0,
1275  message_l_checkerboard_0, message_r_checkerboard_0,
1276  message_u_checkerboard_1, message_d_checkerboard_1,
1277  message_l_checkerboard_1, message_r_checkerboard_1,
1278  disc_k_bp, bp_settings_num_disp_vals, opt_cpu_params);
1279 }
1280 
1281 //kernel to copy the computed BP message values at the current level to the corresponding locations at the "next" level down
1282 //the kernel works from the point of view of the pixel at the prev level that is being copied to four different places
1283 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
1285  beliefprop::CheckerboardPart checkerboard_part,
1286  const beliefprop::BpLevelProperties& current_bp_level,
1287  const beliefprop::BpLevelProperties& next_bp_level,
1288  const T* message_u_prev_checkerboard_0, const T* message_d_prev_checkerboard_0,
1289  const T* message_l_prev_checkerboard_0, const T* message_r_prev_checkerboard_0,
1290  const T* message_u_prev_checkerboard_1, const T* message_d_prev_checkerboard_1,
1291  const T* message_l_prev_checkerboard_1, const T* message_r_prev_checkerboard_1,
1292  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
1293  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
1294  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
1295  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
1296  unsigned int bp_settings_disp_vals,
1297  const ParallelParams& opt_cpu_params)
1298 {
1299 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
1300  int num_threads_kernel{(int)opt_cpu_params.OptParamsForKernel(
1301  {static_cast<unsigned int>(beliefprop::BpKernel::kCopyAtLevel), current_bp_level.level_num_})[0]};
1302  #pragma omp parallel for num_threads(num_threads_kernel)
1303 #else
1304  #pragma omp parallel for
1305 #endif
1306 #ifdef _WIN32
1307  for (int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
1308 #else
1309  for (unsigned int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
1310 #endif //_WIN32
1311  {
1312  const unsigned int y_val = val / current_bp_level.width_checkerboard_level_;
1313  const unsigned int x_val = val % current_bp_level.width_checkerboard_level_;
1314 
1315  beliefprop::CopyMsgDataToNextLevelPixel<T, DISP_VALS>(
1316  x_val, y_val, checkerboard_part,
1317  current_bp_level, next_bp_level,
1318  message_u_prev_checkerboard_0, message_d_prev_checkerboard_0,
1319  message_l_prev_checkerboard_0, message_r_prev_checkerboard_0,
1320  message_u_prev_checkerboard_1, message_d_prev_checkerboard_1,
1321  message_l_prev_checkerboard_1, message_r_prev_checkerboard_1,
1322  message_u_checkerboard_0, message_d_checkerboard_0,
1323  message_l_checkerboard_0, message_r_checkerboard_0,
1324  message_u_checkerboard_1, message_d_checkerboard_1,
1325  message_l_checkerboard_1, message_r_checkerboard_1,
1326  bp_settings_disp_vals);
1327  }
1328 }
1329 
1330 template<RunData_t T, unsigned int DISP_VALS, run_environment::AccSetting ACCELERATION>
1332  const beliefprop::BpLevelProperties& current_bp_level,
1333  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
1334  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
1335  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
1336  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
1337  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1,
1338  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
1339  const ParallelParams& opt_cpu_params)
1340 {
1341  if constexpr (ACCELERATION == run_environment::AccSetting::kNone) {
1342 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
1343  int num_threads_kernel{
1344  (int)opt_cpu_params.OptParamsForKernel(
1345  {static_cast<unsigned int>(beliefprop::BpKernel::kOutputDisp), 0})[0]};
1346  #pragma omp parallel for num_threads(num_threads_kernel)
1347 #else
1348  #pragma omp parallel for
1349 #endif
1350 #ifdef _WIN32
1351  for (int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
1352 #else
1353  for (unsigned int val = 0; val < (current_bp_level.width_checkerboard_level_*current_bp_level.height_level_); val++)
1354 #endif //_WIN32
1355  {
1356  const unsigned int y_val = val / current_bp_level.width_checkerboard_level_;
1357  const unsigned int x_val = val % current_bp_level.width_checkerboard_level_;
1358 
1359  //if datatype is halftype (2 bytes) and acceleration type doesn't support float16
1360  //vectorization, set to process data using float
1361  if constexpr ((sizeof(T) == 2) &&
1362  (((ACCELERATION == run_environment::AccSetting::kAVX256) ||
1363  (ACCELERATION == run_environment::AccSetting::kAVX512)) ||
1364  (ACCELERATION == run_environment::AccSetting::kNEON)))
1365  {
1366  beliefprop::RetrieveOutputDisparityPixel<T, float, DISP_VALS>(
1367  x_val, y_val, current_bp_level,
1368  data_cost_checkerboard_0, data_cost_checkerboard_1,
1369  message_u_checkerboard_0, message_d_checkerboard_0,
1370  message_l_checkerboard_0, message_r_checkerboard_0,
1371  message_u_checkerboard_1, message_d_checkerboard_1,
1372  message_l_checkerboard_1, message_r_checkerboard_1,
1373  disparity_between_images_device, bp_settings_disp_vals);
1374  }
1375  else
1376  {
1377  beliefprop::RetrieveOutputDisparityPixel<T, T, DISP_VALS>(
1378  x_val, y_val, current_bp_level,
1379  data_cost_checkerboard_0, data_cost_checkerboard_1,
1380  message_u_checkerboard_0, message_d_checkerboard_0,
1381  message_l_checkerboard_0, message_r_checkerboard_0,
1382  message_u_checkerboard_1, message_d_checkerboard_1,
1383  message_l_checkerboard_1, message_r_checkerboard_1,
1384  disparity_between_images_device, bp_settings_disp_vals);
1385  }
1386  }
1387  }
1388  else {
1389 #if defined(COMPILING_FOR_ARM)
1390  RetrieveOutputDisparityUseSIMDVectorsNEON<DISP_VALS, ACCELERATION>(
1391  current_bp_level,
1392  data_cost_checkerboard_0, data_cost_checkerboard_1,
1393  message_u_checkerboard_0, message_d_checkerboard_0,
1394  message_l_checkerboard_0, message_r_checkerboard_0,
1395  message_u_checkerboard_1, message_d_checkerboard_1,
1396  message_l_checkerboard_1, message_r_checkerboard_1,
1397  disparity_between_images_device, bp_settings_disp_vals, opt_cpu_params);
1398 #else
1399  //SIMD vectorization of output disparity
1400  if constexpr ((ACCELERATION == run_environment::AccSetting::kAVX512) ||
1401  (ACCELERATION == run_environment::AccSetting::kAVX512_F16))
1402  {
1403 #if ((CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE) || (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE))
1404  RetrieveOutputDisparityUseSIMDVectorsAVX512<DISP_VALS, ACCELERATION>(
1405  current_bp_level,
1406  data_cost_checkerboard_0, data_cost_checkerboard_1,
1407  message_u_checkerboard_0, message_d_checkerboard_0,
1408  message_l_checkerboard_0, message_r_checkerboard_0,
1409  message_u_checkerboard_1, message_d_checkerboard_1,
1410  message_l_checkerboard_1, message_r_checkerboard_1,
1411  disparity_between_images_device, bp_settings_disp_vals, opt_cpu_params);
1412 #endif //(CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE)
1413  }
1414  else if constexpr ((ACCELERATION == run_environment::AccSetting::kAVX256) ||
1415  (ACCELERATION == run_environment::AccSetting::kAVX256_F16)) {
1416  RetrieveOutputDisparityUseSIMDVectorsAVX256<DISP_VALS, ACCELERATION>(
1417  current_bp_level,
1418  data_cost_checkerboard_0, data_cost_checkerboard_1,
1419  message_u_checkerboard_0, message_d_checkerboard_0,
1420  message_l_checkerboard_0, message_r_checkerboard_0,
1421  message_u_checkerboard_1, message_d_checkerboard_1,
1422  message_l_checkerboard_1, message_r_checkerboard_1,
1423  disparity_between_images_device, bp_settings_disp_vals, opt_cpu_params);
1424  }
1425 #endif //COMPILING_FOR_ARM
1426  }
1427 }
1428 
1429 //retrieve the best disparity estimate from image 1 to image 2 for each pixel in parallel using SIMD vectors
1430 template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W, unsigned int DISP_VALS>
1432  const beliefprop::BpLevelProperties& current_bp_level,
1433  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
1434  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
1435  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
1436  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
1437  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1,
1438  float* disparity_between_images_device, unsigned int bp_settings_disp_vals,
1439  const ParallelParams& opt_cpu_params)
1440 {
1441  constexpr size_t simd_data_size{sizeof(W) / sizeof(V)};
1442  const unsigned int width_checkerboard_run_processing = current_bp_level.width_level_ / 2;
1443 
1444  //initially get output for each checkerboard
1445  //set width of disparity checkerboard to be a multiple of simd_data_size so that SIMD vectors can be aligned
1446  unsigned int width_disp_checkerboard =
1447  ((current_bp_level.padded_width_checkerboard_level_ % (current_bp_level.bytes_align_memory_ / sizeof(T)) == 0) ?
1448  current_bp_level.padded_width_checkerboard_level_ :
1449  (current_bp_level.padded_width_checkerboard_level_ + ((current_bp_level.bytes_align_memory_ / sizeof(T)) -
1450  (current_bp_level.padded_width_checkerboard_level_ % (current_bp_level.bytes_align_memory_ / sizeof(T))))));
1451  const unsigned int num_data_disp_checkerboard = width_disp_checkerboard * current_bp_level.height_level_;
1452 #ifdef _WIN32
1453  V* disparity_checkerboard_0 =
1454  static_cast<V*>(
1455  _aligned_malloc(2 * num_data_disp_checkerboard * sizeof(V), current_bp_level.bytes_align_memory_));
1456 #else
1457  V* disparity_checkerboard_0 =
1458  static_cast<V*>(std::aligned_alloc(
1459  current_bp_level.bytes_align_memory_, 2 * num_data_disp_checkerboard * sizeof(V)));
1460 #endif
1461 
1462  for (const auto checkerboardGetDispMap : {beliefprop::CheckerboardPart::kCheckerboardPart0,
1464  {
1465 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
1466  int num_threads_kernel{
1467  (int)opt_cpu_params.OptParamsForKernel(
1468  {static_cast<unsigned int>(beliefprop::BpKernel::kOutputDisp), 0})[0]};
1469  #pragma omp parallel for num_threads(num_threads_kernel)
1470 #else
1471  #pragma omp parallel for
1472 #endif
1473 #ifdef _WIN32
1474  for (int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
1475 #else
1476  for (unsigned int y_val = 1; y_val < current_bp_level.height_level_ - 1; y_val++) {
1477 #endif //_WIN32
1478  //checkerboard_adjustment used for indexing into current checkerboard to retrieve best disparities
1479  const unsigned int checkerboard_adjustment =
1480  (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) ?
1481  ((y_val) % 2) :
1482  ((y_val + 1) % 2);
1483  const unsigned int start_x = (checkerboard_adjustment == 1) ? 0 : 1;
1484  const unsigned int end_final =
1485  std::min(
1486  current_bp_level.width_checkerboard_level_ - checkerboard_adjustment,
1487  width_checkerboard_run_processing);
1488  const int end_x_simd_vect_start =
1489  std::max(0, (int)(end_final / simd_data_size) * (int)simd_data_size - (int)simd_data_size);
1490 
1491  for (unsigned int x_val = 0; x_val < end_final; x_val += simd_data_size) {
1492  unsigned int x_val_process = x_val;
1493 
1494  //need this check first for case where endXAvxStart is 0 and start_x is 1
1495  //if past the last AVX start (since the next one would go beyond the row),
1496  //set to simd_data_size from the final pixel so processing the last numDataInAvxVector in avx
1497  //may be a few pixels that are computed twice but that's OK
1498  if (((int)x_val_process > end_x_simd_vect_start) &&
1499  (end_final > simd_data_size))
1500  {
1501  x_val_process = end_final - simd_data_size;
1502  }
1503 
1504  //not processing at x=0 if start_x is 1 (this will cause this
1505  //processing to be less aligned than ideal for this iteration)
1506  x_val_process = std::max(start_x, x_val_process);
1507 
1508  //get index for output into disparity map corresponding to checkerboard
1509  const unsigned int index_output = (y_val * width_disp_checkerboard) + x_val_process;
1510 
1511  //check if the memory is aligned for AVX instructions at x_val_process location
1512  const bool data_aligned_x_val =
1513  beliefprop::MemoryAlignedAtDataStart<T>(
1514  x_val_process,
1515  simd_data_size,
1516  current_bp_level.bytes_align_memory_,
1517  current_bp_level.padded_width_checkerboard_level_);
1518 
1519  //declare SIMD vectors for data and message values at each disparity
1520  //U data_message, prev_u_message, prev_d_message, prev_l_message, prev_r_message;
1521 
1522  //declare SIMD vectors for current best values and best disparities
1523  W best_vals, best_disparities, val_at_disp;
1524 
1525  //load using aligned instructions when possible
1526  if constexpr (DISP_VALS > 0) {
1527  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
1528  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1529  if (data_aligned_x_val) {
1530  //retrieve and get sum of message and data values
1531  val_at_disp = simd_processing::AddVals<U, U, W>(
1532  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val + 1,
1533  current_disparity, current_bp_level, DISP_VALS, message_u_checkerboard_1),
1534  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val - 1,
1535  current_disparity, current_bp_level, DISP_VALS, message_d_checkerboard_1));
1536  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1537  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1538  current_disparity, current_bp_level, DISP_VALS, message_l_checkerboard_1));
1539  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1540  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1541  current_disparity, current_bp_level, DISP_VALS, message_r_checkerboard_1));
1542  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1543  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val,
1544  current_disparity, current_bp_level, DISP_VALS, data_cost_checkerboard_0));
1545  }
1546  else {
1547  //retrieve and get sum of message and data values
1548  val_at_disp = simd_processing::AddVals<U, U, W>(
1549  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val + 1,
1550  current_disparity, current_bp_level, DISP_VALS, message_u_checkerboard_1),
1551  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val - 1,
1552  current_disparity, current_bp_level, DISP_VALS, message_d_checkerboard_1));
1553  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1554  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1555  current_disparity, current_bp_level, DISP_VALS, message_l_checkerboard_1));
1556  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1557  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1558  current_disparity, current_bp_level, DISP_VALS, message_r_checkerboard_1));
1559  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1560  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val,
1561  current_disparity, current_bp_level, DISP_VALS, data_cost_checkerboard_0));
1562  }
1563  }
1564  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1565  {
1566  if (data_aligned_x_val) {
1567  //retrieve and get sum of message and data values
1568  val_at_disp = simd_processing::AddVals<U, U, W>(
1569  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val + 1,
1570  current_disparity, current_bp_level, DISP_VALS, message_u_checkerboard_0),
1571  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val - 1,
1572  current_disparity, current_bp_level, DISP_VALS, message_d_checkerboard_0));
1573  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1574  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1575  current_disparity, current_bp_level, DISP_VALS, message_l_checkerboard_0));
1576  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1577  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1578  current_disparity, current_bp_level, DISP_VALS, message_r_checkerboard_0));
1579  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1580  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val,
1581  current_disparity, current_bp_level, DISP_VALS, data_cost_checkerboard_1));
1582  }
1583  else {
1584  //retrieve and get sum of message and data values
1585  val_at_disp = simd_processing::AddVals<U, U, W>(
1586  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val + 1,
1587  current_disparity, current_bp_level, DISP_VALS, message_u_checkerboard_0),
1588  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val - 1,
1589  current_disparity, current_bp_level, DISP_VALS, message_d_checkerboard_0));
1590  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1591  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1592  current_disparity, current_bp_level, DISP_VALS, message_l_checkerboard_0));
1593  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1594  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1595  current_disparity, current_bp_level, DISP_VALS, message_r_checkerboard_0));
1596  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1597  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val,
1598  current_disparity, current_bp_level, DISP_VALS, data_cost_checkerboard_1));
1599  }
1600  }
1601  if (current_disparity == 0) {
1602  best_vals = val_at_disp;
1603  //set disp at min vals to all 0
1604  best_disparities = simd_processing::createSIMDVectorSameData<W>(0.0f);
1605  }
1606  else {
1607  //update best disparity and best values
1608  //if value at current disparity is lower than current best value, need
1609  //to update best value to current value and set best disparity to current disparity
1610  UpdateBestDispBestVals(best_disparities, best_vals,
1611  simd_processing::createSIMDVectorSameData<W>((float)current_disparity), val_at_disp);
1612  }
1613  }
1614  if (data_aligned_x_val) {
1615  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1616  simd_processing::StorePackedDataAligned<V, W>(
1617  index_output, disparity_checkerboard_0, best_disparities);
1618  }
1619  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1620  {
1621  simd_processing::StorePackedDataAligned<V, W>(
1622  num_data_disp_checkerboard + index_output, disparity_checkerboard_0, best_disparities);
1623  }
1624  }
1625  else {
1626  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1627  simd_processing::StorePackedDataUnaligned<V, W>(
1628  index_output, disparity_checkerboard_0, best_disparities);
1629  }
1630  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1631  {
1632  simd_processing::StorePackedDataUnaligned<V, W>(
1633  num_data_disp_checkerboard + index_output, disparity_checkerboard_0, best_disparities);
1634  }
1635  }
1636  }
1637  else {
1638  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++) {
1639  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1640  if (data_aligned_x_val) {
1641  //retrieve and get sum of message and data values
1642  val_at_disp = simd_processing::AddVals<U, U, W>(
1643  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val + 1,
1644  current_disparity, current_bp_level, bp_settings_disp_vals, message_u_checkerboard_1),
1645  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val - 1,
1646  current_disparity, current_bp_level, bp_settings_disp_vals, message_d_checkerboard_1));
1647  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1648  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1649  current_disparity, current_bp_level, bp_settings_disp_vals, message_l_checkerboard_1));
1650  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1651  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1652  current_disparity, current_bp_level, bp_settings_disp_vals, message_r_checkerboard_1));
1653  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1654  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val,
1655  current_disparity, current_bp_level, bp_settings_disp_vals, data_cost_checkerboard_0));
1656  }
1657  else {
1658  //retrieve and get sum of message and data values
1659  val_at_disp = simd_processing::AddVals<U, U, W>(
1660  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val + 1,
1661  current_disparity, current_bp_level, bp_settings_disp_vals, message_u_checkerboard_1),
1662  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val - 1,
1663  current_disparity, current_bp_level, bp_settings_disp_vals, message_d_checkerboard_1));
1664  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1665  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1666  current_disparity, current_bp_level, bp_settings_disp_vals, message_l_checkerboard_1));
1667  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1668  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1669  current_disparity, current_bp_level, bp_settings_disp_vals, message_r_checkerboard_1));
1670  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1671  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val,
1672  current_disparity, current_bp_level, bp_settings_disp_vals, data_cost_checkerboard_0));
1673  }
1674  }
1675  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1676  {
1677  if (data_aligned_x_val) {
1678  //retrieve and get sum of message and data values
1679  val_at_disp = simd_processing::AddVals<U, U, W>(
1680  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val + 1,
1681  current_disparity, current_bp_level, bp_settings_disp_vals, message_u_checkerboard_0),
1682  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val - 1,
1683  current_disparity, current_bp_level, bp_settings_disp_vals, message_d_checkerboard_0));
1684  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1685  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1686  current_disparity, current_bp_level, bp_settings_disp_vals, message_l_checkerboard_0));
1687  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1688  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1689  current_disparity, current_bp_level, bp_settings_disp_vals, message_r_checkerboard_0));
1690  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1691  simd_processing::LoadPackedDataAligned<T, U>(x_val_process, y_val,
1692  current_disparity, current_bp_level, bp_settings_disp_vals, data_cost_checkerboard_1));
1693  }
1694  else {
1695  //retrieve and get sum of message and data values
1696  val_at_disp = simd_processing::AddVals<U, U, W>(
1697  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val + 1,
1698  current_disparity, current_bp_level, bp_settings_disp_vals, message_u_checkerboard_0),
1699  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val - 1,
1700  current_disparity, current_bp_level, bp_settings_disp_vals, message_d_checkerboard_0));
1701  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1702  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process + checkerboard_adjustment, y_val,
1703  current_disparity, current_bp_level, bp_settings_disp_vals, message_l_checkerboard_0));
1704  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1705  simd_processing::LoadPackedDataUnaligned<T, U>((x_val_process + checkerboard_adjustment) - 1, y_val,
1706  current_disparity, current_bp_level, bp_settings_disp_vals, message_r_checkerboard_0));
1707  val_at_disp = simd_processing::AddVals<W, U, W>(val_at_disp,
1708  simd_processing::LoadPackedDataUnaligned<T, U>(x_val_process, y_val,
1709  current_disparity, current_bp_level, bp_settings_disp_vals, data_cost_checkerboard_1));
1710  }
1711  }
1712  if (current_disparity == 0) {
1713  best_vals = val_at_disp;
1714  //set disp at min vals to all 0
1715  best_disparities = simd_processing::createSIMDVectorSameData<W>(0.0f);
1716  }
1717  else {
1718  //update best disparity and best values
1719  //if value at current disparity is lower than current best value, need
1720  //to update best value to current value and set best disparity to current disparity
1722  best_disparities,
1723  best_vals,
1724  simd_processing::createSIMDVectorSameData<W>((float)current_disparity),
1725  val_at_disp);
1726  }
1727  }
1728  //store best disparities in checkerboard being updated
1729  if (data_aligned_x_val) {
1730  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1731  simd_processing::StorePackedDataAligned<V, W>(
1732  index_output, disparity_checkerboard_0, best_disparities);
1733  }
1734  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1735  {
1736  simd_processing::StorePackedDataAligned<V, W>(
1737  num_data_disp_checkerboard + index_output, disparity_checkerboard_0, best_disparities);
1738  }
1739  }
1740  else {
1741  if (checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart0) {
1742  simd_processing::StorePackedDataUnaligned<V, W>(
1743  index_output, disparity_checkerboard_0, best_disparities);
1744  }
1745  else //checkerboardGetDispMap == beliefprop::CheckerboardPart::kCheckerboardPart1
1746  {
1747  simd_processing::StorePackedDataUnaligned<V, W>(
1748  num_data_disp_checkerboard + index_output, disparity_checkerboard_0, best_disparities);
1749  }
1750  }
1751  }
1752  }
1753  }
1754  }
1755 
1756  //combine output disparity maps from each checkerboard
1757  //start with checkerboard 0 in first row since (0, 0) corresponds to (0, 0)
1758  //in checkerboard 0 and (1, 0) corresponds to (0, 0) in checkerboard 1
1759 #if defined(SET_THREAD_COUNT_INDIVIDUAL_KERNELS_CPU)
1760  int num_threads_kernel{
1761  (int)opt_cpu_params.OptParamsForKernel(
1762  {static_cast<unsigned int>(beliefprop::BpKernel::kOutputDisp), 0})[0]};
1763  #pragma omp parallel for num_threads(num_threads_kernel)
1764 #else
1765  #pragma omp parallel for
1766 #endif
1767 #ifdef _WIN32
1768  for (int y=0; y < current_bp_level.height_level_; y++)
1769 #else
1770  for (unsigned int y=0; y < current_bp_level.height_level_; y++)
1771 #endif //_WIN32
1772  {
1773  const bool start_checkerboard_0 = ((y%2) == 0);
1774  unsigned int checkerboard_index = y * width_disp_checkerboard;
1775  for (unsigned int x=0; x < (current_bp_level.width_level_); x += 2) {
1776  if ((y == 0) || (y == (current_bp_level.height_level_ - 1))) {
1777  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 0)] = 0;
1778  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 1)] = 0;
1779  }
1780  else {
1781  if (start_checkerboard_0) {
1782  if ((x == 0) || (x == (current_bp_level.width_level_ - 1))) {
1783  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 0)] = 0;
1784  }
1785  else {
1786  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 0)] =
1787  (float)disparity_checkerboard_0[checkerboard_index];
1788  }
1789  if ((x + 1) == (current_bp_level.width_level_ - 1)) {
1790  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 1)] = 0;
1791  }
1792  else if ((x + 1) < current_bp_level.width_level_) {
1793  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 1)] =
1794  (float)disparity_checkerboard_0[num_data_disp_checkerboard + checkerboard_index];
1795  }
1796  }
1797  else {
1798  if ((x == 0) || (x == (current_bp_level.width_level_ - 1))) {
1799  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 0)] = 0;
1800  }
1801  else {
1802  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 0)] =
1803  (float)disparity_checkerboard_0[num_data_disp_checkerboard + checkerboard_index];
1804  }
1805  if ((x + 1) == (current_bp_level.width_level_ - 1)) {
1806  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 1)] = 0;
1807  }
1808  else if ((x + 1) < current_bp_level.width_level_) {
1809  disparity_between_images_device[y * current_bp_level.width_level_ + (x + 1)] =
1810  (float)disparity_checkerboard_0[checkerboard_index];
1811  }
1812  }
1813  //increment checkerboard index for next x-value
1814  checkerboard_index++;
1815  }
1816  }
1817  }
1818 
1819  //delete [] disparity_checkerboard_0;
1820  free(disparity_checkerboard_0);
1821 }
1822 
1823 //function retrieve the minimum value at each 1-d disparity value in O(n) time
1824 //using Felzenszwalb's method (see "Efficient Belief Propagation for Early Vision")
1825 template<RunDataProcess_t T, RunDataVectProcess_t U, unsigned int DISP_VALS>
1826 void beliefprop_cpu::DtStereoSIMD(U f[DISP_VALS])
1827 {
1828  U prev;
1829  const U vector_all_one_val = simd_processing::ConvertValToDatatype<U, T>(1.0f);
1830  for (unsigned int current_disparity = 1; current_disparity < DISP_VALS; current_disparity++)
1831  {
1832  //prev = f[current_disparity-1] + (T)1.0;
1833  prev = simd_processing::AddVals<U, U, U>(f[current_disparity - 1], vector_all_one_val);
1834 
1835  /*if (prev < f[current_disparity])
1836  f[current_disparity] = prev;*/
1837  f[current_disparity] = simd_processing::GetMinByElement<U>(prev, f[current_disparity]);
1838  }
1839 
1840  for (int current_disparity = (int)DISP_VALS-2; current_disparity >= 0; current_disparity--)
1841  {
1842  //prev = f[current_disparity+1] + (T)1.0;
1843  prev = simd_processing::AddVals<U, U, U>(f[current_disparity + 1], vector_all_one_val);
1844 
1845  //if (prev < f[current_disparity])
1846  // f[current_disparity] = prev;
1847  f[current_disparity] = simd_processing::GetMinByElement<U>(prev, f[current_disparity]);
1848  }
1849 }
1850 
1851 //compute current message
1852 template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W, unsigned int DISP_VALS>
1854  unsigned int x_val, unsigned int y_val,
1855  const beliefprop::BpLevelProperties& current_bp_level,
1856  const U messages_neighbor_1[DISP_VALS], const U messages_neighbor_2[DISP_VALS],
1857  const U messages_neighbor_3[DISP_VALS], const U data_costs[DISP_VALS],
1858  T* dst_message_array, const U& disc_k_bp, bool data_aligned)
1859 {
1860  // aggregate and find min
1861  //T minimum = beliefprop::kHighValBp;
1862  W minimum = simd_processing::ConvertValToDatatype<W, V>(beliefprop::kHighValBp<V>);
1863  W dst[DISP_VALS];
1864 
1865  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++)
1866  {
1867  //dst[current_disparity] = messages_neighbor_1[current_disparity] + messages_neighbor_2[current_disparity] +
1868  // messages_neighbor_3[current_disparity] + data_costs[current_disparity];
1869  dst[current_disparity] =
1870  simd_processing::AddVals<U, U, W>(
1871  messages_neighbor_1[current_disparity],
1872  messages_neighbor_2[current_disparity]);
1873  dst[current_disparity] =
1874  simd_processing::AddVals<W, U, W>(
1875  dst[current_disparity],
1876  messages_neighbor_3[current_disparity]);
1877  dst[current_disparity] =
1878  simd_processing::AddVals<W, U, W>(
1879  dst[current_disparity],
1880  data_costs[current_disparity]);
1881 
1882  //if (dst[current_disparity] < minimum)
1883  // minimum = dst[current_disparity];
1884  minimum =
1885  simd_processing::GetMinByElement<W>(
1886  minimum,
1887  dst[current_disparity]);
1888  }
1889 
1890  //retrieve the minimum value at each disparity in O(n) time using Felzenszwalb's method
1891  //(see "Efficient Belief Propagation for Early Vision")
1892  DtStereoSIMD<V, W, DISP_VALS>(dst);
1893 
1894  // truncate
1895  //minimum += disc_k_bp;
1896  minimum =
1897  simd_processing::AddVals<W, U, W>(
1898  minimum,
1899  disc_k_bp);
1900 
1901  // normalize
1902  //T val_to_normalize = 0;
1903  W val_to_normalize = simd_processing::ConvertValToDatatype<W, V>(0.0);
1904 
1905  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++)
1906  {
1907  /*if (minimum < dst[current_disparity]) {
1908  dst[current_disparity] = minimum;
1909  }*/
1910  dst[current_disparity] =
1911  simd_processing::GetMinByElement<W>(
1912  minimum,
1913  dst[current_disparity]);
1914 
1915  //val_to_normalize += dst[current_disparity];
1916  val_to_normalize =
1917  simd_processing::AddVals<W, W, W>(
1918  val_to_normalize,
1919  dst[current_disparity]);
1920  }
1921 
1922  //val_to_normalize /= DISP_VALS;
1923  val_to_normalize =
1924  simd_processing::divideVals<W, W, W>(
1925  val_to_normalize,
1926  simd_processing::ConvertValToDatatype<W, V>((double)DISP_VALS));
1927 
1928  unsigned int dest_message_array_index =
1930  x_val, y_val, current_bp_level.padded_width_checkerboard_level_,
1931  current_bp_level.height_level_, 0, DISP_VALS);
1932 
1933  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++)
1934  {
1935  //dst[current_disparity] -= val_to_normalize;
1936  dst[current_disparity] =
1937  simd_processing::SubtractVals<W, W, W>(
1938  dst[current_disparity],
1939  val_to_normalize);
1940 
1941  if (data_aligned) {
1942  simd_processing::StorePackedDataAligned<T, W>(
1943  dest_message_array_index,
1944  dst_message_array,
1945  dst[current_disparity]);
1946  }
1947  else {
1948  simd_processing::StorePackedDataUnaligned<T, W>(
1949  dest_message_array_index,
1950  dst_message_array,
1951  dst[current_disparity]);
1952  }
1953 
1954  if constexpr (beliefprop::kOptimizedIndexingSetting) {
1955  dest_message_array_index +=
1956  current_bp_level.padded_width_checkerboard_level_;
1957  }
1958  else {
1959  dest_message_array_index++;
1960  }
1961  }
1962 }
1963 
1964 //function retrieve the minimum value at each 1-d disparity value in O(n) time using Felzenszwalb's method
1965 //(see "Efficient Belief Propagation for Early Vision")
1966 template<RunDataProcess_t T, RunDataVectProcess_t U>
1967 void beliefprop_cpu::DtStereoSIMD(U* f, unsigned int bp_settings_disp_vals)
1968 {
1969  U prev;
1970  const U vector_all_one_val = simd_processing::ConvertValToDatatype<U, T>(1.0f);
1971  for (unsigned int current_disparity = 1; current_disparity < bp_settings_disp_vals; current_disparity++)
1972  {
1973  //prev = f[current_disparity-1] + (T)1.0;
1974  prev =
1975  simd_processing::AddVals<U, U, U>(
1976  f[current_disparity - 1],
1977  vector_all_one_val);
1978 
1979  /*if (prev < f[current_disparity])
1980  f[current_disparity] = prev;*/
1981  f[current_disparity] =
1982  simd_processing::GetMinByElement<U>(
1983  prev,
1984  f[current_disparity]);
1985  }
1986 
1987  for (int current_disparity = (int)bp_settings_disp_vals-2; current_disparity >= 0; current_disparity--)
1988  {
1989  //prev = f[current_disparity+1] + (T)1.0;
1990  prev =
1991  simd_processing::AddVals<U, U, U>(
1992  f[current_disparity + 1],
1993  vector_all_one_val);
1994 
1995  //if (prev < f[current_disparity])
1996  // f[current_disparity] = prev;
1997  f[current_disparity] =
1998  simd_processing::GetMinByElement<U>(
1999  prev,
2000  f[current_disparity]);
2001  }
2002 }
2003 
2004 // compute current message
2005 template<RunData_t T, RunDataVect_t U, RunDataProcess_t V, RunDataVectProcess_t W>
2007  unsigned int x_val, unsigned int y_val,
2008  const beliefprop::BpLevelProperties& current_bp_level,
2009  const U* messages_neighbor_1, const U* messages_neighbor_2,
2010  const U* messages_neighbor_3, const U* data_costs,
2011  T* dst_message_array,
2012  const U& disc_k_bp, bool data_aligned,
2013  unsigned int bp_settings_disp_vals)
2014 {
2015  // aggregate and find min
2016  //T minimum = beliefprop::kHighValBp;
2017  W minimum = simd_processing::ConvertValToDatatype<W, V>(beliefprop::kHighValBp<V>);
2018  W* dst = new W[bp_settings_disp_vals];
2019 
2020  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++)
2021  {
2022  //dst[current_disparity] = messages_neighbor_1[current_disparity] + messages_neighbor_2[current_disparity] +
2023  // messages_neighbor_3[current_disparity] + data_costs[current_disparity];
2024  dst[current_disparity] =
2025  simd_processing::AddVals<U, U, W>(
2026  messages_neighbor_1[current_disparity],
2027  messages_neighbor_2[current_disparity]);
2028  dst[current_disparity] =
2029  simd_processing::AddVals<W, U, W>(
2030  dst[current_disparity],
2031  messages_neighbor_3[current_disparity]);
2032  dst[current_disparity] =
2033  simd_processing::AddVals<W, U, W>(
2034  dst[current_disparity],
2035  data_costs[current_disparity]);
2036 
2037  //if (dst[current_disparity] < minimum)
2038  // minimum = dst[current_disparity];
2039  minimum =
2040  simd_processing::GetMinByElement<W>(
2041  minimum,
2042  dst[current_disparity]);
2043  }
2044 
2045  //retrieve the minimum value at each disparity in O(n) time using Felzenszwalb's method
2046  //(see "Efficient Belief Propagation for Early Vision")
2047  DtStereoSIMD<V, W>(dst, bp_settings_disp_vals);
2048 
2049  // truncate
2050  //minimum += disc_k_bp;
2051  minimum =
2052  simd_processing::AddVals<W, U, W>(
2053  minimum,
2054  disc_k_bp);
2055 
2056  // normalize
2057  //T val_to_normalize = 0;
2058  W val_to_normalize = simd_processing::ConvertValToDatatype<W, V>(0.0f);
2059 
2060  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++)
2061  {
2062  //if (minimum < dst[current_disparity]) {
2063  // dst[current_disparity] = minimum;
2064  //}
2065  dst[current_disparity] =
2066  simd_processing::GetMinByElement<W>(
2067  minimum,
2068  dst[current_disparity]);
2069 
2070  //val_to_normalize += dst[current_disparity];
2071  val_to_normalize =
2072  simd_processing::AddVals<W, W, W>(
2073  val_to_normalize,
2074  dst[current_disparity]);
2075  }
2076 
2077  //val_to_normalize /= DISP_VALS;
2078  val_to_normalize =
2079  simd_processing::divideVals<W, W, W>(
2080  val_to_normalize,
2081  simd_processing::ConvertValToDatatype<W, V>((float)bp_settings_disp_vals));
2082 
2083  unsigned int dest_message_array_index =
2085  x_val, y_val, current_bp_level.padded_width_checkerboard_level_,
2086  current_bp_level.height_level_, 0, bp_settings_disp_vals);
2087 
2088  for (unsigned int current_disparity = 0; current_disparity < bp_settings_disp_vals; current_disparity++)
2089  {
2090  //dst[current_disparity] -= val_to_normalize;
2091  dst[current_disparity] =
2092  simd_processing::SubtractVals<W, W, W>(
2093  dst[current_disparity],
2094  val_to_normalize);
2095 
2096  if (data_aligned) {
2097  simd_processing::StorePackedDataAligned<T, W>(
2098  dest_message_array_index,
2099  dst_message_array,
2100  dst[current_disparity]);
2101  }
2102  else {
2103  simd_processing::StorePackedDataUnaligned<T, W>(
2104  dest_message_array_index,
2105  dst_message_array,
2106  dst[current_disparity]);
2107  }
2108 
2109  if constexpr (beliefprop::kOptimizedIndexingSetting) {
2110  dest_message_array_index +=
2111  current_bp_level.padded_width_checkerboard_level_;
2112  }
2113  else {
2114  dest_message_array_index++;
2115  }
2116  }
2117 
2118  delete [] dst;
2119 }
2120 
2121 // compute current message
2122 template<RunData_t T, RunDataVect_t U, unsigned int DISP_VALS>
2124  unsigned int x_val, unsigned int y_val,
2125  const beliefprop::BpLevelProperties& current_bp_level,
2126  const U messages_neighbor_1[DISP_VALS], const U messages_neighbor_2[DISP_VALS],
2127  const U messages_neighbor_3[DISP_VALS], const U data_costs[DISP_VALS],
2128  T* dst_message_array,
2129  const U& disc_k_bp, bool data_aligned)
2130 {
2131  MsgStereoSIMDProcessing<T, U, T, U, DISP_VALS>(x_val, y_val,
2132  current_bp_level, messages_neighbor_1, messages_neighbor_2,
2133  messages_neighbor_3, data_costs, dst_message_array, disc_k_bp, data_aligned);
2134 }
2135 
2136 // compute current message
2137 template<RunData_t T, RunDataVect_t U>
2139  unsigned int x_val, unsigned int y_val,
2140  const beliefprop::BpLevelProperties& current_bp_level,
2141  const U* messages_neighbor_1, const U* messages_neighbor_2,
2142  const U* messages_neighbor_3, const U* data_costs,
2143  T* dst_message_array,
2144  const U& disc_k_bp, bool data_aligned, unsigned int bp_settings_disp_vals)
2145 {
2146  MsgStereoSIMDProcessing<T, U, T, U>(
2147  x_val, y_val, current_bp_level,
2148  messages_neighbor_1, messages_neighbor_2,
2149  messages_neighbor_3, data_costs,
2150  dst_message_array, disc_k_bp, data_aligned,
2151  bp_settings_disp_vals);
2152 }
2153 
2154 template<RunData_t T, unsigned int DISP_VALS>
2156  unsigned int x_val, unsigned int y_val,
2157  const beliefprop::BpLevelProperties& current_bp_level,
2158  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
2159  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
2160  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
2161  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
2162  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1)
2163 {
2164  if (((x_val + y_val) % 2) == 0) {
2165  printf("x_val: %u\n", x_val);
2166  printf("y_val: %u\n", y_val);
2167  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
2168  printf("DISP: %u\n", current_disparity);
2169  printf("messageUPrevStereoCheckerboard: %f \n",
2170  (float)message_u_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2171  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2172  current_disparity, DISP_VALS)]);
2173  printf("messageDPrevStereoCheckerboard: %f \n",
2174  (float)message_d_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2175  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2176  current_disparity, DISP_VALS)]);
2177  printf("messageLPrevStereoCheckerboard: %f \n",
2178  (float)message_l_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2179  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2180  current_disparity, DISP_VALS)]);
2181  printf("messageRPrevStereoCheckerboard: %f \n",
2182  (float)message_r_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2183  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2184  current_disparity, DISP_VALS)]);
2185  printf("dataCostStereoCheckerboard: %f \n",
2186  (float)data_cost_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2187  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2188  current_disparity, DISP_VALS)]);
2189  }
2190  } else {
2191  printf("x_val: %u\n", x_val);
2192  printf("y_val: %u\n", y_val);
2193  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
2194  printf("DISP: %u\n", current_disparity);
2195  printf("messageUPrevStereoCheckerboard: %f \n",
2196  (float)message_u_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2197  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2198  current_disparity, DISP_VALS)]);
2199  printf("messageDPrevStereoCheckerboard: %f \n",
2200  (float)message_d_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2201  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2202  current_disparity, DISP_VALS)]);
2203  printf("messageLPrevStereoCheckerboard: %f \n",
2204  (float)message_l_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2205  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2206  current_disparity, DISP_VALS)]);
2207  printf("messageRPrevStereoCheckerboard: %f \n",
2208  (float)message_r_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2209  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2210  current_disparity, DISP_VALS)]);
2211  printf("dataCostStereoCheckerboard: %f \n",
2212  (float)data_cost_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2213  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2214  current_disparity, DISP_VALS)]);
2215  }
2216  }
2217 }
2218 
2219 template<RunData_t T, unsigned int DISP_VALS>
2221  unsigned int x_val, unsigned int y_val,
2222  const beliefprop::BpLevelProperties& current_bp_level,
2223  const T* data_cost_checkerboard_0, const T* data_cost_checkerboard_1,
2224  const T* message_u_checkerboard_0, const T* message_d_checkerboard_0,
2225  const T* message_l_checkerboard_0, const T* message_r_checkerboard_0,
2226  const T* message_u_checkerboard_1, const T* message_d_checkerboard_1,
2227  const T* message_l_checkerboard_1, const T* message_r_checkerboard_1)
2228 {
2229  const unsigned int checkerboard_adjustment = (((x_val + y_val) % 2) == 0) ? ((y_val)%2) : ((y_val+1)%2);
2230  if (((x_val + y_val) % 2) == 0) {
2231  //TODO: switch use of printf with std::format when it is supported on compiler used for development
2232  //std::cout << std::format("x_val: {}", x_val) << std::endl;
2233  printf("x_val: %u\n", x_val);
2234  printf("y_val: %u\n", y_val);
2235  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
2236  printf("DISP: %u\n", current_disparity);
2237  printf("messageUPrevStereoCheckerboard: %f \n",
2238  (float) message_u_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2239  x_val / 2, y_val + 1, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2240  current_disparity, DISP_VALS)]);
2241  printf("messageDPrevStereoCheckerboard: %f \n",
2242  (float) message_d_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2243  x_val / 2, y_val - 1, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2244  current_disparity, DISP_VALS)]);
2245  printf("messageLPrevStereoCheckerboard: %f \n",
2246  (float) message_l_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2247  x_val / 2 + checkerboard_adjustment, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2248  current_disparity, DISP_VALS)]);
2249  printf("messageRPrevStereoCheckerboard: %f \n",
2250  (float) message_r_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2251  (x_val / 2 - 1) + checkerboard_adjustment, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2252  current_disparity, DISP_VALS)]);
2253  printf("dataCostStereoCheckerboard: %f \n",
2254  (float) data_cost_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2255  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2256  current_disparity, DISP_VALS)]);
2257  }
2258  }
2259  else {
2260  printf("x_val: %u\n", x_val);
2261  printf("y_val: %u\n", y_val);
2262  for (unsigned int current_disparity = 0; current_disparity < DISP_VALS; current_disparity++) {
2263  printf("DISP: %u\n", current_disparity);
2264  printf("messageUPrevStereoCheckerboard: %f \n",
2265  (float) message_u_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2266  x_val / 2, y_val + 1, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2267  current_disparity, DISP_VALS)]);
2268  printf("messageDPrevStereoCheckerboard: %f \n",
2269  (float) message_d_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2270  x_val / 2, y_val - 1, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2271  current_disparity, DISP_VALS)]);
2272  printf("messageLPrevStereoCheckerboard: %f \n",
2273  (float) message_l_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2274  x_val / 2 + checkerboard_adjustment, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2275  current_disparity, DISP_VALS)]);
2276  printf("messageRPrevStereoCheckerboard: %f \n",
2277  (float) message_r_checkerboard_0[beliefprop::RetrieveIndexInDataAndMessage(
2278  (x_val / 2 - 1) + checkerboard_adjustment, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2279  current_disparity, DISP_VALS)]);
2280  printf("dataCostStereoCheckerboard: %f \n",
2281  (float) data_cost_checkerboard_1[beliefprop::RetrieveIndexInDataAndMessage(
2282  x_val / 2, y_val, current_bp_level.padded_width_checkerboard_level_, current_bp_level.height_level_,
2283  current_disparity, DISP_VALS)]);
2284  }
2285  }
2286 }
2287 
2288 #endif //KERNEL_BP_STEREO_CPU_H
File with namespace for enums, constants, structures, and functions specific to belief propagation pr...
Header file that contains information about the stereo sets used for evaluation of the bp implementat...
Defines functions used in processing belief propagation that are specific to implementation with AVX2...
Defines functions used in processing belief propagation that are specific to implementation with AVX5...
Defines functions used in processing belief propagation that are specific to implementation with NEON...
Declares child class of ParallelParams to store and process parallelization parameters to use in each...
Contains namespace with CPU run defaults and constants.
Define constraints for data type in processing.
Contains general functions for processing using SIMD vector data types on CPU.
Functions for belief propagation processing that are used in both optimized CPU and CUDA implementati...
Contains namespace with utility functions for implementation.
Abstract class for holding and processing parallelization parameters. Child class(es) specific to im...
virtual std::array< unsigned int, 2 > OptParamsForKernel(const std::array< unsigned int, 2 > &kernel_location) const =0
Get optimized parallel parameters for parallel processing kernel for kernel that is indexed as an arr...
Namespace to define global kernel functions for optimized belief propagation processing on the CPU us...
void RunBPIterationUsingCheckerboardUpdates(beliefprop::CheckerboardPart checkerboard_to_update, const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_num_disp_vals, const ParallelParams &opt_cpu_params)
void RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsAVX512(beliefprop::CheckerboardPart checkerboard_to_update, const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, float *message_u_checkerboard_0, float *message_d_checkerboard_0, float *message_l_checkerboard_0, float *message_r_checkerboard_0, float *message_u_checkerboard_1, float *message_d_checkerboard_1, float *message_l_checkerboard_1, float *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void DtStereoSIMD(U f[DISP_VALS])
void RetrieveOutputDisparityUseSIMDVectorsAVX256(const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, const float *message_u_prev_checkerboard_0, const float *message_d_prev_checkerboard_0, const float *message_l_prev_checkerboard_0, const float *message_r_prev_checkerboard_0, const float *message_u_prev_checkerboard_1, const float *message_d_prev_checkerboard_1, const float *message_l_prev_checkerboard_1, const float *message_r_prev_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsAVX256(beliefprop::CheckerboardPart checkerboard_to_update, const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, float *message_u_checkerboard_0, float *message_d_checkerboard_0, float *message_l_checkerboard_0, float *message_r_checkerboard_0, float *message_u_checkerboard_1, float *message_d_checkerboard_1, float *message_l_checkerboard_1, float *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void MsgStereoSIMD(unsigned int x_val, unsigned int y_val, const beliefprop::BpLevelProperties &current_bp_level, const U messages_neighbor_1[DISP_VALS], const U messages_neighbor_2[DISP_VALS], const U messages_neighbor_3[DISP_VALS], const U data_costs[DISP_VALS], T *dst_message_array, const U &disc_k_bp, bool data_aligned)
void InitializeBottomLevelData(const beliefprop::BpLevelProperties &current_bp_level, const float *image_1_pixels_device, const float *image_2_pixels_device, T *data_cost_stereo_checkerboard_0, T *data_cost_stereo_checkerboard_1, float lambda_bp, float data_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RetrieveOutputDisparityUseSIMDVectorsNEON(const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, const float *message_u_prev_checkerboard_0, const float *message_d_prev_checkerboard_0, const float *message_l_prev_checkerboard_0, const float *message_r_prev_checkerboard_0, const float *message_u_prev_checkerboard_1, const float *message_d_prev_checkerboard_1, const float *message_l_prev_checkerboard_1, const float *message_r_prev_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RunBPIterationUpdateMsgValsUseSIMDVectors(unsigned int x_val_start_processing, unsigned int y_val, const beliefprop::BpLevelProperties &current_bp_level, const U prev_u_message[DISP_VALS], const U prev_d_message[DISP_VALS], const U prev_l_message[DISP_VALS], const U prev_r_message[DISP_VALS], const U data_message[DISP_VALS], T *current_u_message, T *current_d_message, T *current_l_message, T *current_r_message, const U disc_k_bp_vect, bool data_aligned)
void PrintDataAndMessageValsAtPointKernel(unsigned int x_val, unsigned int y_val, const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, const T *message_u_checkerboard_0, const T *message_d_checkerboard_0, const T *message_l_checkerboard_0, const T *message_r_checkerboard_0, const T *message_u_checkerboard_1, const T *message_d_checkerboard_1, const T *message_l_checkerboard_1, const T *message_r_checkerboard_1)
void RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsProcess(beliefprop::CheckerboardPart checkerboard_to_update, const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void PrintDataAndMessageValsToPointKernel(unsigned int x_val, unsigned int y_val, const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, const T *message_u_checkerboard_0, const T *message_d_checkerboard_0, const T *message_l_checkerboard_0, const T *message_r_checkerboard_0, const T *message_u_checkerboard_1, const T *message_d_checkerboard_1, const T *message_l_checkerboard_1, const T *message_r_checkerboard_1)
void RetrieveOutputDisparityUseSIMDVectors(const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, const T *message_u_prev_checkerboard_0, const T *message_d_prev_checkerboard_0, const T *message_l_prev_checkerboard_0, const T *message_r_prev_checkerboard_0, const T *message_u_prev_checkerboard_1, const T *message_d_prev_checkerboard_1, const T *message_l_prev_checkerboard_1, const T *message_r_prev_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void MsgStereoSIMDProcessing(unsigned int x_val, unsigned int y_val, const beliefprop::BpLevelProperties &current_bp_level, const U *messages_neighbor_1, const U *messages_neighbor_2, const U *messages_neighbor_3, const U *data_costs, T *dst_message_array, const U &disc_k_bp, bool data_aligned, unsigned int bp_settings_disp_vals)
void InitializeCurrentLevelData(beliefprop::CheckerboardPart checkerboard_part, const beliefprop::BpLevelProperties &current_bp_level, const beliefprop::BpLevelProperties &prev_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, T *data_cost_current_level, unsigned int offset_num, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RunBPIterationUsingCheckerboardUpdatesNoPackedInstructions(beliefprop::CheckerboardPart checkerboard_part_update, const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RunBPIterationUsingCheckerboardUpdatesUseSIMDVectorsNEON(beliefprop::CheckerboardPart checkerboard_to_update, const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, float *message_u_checkerboard_0, float *message_d_checkerboard_0, float *message_l_checkerboard_0, float *message_r_checkerboard_0, float *message_u_checkerboard_1, float *message_d_checkerboard_1, float *message_l_checkerboard_1, float *message_r_checkerboard_1, float disc_k_bp, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RetrieveOutputDisparityUseSIMDVectorsAVX512(const beliefprop::BpLevelProperties &current_bp_level, const float *data_cost_checkerboard_0, const float *data_cost_checkerboard_1, const float *message_u_prev_checkerboard_0, const float *message_d_prev_checkerboard_0, const float *message_l_prev_checkerboard_0, const float *message_r_prev_checkerboard_0, const float *message_u_prev_checkerboard_1, const float *message_d_prev_checkerboard_1, const float *message_l_prev_checkerboard_1, const float *message_r_prev_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void InitializeMessageValsToDefaultKernel(const beliefprop::BpLevelProperties &current_bp_level, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void RetrieveOutputDisparity(const beliefprop::BpLevelProperties &current_bp_level, const T *data_cost_checkerboard_0, const T *data_cost_checkerboard_1, const T *message_u_prev_checkerboard_0, const T *message_d_prev_checkerboard_0, const T *message_l_prev_checkerboard_0, const T *message_r_prev_checkerboard_0, const T *message_u_prev_checkerboard_1, const T *message_d_prev_checkerboard_1, const T *message_l_prev_checkerboard_1, const T *message_r_prev_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
void UpdateBestDispBestVals(T &best_disparities, T &best_vals, const T &current_disparity, const T &val_at_disp)
void CopyMsgDataToNextLevel(beliefprop::CheckerboardPart checkerboard_part, const beliefprop::BpLevelProperties &current_bp_level, const beliefprop::BpLevelProperties &next_bp_level, const T *message_u_prev_checkerboard_0, const T *message_d_prev_checkerboard_0, const T *message_l_prev_checkerboard_0, const T *message_r_prev_checkerboard_0, const T *message_u_prev_checkerboard_1, const T *message_d_prev_checkerboard_1, const T *message_l_prev_checkerboard_1, const T *message_r_prev_checkerboard_1, const T *message_u_checkerboard_0, const T *message_d_checkerboard_0, const T *message_l_checkerboard_0, const T *message_r_checkerboard_0, const T *message_u_checkerboard_1, const T *message_d_checkerboard_1, const T *message_l_checkerboard_1, const T *message_r_checkerboard_1, unsigned int bp_settings_disp_vals, const ParallelParams &opt_cpu_params)
constexpr bool kOptimizedIndexingSetting
Definition: BpRunUtils.h:114
CheckerboardPart
Define the two checkerboard "parts" that the image is divided into.
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
Definition: BpLevel.h:42
unsigned int height_level_
Definition: BpLevel.h:44
unsigned int width_checkerboard_level_
Definition: BpLevel.h:46
unsigned int bytes_align_memory_
Definition: BpLevel.h:45
unsigned int width_level_
Definition: BpLevel.h:43
unsigned int padded_width_checkerboard_level_
Definition: BpLevel.h:47