Optimized Belief Propagation (CPU and GPU)
KernelBpStereo.cu
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
29 
30 //uncomment to set CHECK_VAL_TO_NORMALIZE_VALID_CUDA_HALF (disabled by default) since that could
31 //get overflow in message values during processing in some bp settings (only happened on largest stereo set in testing)
32 //shouldn't be needed if using bfloat since that has a higher exponent and not likely to overflow
33 //recommend using bfloat rather than enabling this setting if target GPU supports bflow
34 //#define CHECK_VAL_TO_NORMALIZE_VALID_CUDA_HALF
35 
36 #if defined(CHECK_VAL_TO_NORMALIZE_VALID_CUDA_HALF)
37 #include "KernelBpStereoHalf.cu"
38 #endif //CHECK_VAL_TO_NORMALIZE_VALID_CUDA_HALF
39 
40 //uncomment for CUDA kernel debug functions for belief propagation processing
41 //#include "KernelBpStereoDebug.h"
42 
48 namespace beliefprop_cuda {
49 
67 template<RunData_t T, unsigned int DISP_VALS>
68 __global__ void InitializeBottomLevelData(
69  beliefprop::BpLevelProperties current_bp_level,
70  float* image_1_pixels_device, float* image_2_pixels_device,
71  T* data_cost_stereo_checkerboard_0, T* data_cost_stereo_checkerboard_1,
72  float lambda_bp, float data_k_bp, unsigned int bp_settings_disp_vals)
73 {
74  //get the x and y indices for the current CUDA thread
75  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
76  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
77 
78  //get the x value within the current "checkerboard"
79  const unsigned int x_checkerboard = x_val / 2;
80 
82  x_checkerboard, y_val, current_bp_level.width_level_, current_bp_level.height_level_))
83  {
84  beliefprop::InitializeBottomLevelDataPixel<T, DISP_VALS>(x_val, y_val,
85  current_bp_level, image_1_pixels_device,
86  image_2_pixels_device, data_cost_stereo_checkerboard_0,
87  data_cost_stereo_checkerboard_1, lambda_bp,
88  data_k_bp, bp_settings_disp_vals);
89  }
90 }
91 
107 template<RunData_t T, unsigned int DISP_VALS>
109  beliefprop::CheckerboardPart checkerboard_part,
110  beliefprop::BpLevelProperties current_bp_level,
111  beliefprop::BpLevelProperties prev_bp_level, T* data_cost_checkerboard_0,
112  T* data_cost_checkerboard_1, T* data_cost_current_level,
113  unsigned int offset_num, unsigned int bp_settings_disp_vals)
114 {
115  //get the x and y indices for the current CUDA thread
116  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
117  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
118 
120  x_val, y_val, current_bp_level.width_checkerboard_level_, current_bp_level.height_level_))
121  {
122  beliefprop::InitializeCurrentLevelDataPixel<T, T, DISP_VALS>(
123  x_val, y_val, checkerboard_part, current_bp_level, prev_bp_level,
124  data_cost_checkerboard_0, data_cost_checkerboard_1, data_cost_current_level,
125  offset_num, bp_settings_disp_vals);
126  }
127 }
128 
145 template<RunData_t T, unsigned int DISP_VALS>
147  beliefprop::BpLevelProperties current_bp_level,
148  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
149  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
150  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
151  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
152  unsigned int bp_settings_disp_vals)
153 {
154  //get the x and y indices for the current CUDA thread
155  const unsigned int x_val_in_checkerboard = blockIdx.x * blockDim.x + threadIdx.x;
156  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
157 
159  x_val_in_checkerboard, y_val, current_bp_level.width_checkerboard_level_, current_bp_level.height_level_))
160  {
161  //initialize message values in both checkerboards
162  beliefprop::InitializeMessageValsToDefaultKernelPixel<T, DISP_VALS>(
163  x_val_in_checkerboard, y_val, current_bp_level,
164  message_u_checkerboard_0, message_d_checkerboard_0,
165  message_l_checkerboard_0, message_r_checkerboard_0,
166  message_u_checkerboard_1, message_d_checkerboard_1,
167  message_l_checkerboard_1, message_r_checkerboard_1,
168  bp_settings_disp_vals);
169  }
170 }
171 
196 template<RunData_t T, unsigned int DISP_VALS>
198  beliefprop::CheckerboardPart checkerboard_to_update, beliefprop::BpLevelProperties current_bp_level,
199  T* data_cost_checkerboard_0, T* data_cost_checkerboard_1,
200  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
201  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
202  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
203  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
204  float disc_k_bp, bool data_aligned, unsigned int bp_settings_disp_vals)
205 {
206  //get the x and y indices for the current CUDA thread
207  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
208  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
209 
211  x_val, y_val, current_bp_level.width_level_/2, current_bp_level.height_level_))
212  {
213  beliefprop::RunBPIterationUsingCheckerboardUpdatesKernel<T, T, DISP_VALS>(
214  x_val, y_val, checkerboard_to_update, current_bp_level,
215  data_cost_checkerboard_0, data_cost_checkerboard_1,
216  message_u_checkerboard_0, message_d_checkerboard_0,
217  message_l_checkerboard_0, message_r_checkerboard_0,
218  message_u_checkerboard_1, message_d_checkerboard_1,
219  message_l_checkerboard_1, message_r_checkerboard_1,
220  disc_k_bp, 0, data_aligned, bp_settings_disp_vals);
221  }
222 }
223 
251 template<RunData_t T, unsigned int DISP_VALS>
253  beliefprop::CheckerboardPart checkerboard_to_update, beliefprop::BpLevelProperties current_bp_level,
254  T* data_cost_checkerboard_0, T* data_cost_checkerboard_1,
255  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
256  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
257  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
258  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
259  float disc_k_bp, bool data_aligned, unsigned int bp_settings_disp_vals,
260  void* dst_processing)
261 {
262  //get the x and y indices for the current CUDA thread
263  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
264  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
265 
267  x_val, y_val, current_bp_level.width_level_/2, current_bp_level.height_level_))
268  {
269  beliefprop::RunBPIterationUsingCheckerboardUpdatesKernel<T, T, DISP_VALS>(
270  x_val, y_val, checkerboard_to_update, current_bp_level,
271  data_cost_checkerboard_0, data_cost_checkerboard_1,
272  message_u_checkerboard_0, message_d_checkerboard_0,
273  message_l_checkerboard_0, message_r_checkerboard_0,
274  message_u_checkerboard_1, message_d_checkerboard_1,
275  message_l_checkerboard_1, message_r_checkerboard_1,
276  disc_k_bp, 0, data_aligned, bp_settings_disp_vals, dst_processing);
277  }
278 }
279 
309 template<RunData_t T, unsigned int DISP_VALS>
310 __global__ void CopyMsgDataToNextLevel(
311  beliefprop::CheckerboardPart checkerboard_part,
312  beliefprop::BpLevelProperties current_bp_level,
313  beliefprop::BpLevelProperties next_bp_level,
314  T* message_u_prev_checkerboard_0, T* message_d_prev_checkerboard_0,
315  T* message_l_prev_checkerboard_0, T* message_r_prev_checkerboard_0,
316  T* message_u_prev_checkerboard_1, T* message_d_prev_checkerboard_1,
317  T* message_l_prev_checkerboard_1, T* message_r_prev_checkerboard_1,
318  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
319  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
320  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
321  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
322  unsigned int bp_settings_disp_vals)
323 {
324  //get the x and y indices for the current CUDA thread
325  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
326  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
327 
329  x_val, y_val, current_bp_level.width_checkerboard_level_, current_bp_level.height_level_))
330  {
331  beliefprop::CopyMsgDataToNextLevelPixel<T, DISP_VALS>(x_val, y_val,
332  checkerboard_part, current_bp_level, next_bp_level,
333  message_u_prev_checkerboard_0, message_d_prev_checkerboard_0,
334  message_l_prev_checkerboard_0, message_r_prev_checkerboard_0,
335  message_u_prev_checkerboard_1, message_d_prev_checkerboard_1,
336  message_l_prev_checkerboard_1, message_r_prev_checkerboard_1,
337  message_u_checkerboard_0, message_d_checkerboard_0,
338  message_l_checkerboard_0, message_r_checkerboard_0,
339  message_u_checkerboard_1, message_d_checkerboard_1,
340  message_l_checkerboard_1, message_r_checkerboard_1,
341  bp_settings_disp_vals);
342  }
343 }
344 
364 template<RunData_t T, unsigned int DISP_VALS>
365 __global__ void RetrieveOutputDisparity(
366  beliefprop::BpLevelProperties current_bp_level,
367  T* data_cost_checkerboard_0, T* data_cost_checkerboard_1,
368  T* message_u_checkerboard_0, T* message_d_checkerboard_0,
369  T* message_l_checkerboard_0, T* message_r_checkerboard_0,
370  T* message_u_checkerboard_1, T* message_d_checkerboard_1,
371  T* message_l_checkerboard_1, T* message_r_checkerboard_1,
372  float* disparity_between_images_device, unsigned int bp_settings_disp_vals)
373 {
374  //get x and y indices for the current CUDA thread
375  const unsigned int x_val = blockIdx.x * blockDim.x + threadIdx.x;
376  const unsigned int y_val = blockIdx.y * blockDim.y + threadIdx.y;
377 
379  x_val, y_val, current_bp_level.width_checkerboard_level_, current_bp_level.height_level_))
380  {
381  beliefprop::RetrieveOutputDisparityPixel<T, T, DISP_VALS>(
382  x_val, y_val, current_bp_level,
383  data_cost_checkerboard_0, data_cost_checkerboard_1,
384  message_u_checkerboard_0, message_d_checkerboard_0,
385  message_l_checkerboard_0, message_r_checkerboard_0,
386  message_u_checkerboard_1, message_d_checkerboard_1,
387  message_l_checkerboard_1, message_r_checkerboard_1,
388  disparity_between_images_device, bp_settings_disp_vals);
389  }
390 }
391 
392 };
This file defines the template specialization to perform belief propagation using half precision for ...
Functions for belief propagation processing that are used in both optimized CPU and CUDA implementati...
Namespace to define global kernel functions for parallel belief propagation processing using CUDA.
__global__ void CopyMsgDataToNextLevel(beliefprop::CheckerboardPart checkerboard_part, beliefprop::BpLevelProperties current_bp_level, beliefprop::BpLevelProperties next_bp_level, T *message_u_prev_checkerboard_0, T *message_d_prev_checkerboard_0, T *message_l_prev_checkerboard_0, T *message_r_prev_checkerboard_0, T *message_u_prev_checkerboard_1, T *message_d_prev_checkerboard_1, T *message_l_prev_checkerboard_1, T *message_r_prev_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, unsigned int bp_settings_disp_vals)
Kernel to copy the computed BP message values at the current level to the corresponding locations at ...
__global__ void RunBPIterationUsingCheckerboardUpdates(beliefprop::CheckerboardPart checkerboard_to_update, beliefprop::BpLevelProperties current_bp_level, T *data_cost_checkerboard_0, T *data_cost_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, float disc_k_bp, bool data_aligned, unsigned int bp_settings_disp_vals)
Kernel function to run the current iteration of belief propagation in parallel using the checkerboard...
__global__ void InitializeMessageValsToDefaultKernel(beliefprop::BpLevelProperties current_bp_level, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, unsigned int bp_settings_disp_vals)
Initialize the message values at each pixel of the current level to the default value.
__global__ void InitializeBottomLevelData(beliefprop::BpLevelProperties current_bp_level, float *image_1_pixels_device, float *image_2_pixels_device, T *data_cost_stereo_checkerboard_0, T *data_cost_stereo_checkerboard_1, float lambda_bp, float data_k_bp, unsigned int bp_settings_disp_vals)
Initialize the "data cost" for each possible disparity between the two full-sized input images ("bott...
__global__ void InitializeCurrentLevelData(beliefprop::CheckerboardPart checkerboard_part, beliefprop::BpLevelProperties current_bp_level, beliefprop::BpLevelProperties prev_bp_level, T *data_cost_checkerboard_0, T *data_cost_checkerboard_1, T *data_cost_current_level, unsigned int offset_num, unsigned int bp_settings_disp_vals)
Initialize the data costs at the "next" level up in the pyramid given that the data at the lower has ...
__global__ void RetrieveOutputDisparity(beliefprop::BpLevelProperties current_bp_level, T *data_cost_checkerboard_0, T *data_cost_checkerboard_1, T *message_u_checkerboard_0, T *message_d_checkerboard_0, T *message_l_checkerboard_0, T *message_r_checkerboard_0, T *message_u_checkerboard_1, T *message_d_checkerboard_1, T *message_l_checkerboard_1, T *message_r_checkerboard_1, float *disparity_between_images_device, unsigned int bp_settings_disp_vals)
Retrieve the best disparity estimate from image 1 to image 2 for each pixel in parallel.
CheckerboardPart
Define the two checkerboard "parts" that the image is divided into.
ARCHITECTURE_ADDITION bool WithinImageBounds(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height)
Checks if the current point is within the image bounds Assumed that input x/y vals are above zero sin...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
Definition: BpLevel.h:42
unsigned int height_level_
Definition: BpLevel.h:44
unsigned int width_checkerboard_level_
Definition: BpLevel.h:46
unsigned int width_level_
Definition: BpLevel.h:43