Optimized Belief Propagation (CPU and GPU)
BpRunUtils.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
28 #ifndef BP_RUN_UTILS_H
29 #define BP_RUN_UTILS_H
30 
31 #include <string>
32 #include <string_view>
33 #include <array>
34 #include <limits>
37 #include "RunEval/RunData.h"
38 
43 namespace beliefprop {
44 
46 template <typename T>
47 const T kHighValBp{std::numeric_limits<T>::max()};
48 
51 constexpr float kHighValBpKernel{32000.0f};
52 
53 #if defined(OPTIMIZED_CPU_RUN)
54 #if defined(FLOAT16_VECTORIZATION)
55 
56 //specialization of high value value for half type
57 //that corresponds to max value in float16
58 template<> inline
59 const _Float16 kHighValBp<_Float16>(65504);
60 
61 #endif //FLOAT16_VECTORIZATION
62 #endif //OPTIMIZED_CPU_RUN
63 
64 //define specialization for high value in half precision if using CUDA
65 #if defined(OPTIMIZED_CUDA_RUN)
66 
67 //set data type used for half-precision with CUDA
68 #if defined(USE_BFLOAT16_FOR_HALF_PRECISION)
69 #include <cuda_bf16.h>
70 //specialization for CUDA bfloat16
71 template<> inline
72 const __nv_bfloat16 kHighValBp<__nv_bfloat16>{CUDART_MAX_NORMAL_BF16};
73 #else
74 #include <cuda_fp16.h>
75 //specialization for CUDA bfloat16
76 template<> inline
77 const half kHighValBp<half>{CUDART_MAX_NORMAL_FP16};
78 #endif //USE_BFLOAT16_FOR_HALF_PRECISION
79 
80 #endif //OPTIMIZED_CUDA_RUN
81 
91 inline unsigned int NumBpStereoRuns(unsigned int disparity_vals) {
92 #if defined(FEWER_RUNS_PER_CONFIG)
93  //fewer runs if set to use limited parameters/fewer runs
94  //for faster processing
95  return 3;
96 #else
97  if (disparity_vals > 100) {
98  return 7;
99  }
100  else {
101  return 15;
102  }
103 #endif //FEWER_RUNS_PER_CONFIG
104 }
105 
106 //by default, optimized memory management and optimized indexing used
107 //See http://scottgg.net/OptimizingGlobalStereoMatchingOnNVIDIAGPUs.pdf for more info on these
108 //optimizations (note that the optimized indexing was present in the initial implementation)
109 //Can remove optimized memory management (making the processing more similar to the initial work)
110 //by setting kUseOptMemManagement to false
111 //Optimized indexing can be turned off by changing the kOptimizedIndexingSetting value to false
112 //(not recommended; this slows down processing)
113 constexpr bool kUseOptMemManagement{true};
114 constexpr bool kOptimizedIndexingSetting{true};
115 constexpr bool kAllocateFreeBpMemoryOutsideRuns{true};
116 
117 //constants for headers for run settings in evaluation
118 constexpr std::string_view kMemAllocOptHeader{"Memory allocation of all BP data run at or before start of run"};
119 constexpr std::string_view kMemoryCoalescedBpDataHeader{"BP data arranged for memory coalescence"};
120 constexpr std::string_view kAllocateFreeMemOutsideRunsHeader{"Memory for BP allocated/freed outside of runs"};
121 
128  RunData curr_run_data;
129  curr_run_data.AddDataWHeader(
130  std::string(kMemAllocOptHeader),
132  curr_run_data.AddDataWHeader(
133  std::string(kMemoryCoalescedBpDataHeader),
135  curr_run_data.AddDataWHeader(
138  return curr_run_data;
139 }
140 
151 template <RunData_t T>
153  unsigned int x_val_data_start,
154  unsigned int simd_data_size,
155  unsigned int data_bytes_align_width,
156  unsigned int padded_width_data)
157 {
158  //assuming that the padded checkerboard width divides evenly by
159  //beliefprop::NUM_DATA_ALIGN_WIDTH (if that's not the case it's a bug)
160  return (((x_val_data_start % simd_data_size) == 0) &&
161  (padded_width_data % ((data_bytes_align_width / sizeof(T))) == 0));
162 }
163 
164 };
165 
166 #endif //BP_RUN_UTILS_H
Declares class to store headers with data corresponding to current program run and evaluation.
Contains namespace with constants and enums related to run environment and settings for run.
Define constraints for data type in processing.
Class to store headers with data corresponding to current program run and evaluation.
Definition: RunData.h:42
void AddDataWHeader(const std::string &header, const std::string &data)
Add string data with header describing added data.
Definition: RunData.cpp:49
Namespace for enums, constants, structures, and functions specific to belief propagation processing.
constexpr bool kOptimizedIndexingSetting
Definition: BpRunUtils.h:114
constexpr bool kAllocateFreeBpMemoryOutsideRuns
Definition: BpRunUtils.h:115
RunData RunSettings()
Retrieve run settings as a RunData object for output.
Definition: BpRunUtils.h:127
constexpr std::string_view kAllocateFreeMemOutsideRunsHeader
Definition: BpRunUtils.h:120
constexpr std::string_view kMemoryCoalescedBpDataHeader
Definition: BpRunUtils.h:119
constexpr bool kUseOptMemManagement
Definition: BpRunUtils.h:113
bool MemoryAlignedAtDataStart(unsigned int x_val_data_start, unsigned int simd_data_size, unsigned int data_bytes_align_width, unsigned int padded_width_data)
Inline function to check if data is aligned at x_val_data_start for SIMD loads/stores that require al...
Definition: BpRunUtils.h:152
const T kHighValBp
High value for type to use if initializing to "high" value.
Definition: BpRunUtils.h:47
unsigned int NumBpStereoRuns(unsigned int disparity_vals)
Get number of stereo runs when evaluating implementation Perform less stereo runs if greater number o...
Definition: BpRunUtils.h:91
constexpr std::string_view kMemAllocOptHeader
Definition: BpRunUtils.h:118
constexpr float kHighValBpKernel
High value as used in kernel currently hard-coded to be below maximum short value of 32767.
Definition: BpRunUtils.h:51