Optimized Belief Propagation (CPU and GPU)
DriverBpStereoCPU.cpp
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
29 #include <iostream>
30 #include <array>
37 
38 //enum to define setting to run implementation
39 enum class RunImpSetting {
43 };
44 
45 //
54 void runImp(int argc, char** argv, RunImpSetting impSetting)
55 {
56  //initialize settings to run implementation and evaluation
57  run_environment::RunImpSettings run_imp_settings;
58 
59  //enable optimization of parallel parameters with setting to use the same parallel parameters for all kernels in run
60  //testing on i7-11800H has found that using different parallel parameters (corresponding to OpenMP thread counts)
61  //in different kernels in the optimized CPU implementation can increase runtime (may want to test on additional processors)
62  run_imp_settings.opt_parallel_params_setting =
64 
65  //set default parallel parameters and parallel parameters to benchmark when searching for optimal
66  //parallel parameters
67  run_imp_settings.p_params_default_alt_options =
70 
71  //set run name to first argument if it exists
72  //otherwise set to "CurrentRun"
73  run_imp_settings.run_name = (argc > 1) ? argv[1] : "CurrentRun";
74 
75  //adjust thread count to simulate single CPU on dual-CPU system
76  //currently only works as expected if environment variables are set before run such that threads are pinned to socket via
77  //"export OMP_PLACES="sockets"" and "export OMP_PROC_BIND=true" commands on command line
79  {
80  //adjust settings to simulate run on single CPU in two-CPU system, specifically set parallel thread count options so that
81  //maximum number of parallel threads is thread count of single CPU and set environment variables so that CPU threads
82  //are pinned to socket
83  //set default parallel threads count to be number of threads on a single CPU in the two-CPU system
84  run_imp_settings.p_params_default_alt_options.first =
85  {std::thread::hardware_concurrency() / 2, 1};
86 
87  //erase parallel thread count options with more than the number of threads on a single CPU in the two-CPU system
88  run_imp_settings.RemoveParallelParamAboveMaxThreads(
89  std::thread::hardware_concurrency() / 2);
90 
91  //adjust settings so that CPU threads pinned to socket to simulate run on single CPU
92  //TODO: commented out since currently has no effect; needs to be set before run by
93  //calling "export OMP_PLACES="sockets"" and "export OMP_PROC_BIND=true" commands on
94  //command line before run
95  //run_environment::CPUThreadsPinnedToSocket().operator()(true);
96 
97  //append run name to specify that simulating single CPU on dual-CPU system
98  run_imp_settings.run_name += "_SimSingleCPUOnDualCPUSystem";
99  }
100  //check if running implementation with CPU threads pinned to socket (for cases with multiple CPUs)
101  //TODO: currently not supported due to code for setting CPU threads to be pinned to socket not working as expected
102  //can still run implementation with CPU threads pinned to socket by calling "export OMP_PLACES="sockets"" and
103  //"export OMP_PROC_BIND=true" commands on command line before run
104  /*else if (impSetting == RunImpSetting::kRunImpThreadsPinnedToSocket) {
105  //adjust settings so that CPU threads pinned to socket
106  //TODO: commented out since currently has no effect
107  //run_environment::CPUThreadsPinnedToSocket().operator()(true);
108 
109  //append run name to specify that CPU threads pinned to socket
110  run_imp_settings.run_name += "_ThreadsPinnedToSocket";
111  }*/
112 
113  //set datatype(s) to use in run processing in evaluation
114  run_imp_settings.datatypes_eval_sizes =
117 
118  //set whether or not to run and evaluate alternate optimized implementations
119  //in addition to the "fastest" optimized implementation available
121 
122  //set setting of whether or not to use templated loop iterations in implementation
123  //in evaluation runs
124  run_imp_settings.templated_iters_setting =
126 
127  //set path of baseline runtimes and baseline description
128  run_imp_settings.baseline_runtimes_path_desc =
131 
132  //set data subsets to evaluate separate from all data
134 
135  //remove any parallel processing below given minimum number of threads
136  run_imp_settings.RemoveParallelParamBelowMinThreads(
138 
139 //run multiple belief propagation across multiple inputs and configurations,
140 //with specific vectorization configurations dependent what's supported on
141 //current CPU as defined at compile time
142 #if (CPU_VECTORIZATION_DEFINE == AVX_512_DEFINE)
143  //run belief propagation with AVX512, AVX256, and no vectorization implementations,
144  //with the AVX512 implementation given first as the expected fastest implementation
145  RunImpMultTypesAccels().operator()(
146  {{std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX512)},
147  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX256)},
148  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kNone)}},
149  run_imp_settings,
150  std::make_unique<EvaluateImpResultsBp>());
151 #elif (CPU_VECTORIZATION_DEFINE == AVX_512_F16_DEFINE)
152  //run belief propagation with AVX512 w/ f16 vectorization, AVX512 without f16
153  //vectorization, AVX256 w/ f16 vectorization, AVX256 without f16
154  //vectorization, and no vectorization implementations, with the AVX512
155  //implementation given first as the expected fastest implementation
156  RunImpMultTypesAccels().operator()(
157  {{std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX512_F16)},
158  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX512)},
159  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX256_F16)},
160  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX256)},
161  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kNone)}},
162  run_imp_settings,
163  std::make_unique<EvaluateImpResultsBp>());
164 #elif (CPU_VECTORIZATION_DEFINE == AVX_256_DEFINE)
165  //run belief propagation with AVX256 and no vectorization implementations,
166  //with the AVX256 implementation given first as the expected fastest implementation
167  RunImpMultTypesAccels().operator()(
168  {{std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kAVX256)},
169  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kNone)}},
170  run_imp_settings,
171  std::make_unique<EvaluateImpResultsBp>());
172 #elif (CPU_VECTORIZATION_DEFINE == NEON_DEFINE)
173  //run belief propagation with NEON and no vectorization implementations,
174  //with the NEON implementation given first as the expected fastest implementation
175  RunImpMultTypesAccels().operator()(
176  {{std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kNEON)},
177  {std::make_shared<RunImpMultInputsBp>(run_environment::AccSetting::kNone)}},
178  run_imp_settings,
179  std::make_unique<EvaluateImpResultsBp>());
180 #endif //CPU_VECTORIZATION_DEFINE
181 }
182 
199 int main(int argc, char** argv)
200 {
201  //if running on a system with two cpus, run implementation with settings adjusted
202  //to simulate run on single CPU if specified in second command line argument
203  if ((argc > 2) && (std::string(argv[2]) == std::string(run_cpu::kSimulateSingleCPU))) {
204  std::cout << "Running optimized CPU implementation with settings adjusted such that "
205  "a single CPU is simulated on a dual-CPU system." << std::endl;
206  std::cout << "Results only as expected if running on dual-CPU system and "
207  "environment variables set so that threads pinned to CPU socket." << std::endl;
209  }
210  else {
211  //run default implementation
212  std::cout << "Running optimized CPU implementation" << std::endl;
214  }
215 
216  return 0;
217 }
Belief propagation implementation constants related to file processing.
Declares structure to store the belief propagation settings including the number of levels and iterat...
int main(int argc, char **argv)
Main() function that drives the optimized CPU belief propgation implementation evaluation across mult...
RunImpSetting
@ kRunImpThreadsPinnedToSocket
@ kRunImpSimSingleCPUTwoCPUSystem
void runImp(int argc, char **argv, RunImpSetting impSetting)
Run and evaluate optimized CPU implementation for belief propagation using input parameters from comm...
Declares child class of EvaluateImpResults that defines member functions for belief propagation evalu...
Contains namespace with CPU run defaults and constants.
Declares child class of RunImpMultInputs to run specified belief propagation implementation on a numb...
Declares class to run and evaluate implementation(s) of an algorithm using multiple settings includin...
Class to run and evaluate implementation(s) of an algorithm using multiple settings including differe...
constexpr std::string_view kBaselineRunDesc
constexpr std::string_view kBaselineRunDataPath
const std::vector< std::pair< std::string, std::vector< InputSignature > > > kEvalDataSubsets
Define subsets for evaluating run results on specified inputs The first three stereo sets are labele...
constexpr std::string_view kSimulateSingleCPU
Constant that specifies that run is simulating single CPU on a dual-CPU system.
const std::array< unsigned int, 2 > kParallelParamsDefault
Default parallel parameters setting on CPU.
const unsigned int kMinNumThreadsRun
Minimum number of threads to allow for any parallel parameters setting on CPU.
const std::set< std::array< unsigned int, 2 > > kParallelParameterAltOptions
Parallel parameters options that are tested in order to find optimized configuration in run....
constexpr run_environment::TemplatedItersSetting kTemplatedItersEvalSettings
constexpr bool kRunAltOptimizedImps
constexpr std::array< size_t, 3 > kDataTypesEvalSizes
Structure that stores settings for current implementation run.
Definition: RunSettings.h:84
TemplatedItersSetting templated_iters_setting
Definition: RunSettings.h:86
std::optional< std::array< std::string_view, 2 > > baseline_runtimes_path_desc
Definition: RunSettings.h:93
std::vector< unsigned int > datatypes_eval_sizes
Definition: RunSettings.h:85
std::pair< std::array< unsigned int, 2 >, std::set< std::array< unsigned int, 2 > > > p_params_default_alt_options
Definition: RunSettings.h:89
OptParallelParamsSetting opt_parallel_params_setting
Definition: RunSettings.h:87
std::vector< std::pair< std::string, std::vector< InputSignature > > > subset_desc_input_sig
Definition: RunSettings.h:95
void RemoveParallelParamBelowMinThreads(unsigned int min_threads)
Remove parallel parameters with less than specified number of threads.
Definition: RunSettings.h:103
void RemoveParallelParamAboveMaxThreads(unsigned int max_threads)
Remove parallel parameters with greater than specified number of threads.
Definition: RunSettings.h:114