Optimized Belief Propagation (CPU and GPU)
NEONTemplateSpFuncts.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2024 Scott Grauer-Gray
3 
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
28 #ifndef NEONTEMPLATESPFUNCTS_H_
29 #define NEONTEMPLATESPFUNCTS_H_
30 
31 //NEON only used when processing on an ARM CPU that supports NEON instructions
32 #include <arm_neon.h>
33 
34 template<> inline float64x2_t simd_processing::LoadPackedDataAligned<double, float64x2_t>(
35  unsigned int x, unsigned int y, unsigned int current_disparity,
36  const beliefprop::BpLevelProperties& current_bp_level,
37  unsigned int numDispVals, const double* inData)
38 {
39  return vld1q_f64(&inData[beliefprop::RetrieveIndexInDataAndMessage(
40  x, y, current_bp_level.padded_width_checkerboard_level_,
41  current_bp_level.height_level_, current_disparity, numDispVals)]);
42 }
43 
44 template<> inline float32x4_t simd_processing::LoadPackedDataAligned<float, float32x4_t>(
45  unsigned int x, unsigned int y, unsigned int current_disparity,
46  const beliefprop::BpLevelProperties& current_bp_level,
47  unsigned int numDispVals, const float* inData)
48 {
49  return vld1q_f32(&inData[beliefprop::RetrieveIndexInDataAndMessage(
50  x, y, current_bp_level.padded_width_checkerboard_level_,
51  current_bp_level.height_level_, current_disparity, numDispVals)]);
52 }
53 
54 template<> inline float16x4_t simd_processing::LoadPackedDataAligned<float16_t, float16x4_t>(
55  unsigned int x, unsigned int y, unsigned int current_disparity,
56  const beliefprop::BpLevelProperties& current_bp_level,
57  unsigned int numDispVals, const float16_t* inData)
58 {
59  return vld1_f16(&inData[beliefprop::RetrieveIndexInDataAndMessage(
60  x, y, current_bp_level.padded_width_checkerboard_level_,
61  current_bp_level.height_level_, current_disparity,
62  numDispVals)]);
63 }
64 
65 template<> inline float32x4_t simd_processing::LoadPackedDataUnaligned<float, float32x4_t>(
66  unsigned int x, unsigned int y, unsigned int current_disparity,
67  const beliefprop::BpLevelProperties& current_bp_level,
68  unsigned int numDispVals, const float* inData)
69 {
70  return vld1q_f32(&inData[beliefprop::RetrieveIndexInDataAndMessage(
71  x, y, current_bp_level.padded_width_checkerboard_level_,
72  current_bp_level.height_level_, current_disparity, numDispVals)]);
73 }
74 
75 template<> inline float16x4_t simd_processing::LoadPackedDataUnaligned<float16_t, float16x4_t>(
76  unsigned int x, unsigned int y, unsigned int current_disparity,
77  const beliefprop::BpLevelProperties& current_bp_level,
78  unsigned int numDispVals, const float16_t* inData)
79 {
80  return vld1_f16(&inData[beliefprop::RetrieveIndexInDataAndMessage(
81  x, y, current_bp_level.padded_width_checkerboard_level_,
82  current_bp_level.height_level_, current_disparity, numDispVals)]);
83 }
84 
85 template<> inline float64x2_t simd_processing::LoadPackedDataUnaligned<double, float64x2_t>(
86  unsigned int x, unsigned int y, unsigned int current_disparity,
87  const beliefprop::BpLevelProperties& current_bp_level,
88  unsigned int numDispVals, const double* inData)
89 {
90  return vld1q_f64(&inData[beliefprop::RetrieveIndexInDataAndMessage(
91  x, y, current_bp_level.padded_width_checkerboard_level_,
92  current_bp_level.height_level_, current_disparity, numDispVals)]);
93 }
94 
95 template<> inline float32x4_t simd_processing::createSIMDVectorSameData<float32x4_t>(float data) {
96  return vdupq_n_f32(data);
97 }
98 
99 template<> inline float16x4_t simd_processing::createSIMDVectorSameData<float16x4_t>(float data) {
100  return vcvt_f16_f32(createSIMDVectorSameData<float32x4_t>(data));
101 }
102 
103 template<> inline float64x2_t simd_processing::createSIMDVectorSameData<float64x2_t>(float data) {
104  return vdupq_n_f64((double)data);
105 }
106 
107 template<> inline float32x4_t simd_processing::AddVals<float32x4_t, float32x4_t, float32x4_t>(
108  const float32x4_t& val1, const float32x4_t& val2)
109 {
110  return vaddq_f32(val1, val2);
111 }
112 
113 template<> inline float64x2_t simd_processing::AddVals<float64x2_t, float64x2_t, float64x2_t>(
114  const float64x2_t& val1, const float64x2_t& val2)
115 {
116  return vaddq_f64(val1, val2);
117 }
118 
119 template<> inline float32x4_t simd_processing::AddVals<float32x4_t, float16x4_t, float32x4_t>(
120  const float32x4_t& val1, const float16x4_t& val2)
121 {
122  return vaddq_f32(val1, vcvt_f32_f16(val2));
123 }
124 
125 template<> inline float32x4_t simd_processing::AddVals<float16x4_t, float32x4_t, float32x4_t>(
126  const float16x4_t& val1, const float32x4_t& val2)
127 {
128  return vaddq_f32(vcvt_f32_f16(val1), val2);
129 }
130 
131 template<> inline float32x4_t simd_processing::AddVals<float16x4_t, float16x4_t, float32x4_t>(
132  const float16x4_t& val1, const float16x4_t& val2)
133 {
134  return vaddq_f32(vcvt_f32_f16(val1), vcvt_f32_f16(val2));
135 }
136 
137 template<> inline float32x4_t simd_processing::SubtractVals<float32x4_t, float32x4_t, float32x4_t>(
138  const float32x4_t& val1, const float32x4_t& val2)
139 {
140  return vsubq_f32(val1, val2);
141 }
142 
143 template<> inline float64x2_t simd_processing::SubtractVals<float64x2_t, float64x2_t, float64x2_t>(
144  const float64x2_t& val1, const float64x2_t& val2)
145 {
146  return vsubq_f64(val1, val2);
147 }
148 
149 template<> inline float32x4_t simd_processing::divideVals<float32x4_t, float32x4_t, float32x4_t>(
150  const float32x4_t& val1, const float32x4_t& val2)
151 {
152  return vdivq_f32(val1, val2);
153 }
154 
155 template<> inline float64x2_t simd_processing::divideVals<float64x2_t, float64x2_t, float64x2_t>(
156  const float64x2_t& val1, const float64x2_t& val2)
157 {
158  return vdivq_f64(val1, val2);
159 }
160 
161 template<> inline float32x4_t simd_processing::ConvertValToDatatype<float32x4_t, float>(float val) {
162  return vdupq_n_f32(val);
163 }
164 
165 template<> inline float64x2_t simd_processing::ConvertValToDatatype<float64x2_t, double>(double val) {
166  return vdupq_n_f64(val);
167 }
168 
169 template<> inline float32x4_t simd_processing::GetMinByElement<float32x4_t>(
170  const float32x4_t& val1, const float32x4_t& val2)
171 {
172  return vminnmq_f32(val1, val2);
173 }
174 
175 template<> inline float64x2_t simd_processing::GetMinByElement<float64x2_t>(
176  const float64x2_t& val1, const float64x2_t& val2)
177 {
178  return vminnmq_f64(val1, val2);
179 }
180 
181 template<> inline void simd_processing::StorePackedDataAligned<float, float32x4_t>(
182  unsigned int indexDataStore, float* locationDataStore, const float32x4_t& dataToStore)
183 {
184  vst1q_f32(&locationDataStore[indexDataStore], dataToStore);
185 }
186 
187 template<> inline void simd_processing::StorePackedDataAligned<float16_t, float32x4_t>(
188  unsigned int indexDataStore, float16_t* locationDataStore, const float32x4_t& dataToStore)
189 {
190  vst1_f16(&locationDataStore[indexDataStore], vcvt_f16_f32(dataToStore));
191 }
192 
193 template<> inline void simd_processing::StorePackedDataAligned<double, float64x2_t>(
194  unsigned int indexDataStore, double* locationDataStore, const float64x2_t& dataToStore)
195 {
196  vst1q_f64(&locationDataStore[indexDataStore], dataToStore);
197 }
198 
199 template<> inline void simd_processing::StorePackedDataUnaligned<float, float32x4_t>(
200  unsigned int indexDataStore, float* locationDataStore, const float32x4_t& dataToStore)
201 {
202  vst1q_f32(&locationDataStore[indexDataStore], dataToStore);
203 }
204 
205 template<> inline void simd_processing::StorePackedDataUnaligned<float16_t, float32x4_t>(
206  unsigned int indexDataStore, float16_t* locationDataStore, const float32x4_t& dataToStore)
207 {
208  vst1_f16(&locationDataStore[indexDataStore], vcvt_f16_f32(dataToStore));
209 }
210 
211 template<> inline void simd_processing::StorePackedDataUnaligned<double, float64x2_t>(
212  unsigned int indexDataStore, double* locationDataStore, const float64x2_t& dataToStore)
213 {
214  vst1q_f64(&locationDataStore[indexDataStore], dataToStore);
215 }
216 
217 #endif /* NEONTEMPLATESPFUNCTS_H_ */
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
Definition: BpLevel.h:42
unsigned int height_level_
Definition: BpLevel.h:44
unsigned int padded_width_checkerboard_level_
Definition: BpLevel.h:47