28 #ifndef NEONTEMPLATESPFUNCTS_H_
29 #define NEONTEMPLATESPFUNCTS_H_
34 template<>
inline float64x2_t simd_processing::LoadPackedDataAligned<double, float64x2_t>(
35 unsigned int x,
unsigned int y,
unsigned int current_disparity,
37 unsigned int numDispVals,
const double* inData)
41 current_bp_level.
height_level_, current_disparity, numDispVals)]);
44 template<>
inline float32x4_t simd_processing::LoadPackedDataAligned<float, float32x4_t>(
45 unsigned int x,
unsigned int y,
unsigned int current_disparity,
47 unsigned int numDispVals,
const float* inData)
51 current_bp_level.
height_level_, current_disparity, numDispVals)]);
54 template<>
inline float16x4_t simd_processing::LoadPackedDataAligned<float16_t, float16x4_t>(
55 unsigned int x,
unsigned int y,
unsigned int current_disparity,
57 unsigned int numDispVals,
const float16_t* inData)
65 template<>
inline float32x4_t simd_processing::LoadPackedDataUnaligned<float, float32x4_t>(
66 unsigned int x,
unsigned int y,
unsigned int current_disparity,
68 unsigned int numDispVals,
const float* inData)
72 current_bp_level.
height_level_, current_disparity, numDispVals)]);
75 template<>
inline float16x4_t simd_processing::LoadPackedDataUnaligned<float16_t, float16x4_t>(
76 unsigned int x,
unsigned int y,
unsigned int current_disparity,
78 unsigned int numDispVals,
const float16_t* inData)
82 current_bp_level.
height_level_, current_disparity, numDispVals)]);
85 template<>
inline float64x2_t simd_processing::LoadPackedDataUnaligned<double, float64x2_t>(
86 unsigned int x,
unsigned int y,
unsigned int current_disparity,
88 unsigned int numDispVals,
const double* inData)
92 current_bp_level.
height_level_, current_disparity, numDispVals)]);
95 template<>
inline float32x4_t simd_processing::createSIMDVectorSameData<float32x4_t>(
float data) {
96 return vdupq_n_f32(data);
99 template<>
inline float16x4_t simd_processing::createSIMDVectorSameData<float16x4_t>(
float data) {
100 return vcvt_f16_f32(createSIMDVectorSameData<float32x4_t>(data));
103 template<>
inline float64x2_t simd_processing::createSIMDVectorSameData<float64x2_t>(
float data) {
104 return vdupq_n_f64((
double)data);
107 template<>
inline float32x4_t simd_processing::AddVals<float32x4_t, float32x4_t, float32x4_t>(
108 const float32x4_t& val1,
const float32x4_t& val2)
110 return vaddq_f32(val1, val2);
113 template<>
inline float64x2_t simd_processing::AddVals<float64x2_t, float64x2_t, float64x2_t>(
114 const float64x2_t& val1,
const float64x2_t& val2)
116 return vaddq_f64(val1, val2);
119 template<>
inline float32x4_t simd_processing::AddVals<float32x4_t, float16x4_t, float32x4_t>(
120 const float32x4_t& val1,
const float16x4_t& val2)
122 return vaddq_f32(val1, vcvt_f32_f16(val2));
125 template<>
inline float32x4_t simd_processing::AddVals<float16x4_t, float32x4_t, float32x4_t>(
126 const float16x4_t& val1,
const float32x4_t& val2)
128 return vaddq_f32(vcvt_f32_f16(val1), val2);
131 template<>
inline float32x4_t simd_processing::AddVals<float16x4_t, float16x4_t, float32x4_t>(
132 const float16x4_t& val1,
const float16x4_t& val2)
134 return vaddq_f32(vcvt_f32_f16(val1), vcvt_f32_f16(val2));
137 template<>
inline float32x4_t simd_processing::SubtractVals<float32x4_t, float32x4_t, float32x4_t>(
138 const float32x4_t& val1,
const float32x4_t& val2)
140 return vsubq_f32(val1, val2);
143 template<>
inline float64x2_t simd_processing::SubtractVals<float64x2_t, float64x2_t, float64x2_t>(
144 const float64x2_t& val1,
const float64x2_t& val2)
146 return vsubq_f64(val1, val2);
149 template<>
inline float32x4_t simd_processing::divideVals<float32x4_t, float32x4_t, float32x4_t>(
150 const float32x4_t& val1,
const float32x4_t& val2)
152 return vdivq_f32(val1, val2);
155 template<>
inline float64x2_t simd_processing::divideVals<float64x2_t, float64x2_t, float64x2_t>(
156 const float64x2_t& val1,
const float64x2_t& val2)
158 return vdivq_f64(val1, val2);
161 template<>
inline float32x4_t simd_processing::ConvertValToDatatype<float32x4_t, float>(
float val) {
162 return vdupq_n_f32(val);
165 template<>
inline float64x2_t simd_processing::ConvertValToDatatype<float64x2_t, double>(
double val) {
166 return vdupq_n_f64(val);
169 template<>
inline float32x4_t simd_processing::GetMinByElement<float32x4_t>(
170 const float32x4_t& val1,
const float32x4_t& val2)
172 return vminnmq_f32(val1, val2);
175 template<>
inline float64x2_t simd_processing::GetMinByElement<float64x2_t>(
176 const float64x2_t& val1,
const float64x2_t& val2)
178 return vminnmq_f64(val1, val2);
181 template<>
inline void simd_processing::StorePackedDataAligned<float, float32x4_t>(
182 unsigned int indexDataStore,
float* locationDataStore,
const float32x4_t& dataToStore)
184 vst1q_f32(&locationDataStore[indexDataStore], dataToStore);
187 template<>
inline void simd_processing::StorePackedDataAligned<float16_t, float32x4_t>(
188 unsigned int indexDataStore, float16_t* locationDataStore,
const float32x4_t& dataToStore)
190 vst1_f16(&locationDataStore[indexDataStore], vcvt_f16_f32(dataToStore));
193 template<>
inline void simd_processing::StorePackedDataAligned<double, float64x2_t>(
194 unsigned int indexDataStore,
double* locationDataStore,
const float64x2_t& dataToStore)
196 vst1q_f64(&locationDataStore[indexDataStore], dataToStore);
199 template<>
inline void simd_processing::StorePackedDataUnaligned<float, float32x4_t>(
200 unsigned int indexDataStore,
float* locationDataStore,
const float32x4_t& dataToStore)
202 vst1q_f32(&locationDataStore[indexDataStore], dataToStore);
205 template<>
inline void simd_processing::StorePackedDataUnaligned<float16_t, float32x4_t>(
206 unsigned int indexDataStore, float16_t* locationDataStore,
const float32x4_t& dataToStore)
208 vst1_f16(&locationDataStore[indexDataStore], vcvt_f16_f32(dataToStore));
211 template<>
inline void simd_processing::StorePackedDataUnaligned<double, float64x2_t>(
212 unsigned int indexDataStore,
double* locationDataStore,
const float64x2_t& dataToStore)
214 vst1q_f64(&locationDataStore[indexDataStore], dataToStore);
ARCHITECTURE_ADDITION unsigned int RetrieveIndexInDataAndMessage(unsigned int x_val, unsigned int y_val, unsigned int width, unsigned int height, unsigned int current_disparity, unsigned int total_num_disp_vals, unsigned int offset_data=0u)
Retrieve the current 1-D index value of the given point at the given disparity in the data cost and m...
POD struct to store bp level data. Struct can be passed to global CUDAs kernel so needs to take restr...
unsigned int height_level_
unsigned int padded_width_checkerboard_level_