forked from uTensor/uTensor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquantization_utils.cpp
More file actions
44 lines (42 loc) · 2.08 KB
/
quantization_utils.cpp
File metadata and controls
44 lines (42 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#include "quantization_utils.hpp"
void RequantizeManyInNewRangeReference(const int* input, int32_t count,
float min_input, float max_input,
float min_output,
float max_output,
unsigned char* output) {
// Initially we calculate all the constants we need once, before we go into
// the inner loop. If this is updated, also update the Eigen version.
const int fp_shift = 16;
const float input_range = max_input - min_input;
const float output_range = max_output - min_output;
const float recip_output_range =
output_range == 0.0 ? 0.0 : (255.0 / output_range);
const float input_rezero = (min_input + max_input) / 2.0;
const int64_t range_scale_fp =
output_range == 0.0 ? 0.0
: static_cast<int64_t>(255.0 * (1 << fp_shift) *
input_range / output_range);
const int64_t input_offset_fp =
static_cast<int64_t>(input_rezero * recip_output_range * (1 << fp_shift));
const int64_t output_offset_fp =
output_range == 0.0
? 0
: static_cast<int64_t>((1 << fp_shift) * (min_output * 255.0) /
output_range);
const int64_t rounding_delta = 1 << (fp_shift - 1);
// Inside this loop we just do minimal adds, multiplies, and shifts, in a way
// that could be easily adapted for a SIMD implementation. It should also be
// possible to perform all the calculations in 32-bit rather than 64, but
// that's not been implemented yet.
for (int32_t index = 0; index < count; ++index) {
const int64_t input_value = static_cast<int64_t>(input[index]);
const int64_t fp_value =
((input_value * range_scale_fp) >> 32) + input_offset_fp;
const int64_t offset_intermediate = fp_value - output_offset_fp;
const int64_t round_intermediate = offset_intermediate + rounding_delta;
int64_t quantized_int64 = round_intermediate >> fp_shift;
quantized_int64 = std::max(quantized_int64, 0LL);
quantized_int64 = std::min(quantized_int64, 255LL);
output[index] = static_cast<unsigned char>(static_cast<int32_t>(quantized_int64));
}
}