Floating-point representation
Source: CS:APP3e, Bryant and O’Hallaron, practice problem 2.97
Description
Compute (float) i
, that is, the bit-level representation of an integer
value cast to a float.
Notes
IEEE single-precision (32-bit) floating-point word format:
S EEEEEEEE 1.MMMMMMM MMMMMMMM MMMMMMMM
S
: sign bit fieldE
: 8-bit exponent field1
: implicit (not stored) 24th bit, assumed to be 1M
: 23-bit mantissa/fraction field
Solution
1
2
3
4
5
6
7
8
9
10
11
12
13
#include <assert.h>
#include <stdio.h>
// Bit-level representation of floating-point number.
typedef unsigned float_bits;
// I implemented three separate functions so I could test them separately.
// Obviously, everything could have been done inside `float_i2f` instead.
unsigned char sign_bit(int i);
unsigned exponent(int i);
unsigned fraction(int i);
float_bits float_i2f(int i);
sign_bit()
Given a signed integer, return an unsigned char representing the value of the sign bit in IEEE single-precision floating point format.
1
2
3
4
5
6
7
unsigned char sign_bit(int i) { return i < 0 ? 1 : 0; }
void test_sign_bit() {
assert(sign_bit( 0) == 0); // sign bit should be 0
assert(sign_bit( 1) == 0); // sign bit should be 0
assert(sign_bit(-1) == 1); // sign bit should be 1
}
exponent()
Given an integer, return an unsigned integer representing the 8-bit exponent in IEEE single-precision floating point format.
Shift i
to the right until i
represents a value of zero,
incrementing a counter if i
still represents a value greater
than zero after each shift. When i
represents a value of zero,
the value of the counter represents the exponent. (This is
equivalent to repeatedly dividing i
by 2.) Note: this returns
a value to which the bias of 127 specified by IEEE single-precision
floating-point format has yet to be added.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
unsigned exponent(int i) {
unsigned expt = 0;
while (i > 0) {
i >>= 1;
if (i > 0) expt++; // test before incrementing!
}
return expt;
}
// Ref: https://www.h-schmidt.net/FloatConverter/IEEE754.html
void test_exponent() {
assert(exponent( 0) == 0);
assert(exponent( 1) == 0);
assert(exponent( 2) == 1);
assert(exponent( 3) == 1);
assert(exponent( 4) == 2);
assert(exponent( 7) == 2);
assert(exponent( 8) == 3);
assert(exponent( 9) == 3);
assert(exponent(15) == 3);
assert(exponent(16) == 4);
assert(exponent(31) == 4);
assert(exponent(32) == 5);
assert(exponent(2048) == 11);
assert(exponent(16777215) == 23);
assert(exponent(16777216) == 24);
assert(exponent(33554432) == 25);
assert(exponent(268435440) == 27);
}
fraction()
Given a signed integer, return an unsigned integer representing the 24-bit fraction field in IEEE single-precision floating point format. Round to even when necessary. Note: this returns a 24-bit pattern, whose most significant bit must be masked out before it is used (the 24th bit is implicit, not stored).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
unsigned fraction(int i) {
unsigned expt = exponent(i);
unsigned fract;
// printf("i: 0x%x expt: %d ", i, expt + 127); // debug
if (expt <= 23) {
// `expt` represents the number of significant bits in the 32-bit
// integer `i`. To get the value that will represent the fraction
// in IEEE single-precision floating point, left-shift `i` by a
// value equivalent to the difference between 23 (the last
// possibly significant bit in a 24-bit fraction field) and `expt`.
// After the shift, the most significant bit of `i` is at the 24th
// place. For example, the 32-bit signed integer 0x3039 is encoded
// in binary as follows:
//
// 00000000 00000000 00110000 00111001
// ^ MSB
//
// For 0x3039, the value of `expt` is 13, so we left-shift
// 23 - 13 = 10 places to bring the MSB to position 24:
//
// 00000000 11000000 11100100 00000000
// ^ new MSB
//
fract = i << (23 - expt);
} else {
// Handle the case where `fract` has more than 24 significant bits
// and therefore cannot be fully represented using the 24 bits of
// the IEEE single-precision floating point fraction field. This
// requires that we round the portion of `fract` that WILL fit in
// a 24-bit field.
//
// To get the value that will represent the fraction in IEEE
// single-precision floating point, left-shift `i` by a
// value equivalent to the difference between 31 (the last
// possibly significant bit in a 32-bit fraction field) and `expt`.
// (We do not want to lose any information, so we use the entire
// 32-bit field of `i` to start with.) After the shift, the most
// significant bit of `i` is at the 31st place.
//
fract = i << (31 - expt);
// Get the rightmost 8 bits (bits 0-7) of `fract`. This is the
// residual part of the fraction, which will not fit into a
// 24-bit field. Do this by masking with 0xff, equivalent to
// 0b_0000_0000_0000_0000_0000_0000_1111_1111.
//
// |---- `fract`, derived from `i` -----|
// | 24-bit fraction field | residue
// ........ ........ ........ ........
// 31 24 23 16 15 8 7 0
//
unsigned residue = fract & 0xff;
// printf("residue: 0x%x ", residue); // debug
// Note: we perform this step BEFORE the rounding operation.
// Right-shift 8 places, discarding the residue and leaving only
// the part of the fraction that will fit in a 24-bit field. Note
// also: we are returning a 24-bit pattern, not the 23-bit pattern
// specified by IEEE single-precision floating-point format, so the
// 24th bit will have to be masked out later before using it.
//
// After the shift, it looks like this:
//
// | 24-bit fraction field |
// 00000000 ........ ........ ........
// 31 24 23 16 15 8 7 0
//
fract >>= 8;
// If the residual part of the fraction represents 0x80, equivalent
// to 0b_1000_0000 (representing exactly one-half of the previous
// bit's place value), we must round to even.
//
if (residue == 0x80) {
// printf("[round to even] "); // debug
// Get the value of the rightmost bit of `fract`. This is
// the least significant bit of the part of the fraction
// that *will* fit in a 24-bit field.
//
// |---- `fract`, derived from `i` -----|
// |-- 24-bit fraction field -| residue
// ........ ........ .......* ........
// 31 24 23 16 15 8 7 0
//
unsigned fract_lsb = fract & 1; // equivalent to `(fract >> 0) & 1`
// printf("fract_lsb: %d --> ", fract_lsb); // debug
// Round to even: if `fract_lsb` is 1, round up by adding 1
// to `fract`.
if (fract_lsb) {
// printf("round up "); // debug
// if (fract == 0xffffff) printf("OVERFLOW! "); // debug
fract += 1;
} else {
// printf("round down "); // debug
}
} else {
// If the residue is less than 0b_1000_000, we round down, so don't
// do anything here. If it is greater than 0b_1000_000, we round up
// by adding 1 to `fract`.
if (residue > 0x80) {
// printf("[round up] "); // debug
fract += 1;
} else {
// printf("[no rounding or round down] "); // debug
}
}
}
// printf("fract: 0x%x\n", fract & 0x7fffff); // debug
return fract;
}
// Ref: https://www.h-schmidt.net/FloatConverter/IEEE754.html
void test_fraction() {
// Exponents < 23, no rounding required.
assert((fraction( 0) & 0x7fffff) == 0x0);
assert((fraction( 1) & 0x7fffff) == 0x0);
assert((fraction( 2) & 0x7fffff) == 0x0);
assert((fraction( 3) & 0x7fffff) == 0x400000);
assert((fraction( 4) & 0x7fffff) == 0x0);
assert((fraction( 6) & 0x7fffff) == 0x400000);
assert((fraction( 7) & 0x7fffff) == 0x600000);
assert((fraction( 8) & 0x7fffff) == 0x0);
assert((fraction( 9) & 0x7fffff) == 0x100000);
assert((fraction(10) & 0x7fffff) == 0x200000);
assert((fraction(2048) & 0x7fffff)== 0x0);
// Exponents > 24, requiring rounding.
assert((fraction(0xFFFFFF0) & 0x7fffff) == 0x7FFFFF);
assert((fraction(0x08000001) & 0x7fffff) == 0x0);
assert((fraction(0x080000F9) & 0x7fffff) == 0x10);
assert((fraction(0x080000F8) & 0x7fffff) == 0x10);
assert((fraction(0x080000E8) & 0x7fffff) == 0xE);
assert((fraction(0x01000003) & 0x7fffff) == 0x2);
assert((fraction(0x0FFFFFF8) & 0x7fffff) == 0x0);
}
float_i2f()
Compute (float) i
, that is, the bit-level representation
of an integer value cast to a float.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
float_bits float_i2f(int i) {
// Special cases.
if (i == 0) return 0x0;
if (i == -0x80000000) return 0xCF000000; // -2^31
unsigned sbit = sign_bit(i);
// If `i` is negative, get its absolute value.
if (sbit) i = -i;
unsigned expt = exponent(i);
unsigned fract = fraction(i);
// Another special case: if the 24-bit fraction was 0xffffff
// (0b_1111_1111_1111_1111_1111_1111), it will have overflowed the
// 24-bit fraction field if 1 was added to the fraction to
// round it. So we test for that and increment the exponent here.
//
if (fract == 0xffffff + 1) expt += 1;
// IEEE floating point specifies a bias of 127 for the exponent.
expt += 127;
// Fields we need:
// S EEEEEEEE 1.MMMMMMM MMMMMMMM MMMMMMMM
sbit <<= 31;
expt <<= 23;
fract &= 0x7fffff; // the 24th bit is implicit, so we don't need it
return sbit | expt | fract;
}
// Ref: https://www.h-schmidt.net/FloatConverter/IEEE754.html
void test_float_i2f() {
// Special cases.
assert(float_i2f(0) == 0);
assert(float_i2f(-0x80000000) == 0xCF000000);
// Representable in 24 bits.
assert(float_i2f(1) == 0x3f800000);
assert(float_i2f(-1) == 0xBF800000);
assert(float_i2f(2) == 0x40000000);
assert(float_i2f(3) == 0x40400000);
assert(float_i2f(4) == 0x40800000);
assert(float_i2f(7) == 0x40e00000);
assert(float_i2f(8) == 0x41000000);
assert(float_i2f(9) == 0x41100000);
assert(float_i2f(15) == 0x41700000);
assert(float_i2f(16) == 0x41800000);
assert(float_i2f(31) == 0x41f80000);
assert(float_i2f(32) == 0x42000000);
assert(float_i2f(64) == 0x42800000);
// Upper bound of what is representable in 24 bits.
// Upper bound with exponent of 23:
assert(float_i2f(16777215) == 0x4b7fffff);
// Upper bound of what is representable: 16777216 == 1 * 2^24
assert(float_i2f(16777216) == 0x4b800000);
// Rounding.
assert(float_i2f(0x08000001) == 0x4D000000);
assert(float_i2f(0x080000F9) == 0x4D000010);
assert(float_i2f(0x080000F8) == 0x4D000010);
assert(float_i2f(0x080000E8) == 0x4D00000E);
assert(float_i2f(0x0ABCD130) == 0x4D2BCD13);
assert(float_i2f(0x01000003) == 0x4B800002);
// Overflows 24-bit fraction field.
assert(float_i2f(0x0FFFFFF8) == 0x4D800000);
}
Testing
1
2
3
4
5
6
7
8
9
10
11
12
int main() {
// System data types. int and float should be 32 bits.
assert(sizeof(int) == 4);
assert(sizeof(float) == 4);
test_sign_bit();
test_exponent();
test_fraction();
test_float_i2f();
return 0;
}