Update qoa.h
This commit is contained in:
parent
6ebfc6023b
commit
925978ffde
1 changed files with 119 additions and 51 deletions
170
src/external/qoa.h
vendored
170
src/external/qoa.h
vendored
|
@ -8,71 +8,96 @@ QOA - The "Quite OK Audio" format for fast, lossy audio compression
|
||||||
|
|
||||||
-- Data Format
|
-- Data Format
|
||||||
|
|
||||||
A QOA file has an 8 byte file header, followed by a number of frames. Each frame
|
QOA encodes pulse-code modulated (PCM) audio data with up to 255 channels,
|
||||||
consists of an 8 byte frame header, the current 8 byte en-/decoder state per
|
sample rates from 1 up to 16777215 hertz and a bit depth of 16 bits.
|
||||||
channel and 256 slices per channel. Each slice is 8 bytes wide and encodes 20
|
|
||||||
samples of audio data.
|
|
||||||
|
|
||||||
Note that the last frame of a file may contain less than 256 slices per channel.
|
The compression method employed in QOA is lossy; it discards some information
|
||||||
The last slice (per channel) in the last frame may contain less 20 samples, but
|
from the uncompressed PCM data. For many types of audio signals this compression
|
||||||
the slice will still be 8 bytes wide, with the unused samples zeroed out.
|
is "transparent", i.e. the difference from the original file is often not
|
||||||
|
audible.
|
||||||
|
|
||||||
The samplerate and number of channels is only stated in the frame headers, but
|
QOA encodes 20 samples of 16 bit PCM data into slices of 64 bits. A single
|
||||||
not in the file header. A decoder may peek into the first frame of the file to
|
sample therefore requires 3.2 bits of storage space, resulting in a 5x
|
||||||
find these values.
|
compression (16 / 3.2).
|
||||||
|
|
||||||
In a valid QOA file all frames have the same number of channels and the same
|
A QOA file consists of an 8 byte file header, followed by a number of frames.
|
||||||
samplerate. These restrictions may be relaxed for streaming. This remains to
|
Each frame contains an 8 byte frame header, the current 16 byte en-/decoder
|
||||||
be decided.
|
state per channel and 256 slices per channel. Each slice is 8 bytes wide and
|
||||||
|
encodes 20 samples of audio data.
|
||||||
|
|
||||||
All values in a QOA file are BIG ENDIAN. Luckily, EVERYTHING in a QOA file,
|
All values, including the slices, are big endian. The file layout is as follows:
|
||||||
including the headers, is 64 bit aligned, so it's possible to read files with
|
|
||||||
just a read_u64() that does the byte swapping if necessary.
|
|
||||||
|
|
||||||
In pseudocode, the file layout is as follows:
|
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
struct {
|
struct {
|
||||||
char magic[4]; // magic bytes 'qoaf'
|
char magic[4]; // magic bytes "qoaf"
|
||||||
uint32_t samples; // number of samples per channel in this file
|
uint32_t samples; // samples per channel in this file
|
||||||
} file_header; // = 64 bits
|
} file_header;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
struct {
|
struct {
|
||||||
uint8_t num_channels; // number of channels
|
uint8_t num_channels; // no. of channels
|
||||||
uint24_t samplerate; // samplerate in hz
|
uint24_t samplerate; // samplerate in hz
|
||||||
uint16_t fsamples; // sample count per channel in this frame
|
uint16_t fsamples; // samples per channel in this frame
|
||||||
uint16_t fsize; // frame size (including the frame header)
|
uint16_t fsize; // frame size (includes this header)
|
||||||
} frame_header; // = 64 bits
|
} frame_header;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
int16_t history[4]; // = 64 bits
|
int16_t history[4]; // most recent last
|
||||||
int16_t weights[4]; // = 64 bits
|
int16_t weights[4]; // most recent last
|
||||||
} lms_state[num_channels];
|
} lms_state[num_channels];
|
||||||
|
|
||||||
qoa_slice_t slices[256][num_channels]; // = 64 bits each
|
qoa_slice_t slices[256][num_channels];
|
||||||
} frames[samples * channels / qoa_max_framesize()];
|
|
||||||
} qoa_file;
|
|
||||||
|
|
||||||
Wheras the 64bit qoa_slice_t is defined as follows:
|
} frames[ceil(samples / (256 * 20))];
|
||||||
|
} qoa_file_t;
|
||||||
|
|
||||||
|
Each `qoa_slice_t` contains a quantized scalefactor `sf_quant` and 20 quantized
|
||||||
|
residuals `qrNN`:
|
||||||
|
|
||||||
.- QOA_SLICE -- 64 bits, 20 samples --------------------------/ /------------.
|
.- QOA_SLICE -- 64 bits, 20 samples --------------------------/ /------------.
|
||||||
| Byte[0] | Byte[1] | Byte[2] \ \ Byte[7] |
|
| Byte[0] | Byte[1] | Byte[2] \ \ Byte[7] |
|
||||||
| 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 / / 2 1 0 |
|
| 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 / / 2 1 0 |
|
||||||
|------------+--------+--------+--------+---------+---------+-\ \--+---------|
|
|------------+--------+--------+--------+---------+---------+-\ \--+---------|
|
||||||
| sf_index | r00 | r01 | r02 | r03 | r04 | / / | r19 |
|
| sf_quant | qr00 | qr01 | qr02 | qr03 | qr04 | / / | qr19 |
|
||||||
`-------------------------------------------------------------\ \------------`
|
`-------------------------------------------------------------\ \------------`
|
||||||
|
|
||||||
`sf_index` defines the scalefactor to use for this slice as an index into the
|
Each frame except the last must contain exactly 256 slices per channel. The last
|
||||||
qoa_scalefactor_tab[16]
|
frame may contain between 1 .. 256 (inclusive) slices per channel. The last
|
||||||
|
slice (for each channel) in the last frame may contain less than 20 samples; the
|
||||||
|
slice still must be 8 bytes wide, with the unused samples zeroed out.
|
||||||
|
|
||||||
`r00`--`r19` are the residuals for the individual samples, divided by the
|
Channels are interleaved per slice. E.g. for 2 channel stereo:
|
||||||
scalefactor and quantized by the qoa_quant_tab[].
|
slice[0] = L, slice[1] = R, slice[2] = L, slice[3] = R ...
|
||||||
|
|
||||||
In the decoder, a prediction of the next sample is computed by multiplying the
|
A valid QOA file or stream must have at least one frame. Each frame must contain
|
||||||
state (the last four output samples) with the predictor. The residual from the
|
at least one channel and one sample with a samplerate between 1 .. 16777215
|
||||||
slice is then dequantized using the qoa_dequant_tab[] and added to the
|
(inclusive).
|
||||||
prediction. The result is clamped to int16 to form the final output sample.
|
|
||||||
|
If the total number of samples is not known by the encoder, the samples in the
|
||||||
|
file header may be set to 0x00000000 to indicate that the encoder is
|
||||||
|
"streaming". In a streaming context, the samplerate and number of channels may
|
||||||
|
differ from frame to frame. For static files (those with samples set to a
|
||||||
|
non-zero value), each frame must have the same number of channels and same
|
||||||
|
samplerate.
|
||||||
|
|
||||||
|
Note that this implementation of QOA only handles files with a known total
|
||||||
|
number of samples.
|
||||||
|
|
||||||
|
A decoder should support at least 8 channels. The channel layout for channel
|
||||||
|
counts 1 .. 8 is:
|
||||||
|
|
||||||
|
1. Mono
|
||||||
|
2. L, R
|
||||||
|
3. L, R, C
|
||||||
|
4. FL, FR, B/SL, B/SR
|
||||||
|
5. FL, FR, C, B/SL, B/SR
|
||||||
|
6. FL, FR, C, LFE, B/SL, B/SR
|
||||||
|
7. FL, FR, C, LFE, B, SL, SR
|
||||||
|
8. FL, FR, C, LFE, BL, BR, SL, SR
|
||||||
|
|
||||||
|
QOA predicts each audio sample based on the previously decoded ones using a
|
||||||
|
"Sign-Sign Least Mean Squares Filter" (LMS). This prediction plus the
|
||||||
|
dequantized residual forms the final output sample.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -158,7 +183,7 @@ the higher end. Note that the residual zero is identical to the lowest positive
|
||||||
value. This is mostly fine, since the qoa_div() function always rounds away
|
value. This is mostly fine, since the qoa_div() function always rounds away
|
||||||
from zero. */
|
from zero. */
|
||||||
|
|
||||||
static int qoa_quant_tab[17] = {
|
static const int qoa_quant_tab[17] = {
|
||||||
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
|
7, 7, 7, 5, 5, 3, 3, 1, /* -8..-1 */
|
||||||
0, /* 0 */
|
0, /* 0 */
|
||||||
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
|
0, 2, 2, 4, 4, 6, 6, 6 /* 1.. 8 */
|
||||||
|
@ -169,13 +194,13 @@ static int qoa_quant_tab[17] = {
|
||||||
less accurate at the higher end. In theory, the highest scalefactor that we
|
less accurate at the higher end. In theory, the highest scalefactor that we
|
||||||
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
|
would need to encode the highest 16bit residual is (2**16)/8 = 8192. However we
|
||||||
rely on the LMS filter to predict samples accurately enough that a maximum
|
rely on the LMS filter to predict samples accurately enough that a maximum
|
||||||
residual of one quarter of the 16 bit range is high sufficient. I.e. with the
|
residual of one quarter of the 16 bit range is sufficient. I.e. with the
|
||||||
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
|
scalefactor 2048 times the quant range of 8 we can encode residuals up to 2**14.
|
||||||
|
|
||||||
The scalefactor values are computed as:
|
The scalefactor values are computed as:
|
||||||
scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
|
scalefactor_tab[s] <- round(pow(s + 1, 2.75)) */
|
||||||
|
|
||||||
static int qoa_scalefactor_tab[16] = {
|
static const int qoa_scalefactor_tab[16] = {
|
||||||
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
|
1, 7, 21, 45, 84, 138, 211, 304, 421, 562, 731, 928, 1157, 1419, 1715, 2048
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -188,7 +213,7 @@ do this in .16 fixed point with integers, instead of floats.
|
||||||
The reciprocal_tab is computed as:
|
The reciprocal_tab is computed as:
|
||||||
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
|
reciprocal_tab[s] <- ((1<<16) + scalefactor_tab[s] - 1) / scalefactor_tab[s] */
|
||||||
|
|
||||||
static int qoa_reciprocal_tab[16] = {
|
static const int qoa_reciprocal_tab[16] = {
|
||||||
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
|
65536, 9363, 3121, 1457, 781, 475, 311, 216, 156, 117, 90, 71, 57, 47, 39, 32
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -200,9 +225,13 @@ Since qoa_div rounds away from the zero, the smallest entries are mapped to 3/4
|
||||||
instead of 1. The dequant_tab assumes the following dequantized values for each
|
instead of 1. The dequant_tab assumes the following dequantized values for each
|
||||||
of the quant_tab indices and is computed as:
|
of the quant_tab indices and is computed as:
|
||||||
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
|
float dqt[8] = {0.75, -0.75, 2.5, -2.5, 4.5, -4.5, 7, -7};
|
||||||
dequant_tab[s][q] <- round(scalefactor_tab[s] * dqt[q]) */
|
dequant_tab[s][q] <- round_ties_away_from_zero(scalefactor_tab[s] * dqt[q])
|
||||||
|
|
||||||
static int qoa_dequant_tab[16][8] = {
|
The rounding employed here is "to nearest, ties away from zero", i.e. positive
|
||||||
|
and negative values are treated symmetrically.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static const int qoa_dequant_tab[16][8] = {
|
||||||
{ 1, -1, 3, -3, 5, -5, 7, -7},
|
{ 1, -1, 3, -3, 5, -5, 7, -7},
|
||||||
{ 5, -5, 18, -18, 32, -32, 49, -49},
|
{ 5, -5, 18, -18, 32, -32, 49, -49},
|
||||||
{ 16, -16, 53, -53, 95, -95, 147, -147},
|
{ 16, -16, 53, -53, 95, -95, 147, -147},
|
||||||
|
@ -270,7 +299,21 @@ static inline int qoa_div(int v, int scalefactor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int qoa_clamp(int v, int min, int max) {
|
static inline int qoa_clamp(int v, int min, int max) {
|
||||||
return (v < min) ? min : (v > max) ? max : v;
|
if (v < min) { return min; }
|
||||||
|
if (v > max) { return max; }
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This specialized clamp function for the signed 16 bit range improves decode
|
||||||
|
performance quite a bit. The extra if() statement works nicely with the CPUs
|
||||||
|
branch prediction as this branch is rarely taken. */
|
||||||
|
|
||||||
|
static inline int qoa_clamp_s16(int v) {
|
||||||
|
if ((unsigned int)(v + 32768) > 65535) {
|
||||||
|
if (v < -32768) { return -32768; }
|
||||||
|
if (v > 32767) { return 32767; }
|
||||||
|
}
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
|
static inline qoa_uint64_t qoa_read_u64(const unsigned char *bytes, unsigned int *p) {
|
||||||
|
@ -312,6 +355,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
|
||||||
unsigned int p = 0;
|
unsigned int p = 0;
|
||||||
unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
unsigned int slices = (frame_len + QOA_SLICE_LEN - 1) / QOA_SLICE_LEN;
|
||||||
unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
|
unsigned int frame_size = QOA_FRAME_SIZE(channels, slices);
|
||||||
|
int prev_scalefactor[QOA_MAX_CHANNELS] = {0};
|
||||||
|
|
||||||
/* Write the frame header */
|
/* Write the frame header */
|
||||||
qoa_write_u64((
|
qoa_write_u64((
|
||||||
|
@ -321,8 +365,24 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
|
||||||
(qoa_uint64_t)frame_size
|
(qoa_uint64_t)frame_size
|
||||||
), bytes, &p);
|
), bytes, &p);
|
||||||
|
|
||||||
/* Write the current LMS state */
|
|
||||||
for (int c = 0; c < channels; c++) {
|
for (int c = 0; c < channels; c++) {
|
||||||
|
/* If the weights have grown too large, reset them to 0. This may happen
|
||||||
|
with certain high-frequency sounds. This is a last resort and will
|
||||||
|
introduce quite a bit of noise, but should at least prevent pops/clicks */
|
||||||
|
int weights_sum =
|
||||||
|
qoa->lms[c].weights[0] * qoa->lms[c].weights[0] +
|
||||||
|
qoa->lms[c].weights[1] * qoa->lms[c].weights[1] +
|
||||||
|
qoa->lms[c].weights[2] * qoa->lms[c].weights[2] +
|
||||||
|
qoa->lms[c].weights[3] * qoa->lms[c].weights[3];
|
||||||
|
if (weights_sum > 0x2fffffff) {
|
||||||
|
qoa->lms[c].weights[0] = 0;
|
||||||
|
qoa->lms[c].weights[1] = 0;
|
||||||
|
qoa->lms[c].weights[2] = 0;
|
||||||
|
qoa->lms[c].weights[3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the current LMS state */
|
||||||
qoa_uint64_t weights = 0;
|
qoa_uint64_t weights = 0;
|
||||||
qoa_uint64_t history = 0;
|
qoa_uint64_t history = 0;
|
||||||
for (int i = 0; i < QOA_LMS_LEN; i++) {
|
for (int i = 0; i < QOA_LMS_LEN; i++) {
|
||||||
|
@ -348,8 +408,13 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
|
||||||
qoa_uint64_t best_error = -1;
|
qoa_uint64_t best_error = -1;
|
||||||
qoa_uint64_t best_slice;
|
qoa_uint64_t best_slice;
|
||||||
qoa_lms_t best_lms;
|
qoa_lms_t best_lms;
|
||||||
|
int best_scalefactor;
|
||||||
|
|
||||||
for (int scalefactor = 0; scalefactor < 16; scalefactor++) {
|
for (int sfi = 0; sfi < 16; sfi++) {
|
||||||
|
/* There is a strong correlation between the scalefactors of
|
||||||
|
neighboring slices. As an optimization, start testing
|
||||||
|
the best scalefactor of the previous slice first. */
|
||||||
|
int scalefactor = (sfi + prev_scalefactor[c]) % 16;
|
||||||
|
|
||||||
/* We have to reset the LMS state to the last known good one
|
/* We have to reset the LMS state to the last known good one
|
||||||
before trying each scalefactor, as each pass updates the LMS
|
before trying each scalefactor, as each pass updates the LMS
|
||||||
|
@ -367,7 +432,7 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
|
||||||
int clamped = qoa_clamp(scaled, -8, 8);
|
int clamped = qoa_clamp(scaled, -8, 8);
|
||||||
int quantized = qoa_quant_tab[clamped + 8];
|
int quantized = qoa_quant_tab[clamped + 8];
|
||||||
int dequantized = qoa_dequant_tab[scalefactor][quantized];
|
int dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||||
int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
|
int reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||||
|
|
||||||
long long error = (sample - reconstructed);
|
long long error = (sample - reconstructed);
|
||||||
current_error += error * error;
|
current_error += error * error;
|
||||||
|
@ -383,9 +448,12 @@ unsigned int qoa_encode_frame(const short *sample_data, qoa_desc *qoa, unsigned
|
||||||
best_error = current_error;
|
best_error = current_error;
|
||||||
best_slice = slice;
|
best_slice = slice;
|
||||||
best_lms = lms;
|
best_lms = lms;
|
||||||
|
best_scalefactor = scalefactor;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prev_scalefactor[c] = best_scalefactor;
|
||||||
|
|
||||||
qoa->lms[c] = best_lms;
|
qoa->lms[c] = best_lms;
|
||||||
#ifdef QOA_RECORD_TOTAL_ERROR
|
#ifdef QOA_RECORD_TOTAL_ERROR
|
||||||
qoa->error += best_error;
|
qoa->error += best_error;
|
||||||
|
@ -553,7 +621,7 @@ unsigned int qoa_decode_frame(const unsigned char *bytes, unsigned int size, qoa
|
||||||
int predicted = qoa_lms_predict(&qoa->lms[c]);
|
int predicted = qoa_lms_predict(&qoa->lms[c]);
|
||||||
int quantized = (slice >> 57) & 0x7;
|
int quantized = (slice >> 57) & 0x7;
|
||||||
int dequantized = qoa_dequant_tab[scalefactor][quantized];
|
int dequantized = qoa_dequant_tab[scalefactor][quantized];
|
||||||
int reconstructed = qoa_clamp(predicted + dequantized, -32768, 32767);
|
int reconstructed = qoa_clamp_s16(predicted + dequantized);
|
||||||
|
|
||||||
sample_data[si] = reconstructed;
|
sample_data[si] = reconstructed;
|
||||||
slice <<= 3;
|
slice <<= 3;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue