[Sound-open-firmware] [PATCH 1/6] SRC: Files structure change and add Xtensa optimized versions
Seppo Ingalsuo
seppo.ingalsuo at linux.intel.com
Thu Mar 8 15:22:17 CET 2018
This patch moves generic common code to src.c/h from src_core.c/h and
places generic C optimized filter to src_generic.c. The HiFi EP
version is in src_hifi2ep.c and HiFi3 version is in src_hifi3.c. Use of
the Xtensa optimized versions require xt-xcc compiler.
The non-used SRC in/out rates query code is removed. The 24 bit
coefficients were replaced by 32 bit coefficients those are compatible
with Xtensa fractional integer types.
Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
---
src/audio/Makefile.am | 4 +-
src/audio/src.c | 346 ++++++++++++++++++--
src/audio/{src_core.h => src.h} | 17 +-
src/audio/src_config.h | 57 +++-
src/audio/src_core.c | 676 ----------------------------------------
src/audio/src_generic.c | 435 ++++++++++++++++++++++++++
src/audio/src_hifi2ep.c | 562 +++++++++++++++++++++++++++++++++
src/audio/src_hifi3.c | 567 +++++++++++++++++++++++++++++++++
8 files changed, 1938 insertions(+), 726 deletions(-)
rename src/audio/{src_core.h => src.h} (93%)
delete mode 100644 src/audio/src_core.c
create mode 100644 src/audio/src_generic.c
create mode 100644 src/audio/src_hifi2ep.c
create mode 100644 src/audio/src_hifi3.c
diff --git a/src/audio/Makefile.am b/src/audio/Makefile.am
index bccedbf..ae58289 100644
--- a/src/audio/Makefile.am
+++ b/src/audio/Makefile.am
@@ -1006,7 +1006,9 @@ libaudio_a_SOURCES = \
fir.c \
tone.c \
src.c \
- src_core.c \
+ src_generic.c \
+ src_hifi2ep.c \
+ src_hifi3.c \
mixer.c \
mux.c \
volume.c \
diff --git a/src/audio/src.c b/src/audio/src.c
index c7ac649..cca0cbc 100644
--- a/src/audio/src.c
+++ b/src/audio/src.c
@@ -43,7 +43,17 @@
#include <reef/audio/component.h>
#include <reef/audio/pipeline.h>
#include <uapi/ipc.h>
-#include "src_core.h"
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_SHORT
+#include <reef/audio/coefficients/src/src_tiny_int16_define.h>
+#include <reef/audio/coefficients/src/src_tiny_int16_table.h>
+#else
+#include <reef/audio/coefficients/src/src_std_int32_define.h>
+#include <reef/audio/coefficients/src/src_std_int32_table.h>
+#endif
#ifdef MODULE_TEST
#include <stdio.h>
@@ -53,6 +63,10 @@
#define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e)
#define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e)
+/* The FIR maximum lengths are per channel so need to multiply them */
+#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE)
+#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE)
+
/* src component private data */
struct comp_data {
struct polyphase_src src;
@@ -63,14 +77,273 @@ struct comp_data {
int32_t *sbuf_w_ptr;
int32_t *sbuf_r_ptr;
int sbuf_avail;
- void (* src_func)(struct comp_dev *dev,
+ void (*src_func)(struct comp_dev *dev,
struct comp_buffer *source,
struct comp_buffer *sink,
size_t *consumed,
size_t *produced);
- void (* polyphase_func)(struct src_stage_prm *s);
+ void (*polyphase_func)(struct src_stage_prm *s);
};
+/* Calculate ceil() for integer division */
+int src_ceil_divide(int a, int b)
+{
+ int c;
+
+ c = a / b;
+ if (c * b < a)
+ c++;
+
+ return c;
+}
+
+/* Calculates the needed FIR delay line length */
+static int src_fir_delay_length(struct src_stage *s)
+{
+ return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm
+ + s->blk_in;
+}
+
+/* Calculates the FIR output delay line length */
+static int src_out_delay_length(struct src_stage *s)
+{
+ return 1 + (s->num_of_subfilters - 1) * s->odm;
+}
+
+/* Returns index of a matching sample rate */
+static int src_find_fs(int fs_list[], int list_length, int fs)
+{
+ int i;
+
+ for (i = 0; i < list_length; i++) {
+ if (fs_list[i] == fs)
+ return i;
+ }
+ return -EINVAL;
+}
+
+/* Calculates buffers to allocate for a SRC mode */
+int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
+ int frames, int frames_is_for_source)
+{
+ struct src_stage *stage1;
+ struct src_stage *stage2;
+ int q;
+ int den;
+ int num;
+ int frames2;
+
+ if (nch > PLATFORM_MAX_CHANNELS) {
+ trace_src_error("che");
+ tracev_value(nch);
+ return -EINVAL;
+ }
+
+ a->nch = nch;
+ a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in);
+ a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out);
+
+ /* Check that both in and out rates are supported */
+ if (a->idx_in < 0 || a->idx_out < 0) {
+ trace_src_error("us1");
+ tracev_value(fs_in);
+ tracev_value(fs_out);
+ return -EINVAL;
+ }
+
+ stage1 = src_table1[a->idx_out][a->idx_in];
+ stage2 = src_table2[a->idx_out][a->idx_in];
+
+ /* Check from stage1 parameter for a deleted in/out rate combination.*/
+ if (stage1->filter_length < 1) {
+ trace_src_error("us2");
+ tracev_value(fs_in);
+ tracev_value(fs_out);
+ return -EINVAL;
+ }
+
+ a->fir_s1 = nch * src_fir_delay_length(stage1);
+ a->out_s1 = nch * src_out_delay_length(stage1);
+
+ /* Find out how many additional times the SRC can be executed
+ * while having block size less or equal to max_frames.
+ */
+ if (frames_is_for_source) {
+ /* Times that stage1 needs to run to input length of frames */
+ a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in);
+ q = frames / stage1->blk_in;
+ a->stage1_times = MAX(q, 1);
+ a->blk_in = a->stage1_times * stage1->blk_in;
+
+ /* Times that stage2 needs to run */
+ den = stage2->blk_in * stage1->blk_in;
+ num = frames * stage2->blk_out * stage1->blk_out;
+ frames2 = src_ceil_divide(num, den);
+ a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out);
+ q = frames2 / stage2->blk_out;
+ a->stage2_times = MAX(q, 1);
+ a->blk_out = a->stage2_times * stage2->blk_out;
+ } else {
+ /* Times that stage2 needs to run to output length of frames */
+ a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out);
+ q = frames / stage2->blk_out;
+ a->stage2_times = MAX(q, 1);
+ a->blk_out = a->stage2_times * stage2->blk_out;
+
+ /* Times that stage1 needs to run */
+ num = frames * stage2->blk_in * stage1->blk_in;
+ den = stage2->blk_out * stage1->blk_out;
+ frames2 = src_ceil_divide(num, den);
+ a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in);
+ q = frames2 / stage1->blk_in;
+ a->stage1_times = MAX(q, 1);
+ a->blk_in = a->stage1_times * stage1->blk_in;
+ }
+
+ if (stage2->filter_length == 1) {
+ a->fir_s2 = 0;
+ a->out_s2 = 0;
+ a->stage2_times = 0;
+ a->stage2_times_max = 0;
+ a->sbuf_length = 0;
+ } else {
+ a->fir_s2 = nch * src_fir_delay_length(stage2);
+ a->out_s2 = nch * src_out_delay_length(stage2);
+ /* 2x is an empirically tested length. Since the sink buffer
+ * capability to receive samples varies a shorter stage 2 output
+ * block will create a peak in internal buffer usage.
+ */
+
+ /* TODO 1: Equation for needed length */
+ a->sbuf_length = 2 * nch * stage1->blk_out
+ * a->stage1_times_max;
+ }
+
+ a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
+ a->total = a->sbuf_length + a->src_multich;
+
+ return 0;
+}
+
+static void src_state_reset(struct src_state *state)
+{
+ state->fir_delay_size = 0;
+ state->out_delay_size = 0;
+}
+
+static int init_stages(struct src_stage *stage1, struct src_stage *stage2,
+ struct polyphase_src *src, struct src_param *p,
+ int n, int32_t *delay_lines_start)
+{
+ /* Clear FIR state */
+ src_state_reset(&src->state1);
+ src_state_reset(&src->state2);
+
+ src->number_of_stages = n;
+ src->stage1 = stage1;
+ src->stage2 = stage2;
+ if (n == 1 && stage1->blk_out == 0)
+ return -EINVAL;
+
+ /* Optimized SRC requires subfilter length multiple of 4 */
+ if (stage1->filter_length > 1 && (stage1->subfilter_length & 0x3) > 0)
+ return -EINVAL;
+
+ if (stage2->filter_length > 1 && (stage2->subfilter_length & 0x3) > 0)
+ return -EINVAL;
+
+ /* Delay line sizes */
+ src->state1.fir_delay_size = p->fir_s1;
+ src->state1.out_delay_size = p->out_s1;
+ src->state1.fir_delay = delay_lines_start;
+ src->state1.out_delay =
+ src->state1.fir_delay + src->state1.fir_delay_size;
+ /* Initialize to last ensures that circular wrap cannot happen
+ * mid-frame. The size is multiple of channels count.
+ */
+ src->state1.fir_wp = &src->state1.fir_delay[p->fir_s1 - 1];
+ src->state1.out_rp = src->state1.out_delay;
+ if (n > 1) {
+ src->state2.fir_delay_size = p->fir_s2;
+ src->state2.out_delay_size = p->out_s2;
+ src->state2.fir_delay =
+ src->state1.out_delay + src->state1.out_delay_size;
+ src->state2.out_delay =
+ src->state2.fir_delay + src->state2.fir_delay_size;
+ /* Initialize to last ensures that circular wrap cannot happen
+ * mid-frame. The size is multiple of channels count.
+ */
+ src->state2.fir_wp = &src->state2.fir_delay[p->fir_s2 - 1];
+ src->state2.out_rp = src->state2.out_delay;
+ } else {
+ src->state2.fir_delay_size = 0;
+ src->state2.out_delay_size = 0;
+ src->state2.fir_delay = NULL;
+ src->state2.out_delay = NULL;
+ }
+
+ /* Check the sizes are less than MAX */
+ if (src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH ||
+ src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH ||
+ src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH ||
+ src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH) {
+ src->state1.fir_delay = NULL;
+ src->state1.out_delay = NULL;
+ src->state2.fir_delay = NULL;
+ src->state2.out_delay = NULL;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void src_polyphase_reset(struct polyphase_src *src)
+{
+ src->number_of_stages = 0;
+ src->stage1 = NULL;
+ src->stage2 = NULL;
+ src_state_reset(&src->state1);
+ src_state_reset(&src->state2);
+}
+
+int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
+ int32_t *delay_lines_start)
+{
+ struct src_stage *stage1;
+ struct src_stage *stage2;
+ int n_stages;
+ int ret;
+
+ if (p->idx_in < 0 || p->idx_out < 0)
+ return -EINVAL;
+
+ /* Get setup for 2 stage conversion */
+ stage1 = src_table1[p->idx_out][p->idx_in];
+ stage2 = src_table2[p->idx_out][p->idx_in];
+ ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start);
+ if (ret < 0)
+ return -EINVAL;
+
+ /* Get number of stages used for optimize opportunity. 2nd
+ * stage length is one if conversion needs only one stage.
+ * If input and output rate is the same return 0 to
+ * use a simple copy function instead of 1 stage FIR with one
+ * tap.
+ */
+ n_stages = (src->stage2->filter_length == 1) ? 1 : 2;
+ if (p->idx_in == p->idx_out)
+ n_stages = 0;
+
+ /* If filter length for first stage is zero this is a deleted
+ * mode from in/out matrix. Computing of such SRC mode needs
+ * to be prevented.
+ */
+ if (src->stage1->filter_length == 0)
+ return -EINVAL;
+
+ return n_stages;
+}
+
/* Fallback function */
static void src_fallback(struct comp_dev *dev, struct comp_buffer *source,
struct comp_buffer *sink, size_t *bytes_read, size_t *bytes_written)
@@ -91,8 +364,9 @@ static void src_2s_s32_default(struct comp_dev *dev,
int s2_blk_in;
int s2_blk_out;
struct comp_data *cd = comp_get_drvdata(dev);
- int32_t *dest = (int32_t *) sink->w_ptr;
- int32_t *src = (int32_t *) source->r_ptr;
+ int32_t *dest = (int32_t *)sink->w_ptr;
+ int32_t *src = (int32_t *)source->r_ptr;
+ int32_t *sbuf_addr = cd->delay_lines;
int32_t *sbuf_end_addr = &cd->delay_lines[cd->param.sbuf_length];
int32_t sbuf_size = cd->param.sbuf_length * sizeof(int32_t);
int nch = dev->params.channels;
@@ -107,6 +381,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
s1.x_end_addr = source->end_addr;
s1.x_size = source->size;
+ s1.y_addr = sbuf_addr;
s1.y_end_addr = sbuf_end_addr;
s1.y_size = sbuf_size;
s1.state = &cd->src.state1;
@@ -117,6 +392,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
s2.x_end_addr = sbuf_end_addr;
s2.x_size = sbuf_size;
+ s2.y_addr = sink->addr;
s2.y_end_addr = sink->end_addr;
s2.y_size = sink->size;
s2.state = &cd->src.state2;
@@ -125,14 +401,13 @@ static void src_2s_s32_default(struct comp_dev *dev,
s2.y_wptr = dest;
s2.nch = nch;
-
/* Test if 1st stage can be run with default block length to reach
* the period length or just under it.
*/
s1.times = cd->param.stage1_times;
s1_blk_in = s1.times * cd->src.stage1->blk_in * nch;
s1_blk_out = s1.times * cd->src.stage1->blk_out * nch;
- if ((avail_b >= s1_blk_in * sz) && (sbuf_free >= s1_blk_out)) {
+ if (avail_b >= s1_blk_in * sz && sbuf_free >= s1_blk_out) {
cd->polyphase_func(&s1);
cd->sbuf_w_ptr = s1.y_wptr;
@@ -147,8 +422,9 @@ static void src_2s_s32_default(struct comp_dev *dev,
s1.times = 1;
s1_blk_in = cd->src.stage1->blk_in * nch;
s1_blk_out = cd->src.stage1->blk_out * nch;
- while ((n1 < cd->param.stage1_times_max) && (avail_b >= s1_blk_in * sz)
- && (sbuf_free >= s1_blk_out)) {
+ while (n1 < cd->param.stage1_times_max &&
+ avail_b >= s1_blk_in * sz &&
+ sbuf_free >= s1_blk_out) {
cd->polyphase_func(&s1);
cd->sbuf_w_ptr = s1.y_wptr;
@@ -163,7 +439,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
s2.times = cd->param.stage2_times;
s2_blk_in = s2.times * cd->src.stage2->blk_in * nch;
s2_blk_out = s2.times * cd->src.stage2->blk_out * nch;
- if ((cd->sbuf_avail >= s2_blk_in) && (free_b >= s2_blk_out * sz)) {
+ if (cd->sbuf_avail >= s2_blk_in && free_b >= s2_blk_out * sz) {
cd->polyphase_func(&s2);
cd->sbuf_r_ptr = s2.x_rptr;
@@ -173,14 +449,13 @@ static void src_2s_s32_default(struct comp_dev *dev,
n2 = s2.times;
}
-
/* Run one block at time the remaining 2nd stage output */
s2.times = 1;
s2_blk_in = cd->src.stage2->blk_in * nch;
s2_blk_out = cd->src.stage2->blk_out * nch;
- while ((n2 < cd->param.stage2_times_max)
- && (cd->sbuf_avail >= s2_blk_in)
- && (free_b >= s2_blk_out * sz)) {
+ while (n2 < cd->param.stage2_times_max &&
+ cd->sbuf_avail >= s2_blk_in &&
+ free_b >= s2_blk_out * sz) {
cd->polyphase_func(&s2);
cd->sbuf_r_ptr = s2.x_rptr;
@@ -205,10 +480,10 @@ static void src_1s_s32_default(struct comp_dev *dev,
int n_written = 0;
s1.times = cd->param.stage1_times;
- s1.x_rptr = (int32_t *) source->r_ptr;
+ s1.x_rptr = (int32_t *)source->r_ptr;
s1.x_end_addr = source->end_addr;
s1.x_size = source->size;
- s1.y_wptr = (int32_t *) sink->w_ptr;
+ s1.y_wptr = (int32_t *)sink->w_ptr;
s1.y_end_addr = sink->end_addr;
s1.y_size = sink->size;
s1.state = &cd->src.state1;
@@ -229,8 +504,8 @@ static void src_copy_s32_default(struct comp_dev *dev,
size_t *bytes_read, size_t *bytes_written)
{
struct comp_data *cd = comp_get_drvdata(dev);
- int32_t *src = (int32_t *) source->r_ptr;
- int32_t *snk = (int32_t *) sink->w_ptr;
+ int32_t *src = (int32_t *)source->r_ptr;
+ int32_t *snk = (int32_t *)sink->w_ptr;
int nch = dev->params.channels;
int frames = cd->param.blk_in;
int n;
@@ -241,9 +516,10 @@ static void src_copy_s32_default(struct comp_dev *dev,
n = frames * nch;
while (n > 0) {
- n_wrap_src = (int32_t *) source->end_addr - src;
- n_wrap_snk = (int32_t *) sink->end_addr - snk;
- n_wrap_min = (n_wrap_src < n_wrap_snk) ? n_wrap_src : n_wrap_snk;
+ n_wrap_src = (int32_t *)source->end_addr - src;
+ n_wrap_snk = (int32_t *)sink->end_addr - snk;
+ n_wrap_min = (n_wrap_src < n_wrap_snk) ?
+ n_wrap_src : n_wrap_snk;
n_copy = (n < n_wrap_min) ? n : n_wrap_min;
memcpy(snk, src, n_copy * sizeof(int32_t));
@@ -253,7 +529,6 @@ static void src_copy_s32_default(struct comp_dev *dev,
snk += n_copy;
src_circ_inc_wrap(&src, source->end_addr, source->size);
src_circ_inc_wrap(&snk, sink->end_addr, sink->size);
-
}
*bytes_read = frames * nch * sizeof(int32_t);
*bytes_written = frames * nch * sizeof(int32_t);
@@ -263,7 +538,7 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
{
struct comp_dev *dev;
struct sof_ipc_comp_src *src;
- struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *) comp;
+ struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *)comp;
struct comp_data *cd;
trace_src("new");
@@ -276,14 +551,14 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
dev = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM,
COMP_SIZE(struct sof_ipc_comp_src));
- if (dev == NULL)
+ if (!dev)
return NULL;
- src = (struct sof_ipc_comp_src *) &dev->comp;
+ src = (struct sof_ipc_comp_src *)&dev->comp;
memcpy(src, ipc_src, sizeof(struct sof_ipc_comp_src));
cd = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM, sizeof(*cd));
- if (cd == NULL) {
+ if (!cd) {
rfree(dev);
return NULL;
}
@@ -306,7 +581,7 @@ static void src_free(struct comp_dev *dev)
trace_src("fre");
/* Free dynamically reserved buffers for SRC algorithm */
- if (cd->delay_lines != NULL)
+ if (!cd->delay_lines)
rfree(cd->delay_lines);
rfree(cd);
@@ -347,7 +622,8 @@ static int src_params(struct comp_dev *dev)
}
/* Calculate source and sink rates, one rate will come from IPC new
- * and the other from params. */
+ * and the other from params.
+ */
if (src->source_rate == 0) {
/* params rate is source rate */
source_rate = params->rate;
@@ -383,12 +659,12 @@ static int src_params(struct comp_dev *dev)
}
/* free any existing delay lines. TODO reuse if same size */
- if (cd->delay_lines != NULL)
+ if (!cd->delay_lines)
rfree(cd->delay_lines);
cd->delay_lines = rballoc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM,
delay_lines_size);
- if (cd->delay_lines == NULL) {
+ if (!cd->delay_lines) {
trace_src_error("sr3");
trace_value(delay_lines_size);
return -EINVAL;
@@ -424,7 +700,6 @@ static int src_params(struct comp_dev *dev)
trace_src("SFa");
cd->src_func = src_fallback;
return -EINVAL;
- break;
}
/* Calculate period size based on config. First make sure that
@@ -438,7 +713,7 @@ static int src_params(struct comp_dev *dev)
* buffer_set_size will return an error if the required length would
* be too long.
*/
- q = src_ceil_divide(cd->param.blk_out, (int) dev->frames) + 1;
+ q = src_ceil_divide(cd->param.blk_out, (int)dev->frames) + 1;
/* Configure downstream buffer */
sink = list_first_item(&dev->bsink_list, struct comp_buffer,
@@ -459,7 +734,6 @@ static int src_params(struct comp_dev *dev)
return -EINVAL;
}
-
return 0;
}
@@ -518,7 +792,8 @@ static int src_copy(struct comp_dev *dev)
/* make sure source component buffer has enough data available and that
* the sink component buffer has enough free bytes for copy. Also
- * check for XRUNs */
+ * check for XRUNs.
+ */
if (source->avail < need_source) {
trace_src_error("xru");
return -EIO; /* xrun */
@@ -530,6 +805,9 @@ static int src_copy(struct comp_dev *dev)
cd->src_func(dev, source, sink, &consumed, &produced);
+ tracev_value(consumed >> 3);
+ tracev_value(produced >> 3);
+
/* Calc new free and available if data was processed. These
* functions must not be called with 0 consumed/produced.
*/
diff --git a/src/audio/src_core.h b/src/audio/src.h
similarity index 93%
rename from src/audio/src_core.h
rename to src/audio/src.h
index 3ea6028..3208693 100644
--- a/src/audio/src_core.h
+++ b/src/audio/src.h
@@ -29,8 +29,8 @@
*
*/
-#ifndef SRC_CORE_H
-#define SRC_CORE_H
+#ifndef SRC_H
+#define SRC_H
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -68,12 +68,12 @@ struct src_stage {
};
struct src_state {
- int fir_delay_size;
- int out_delay_size;
- int fir_wi;
- int out_ri;
+ int fir_delay_size; /* samples */
+ int out_delay_size; /* samples */
int32_t *fir_delay;
int32_t *out_delay;
+ int32_t *fir_wp;
+ int32_t *out_rp;
};
struct polyphase_src {
@@ -91,6 +91,7 @@ struct src_stage_prm {
int32_t *x_end_addr;
size_t x_size;
int32_t *y_wptr;
+ int32_t *y_addr;
int32_t *y_end_addr;
size_t y_size;
struct src_state *state;
@@ -100,13 +101,13 @@ struct src_stage_prm {
static inline void src_circ_inc_wrap(int32_t **ptr, int32_t *end, size_t size)
{
if (*ptr >= end)
- *ptr = (int32_t *) ((size_t) * ptr - size);
+ *ptr = (int32_t *)((size_t)*ptr - size);
}
static inline void src_circ_dec_wrap(int32_t **ptr, int32_t *addr, size_t size)
{
if (*ptr < addr)
- *ptr = (int32_t *) ((size_t) * ptr + size);
+ *ptr = (int32_t *)((size_t)*ptr + size);
}
void src_polyphase_reset(struct polyphase_src *src);
diff --git a/src/audio/src_config.h b/src/audio/src_config.h
index 3ad4c78..65d6247 100644
--- a/src/audio/src_config.h
+++ b/src/audio/src_config.h
@@ -34,14 +34,57 @@
#include <config.h>
-#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL || defined CONFIG_BROADWELL || defined CONFIG_HASWELL
-#define SRC_SHORT 1
-#include <reef/audio/coefficients/src/src_tiny_int16_define.h>
-#include <reef/audio/coefficients/src/src_tiny_int16_table.h>
+/* If next defines are set to 1 the SRC is configured automatically. Setting
+ * to zero temporarily is useful is for testing needs.
+ * Setting SRC_AUTODSP to 0 allows to manually set the code variant.
+ * Setting SRC_AUTOCOEF to 0 allows to select the coefficient type.
+ */
+#define SRC_AUTOARCH 1
+#define SRC_AUTOCOEF 1
+
+/* Force manually some code variant when SRC_AUTODSP is set to zero. These
+ * are useful in code debugging.
+ */
+#if SRC_AUTOARCH == 0
+#define SRC_GENERIC 1
+#define SRC_HIFIEP 0
+#define SRC_HIFI3 0
+#endif
+#if SRC_AUTOCOEF == 0
+#define SRC_SHORT 0
+#endif
+
+/* Select 16 bit coefficients for specific platforms.
+ * Otherwise 32 bits is the default.
+ */
+#if SRC_AUTOCOEF == 1
+#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL \
+ || defined CONFIG_BROADWELL || defined CONFIG_HASWELL
+#define SRC_SHORT 1 /* Use int16_t filter coefficients */
#else
-#define SHORT_SHORT 0
-#include <reef/audio/coefficients/src/src_std_int24_define.h>
-#include <reef/audio/coefficients/src/src_std_int24_table.h>
+#define SRC_SHORT 0 /* Use int32_t filter coefficients */
+#endif
+#endif
+
+/* Select optimized code variant when xt-xcc compiler is used */
+#if SRC_AUTOARCH == 1
+#if defined __XCC__
+#include <xtensa/config/core-isa.h>
+#define SRC_GENERIC 0
+#if XCHAL_HAVE_HIFI2EP == 1
+#define SRC_HIFIEP 1
+#define SRC_HIFI3 0
+#endif
+#if XCHAL_HAVE_HIFI3 == 1
+#define SRC_HIFI3 1
+#define SRC_HIFIEP 0
+#endif
+#else
+/* GCC */
+#define SRC_GENERIC 1
+#define SRC_HIFIEP 0
+#define SRC_HIFI3 0
+#endif
#endif
#endif
diff --git a/src/audio/src_core.c b/src/audio/src_core.c
deleted file mode 100644
index d8b9a3d..0000000
--- a/src/audio/src_core.c
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
- * Copyright (c) 2016, Intel Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the Intel Corporation nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
- *
- */
-
-/* Non optimized default C implementation guaranteed to work on any
- * architecture.
- */
-
-#include <stdint.h>
-
-#ifdef MODULE_TEST
-#include <stdio.h>
-#endif
-
-#include <reef/alloc.h>
-#include <reef/audio/format.h>
-#include <reef/math/numbers.h>
-#include "src_core.h"
-#include "src_config.h"
-
-#define trace_src(__e) trace_event(TRACE_CLASS_SRC, __e)
-#define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e)
-#define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e)
-
-/* TODO: These should be defined somewhere else. */
-#define SOF_RATES_LENGTH 15
-int sof_rates[SOF_RATES_LENGTH] = {8000, 11025, 12000, 16000, 18900,
- 22050, 24000, 32000, 44100, 48000, 64000, 88200, 96000, 176400,
- 192000};
-
-/* The FIR maximum lengths are per channel so need to multiply them */
-#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE)
-#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE)
-
-/* Calculate ceil() for integer division */
-int src_ceil_divide(int a, int b)
-{
- int c;
-
- c = a / b;
- if (c * b < a)
- c++;
-
- return c;
-}
-
-/* Calculates the needed FIR delay line length */
-static int src_fir_delay_length(struct src_stage *s)
-{
- return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm
- + s->blk_in;
-}
-
-/* Calculates the FIR output delay line length */
-static int src_out_delay_length(struct src_stage *s)
-{
-
- return 1 + (s->num_of_subfilters - 1) * s->odm;
-}
-
-/* Returns index of a matching sample rate */
-static int src_find_fs(int fs_list[], int list_length, int fs)
-{
- int i;
-
- for (i = 0; i < list_length; i++) {
- if (fs_list[i] == fs)
- return i;
- }
- return -EINVAL;
-}
-
-/* Match SOF and defined SRC input rates into a bit mask */
-int32_t src_input_rates(void)
-{
- int n;
- int b;
- int mask = 0;
-
- for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) {
- b = (src_find_fs(src_in_fs, NUM_IN_FS, sof_rates[n]) >= 0)
- ? 1 : 0;
- mask = (mask << 1) | b;
- }
- return mask;
-}
-
-/* Match SOF and defined SRC output rates into a bit mask */
-int32_t src_output_rates(void)
-{
- int n;
- int b;
- int mask = 0;
-
- for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) {
- b = (src_find_fs(src_out_fs, NUM_OUT_FS, sof_rates[n]) >= 0)
- ? 1 : 0;
- mask = (mask << 1) | b;
- }
- return mask;
-}
-
-/* Calculates buffers to allocate for a SRC mode */
-int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
- int frames, int frames_is_for_source)
-{
- struct src_stage *stage1;
- struct src_stage *stage2;
- int q;
- int den;
- int num;
- int frames2;
-
- if (nch > PLATFORM_MAX_CHANNELS) {
- trace_src_error("che");
- tracev_value(nch);
- return -EINVAL;
- }
-
- a->nch = nch;
- a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in);
- a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out);
-
- /* Check that both in and out rates are supported */
- if ((a->idx_in < 0) || (a->idx_out < 0)) {
- trace_src_error("us1");
- tracev_value(fs_in);
- tracev_value(fs_out);
- return -EINVAL;
- }
-
- stage1 = src_table1[a->idx_out][a->idx_in];
- stage2 = src_table2[a->idx_out][a->idx_in];
-
- /* Check from stage1 parameter for a deleted in/out rate combination.*/
- if (stage1->filter_length < 1) {
- trace_src_error("us2");
- tracev_value(fs_in);
- tracev_value(fs_out);
- return -EINVAL;
- }
-
- a->fir_s1 = nch * src_fir_delay_length(stage1);
- a->out_s1 = nch * src_out_delay_length(stage1);
-
- /* Find out how many additional times the SRC can be executed
- while having block size less or equal to max_frames.
- */
- if (frames_is_for_source) {
- /* Times that stage1 needs to run to input length of frames */
- a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in);
- q = frames / stage1->blk_in;
- a->stage1_times = MAX(q, 1);
- a->blk_in = a->stage1_times * stage1->blk_in;
-
- /* Times that stage2 needs to run */
- den = stage2->blk_in * stage1->blk_in;
- num = frames * stage2->blk_out * stage1->blk_out;
- frames2 = src_ceil_divide(num, den);
- a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out);
- q = frames2 / stage2->blk_out;
- a->stage2_times = MAX(q, 1);
- a->blk_out = a->stage2_times * stage2->blk_out;
- } else {
- /* Times that stage2 needs to run to output length of frames */
- a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out);
- q = frames / stage2->blk_out;
- a->stage2_times = MAX(q, 1);
- a->blk_out = a->stage2_times * stage2->blk_out;
-
- /* Times that stage1 needs to run */
- num = frames * stage2->blk_in * stage1->blk_in;
- den = stage2->blk_out * stage1->blk_out;
- frames2 = src_ceil_divide(num, den);
- a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in);
- q = frames2 / stage1->blk_in;
- a->stage1_times = MAX(q, 1);
- a->blk_in = a->stage1_times * stage1->blk_in;
- }
-
- if (stage2->filter_length == 1) {
- a->fir_s2 = 0;
- a->out_s2 = 0;
- a->stage2_times = 0;
- a->stage2_times_max = 0;
- a->sbuf_length = 0;
- } else {
- a->fir_s2 = nch * src_fir_delay_length(stage2);
- a->out_s2 = nch * src_out_delay_length(stage2);
- /* 2x is an empirically tested length. Since the sink buffer
- * capability to receive samples varies a shorter stage 2 output
- * block will create a peak in internal buffer usage.
- */
- a->sbuf_length = 2 * nch * stage1->blk_out * a->stage1_times_max;
- }
-
- a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
- a->total = a->sbuf_length + a->src_multich;
-
- return 0;
-}
-
-static void src_state_reset(struct src_state *state)
-{
-
- state->fir_delay_size = 0;
- state->out_delay_size = 0;
- state->fir_wi = 0;
- state->out_ri = 0;
-}
-
-static int init_stages(
- struct src_stage *stage1, struct src_stage *stage2,
- struct polyphase_src *src, struct src_param *p,
- int n, int32_t *delay_lines_start)
-{
- /* Clear FIR state */
- src_state_reset(&src->state1);
- src_state_reset(&src->state2);
-
- src->number_of_stages = n;
- src->stage1 = stage1;
- src->stage2 = stage2;
- if ((n == 1) && (stage1->blk_out == 0))
- return -EINVAL;
-
- /* Delay line sizes */
- src->state1.fir_delay_size = p->fir_s1;
- src->state1.out_delay_size = p->out_s1;
- src->state1.fir_delay = delay_lines_start;
- src->state1.out_delay =
- src->state1.fir_delay + src->state1.fir_delay_size;
- if (n > 1) {
- src->state2.fir_delay_size = p->fir_s2;
- src->state2.out_delay_size = p->out_s2;
- src->state2.fir_delay =
- src->state1.out_delay + src->state1.out_delay_size;
- src->state2.out_delay =
- src->state2.fir_delay + src->state2.fir_delay_size;
- } else {
- src->state2.fir_delay_size = 0;
- src->state2.out_delay_size = 0;
- src->state2.fir_delay = NULL;
- src->state2.out_delay = NULL;
- }
-
- /* Check the sizes are less than MAX */
- if ((src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
- || (src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)
- || (src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
- || (src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)) {
- src->state1.fir_delay = NULL;
- src->state1.out_delay = NULL;
- src->state2.fir_delay = NULL;
- src->state2.out_delay = NULL;
- return -EINVAL;
- }
-
- return 0;
-}
-
-void src_polyphase_reset(struct polyphase_src *src)
-{
-
- src->number_of_stages = 0;
- src->stage1 = NULL;
- src->stage2 = NULL;
- src_state_reset(&src->state1);
- src_state_reset(&src->state2);
-}
-
-int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
- int32_t *delay_lines_start)
-{
- struct src_stage *stage1;
- struct src_stage *stage2;
- int n_stages;
- int ret;
-
- if ((p->idx_in < 0) || (p->idx_out < 0)) {
- return -EINVAL;
- }
-
- /* Get setup for 2 stage conversion */
- stage1 = src_table1[p->idx_out][p->idx_in];
- stage2 = src_table2[p->idx_out][p->idx_in];
- ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start);
- if (ret < 0)
- return -EINVAL;
-
- /* Get number of stages used for optimize opportunity. 2nd
- * stage length is one if conversion needs only one stage.
- * If input and output rate is the same return 0 to
- * use a simple copy function instead of 1 stage FIR with one
- * tap.
- */
- n_stages = (src->stage2->filter_length == 1) ? 1 : 2;
- if (p->idx_in == p->idx_out)
- n_stages = 0;
-
- /* If filter length for first stage is zero this is a deleted
- * mode from in/out matrix. Computing of such SRC mode needs
- * to be prevented.
- */
- if (src->stage1->filter_length == 0)
- return -EINVAL;
-
- return n_stages;
-}
-
-#if SRC_SHORT == 1
-
-/* Calculate a FIR filter part that does not need circular modification */
-
-static inline void fir_part(int64_t y[], int *id, int *ic,
- const int32_t data[], const int16_t coef[], int nch_x_taps, int nch)
-{
- int64_t tap0;
- int64_t tap1;
- int n;
- int64_t a = 0;
- int64_t b = 0;
- int c = *ic;
- int d = *id;
- int d_end = d - nch_x_taps;
-
- /* Data is Q1.31, coef is Q1.15, product is Q2.46 */
- if (nch == 2) {
- for (n = 0; n < (nch_x_taps >> 2); n++) {
- tap0 = coef[c++];
- tap1 = coef[c++];
- b += data[d--] * tap0;
- a += data[d--] * tap0;
- b += data[d--] * tap1;
- a += data[d--] * tap1;
- }
- if (d > d_end) {
- tap0 = coef[c++];
- b += data[d--] * tap0;
- a += data[d--] * tap0;
- }
- y[1] += b;
- y[0] += a;
- } else {
- while (d > d_end) {
- tap0 = coef[c++];
- for (n = nch - 1; n >= 0; n--)
- y[n] += data[d--] * tap0;
- }
- }
- *ic = c;
- *id = d;
-}
-
-#else
-
-static inline void fir_part(int64_t y[], int *id, int *ic,
- const int32_t data[], const int32_t coef[], int nch_x_taps, int nch)
-{
- int64_t tap0;
- int64_t tap1;
- int n;
- int64_t a = 0;
- int64_t b = 0;
- int c = *ic;
- int d = *id;
- int d_end = d - nch_x_taps;
-
- /* Data is Q1.31, coef is Q1.23, product is Q2.54 */
- if (nch == 2) {
- for (n = 0; n < (nch_x_taps >> 2); n++) {
- tap0 = coef[c++];
- tap1 = coef[c++];
- b += data[d--] * tap0;
- a += data[d--] * tap0;
- b += data[d--] * tap1;
- a += data[d--] * tap1;
- }
- if (d > d_end) {
- tap0 = coef[c++];
- b += data[d--] * tap0;
- a += data[d--] * tap0;
- }
- y[1] += b;
- y[0] += a;
- } else {
- while (d > d_end) {
- tap0 = coef[c++];
- for (n = nch - 1; n >= 0; n--)
- y[n] += data[d--] * tap0;
- }
- }
- *ic = c;
- *id = d;
-}
-
-#endif
-
-#if SRC_SHORT == 1
-
-static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
- int32_t out_delay[], const int16_t coefs[], int dsm1, int taps,
- int shift, int nch)
-{
- int n2;
- int i;
- int64_t y[PLATFORM_MAX_CHANNELS];
- int ri = ri0;
- int wi = wi0;
- int n1 = ri0 + 1; /* Convert to number of sequential frames */
- int qshift = 15 + shift; /* Q2.46 -> Q2.31 */
- int32_t rnd = 1 << (qshift - 1); /* Half LSB */
- int nch_x_taps = nch * taps;
-
- /* Initialize to half LSB for rounding */
- for (i = 0; i < nch; i++)
- y[i] = rnd;
-
- if (n1 >= nch_x_taps) {
- fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
- } else {
- n2 = nch_x_taps - n1;
- fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
- ri = dsm1;
- fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
- }
-
- for (i = 0; i < nch; i++)
- out_delay[wi++] = sat_int32(y[i] >> qshift);
-}
-#else
-
-static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
- int32_t out_delay[], const int32_t coefs[], int dsm1, int taps,
- int shift, int nch)
-{
- int n2;
- int i;
- int64_t y[PLATFORM_MAX_CHANNELS];
- int ri = ri0;
- int wi = wi0;
- int n1 = ri0 + 1; /* Convert to number of sequential frames */
- int qshift = 23 + shift; /* Q2.54 -> Q2.31 */
- int32_t rnd = 1 << (qshift - 1); /* Half LSB */
- int nch_x_taps = nch * taps;
-
- /* Initialize to half LSB for rounding */
- for (i = 0; i < nch; i++)
- y[i] = rnd;
-
- if (n1 >= nch_x_taps) {
- fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
- } else {
- n2 = nch_x_taps - n1;
- fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
- ri = dsm1;
- fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
- }
-
- for (i = 0; i < nch; i++)
- out_delay[wi++] = sat_int32(y[i] >> qshift);
-
-}
-
-#endif
-
-void src_polyphase_stage_cir(struct src_stage_prm * s)
-{
- struct src_state *fir = s->state;
- struct src_stage *cfg = s->stage;
- int n;
- int m;
- int f;
- int ci;
- int ri;
- int n_wrap_fir;
- int n_wrap_buf;
- int n_wrap_min;
- int n_min;
- int wi;
- const void *coef = cfg->coefs;
- int32_t *in_delay = fir->fir_delay;
- int32_t *out_delay = fir->out_delay;
- int dsm1 = fir->fir_delay_size - 1;
- int shift = cfg->shift;
- int nch = s->nch;
- int rewind = -nch * (cfg->blk_in
- + (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
- int nch_x_idm = cfg->idm * nch;
- int nch_x_odm = cfg->odm * nch;
- size_t sz = sizeof(int32_t);
- int blk_in_bytes = nch * cfg->blk_in * sz;
- int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
-
-
- for (n = 0; n < s->times; n++) {
- /* Input data */
- m = blk_in_bytes;
- while (m > 0) {
- n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
- n_wrap_buf = s->x_end_addr - s->x_rptr;
- n_wrap_min = (n_wrap_fir < n_wrap_buf)
- ? n_wrap_fir : n_wrap_buf;
- n_min = (m < n_wrap_min) ? m : n_wrap_min;
- while (n_min > 0) {
- fir->fir_delay[fir->fir_wi++] = *s->x_rptr;
- s->x_rptr++;
- n_min -= sz;
- m -= sz;
- }
- /* Check for wrap */
- src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
- if (fir->fir_wi == fir->fir_delay_size)
- fir->fir_wi = 0;
- }
-
- /* Filter */
- ci = 0; /* Reset to 1st coefficient */
- ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
- if (ri < 0)
- ri += fir->fir_delay_size;
-
- wi = fir->out_ri;
- for (f = 0; f < cfg->num_of_subfilters; f++) {
- fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
- dsm1, cfg->subfilter_length, shift, nch);
-
- wi += nch_x_odm;
- if (wi >= fir->out_delay_size)
- wi -= fir->out_delay_size;
-
- ri += nch_x_idm; /* Next sub-filter start */
- if (ri >= fir->fir_delay_size)
- ri -= fir->fir_delay_size;
- }
-
- /* Output */
- m = blk_out_bytes;
- while (m > 0) {
- n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
- n_wrap_buf = s->y_end_addr - s->y_wptr;
- n_wrap_min = (n_wrap_fir < n_wrap_buf)
- ? n_wrap_fir : n_wrap_buf;
- n_min = (m < n_wrap_min) ? m : n_wrap_min;
- while (n_min > 0) {
- *s->y_wptr = fir->out_delay[fir->out_ri++];
- s->y_wptr++;
- n_min -= sz;
- m -= sz;
- }
- /* Check wrap */
- src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
- if (fir->out_ri == fir->out_delay_size)
- fir->out_ri = 0;
- }
- }
-}
-
-void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
-{
- struct src_state *fir = s->state;
- struct src_stage *cfg = s->stage;
- int n;
- int m;
- int f;
- int ci;
- int ri;
- int n_wrap_fir;
- int n_wrap_buf;
- int n_wrap_min;
- int n_min;
- int wi;
- const void *coef = cfg->coefs;
- int32_t *in_delay = fir->fir_delay;
- int32_t *out_delay = fir->out_delay;
- int dsm1 = fir->fir_delay_size - 1;
- int shift = cfg->shift;
- int nch = s->nch;
- int rewind = -nch * (cfg->blk_in
- + (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
- int nch_x_idm = cfg->idm * nch;
- int nch_x_odm = cfg->odm * nch;
- size_t sz = sizeof(int32_t);
- int blk_in_bytes = nch * cfg->blk_in * sz;
- int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
-
- for (n = 0; n < s->times; n++) {
- /* Input data */
- m = blk_in_bytes;
- while (m > 0) {
- n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
- n_wrap_buf = s->x_end_addr - s->x_rptr;
- n_wrap_min = (n_wrap_fir < n_wrap_buf)
- ? n_wrap_fir : n_wrap_buf;
- n_min = (m < n_wrap_min) ? m : n_wrap_min;
- while (n_min > 0) {
- fir->fir_delay[fir->fir_wi++] = *s->x_rptr << 8;
- s->x_rptr++;
- n_min -= sz;
- m -= sz;
- }
- /* Check for wrap */
- src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
- if (fir->fir_wi == fir->fir_delay_size)
- fir->fir_wi = 0;
- }
-
- /* Filter */
- ci = 0; /* Reset to 1st coefficient */
- ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
- if (ri < 0)
- ri += fir->fir_delay_size;
-
- wi = fir->out_ri;
- for (f = 0; f < cfg->num_of_subfilters; f++) {
- fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
- dsm1, cfg->subfilter_length, shift, nch);
-
- wi += nch_x_odm;
- if (wi >= fir->out_delay_size)
- wi -= fir->out_delay_size;
-
- ri += nch_x_idm; /* Next sub-filter start */
- if (ri >= fir->fir_delay_size)
- ri -= fir->fir_delay_size;
- }
-
- /* Output */
- m = blk_out_bytes;
- while (m > 0) {
- n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
- n_wrap_buf = s->y_end_addr - s->y_wptr;
- n_wrap_min = (n_wrap_fir < n_wrap_buf)
- ? n_wrap_fir : n_wrap_buf;
- n_min = (m < n_wrap_min) ? m : n_wrap_min;
- while (n_min > 0) {
- *s->y_wptr = fir->out_delay[fir->out_ri++] >> 8;
- s->y_wptr++;
- n_min -= sz;
- m -= sz;
- }
- /* Check wrap */
- src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
- if (fir->out_ri == fir->out_delay_size)
- fir->out_ri = 0;
- }
- }
-
-}
diff --git a/src/audio/src_generic.c b/src/audio/src_generic.c
new file mode 100644
index 0000000..9caa090
--- /dev/null
+++ b/src/audio/src_generic.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Intel Corporation nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* Default C implementation guaranteed to work on any
+ * architecture.
+ */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_GENERIC
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0,
+ int32_t *fir_start, int32_t *fir_end, const int fir_delay_length,
+ const int taps_x_nch, const int shift, const int nch)
+{
+ int64_t y0;
+ int64_t y1;
+ int32_t *data;
+ const int16_t *coef;
+ int i;
+ int j;
+ int n1;
+ int n2;
+ int frames;
+ const int qshift = 15 + shift; /* Q2.46 -> Q2.31 */
+ const int32_t rnd = 1 << (qshift - 1); /* Half LSB */
+ int32_t *d = rp;
+ int32_t *wp = wp0;
+
+ /* Check for 2ch FIR case */
+ if (nch == 2) {
+ /* Decrement data pointer to next channel start. Note that
+ * initialization code ensures that circular wrap does not
+ * happen mid-frame.
+ */
+ data = d - 1;
+
+ /* Initialize to half LSB for rounding, prepare for FIR core */
+ y0 = rnd;
+ y1 = rnd;
+ coef = (const int16_t *)cp;
+ frames = fir_end - data; /* Frames until wrap */
+ n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1;
+ n2 = (taps_x_nch >> 1) - n1;
+
+ /* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The
+ * output shift includes the shift by 15 for Qx.46 to
+ * Qx.31.
+ */
+ for (i = 0; i < n1; i++) {
+ y0 += (int64_t)(*coef) * (*data);
+ data++;
+ y1 += (int64_t)(*coef) * (*data);
+ data++;
+ coef++;
+ }
+ if (data == fir_end)
+ data = fir_start;
+
+ for (i = 0; i < n2; i++) {
+ y0 += (int64_t)(*coef) * (*data);
+ data++;
+ y1 += (int64_t)(*coef) * (*data);
+ data++;
+ coef++;
+ }
+
+ *wp = sat_int32(y1 >> qshift);
+ *(wp + 1) = sat_int32(y0 >> qshift);
+ return;
+ }
+
+ for (j = 0; j < nch; j++) {
+ /* Decrement data pointer to next channel start. Note that
+ * initialization code ensures that circular wrap does not
+ * happen mid-frame.
+ */
+ data = d--;
+
+ /* Initialize to half LSB for rounding, prepare for FIR core */
+ y0 = rnd;
+ coef = (const int16_t *)cp;
+ frames = fir_end - data + nch - j - 1; /* Frames until wrap */
+ n1 = (taps_x_nch < frames) ? taps_x_nch : frames;
+ n2 = taps_x_nch - n1;
+
+ /* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The
+ * output shift includes the shift by 15 for Qx.46 to
+ * Qx.31.
+ */
+ for (i = 0; i < n1; i += nch) {
+ y0 += (int64_t)(*coef) * (*data);
+ coef++;
+ data += nch;
+ }
+ if (data >= fir_end)
+ data -= fir_delay_length;
+
+ for (i = 0; i < n2; i += nch) {
+ y0 += (int64_t)(*coef) * (*data);
+ coef++;
+ data += nch;
+ }
+
+ *wp = sat_int32(y0 >> qshift);
+ wp++;
+ }
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0,
+ int32_t *fir_start, int32_t *fir_end, int fir_delay_length,
+ const int taps_x_nch, const int shift, const int nch)
+{
+ int64_t y0;
+ int64_t y1;
+ int32_t *data;
+ const int32_t *coef;
+ int i;
+ int j;
+ int frames;
+ int n1;
+ int n2;
+
+ const int qshift = 23 + shift; /* Qx.54 -> Qx.31 */
+ const int32_t rnd = 1 << (qshift - 1); /* Half LSB */
+ int32_t *d = rp;
+ int32_t *wp = wp0;
+
+ /* Check for 2ch FIR case */
+ if (nch == 2) {
+ /* Decrement data pointer to next channel start. Note that
+ * initialization code ensures that circular wrap does not
+ * happen mid-frame.
+ */
+ data = d - 1;
+
+ /* Initialize to half LSB for rounding, prepare for FIR core */
+ y0 = rnd;
+ y1 = rnd;
+ coef = (const int32_t *)cp;
+ frames = fir_end - data; /* Frames until wrap */
+ n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1;
+ n2 = (taps_x_nch >> 1) - n1;
+
+ /* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The
+ * output shift includes the shift by 23 for Qx.54 to
+ * Qx.31.
+ */
+ for (i = 0; i < n1; i++) {
+ y0 += (int64_t)(*coef >> 8) * (*data);
+ data++;
+ y1 += (int64_t)(*coef >> 8) * (*data);
+ data++;
+ coef++;
+ }
+ if (data == fir_end)
+ data = fir_start;
+
+ for (i = 0; i < n2; i++) {
+ y0 += (int64_t)(*coef >> 8) * (*data);
+ data++;
+ y1 += (int64_t)(*coef >> 8) * (*data);
+ data++;
+ coef++;
+ }
+ *wp = sat_int32(y1 >> qshift);
+ *(wp + 1) = sat_int32(y0 >> qshift);
+ return;
+ }
+
+ for (j = 0; j < nch; j++) {
+ /* Decrement data pointer to next channel start. Note that
+ * initialization code ensures that circular wrap does not
+ * happen mid-frame.
+ */
+ data = d--;
+
+ /* Initialize to half LSB for rounding, prepare for FIR core */
+ y0 = rnd;
+ coef = (const int32_t *)cp;
+ frames = fir_end - data + nch - j - 1; /* Frames until wrap */
+ n1 = (taps_x_nch < frames) ? taps_x_nch : frames;
+ n2 = taps_x_nch - n1;
+
+ /* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The
+ * output shift includes the shift by 23 for Qx.54 to
+ * Qx.31.
+ */
+ for (i = 0; i < n1; i += nch) {
+ y0 += (int64_t)(*coef >> 8) * (*data);
+ coef++;
+ data += nch;
+ }
+ if (data >= fir_end)
+ data -= fir_delay_length;
+
+ for (i = 0; i < n2; i += nch) {
+ y0 += (int64_t)(*coef >> 8) * (*data);
+ coef++;
+ data += nch;
+ }
+ *wp = sat_int32(y0 >> qshift);
+ wp++;
+ }
+}
+
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_wrap_fir;
+ int n_min;
+ int32_t *rp;
+ int32_t *wp;
+
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_delay = fir->fir_delay;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int fir_length = fir->fir_delay_size;
+ const int rewind = nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch;
+ const int nch_x_idm = nch * cfg->idm;
+ const size_t fir_size = fir->fir_delay_size * sizeof(int32_t);
+ const int taps_x_nch = cfg->subfilter_length * nch;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data */
+ m = blk_in_words;
+ while (m > 0) {
+ /* Number of words without circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_wrap_fir = fir->fir_wp - fir->fir_delay + 1;
+ n_min = (n_wrap_fir < n_wrap_buf)
+ ? n_wrap_fir : n_wrap_buf;
+ n_min = (m < n_min) ? m : n_min;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ *fir->fir_wp = *s->x_rptr;
+ fir->fir_wp--;
+ s->x_rptr++;
+ }
+ /* Check for wrap */
+ src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size);
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = fir->fir_wp + rewind;
+ src_circ_inc_wrap(&rp, fir_end, fir_size);
+ wp = fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter_generic(rp, cp, wp,
+ fir_delay, fir_end, fir_length,
+ taps_x_nch, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap(&wp, out_delay_end, out_size);
+ rp -= nch_x_idm; /* Next sub-filter start */
+ src_circ_dec_wrap(&rp, fir_delay, fir_size);
+ }
+
+ /* Output */
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_fir = out_delay_end - fir->out_rp;
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (n_wrap_fir < n_wrap_buf)
+ ? n_wrap_fir : n_wrap_buf;
+ n_min = (m < n_min) ? m : n_min;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ *s->y_wptr = *fir->out_rp;
+ s->y_wptr++;
+ fir->out_rp++;
+ }
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ src_circ_inc_wrap(&fir->out_rp, out_delay_end,
+ out_size);
+ }
+ }
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_wrap_fir;
+ int n_min;
+ int32_t *rp;
+ int32_t *wp;
+
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_delay = fir->fir_delay;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int fir_length = fir->fir_delay_size;
+ const int rewind = nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch;
+ const int nch_x_idm = nch * cfg->idm;
+ const size_t fir_size = fir->fir_delay_size * sizeof(int32_t);
+ const int taps_x_nch = cfg->subfilter_length * nch;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data */
+ m = blk_in_words;
+ while (m > 0) {
+ /* Number of words without circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_wrap_fir = fir->fir_wp - fir->fir_delay + 1;
+ n_min = (n_wrap_fir < n_wrap_buf)
+ ? n_wrap_fir : n_wrap_buf;
+ n_min = (m < n_min) ? m : n_min;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ *fir->fir_wp = *s->x_rptr << 8;
+ fir->fir_wp--;
+ s->x_rptr++;
+ }
+ /* Check for wrap */
+ src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size);
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = fir->fir_wp + rewind;
+ src_circ_inc_wrap(&rp, fir_end, fir_size);
+ wp = fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter_generic(rp, cp, wp,
+ fir_delay, fir_end, fir_length,
+ taps_x_nch, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap(&wp, out_delay_end, out_size);
+ rp -= nch_x_idm; /* Next sub-filter start */
+ src_circ_dec_wrap(&rp, fir_delay, fir_size);
+ }
+
+ /* Output */
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_fir = out_delay_end - fir->out_rp;
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (n_wrap_fir < n_wrap_buf)
+ ? n_wrap_fir : n_wrap_buf;
+ n_min = (m < n_min) ? m : n_min;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ *s->y_wptr = *fir->out_rp >> 8;
+ s->y_wptr++;
+ fir->out_rp++;
+ }
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ src_circ_inc_wrap(&fir->out_rp, out_delay_end,
+ out_size);
+ }
+ }
+}
+
+#endif
diff --git a/src/audio/src_hifi2ep.c b/src/audio/src_hifi2ep.c
new file mode 100644
index 0000000..0d03ffa
--- /dev/null
+++ b/src/audio/src_hifi2ep.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Intel Corporation nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* HiFi EP optimized code parts for SRC */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_HIFIEP
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi2.h>
+
+/* HiFi EP has
+ * 4x 56 bit registers in register file Q
+ * 8x 48 bit registers in register file P
+ */
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
+ const int taps_div_4, const int shift, const int nch)
+{
+ /* This function uses
+ * 2x 56 bit registers Q,
+ * 4x 48 bit registers P
+ * 3x integers
+ * 4x address pointers,
+ */
+ ae_q56s a0;
+ ae_q56s a1;
+ ae_p24x2f data2;
+ ae_p24x2f coef2;
+ ae_p24x2f p0;
+ ae_p24x2f p1;
+ ae_p16x2s *coefp;
+ ae_p24x2f *dp = (ae_p24x2f *)rp;
+ ae_p24x2f *dp0;
+ ae_q32s *wp = wp0;
+ int i;
+ int j;
+ const int inc = sizeof(ae_p24x2f);
+
+ /* 2ch FIR case */
+ if (nch == 2) {
+ /* Move data pointer back by one sample to start from right
+ * channel sample. Discard read value p0.
+ */
+ AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_p16x2s *)cp;
+ a0 = AE_ZEROQ56();
+ a1 = AE_ZEROQ56();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Two coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients. Coef2_h contains tap *coefp
+ * and coef2_l contains the next tap.
+ */
+ coef2 = AE_LP16X2F_I(coefp, 0);
+ coefp++;
+
+ /* Load two data samples from two channels */
+ AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
+ AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */
+
+ /* Select to d0 successive left channel samples, to d1
+ * successive right channel samples. Then accumulate
+ * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+ * data and Q1.15 coefficients are used as 24 bits as
+ * Q1.23 values.
+ */
+ data2 = AE_SELP24_LL(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(p0, p1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+ /* Repeat for next two taps */
+ coef2 = AE_LP16X2F_I(coefp, 0);
+ coefp++;
+ AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
+ AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
+ data2 = AE_SELP24_LL(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(p0, p1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output.
+ */
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
+ sizeof(int32_t));
+ return;
+ }
+
+ for (j = 0; j < nch; j++) {
+ /* Copy pointer and advance to next ch with dummy load */
+ dp0 = dp;
+ AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_p16x2s *)cp;
+ a0 = AE_ZEROQ56();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Two coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients */
+ coef2 = *coefp++;
+
+ /* Load two data samples */
+ AE_LP24F_C(p0, dp0, inc);
+ AE_LP24F_C(p1, dp0, inc);
+
+ /* Pack p0 and p1 to data2_h and data2_l */
+ data2 = AE_SELP24_LL(p0, p1);
+
+ /* Accumulate data2_h * coef2_h + data2_l * coef2_l */
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+
+ /* Repeat for next two filter taps */
+ coef2 = *coefp++;
+ AE_LP24F_C(p0, dp0, inc);
+ AE_LP24F_C(p1, dp0, inc);
+ data2 = AE_SELP24_LL(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output. Advance write
+ * pointer to next sample.
+ */
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+ wp++;
+ }
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
+ const int taps_div_4, const int shift, const int nch)
+{
+ /* This function uses
+ * 2x 56 bit registers Q,
+ * 4x 48 bit registers P
+ * 3x integers
+ * 4x address pointers,
+ */
+ ae_q56s a0;
+ ae_q56s a1;
+ ae_p24x2f p0;
+ ae_p24x2f p1;
+ ae_p24x2f data2;
+ ae_p24x2f coef2;
+ ae_p24x2f *coefp;
+ ae_p24x2f *dp = (ae_p24x2f *)rp;
+ ae_p24x2f *dp0;
+ ae_q32s *wp = wp0;
+ int i;
+ int j;
+ const int inc = sizeof(ae_p24x2f);
+
+ /* 2ch FIR case */
+ if (nch == 2) {
+ /* Move data pointer back by one sample to start from right
+ * channel sample. Discard read value p0.
+ */
+ AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_p24x2f *)cp;
+ a0 = AE_ZEROQ56();
+ a1 = AE_ZEROQ56();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Two coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients. Coef2_h contains tap *coefp
+ * and coef2_l contains the next tap.
+ */
+ /* TODO: Ensure coefficients are 64 bits aligned */
+ coef2 = AE_LP24X2F_I(coefp, 0);
+ coefp++;
+
+ /* Load two data samples from two channels */
+ AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
+ AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */
+
+ /* Select to d0 successive left channel samples, to d1
+ * successive right channel samples.
+ */
+
+ /* Accumulate to m
+ * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+ * data and Q1.15 coefficients are used as 24 bits as
+ * Q1.23 values.
+ */
+ data2 = AE_SELP24_LL(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(p0, p1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+ /* Repeat for next two taps */
+ coef2 = AE_LP24X2F_I(coefp, 0);
+ coefp++;
+ AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
+ AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
+ data2 = AE_SELP24_LL(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(p0, p1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output.
+ */
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
+ sizeof(int32_t));
+ return;
+ }
+
+ for (j = 0; j < nch; j++) {
+ /* Copy pointer and advance to next ch with dummy load */
+ dp0 = dp;
+ AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_p24x2f *)cp;
+ a0 = AE_ZEROQ56();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Two coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients */
+ coef2 = *coefp++;
+
+ /* Load two data samples and place them to L and H of
+ * data2.
+ */
+ AE_LP24F_C(p0, dp0, inc);
+ AE_LP24F_C(p1, dp0, inc);
+ data2 = AE_SELP24_LH(p0, p1);
+
+ /* Accumulate to m
+ * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+ * data and coefficients are used as the most
+ * significant 24 bits as Q1.23 values.
+ */
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+
+ /* Repeat for next two filter taps */
+ coef2 = *coefp++;
+ AE_LP24F_C(p0, dp0, inc);
+ AE_LP24F_C(p1, dp0, inc);
+ data2 = AE_SELP24_LH(p0, p1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output. Advance write
+ * pointer to next sample.
+ */
+ AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+ wp++;
+ }
+}
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+ /* This function uses
+ * 1x 56 bit registers Q,
+ * 0x 48 bit registers P,
+ * 16x integers
+ * 7x address pointers,
+ */
+ ae_q56s q;
+ ae_q32s *rp;
+ ae_q32s *wp;
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_min;
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int sz = sizeof(int32_t);
+ const int n_sz = -sizeof(int32_t);
+ const int rewind_sz = sz * (nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+ const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+ const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data to filter */
+ m = blk_in_words;
+
+ /* Setup circular buffer for FIR input data delay */
+ AE_SETCBEGIN0(fir->fir_delay);
+ AE_SETCEND0(fir_end);
+
+ while (m > 0) {
+ /* Number of words until circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Load 32 bits sample to accumulator */
+ q = AE_LQ32F_I((ae_q32s *)s->x_rptr++, 0);
+
+ /* Store to circular buffer, advance pointer */
+ AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
+ }
+
+ /* Check for wrap */
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Do filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = (ae_q32s *)fir->fir_wp;
+
+ /* Do circular modification to pointer rp by amount of
+ * rewind to to data start. Loaded value q is discarded.
+ */
+ AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);
+
+ /* Reset FIR write pointer and compute all polyphase
+ * sub-filters.
+ */
+ wp = (ae_q32s *)fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+ out_size);
+
+ /* Circular advance pointer rp by number of
+ * channels x input delay multiplier. Loaded value q
+ * is discarded.
+ */
+ AE_LQ32F_C(q, rp, nch_x_idm_sz);
+ }
+
+ /* Output */
+
+ /* Setup circular buffer for SRC out delay access */
+ AE_SETCBEGIN0(fir->out_delay);
+ AE_SETCEND0(out_delay_end);
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Circular load followed by linear store */
+ AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
+ AE_SQ32F_I(q, (ae_q32s *)s->y_wptr, 0);
+ s->y_wptr++;
+ }
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ }
+ }
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+ /* This function uses
+ * 1x 56 bit registers Q,
+ * 0x 48 bit registers P,
+ * 16x integers
+ * 7x address pointers,
+ */
+ ae_q56s q;
+ ae_q32s *rp;
+ ae_q32s *wp;
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_min;
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int sz = sizeof(int32_t);
+ const int n_sz = -sizeof(int32_t);
+ const int rewind_sz = sz * (nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+ const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+ const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data to filter */
+ m = blk_in_words;
+
+ /* Setup circular buffer for FIR input data delay */
+ AE_SETCBEGIN0(fir->fir_delay);
+ AE_SETCEND0(fir_end);
+
+ while (m > 0) {
+ /* Number of words without circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Load 32 bits sample to accumulator
+ * and left shift by 8, advance read
+ * pointer.
+ */
+ q = AE_SLLIQ56(AE_LQ32F_I(
+ (ae_q32s *)s->x_rptr++, 0), 8);
+
+ /* Store to circular buffer, advance
+ * write pointer.
+ */
+ AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
+ }
+
+ /* Check for wrap */
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Do filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = (ae_q32s *)fir->fir_wp;
+
+ /* Do circular modification to pointer rp by amount of
+ * rewind to to data start. Loaded value q is discarded.
+ */
+ AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);
+
+ /* Reset FIR output write pointer and compute all polyphase
+ * sub-filters.
+ */
+ wp = (ae_q32s *)fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+ out_size);
+
+ /* Circular advance pointer rp by number of
+ * channels x input delay multiplier. Loaded value q
+ * is discarded.
+ */
+ AE_LQ32F_C(q, rp, nch_x_idm_sz);
+ }
+
+ /* Output */
+
+ /* Setup circular buffer for SRC out delay access */
+ AE_SETCBEGIN0(fir->out_delay);
+ AE_SETCEND0(out_delay_end);
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Circular load for 32 bit sample,
+ * advance pointer.
+ */
+ AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
+
+ /* Store value as shifted right by 8 for
+ * sign extended 24 bit value, advance pointer.
+ */
+ AE_SQ32F_I(AE_SRAIQ56(q, 8),
+ (ae_q32s *)s->y_wptr, 0);
+ s->y_wptr++;
+ }
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ }
+ }
+}
+
+#endif
diff --git a/src/audio/src_hifi3.c b/src/audio/src_hifi3.c
new file mode 100644
index 0000000..96d3c99
--- /dev/null
+++ b/src/audio/src_hifi3.c
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Intel Corporation nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* HiFi3 optimized code parts for SRC */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_HIFI3
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi3.h>
+
+/* HiFi3 has
+ * 16x 64 bit registers in register file AE_DR
+ */
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+ const int taps_div_4, const int shift, const int nch)
+{
+ /* This function uses
+ * 6x 64 bit registers
+ * 3x integers
+ * 5x address pointers,
+ */
+ ae_f64 a0;
+ ae_f64 a1;
+ ae_valign u;
+ ae_f16x4 coef4;
+ ae_f32x2 d0;
+ ae_f32x2 d1;
+ ae_f32x2 data2;
+ ae_f16x4 *coefp;
+ ae_f32x2 *dp;
+ ae_f32 *dp0;
+ ae_f32 *dp1;
+ int i;
+ int j;
+ ae_f32 *wp = wp0;
+ const int inc = nch * sizeof(int32_t);
+
+ if (nch == 2) {
+ /* Move data pointer back by one sample to start from right
+ * channel sample. Discard read value p0.
+ */
+ dp = (ae_f32x2 *)rp;
+ AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_f16x4 *)cp;
+ a0 = AE_ZERO64();
+ a1 = AE_ZERO64();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Four coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load four coefficients */
+ AE_LA16X4_IP(coef4, u, coefp);
+
+ /* Load two data samples from two channels */
+ AE_L32X2_XC(d0, dp, inc); /* r0, l0 */
+ AE_L32X2_XC(d1, dp, inc); /* r1, l1 */
+
+ /* Select to data2 sequential samples from a channel
+ * and then accumulate to a0 and a1
+ * data2_h * coef4_3 + data2_l * coef4_2.
+ * The data is 32 bits Q1.31 and coefficient 16 bits
+ * Q1.15. The accumulators are Q17.47.
+ */
+ data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
+ AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+ data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
+ AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
+
+ /* Load two data samples from two channels */
+ AE_L32X2_XC(d0, dp, inc); /* r2, l2 */
+ AE_L32X2_XC(d1, dp, inc); /* r3, l3 */
+
+ /* Accumulate
+ * data2_h * coef4_1 + data2_l * coef4_0.
+ */
+ data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
+ AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+ data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
+ AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output.
+ */
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+ sizeof(int32_t));
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+ sizeof(int32_t));
+
+ return;
+ }
+
+ dp1 = (ae_f32 *)rp;
+ for (j = 0; j < nch; j++) {
+ /* Copy pointer and advance to next ch with dummy load */
+ dp0 = dp1;
+ AE_L32_XC(d0, dp1, -sizeof(ae_f32));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_f16x4 *)cp;
+ a0 = AE_ZERO64();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Data is read from
+ * interleaved buffer with stride of channels count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load four coefficients */
+ AE_LA16X4_IP(coef4, u, coefp);
+
+ /* Load two data samples, place to high and
+ * low of data2.
+ */
+ AE_L32_XC(d0, dp0, inc);
+ AE_L32_XC(d1, dp0, inc);
+ data2 = AE_SEL32_LL(d0, d1);
+
+ /* Accumulate
+ * data2_h * coef4_3 + data2_l* coef4_2.
+ * The data is 32 bits Q1.31 and coefficient 16 bits
+ * Q1.15. The accumulator is Q17.47.
+ */
+ AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+
+ /* Repeat with next two samples */
+ AE_L32_XC(d0, dp0, inc);
+ AE_L32_XC(d1, dp0, inc);
+ data2 = AE_SEL32_LL(d0, d1);
+
+ /* Accumulate
+ * data2_h * coef4_1 + data2_l * coef4_0.
+ */
+ AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate Q17.47
+ * to Q1.31, and store 32 bit output. Advance write
+ * pointer to next sample.
+ */
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+ sizeof(int32_t));
+ }
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+ const int taps_div_4, const int shift, const int nch)
+{
+ /* This function uses
+ * 6x 64 bit registers
+ * 3x integers
+ * 5x address pointers,
+ */
+ ae_f64 a0;
+ ae_f64 a1;
+ ae_f24x2 data2;
+ ae_f24x2 coef2;
+ ae_f24x2 d0;
+ ae_f24x2 d1;
+ ae_f24x2 *coefp;
+ ae_f24x2 *dp;
+ ae_f24 *dp1;
+ ae_f24 *dp0;
+ int i;
+ int j;
+ ae_f32 *wp = wp0;
+ const int inc = nch * sizeof(int32_t);
+
+ if (nch == 2) {
+ /* Move data pointer back by one sample to start from right
+ * channel sample. Discard read value p0.
+ */
+ dp = (ae_f24x2 *)rp;
+ AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_f24x2 *)cp;
+ a0 = AE_ZERO64();
+ a1 = AE_ZERO64();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Two coefficients
+ * are loaded simultaneously. Data is read
+ * from interleaved buffer with stride of channels
+ * count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients. Coef2_h contains tap *coefp
+ * and coef2_l contains the next tap.
+ */
+ /* TODO: Ensure coefficients are 64 bits aligned */
+ AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+
+ /* Load two data samples from two channels */
+ AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
+ AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
+
+ /* Select to d0 successive left channel samples, to d1
+ * successive right channel samples. Then Accumulate
+ * to a0 and a1
+ * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+ * data and Q1.15 coefficients are used as 24 bits as
+ * Q1.23 values.
+ */
+ data2 = AE_SELP24_LL(d0, d1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(d0, d1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+ /* Repeat for next two taps */
+ AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+ AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
+ AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
+ data2 = AE_SELP24_LL(d0, d1);
+ AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+ data2 = AE_SELP24_HH(d0, d1);
+ AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate
+ * to Q1.31, and store 32 bit output.
+ */
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+ sizeof(int32_t));
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+ sizeof(int32_t));
+
+ return;
+ }
+
+ dp1 = (ae_f24 *)rp;
+ for (j = 0; j < nch; j++) {
+ /* Copy pointer and advance to next ch with dummy load */
+ dp0 = dp1;
+ AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
+
+ /* Reset coefficient pointer and clear accumulator */
+ coefp = (ae_f24x2 *)cp;
+ a0 = AE_ZERO64();
+
+ /* Compute FIR filter for current channel with four
+ * taps per every loop iteration. Data is read from
+ * interleaved buffer with stride of channels count.
+ */
+ for (i = 0; i < taps_div_4; i++) {
+ /* Load two coefficients */
+ coef2 = *coefp++;
+
+ /* Load two data samples, place to high and
+ * low of data2.
+ */
+ AE_L32F24_XC(d0, dp0, inc);
+ AE_L32F24_XC(d1, dp0, inc);
+ data2 = AE_SELP24_LL(d0, d1);
+
+ /* Accumulate to data2_h * coef2_h +
+ * data2_l*coef2_l. The Q1.31 bit data is used
+ * as Q1.23 from MSB side bits of the 32 bit
+ * word. The accumulator m is Q17.47.
+ */
+ AE_MULAAFD24_HH_LL(a0, data2, coef2);
+
+ /* Repeat the same for next two filter taps */
+ coef2 = *coefp++;
+ AE_L32F24_XC(d0, dp0, inc);
+ AE_L32F24_XC(d1, dp0, inc);
+ data2 = AE_SELP24_LL(d0, d1);
+ AE_MULAAFD24_HH_LL(a0, data2, coef2);
+ }
+
+ /* Scale FIR output with right shifts, round/saturate Q17.47
+ * to Q1.31, and store 32 bit output. Advance write
+ * pointer to next sample.
+ */
+ AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+ sizeof(int32_t));
+ }
+}
+
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+ /* This function uses
+ * 1x 64 bit registers
+ * 16x integers
+ * 7x address pointers,
+ */
+ ae_int32x2 q;
+ ae_f32 *rp;
+ ae_f32 *wp;
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_min;
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int sz = sizeof(int32_t);
+ const int n_sz = -sizeof(int32_t);
+ const int rewind_sz = sz * (nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+ const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+ const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data to filter */
+ m = blk_in_words;
+
+ /* Setup circular buffer for FIR input data delay */
+ AE_SETCBEGIN0(fir->fir_delay);
+ AE_SETCEND0(fir_end);
+
+ while (m > 0) {
+ /* Number of words until circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Load 32 bits sample to accumulator,
+ * advance pointer.
+ */
+ AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
+
+ /* Store to circular buffer, advance pointer */
+ AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz);
+ }
+
+ /* Check for wrap */
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Do filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = (ae_f32 *)fir->fir_wp;
+
+ /* Do circular modification to pointer rp by amount of
+ * rewind to to data start. Loaded value q is discarded.
+ */
+ AE_L32_XC(q, rp, rewind_sz);
+
+ /* Reset FIR write pointer and compute all polyphase
+ * sub-filters.
+ */
+ wp = (ae_f32 *)fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+ out_size);
+
+ /* Circular advance pointer rp by number of
+ * channels x input delay multiplier. Loaded value q
+ * is discarded.
+ */
+ AE_L32_XC(q, rp, nch_x_idm_sz);
+ }
+
+ /* Output */
+
+ /* Setup circular buffer for SRC out delay access */
+ AE_SETCBEGIN0(fir->out_delay);
+ AE_SETCEND0(out_delay_end);
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Circular load followed by linear store,
+ * advance read and write pointers.
+ */
+ AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+ AE_S32_L_XP(q, (ae_int32 *)s->y_wptr, sz);
+ }
+
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ }
+ }
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+ /* This function uses
+ * 1x 64 bit registers
+ * 16x integers
+ * 7x address pointers,
+ */
+ ae_int32x2 q;
+ ae_f32 *rp;
+ ae_f32 *wp;
+ int i;
+ int n;
+ int m;
+ int n_wrap_buf;
+ int n_min;
+
+ struct src_state *fir = s->state;
+ struct src_stage *cfg = s->stage;
+ int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+ int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+ const void *cp; /* Can be int32_t or int16_t */
+ const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+ const int nch = s->nch;
+ const int nch_x_odm = cfg->odm * nch;
+ const int blk_in_words = nch * cfg->blk_in;
+ const int blk_out_words = nch * cfg->num_of_subfilters;
+ const int sz = sizeof(int32_t);
+ const int n_sz = -sizeof(int32_t);
+ const int rewind_sz = sz * (nch * (cfg->blk_in
+ + (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+ const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+ const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+ const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+ for (n = 0; n < s->times; n++) {
+ /* Input data */
+ m = blk_in_words;
+
+ /* Setup circular buffer for FIR input data delay */
+ AE_SETCBEGIN0(fir->fir_delay);
+ AE_SETCEND0(fir_end);
+
+ while (m > 0) {
+ /* Number of words without circular wrap */
+ n_wrap_buf = s->x_end_addr - s->x_rptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Load 32 bits sample to accumulator
+ * and left shift by 8, advance read
+ * pointer.
+ */
+ AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
+ AE_S32_L_XC(AE_SLAI32(q, 8),
+ (ae_int32 *)fir->fir_wp, n_sz);
+ }
+
+ /* Check for wrap */
+ src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+ }
+
+ /* Do filter */
+ cp = cfg->coefs; /* Reset to 1st coefficient */
+ rp = (ae_f32 *)fir->fir_wp;
+
+ /* Do circular modification to pointer rp by amount of
+ * rewind to to data start. Loaded value q is discarded.
+ */
+ AE_L32_XC(q, rp, rewind_sz);
+
+ /* Reset FIR output write pointer and compute all polyphase
+ * sub-filters.
+ */
+ wp = (ae_f32 *)fir->out_rp;
+ for (i = 0; i < cfg->num_of_subfilters; i++) {
+ fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+ wp += nch_x_odm;
+ cp += subfilter_size;
+ src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+ out_size);
+
+ /* Circular advance pointer rp by number of
+ * channels x input delay multiplier. Loaded value q
+ * is discarded.
+ */
+ AE_L32_XC(q, rp, nch_x_idm_sz);
+ }
+
+ /* Output */
+
+ /* Setup circular buffer for SRC out delay access */
+ AE_SETCBEGIN0(fir->out_delay);
+ AE_SETCEND0(out_delay_end);
+ m = blk_out_words;
+ while (m > 0) {
+ n_wrap_buf = s->y_end_addr - s->y_wptr;
+ n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+ m -= n_min;
+ for (i = 0; i < n_min; i++) {
+ /* Circular load for 32 bit sample,
+ * advance read pointer.
+ */
+ AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+
+ /* Store value as shifted right by 8
+ * for sign extended 24 bit value,
+ * advance write pointer.
+ */
+ AE_S32_L_XP(AE_SRAI32(q, 8),
+ (ae_int32 *)s->y_wptr, sz);
+ }
+
+ /* Check wrap */
+ src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+ }
+ }
+}
+
+#endif
--
2.14.1
More information about the Sound-open-firmware
mailing list