[Sound-open-firmware] [PATCH 1/6] SRC: Files structure change and add Xtensa optimized versions

Thu Mar 8 15:22:17 CET 2018

This patch moves generic common code to src.c/h from src_core.c/h and
places generic C optimized filter to src_generic.c. The HiFi EP
version is in src_hifi2ep.c and HiFi3 version is in src_hifi3.c. Use of
the Xtensa optimized versions require xt-xcc compiler.

The non-used SRC in/out rates query code is removed. The 24 bit
coefficients were replaced by 32 bit coefficients those are compatible
with Xtensa fractional integer types.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
---
 src/audio/Makefile.am           |   4 +-
 src/audio/src.c                 | 346 ++++++++++++++++++--
 src/audio/{src_core.h => src.h} |  17 +-
 src/audio/src_config.h          |  57 +++-
 src/audio/src_core.c            | 676 ----------------------------------------
 src/audio/src_generic.c         | 435 ++++++++++++++++++++++++++
 src/audio/src_hifi2ep.c         | 562 +++++++++++++++++++++++++++++++++
 src/audio/src_hifi3.c           | 567 +++++++++++++++++++++++++++++++++
 8 files changed, 1938 insertions(+), 726 deletions(-)
 rename src/audio/{src_core.h => src.h} (93%)
 delete mode 100644 src/audio/src_core.c
 create mode 100644 src/audio/src_generic.c
 create mode 100644 src/audio/src_hifi2ep.c
 create mode 100644 src/audio/src_hifi3.c

diff --git a/src/audio/Makefile.am b/src/audio/Makefile.am
index bccedbf..ae58289 100644
--- a/src/audio/Makefile.am
+++ b/src/audio/Makefile.am
@@ -1006,7 +1006,9 @@ libaudio_a_SOURCES = \
 	fir.c \
 	tone.c \
 	src.c \
-	src_core.c \
+	src_generic.c \
+	src_hifi2ep.c \
+	src_hifi3.c \
 	mixer.c \
 	mux.c \
 	volume.c \
diff --git a/src/audio/src.c b/src/audio/src.c
index c7ac649..cca0cbc 100644
--- a/src/audio/src.c
+++ b/src/audio/src.c
@@ -43,7 +43,17 @@
 #include <reef/audio/component.h>
 #include <reef/audio/pipeline.h>
 #include <uapi/ipc.h>
-#include "src_core.h"
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_SHORT
+#include <reef/audio/coefficients/src/src_tiny_int16_define.h>
+#include <reef/audio/coefficients/src/src_tiny_int16_table.h>
+#else
+#include <reef/audio/coefficients/src/src_std_int32_define.h>
+#include <reef/audio/coefficients/src/src_std_int32_table.h>
+#endif
 
 #ifdef MODULE_TEST
 #include <stdio.h>
@@ -53,6 +63,10 @@
 #define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e)
 #define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e)
 
+/* The FIR maximum lengths are per channel so need to multiply them */
+#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE)
+#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE)
+
 /* src component private data */
 struct comp_data {
 	struct polyphase_src src;
@@ -63,14 +77,273 @@ struct comp_data {
 	int32_t *sbuf_w_ptr;
 	int32_t *sbuf_r_ptr;
 	int sbuf_avail;
-	void (* src_func)(struct comp_dev *dev,
+	void (*src_func)(struct comp_dev *dev,
 		struct comp_buffer *source,
 		struct comp_buffer *sink,
 		size_t *consumed,
 		size_t *produced);
-	void (* polyphase_func)(struct src_stage_prm *s);
+	void (*polyphase_func)(struct src_stage_prm *s);
 };
 
+/* Calculate ceil() for integer division */
+int src_ceil_divide(int a, int b)
+{
+	int c;
+
+	c = a / b;
+	if (c * b < a)
+		c++;
+
+	return c;
+}
+
+/* Calculates the needed FIR delay line length */
+static int src_fir_delay_length(struct src_stage *s)
+{
+	return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm
+		+ s->blk_in;
+}
+
+/* Calculates the FIR output delay line length */
+static int src_out_delay_length(struct src_stage *s)
+{
+	return 1 + (s->num_of_subfilters - 1) * s->odm;
+}
+
+/* Returns index of a matching sample rate */
+static int src_find_fs(int fs_list[], int list_length, int fs)
+{
+	int i;
+
+	for (i = 0; i < list_length; i++) {
+		if (fs_list[i] == fs)
+			return i;
+	}
+	return -EINVAL;
+}
+
+/* Calculates buffers to allocate for a SRC mode */
+int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
+	int frames, int frames_is_for_source)
+{
+	struct src_stage *stage1;
+	struct src_stage *stage2;
+	int q;
+	int den;
+	int num;
+	int frames2;
+
+	if (nch > PLATFORM_MAX_CHANNELS) {
+		trace_src_error("che");
+		tracev_value(nch);
+		return -EINVAL;
+	}
+
+	a->nch = nch;
+	a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in);
+	a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out);
+
+	/* Check that both in and out rates are supported */
+	if (a->idx_in < 0 || a->idx_out < 0) {
+		trace_src_error("us1");
+		tracev_value(fs_in);
+		tracev_value(fs_out);
+		return -EINVAL;
+	}
+
+	stage1 = src_table1[a->idx_out][a->idx_in];
+	stage2 = src_table2[a->idx_out][a->idx_in];
+
+	/* Check from stage1 parameter for a deleted in/out rate combination.*/
+	if (stage1->filter_length < 1) {
+		trace_src_error("us2");
+		tracev_value(fs_in);
+		tracev_value(fs_out);
+		return -EINVAL;
+	}
+
+	a->fir_s1 = nch * src_fir_delay_length(stage1);
+	a->out_s1 = nch * src_out_delay_length(stage1);
+
+	/* Find out how many additional times the SRC can be executed
+	 * while having block size less or equal to max_frames.
+	 */
+	if (frames_is_for_source) {
+		/* Times that stage1 needs to run to input length of frames */
+		a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in);
+		q = frames / stage1->blk_in;
+		a->stage1_times = MAX(q, 1);
+		a->blk_in = a->stage1_times * stage1->blk_in;
+
+		/* Times that stage2 needs to run */
+		den = stage2->blk_in * stage1->blk_in;
+		num = frames * stage2->blk_out * stage1->blk_out;
+		frames2 = src_ceil_divide(num, den);
+		a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out);
+		q = frames2 / stage2->blk_out;
+		a->stage2_times = MAX(q, 1);
+		a->blk_out = a->stage2_times * stage2->blk_out;
+	} else {
+		/* Times that stage2 needs to run to output length of frames */
+		a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out);
+		q = frames / stage2->blk_out;
+		a->stage2_times = MAX(q, 1);
+		a->blk_out = a->stage2_times * stage2->blk_out;
+
+		/* Times that stage1 needs to run */
+		num = frames * stage2->blk_in * stage1->blk_in;
+		den = stage2->blk_out * stage1->blk_out;
+		frames2 = src_ceil_divide(num, den);
+		a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in);
+		q = frames2 / stage1->blk_in;
+		a->stage1_times = MAX(q, 1);
+		a->blk_in = a->stage1_times * stage1->blk_in;
+	}
+
+	if (stage2->filter_length == 1) {
+		a->fir_s2 = 0;
+		a->out_s2 = 0;
+		a->stage2_times = 0;
+		a->stage2_times_max = 0;
+		a->sbuf_length = 0;
+	} else {
+		a->fir_s2 = nch * src_fir_delay_length(stage2);
+		a->out_s2 = nch * src_out_delay_length(stage2);
+		/* 2x is an empirically tested length. Since the sink buffer
+		 * capability to receive samples varies a shorter stage 2 output
+		 * block will create a peak in internal buffer usage.
+		 */
+
+		/* TODO 1: Equation for needed length */
+		a->sbuf_length = 2 * nch * stage1->blk_out
+			* a->stage1_times_max;
+	}
+
+	a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
+	a->total = a->sbuf_length + a->src_multich;
+
+	return 0;
+}
+
+static void src_state_reset(struct src_state *state)
+{
+	state->fir_delay_size = 0;
+	state->out_delay_size = 0;
+}
+
+static int init_stages(struct src_stage *stage1, struct src_stage *stage2,
+	struct polyphase_src *src, struct src_param *p,
+	int n, int32_t *delay_lines_start)
+{
+	/* Clear FIR state */
+	src_state_reset(&src->state1);
+	src_state_reset(&src->state2);
+
+	src->number_of_stages = n;
+	src->stage1 = stage1;
+	src->stage2 = stage2;
+	if (n == 1 && stage1->blk_out == 0)
+		return -EINVAL;
+
+	/* Optimized SRC requires subfilter length multiple of 4 */
+	if (stage1->filter_length > 1 && (stage1->subfilter_length & 0x3) > 0)
+		return -EINVAL;
+
+	if (stage2->filter_length > 1 && (stage2->subfilter_length & 0x3) > 0)
+		return -EINVAL;
+
+	/* Delay line sizes */
+	src->state1.fir_delay_size = p->fir_s1;
+	src->state1.out_delay_size = p->out_s1;
+	src->state1.fir_delay = delay_lines_start;
+	src->state1.out_delay =
+		src->state1.fir_delay + src->state1.fir_delay_size;
+	/* Initialize to last ensures that circular wrap cannot happen
+	 * mid-frame. The size is multiple of channels count.
+	 */
+	src->state1.fir_wp = &src->state1.fir_delay[p->fir_s1 - 1];
+	src->state1.out_rp = src->state1.out_delay;
+	if (n > 1) {
+		src->state2.fir_delay_size = p->fir_s2;
+		src->state2.out_delay_size = p->out_s2;
+		src->state2.fir_delay =
+			src->state1.out_delay + src->state1.out_delay_size;
+		src->state2.out_delay =
+			src->state2.fir_delay + src->state2.fir_delay_size;
+		/* Initialize to last ensures that circular wrap cannot happen
+		 * mid-frame. The size is multiple of channels count.
+		 */
+		src->state2.fir_wp = &src->state2.fir_delay[p->fir_s2 - 1];
+		src->state2.out_rp = src->state2.out_delay;
+	} else {
+		src->state2.fir_delay_size = 0;
+		src->state2.out_delay_size = 0;
+		src->state2.fir_delay = NULL;
+		src->state2.out_delay = NULL;
+	}
+
+	/* Check the sizes are less than MAX */
+	if (src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH ||
+		src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH ||
+		src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH ||
+		src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH) {
+		src->state1.fir_delay = NULL;
+		src->state1.out_delay = NULL;
+		src->state2.fir_delay = NULL;
+		src->state2.out_delay = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void src_polyphase_reset(struct polyphase_src *src)
+{
+	src->number_of_stages = 0;
+	src->stage1 = NULL;
+	src->stage2 = NULL;
+	src_state_reset(&src->state1);
+	src_state_reset(&src->state2);
+}
+
+int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
+	int32_t *delay_lines_start)
+{
+	struct src_stage *stage1;
+	struct src_stage *stage2;
+	int n_stages;
+	int ret;
+
+	if (p->idx_in < 0 || p->idx_out < 0)
+		return -EINVAL;
+
+	/* Get setup for 2 stage conversion */
+	stage1 = src_table1[p->idx_out][p->idx_in];
+	stage2 = src_table2[p->idx_out][p->idx_in];
+	ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start);
+	if (ret < 0)
+		return -EINVAL;
+
+	/* Get number of stages used for optimize opportunity. 2nd
+	 * stage length is one if conversion needs only one stage.
+	 * If input and output rate is the same return 0 to
+	 * use a simple copy function instead of 1 stage FIR with one
+	 * tap.
+	 */
+	n_stages = (src->stage2->filter_length == 1) ? 1 : 2;
+	if (p->idx_in == p->idx_out)
+		n_stages = 0;
+
+	/* If filter length for first stage is zero this is a deleted
+	 * mode from in/out matrix. Computing of such SRC mode needs
+	 * to be prevented.
+	 */
+	if (src->stage1->filter_length == 0)
+		return -EINVAL;
+
+	return n_stages;
+}
+
 /* Fallback function */
 static void src_fallback(struct comp_dev *dev, struct comp_buffer *source,
 	struct comp_buffer *sink, size_t *bytes_read, size_t *bytes_written)
@@ -91,8 +364,9 @@ static void src_2s_s32_default(struct comp_dev *dev,
 	int s2_blk_in;
 	int s2_blk_out;
 	struct comp_data *cd = comp_get_drvdata(dev);
-	int32_t *dest = (int32_t *) sink->w_ptr;
-	int32_t *src = (int32_t *) source->r_ptr;
+	int32_t *dest = (int32_t *)sink->w_ptr;
+	int32_t *src = (int32_t *)source->r_ptr;
+	int32_t *sbuf_addr = cd->delay_lines;
 	int32_t *sbuf_end_addr = &cd->delay_lines[cd->param.sbuf_length];
 	int32_t sbuf_size = cd->param.sbuf_length * sizeof(int32_t);
 	int nch = dev->params.channels;
@@ -107,6 +381,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
 
 	s1.x_end_addr = source->end_addr;
 	s1.x_size = source->size;
+	s1.y_addr = sbuf_addr;
 	s1.y_end_addr = sbuf_end_addr;
 	s1.y_size = sbuf_size;
 	s1.state = &cd->src.state1;
@@ -117,6 +392,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
 
 	s2.x_end_addr = sbuf_end_addr;
 	s2.x_size = sbuf_size;
+	s2.y_addr = sink->addr;
 	s2.y_end_addr = sink->end_addr;
 	s2.y_size = sink->size;
 	s2.state = &cd->src.state2;
@@ -125,14 +401,13 @@ static void src_2s_s32_default(struct comp_dev *dev,
 	s2.y_wptr = dest;
 	s2.nch = nch;
 
-
 	/* Test if 1st stage can be run with default block length to reach
 	 * the period length or just under it.
 	 */
 	s1.times = cd->param.stage1_times;
 	s1_blk_in = s1.times * cd->src.stage1->blk_in * nch;
 	s1_blk_out = s1.times * cd->src.stage1->blk_out * nch;
-	if ((avail_b >= s1_blk_in * sz) && (sbuf_free >= s1_blk_out)) {
+	if (avail_b >= s1_blk_in * sz && sbuf_free >= s1_blk_out) {
 		cd->polyphase_func(&s1);
 
 		cd->sbuf_w_ptr = s1.y_wptr;
@@ -147,8 +422,9 @@ static void src_2s_s32_default(struct comp_dev *dev,
 	s1.times = 1;
 	s1_blk_in = cd->src.stage1->blk_in * nch;
 	s1_blk_out = cd->src.stage1->blk_out * nch;
-	while ((n1 < cd->param.stage1_times_max) && (avail_b >= s1_blk_in * sz)
-		&& (sbuf_free >= s1_blk_out)) {
+	while (n1 < cd->param.stage1_times_max &&
+		avail_b >= s1_blk_in * sz &&
+		sbuf_free >= s1_blk_out) {
 		cd->polyphase_func(&s1);
 
 		cd->sbuf_w_ptr = s1.y_wptr;
@@ -163,7 +439,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
 	s2.times = cd->param.stage2_times;
 	s2_blk_in = s2.times * cd->src.stage2->blk_in * nch;
 	s2_blk_out = s2.times * cd->src.stage2->blk_out * nch;
-	if ((cd->sbuf_avail >= s2_blk_in) && (free_b >= s2_blk_out * sz)) {
+	if (cd->sbuf_avail >= s2_blk_in && free_b >= s2_blk_out * sz) {
 		cd->polyphase_func(&s2);
 
 		cd->sbuf_r_ptr = s2.x_rptr;
@@ -173,14 +449,13 @@ static void src_2s_s32_default(struct comp_dev *dev,
 		n2 = s2.times;
 	}
 
-
 	/* Run one block at time the remaining 2nd stage output */
 	s2.times = 1;
 	s2_blk_in = cd->src.stage2->blk_in * nch;
 	s2_blk_out = cd->src.stage2->blk_out * nch;
-	while ((n2 < cd->param.stage2_times_max)
-		&& (cd->sbuf_avail >= s2_blk_in)
-		&& (free_b >= s2_blk_out * sz)) {
+	while (n2 < cd->param.stage2_times_max &&
+		cd->sbuf_avail >= s2_blk_in &&
+		free_b >= s2_blk_out * sz) {
 		cd->polyphase_func(&s2);
 
 		cd->sbuf_r_ptr = s2.x_rptr;
@@ -205,10 +480,10 @@ static void src_1s_s32_default(struct comp_dev *dev,
 	int n_written = 0;
 
 	s1.times = cd->param.stage1_times;
-	s1.x_rptr = (int32_t *) source->r_ptr;
+	s1.x_rptr = (int32_t *)source->r_ptr;
 	s1.x_end_addr = source->end_addr;
 	s1.x_size = source->size;
-	s1.y_wptr = (int32_t *) sink->w_ptr;
+	s1.y_wptr = (int32_t *)sink->w_ptr;
 	s1.y_end_addr = sink->end_addr;
 	s1.y_size = sink->size;
 	s1.state = &cd->src.state1;
@@ -229,8 +504,8 @@ static void src_copy_s32_default(struct comp_dev *dev,
 	size_t *bytes_read, size_t *bytes_written)
 {
 	struct comp_data *cd = comp_get_drvdata(dev);
-	int32_t *src = (int32_t *) source->r_ptr;
-	int32_t *snk = (int32_t *) sink->w_ptr;
+	int32_t *src = (int32_t *)source->r_ptr;
+	int32_t *snk = (int32_t *)sink->w_ptr;
 	int nch = dev->params.channels;
 	int frames = cd->param.blk_in;
 	int n;
@@ -241,9 +516,10 @@ static void src_copy_s32_default(struct comp_dev *dev,
 
 	n = frames * nch;
 	while (n > 0) {
-		n_wrap_src = (int32_t *) source->end_addr - src;
-		n_wrap_snk = (int32_t *) sink->end_addr - snk;
-		n_wrap_min = (n_wrap_src < n_wrap_snk) ? n_wrap_src : n_wrap_snk;
+		n_wrap_src = (int32_t *)source->end_addr - src;
+		n_wrap_snk = (int32_t *)sink->end_addr - snk;
+		n_wrap_min = (n_wrap_src < n_wrap_snk) ?
+			n_wrap_src : n_wrap_snk;
 		n_copy = (n < n_wrap_min) ? n : n_wrap_min;
 		memcpy(snk, src, n_copy * sizeof(int32_t));
 
@@ -253,7 +529,6 @@ static void src_copy_s32_default(struct comp_dev *dev,
 		snk += n_copy;
 		src_circ_inc_wrap(&src, source->end_addr, source->size);
 		src_circ_inc_wrap(&snk, sink->end_addr, sink->size);
-
 	}
 	*bytes_read = frames * nch * sizeof(int32_t);
 	*bytes_written = frames * nch * sizeof(int32_t);
@@ -263,7 +538,7 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
 {
 	struct comp_dev *dev;
 	struct sof_ipc_comp_src *src;
-	struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *) comp;
+	struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *)comp;
 	struct comp_data *cd;
 
 	trace_src("new");
@@ -276,14 +551,14 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
 
 	dev = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM,
 		COMP_SIZE(struct sof_ipc_comp_src));
-	if (dev == NULL)
+	if (!dev)
 		return NULL;
 
-	src = (struct sof_ipc_comp_src *) &dev->comp;
+	src = (struct sof_ipc_comp_src *)&dev->comp;
 	memcpy(src, ipc_src, sizeof(struct sof_ipc_comp_src));
 
 	cd = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM, sizeof(*cd));
-	if (cd == NULL) {
+	if (!cd) {
 		rfree(dev);
 		return NULL;
 	}
@@ -306,7 +581,7 @@ static void src_free(struct comp_dev *dev)
 	trace_src("fre");
 
 	/* Free dynamically reserved buffers for SRC algorithm */
-	if (cd->delay_lines != NULL)
+	if (!cd->delay_lines)
 		rfree(cd->delay_lines);
 
 	rfree(cd);
@@ -347,7 +622,8 @@ static int src_params(struct comp_dev *dev)
 	}
 
 	/* Calculate source and sink rates, one rate will come from IPC new
-	 * and the other from params. */
+	 * and the other from params.
+	 */
 	if (src->source_rate == 0) {
 		/* params rate is source rate */
 		source_rate = params->rate;
@@ -383,12 +659,12 @@ static int src_params(struct comp_dev *dev)
 	}
 
 	/* free any existing delay lines. TODO reuse if same size */
-	if (cd->delay_lines != NULL)
+	if (!cd->delay_lines)
 		rfree(cd->delay_lines);
 
 	cd->delay_lines = rballoc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM,
 		delay_lines_size);
-	if (cd->delay_lines == NULL) {
+	if (!cd->delay_lines) {
 		trace_src_error("sr3");
 		trace_value(delay_lines_size);
 		return -EINVAL;
@@ -424,7 +700,6 @@ static int src_params(struct comp_dev *dev)
 		trace_src("SFa");
 		cd->src_func = src_fallback;
 		return -EINVAL;
-		break;
 	}
 
 	/* Calculate period size based on config. First make sure that
@@ -438,7 +713,7 @@ static int src_params(struct comp_dev *dev)
 	 * buffer_set_size will return an error if the required length would
 	 * be too long.
 	 */
-	q = src_ceil_divide(cd->param.blk_out, (int) dev->frames) + 1;
+	q = src_ceil_divide(cd->param.blk_out, (int)dev->frames) + 1;
 
 	/* Configure downstream buffer */
 	sink = list_first_item(&dev->bsink_list, struct comp_buffer,
@@ -459,7 +734,6 @@ static int src_params(struct comp_dev *dev)
 		return -EINVAL;
 	}
 
-
 	return 0;
 }
 
@@ -518,7 +792,8 @@ static int src_copy(struct comp_dev *dev)
 
 	/* make sure source component buffer has enough data available and that
 	 * the sink component buffer has enough free bytes for copy. Also
-	 * check for XRUNs */
+	 * check for XRUNs.
+	 */
 	if (source->avail < need_source) {
 		trace_src_error("xru");
 		return -EIO;	/* xrun */
@@ -530,6 +805,9 @@ static int src_copy(struct comp_dev *dev)
 
 	cd->src_func(dev, source, sink, &consumed, &produced);
 
+	tracev_value(consumed >> 3);
+	tracev_value(produced >> 3);
+
 	/* Calc new free and available if data was processed. These
 	 * functions must not be called with 0 consumed/produced.
 	 */
diff --git a/src/audio/src_core.h b/src/audio/src.h
similarity index 93%
rename from src/audio/src_core.h
rename to src/audio/src.h
index 3ea6028..3208693 100644
--- a/src/audio/src_core.h
+++ b/src/audio/src.h
@@ -29,8 +29,8 @@
  *
  */
 
-#ifndef SRC_CORE_H
-#define SRC_CORE_H
+#ifndef SRC_H
+#define SRC_H
 
 #define MAX(a, b) (((a) > (b)) ? (a) : (b))
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -68,12 +68,12 @@ struct src_stage {
 };
 
 struct src_state {
-	int fir_delay_size;
-	int out_delay_size;
-	int fir_wi;
-	int out_ri;
+	int fir_delay_size;	/* samples */
+	int out_delay_size;	/* samples */
 	int32_t *fir_delay;
 	int32_t *out_delay;
+	int32_t *fir_wp;
+	int32_t *out_rp;
 };
 
 struct polyphase_src {
@@ -91,6 +91,7 @@ struct src_stage_prm {
 	int32_t *x_end_addr;
 	size_t x_size;
 	int32_t *y_wptr;
+	int32_t *y_addr;
 	int32_t *y_end_addr;
 	size_t y_size;
 	struct src_state *state;
@@ -100,13 +101,13 @@ struct src_stage_prm {
 static inline void src_circ_inc_wrap(int32_t **ptr, int32_t *end, size_t size)
 {
 	if (*ptr >= end)
-		*ptr = (int32_t *) ((size_t) * ptr - size);
+		*ptr = (int32_t *)((size_t)*ptr - size);
 }
 
 static inline void src_circ_dec_wrap(int32_t **ptr, int32_t *addr, size_t size)
 {
 	if (*ptr < addr)
-		*ptr = (int32_t *) ((size_t) * ptr + size);
+		*ptr = (int32_t *)((size_t)*ptr + size);
 }
 
 void src_polyphase_reset(struct polyphase_src *src);
diff --git a/src/audio/src_config.h b/src/audio/src_config.h
index 3ad4c78..65d6247 100644
--- a/src/audio/src_config.h
+++ b/src/audio/src_config.h
@@ -34,14 +34,57 @@
 
 #include <config.h>
 
-#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL || defined CONFIG_BROADWELL || defined CONFIG_HASWELL
-#define SRC_SHORT 1
-#include <reef/audio/coefficients/src/src_tiny_int16_define.h>
-#include <reef/audio/coefficients/src/src_tiny_int16_table.h>
+/* If next defines are set to 1 the SRC is configured automatically. Setting
+ * to zero temporarily is useful is for testing needs.
+ * Setting SRC_AUTODSP to 0 allows to manually set the code variant.
+ * Setting SRC_AUTOCOEF to 0 allows to select the coefficient type.
+ */
+#define SRC_AUTOARCH    1
+#define SRC_AUTOCOEF    1
+
+/* Force manually some code variant when SRC_AUTODSP is set to zero. These
+ * are useful in code debugging.
+ */
+#if SRC_AUTOARCH == 0
+#define SRC_GENERIC	1
+#define SRC_HIFIEP	0
+#define SRC_HIFI3	0
+#endif
+#if SRC_AUTOCOEF == 0
+#define SRC_SHORT	0
+#endif
+
+/* Select 16 bit coefficients for specific platforms.
+ * Otherwise 32 bits is the default.
+ */
+#if SRC_AUTOCOEF == 1
+#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL \
+	|| defined CONFIG_BROADWELL || defined CONFIG_HASWELL
+#define SRC_SHORT	1     /* Use int16_t filter coefficients */
 #else
-#define SHORT_SHORT 0
-#include <reef/audio/coefficients/src/src_std_int24_define.h>
-#include <reef/audio/coefficients/src/src_std_int24_table.h>
+#define SRC_SHORT	0     /* Use int32_t filter coefficients */
+#endif
+#endif
+
+/* Select optimized code variant when xt-xcc compiler is used */
+#if SRC_AUTOARCH == 1
+#if defined __XCC__
+#include <xtensa/config/core-isa.h>
+#define SRC_GENERIC	0
+#if XCHAL_HAVE_HIFI2EP == 1
+#define SRC_HIFIEP	1
+#define SRC_HIFI3	0
+#endif
+#if XCHAL_HAVE_HIFI3 == 1
+#define SRC_HIFI3	1
+#define SRC_HIFIEP	0
+#endif
+#else
+/* GCC */
+#define SRC_GENERIC	1
+#define SRC_HIFIEP	0
+#define SRC_HIFI3	0
+#endif
 #endif
 
 #endif
diff --git a/src/audio/src_core.c b/src/audio/src_core.c
deleted file mode 100644
index d8b9a3d..0000000
--- a/src/audio/src_core.c
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
- * Copyright (c) 2016, Intel Corporation
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *   * Neither the name of the Intel Corporation nor the
- *     names of its contributors may be used to endorse or promote products
- *     derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
- *
- */
-
-/* Non optimized default C implementation guaranteed to work on any
- * architecture.
- */
-
-#include <stdint.h>
-
-#ifdef MODULE_TEST
-#include <stdio.h>
-#endif
-
-#include <reef/alloc.h>
-#include <reef/audio/format.h>
-#include <reef/math/numbers.h>
-#include "src_core.h"
-#include "src_config.h"
-
-#define trace_src(__e) trace_event(TRACE_CLASS_SRC, __e)
-#define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e)
-#define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e)
-
-/* TODO: These should be defined somewhere else. */
-#define SOF_RATES_LENGTH 15
-int sof_rates[SOF_RATES_LENGTH] = {8000, 11025, 12000, 16000, 18900,
-	22050, 24000, 32000, 44100, 48000, 64000, 88200, 96000, 176400,
-	192000};
-
-/* The FIR maximum lengths are per channel so need to multiply them */
-#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE)
-#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE)
-
-/* Calculate ceil() for integer division */
-int src_ceil_divide(int a, int b)
-{
-	int c;
-
-	c = a / b;
-	if (c * b < a)
-		c++;
-
-	return c;
-}
-
-/* Calculates the needed FIR delay line length */
-static int src_fir_delay_length(struct src_stage *s)
-{
-	return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm
-		+ s->blk_in;
-}
-
-/* Calculates the FIR output delay line length */
-static int src_out_delay_length(struct src_stage *s)
-{
-
-	return 1 + (s->num_of_subfilters - 1) * s->odm;
-}
-
-/* Returns index of a matching sample rate */
-static int src_find_fs(int fs_list[], int list_length, int fs)
-{
-	int i;
-
-	for (i = 0; i < list_length; i++) {
-		if (fs_list[i] == fs)
-			return i;
-	}
-	return -EINVAL;
-}
-
-/* Match SOF and defined SRC input rates into a bit mask */
-int32_t src_input_rates(void)
-{
-	int n;
-	int b;
-	int mask = 0;
-
-	for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) {
-		b = (src_find_fs(src_in_fs, NUM_IN_FS, sof_rates[n]) >= 0)
-			? 1 : 0;
-		mask = (mask << 1) | b;
-	}
-	return mask;
-}
-
-/* Match SOF and defined SRC output rates into a bit mask */
-int32_t src_output_rates(void)
-{
-	int n;
-	int b;
-	int mask = 0;
-
-	for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) {
-		b = (src_find_fs(src_out_fs, NUM_OUT_FS, sof_rates[n]) >= 0)
-			? 1 : 0;
-		mask = (mask << 1) | b;
-	}
-	return mask;
-}
-
-/* Calculates buffers to allocate for a SRC mode */
-int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
-	int frames, int frames_is_for_source)
-{
-	struct src_stage *stage1;
-	struct src_stage *stage2;
-	int q;
-	int den;
-	int num;
-	int frames2;
-
-	if (nch > PLATFORM_MAX_CHANNELS) {
-		trace_src_error("che");
-		tracev_value(nch);
-		return -EINVAL;
-	}
-
-	a->nch = nch;
-	a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in);
-	a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out);
-
-	/* Check that both in and out rates are supported */
-	if ((a->idx_in < 0) || (a->idx_out < 0)) {
-		trace_src_error("us1");
-		tracev_value(fs_in);
-		tracev_value(fs_out);
-		return -EINVAL;
-	}
-
-	stage1 = src_table1[a->idx_out][a->idx_in];
-	stage2 = src_table2[a->idx_out][a->idx_in];
-
-	/* Check from stage1 parameter for a deleted in/out rate combination.*/
-	if (stage1->filter_length < 1) {
-		trace_src_error("us2");
-		tracev_value(fs_in);
-		tracev_value(fs_out);
-		return -EINVAL;
-	}
-
-	a->fir_s1 = nch * src_fir_delay_length(stage1);
-	a->out_s1 = nch * src_out_delay_length(stage1);
-
-	/* Find out how many additional times the SRC can be executed
-	   while having block size less or equal to max_frames.
-	 */
-	if (frames_is_for_source) {
-		/* Times that stage1 needs to run to input length of frames */
-		a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in);
-		q = frames / stage1->blk_in;
-		a->stage1_times = MAX(q, 1);
-		a->blk_in = a->stage1_times * stage1->blk_in;
-
-		/* Times that stage2 needs to run */
-		den = stage2->blk_in * stage1->blk_in;
-		num = frames * stage2->blk_out * stage1->blk_out;
-		frames2 = src_ceil_divide(num, den);
-		a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out);
-		q = frames2 / stage2->blk_out;
-		a->stage2_times = MAX(q, 1);
-		a->blk_out = a->stage2_times * stage2->blk_out;
-	} else {
-		/* Times that stage2 needs to run to output length of frames */
-		a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out);
-		q = frames / stage2->blk_out;
-		a->stage2_times = MAX(q, 1);
-		a->blk_out = a->stage2_times * stage2->blk_out;
-
-		/* Times that stage1 needs to run */
-		num = frames * stage2->blk_in * stage1->blk_in;
-		den = stage2->blk_out * stage1->blk_out;
-		frames2 = src_ceil_divide(num, den);
-		a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in);
-		q = frames2 / stage1->blk_in;
-		a->stage1_times = MAX(q, 1);
-		a->blk_in = a->stage1_times * stage1->blk_in;
-	}
-
-	if (stage2->filter_length == 1) {
-		a->fir_s2 = 0;
-		a->out_s2 = 0;
-		a->stage2_times = 0;
-		a->stage2_times_max = 0;
-		a->sbuf_length = 0;
-	} else {
-		a->fir_s2 = nch * src_fir_delay_length(stage2);
-		a->out_s2 = nch * src_out_delay_length(stage2);
-		/* 2x is an empirically tested length. Since the sink buffer
-		 * capability to receive samples varies a shorter stage 2 output
-		 * block will create a peak in internal buffer usage.
-		 */
-		a->sbuf_length = 2 * nch * stage1->blk_out * a->stage1_times_max;
-	}
-
-	a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
-	a->total = a->sbuf_length + a->src_multich;
-
-	return 0;
-}
-
-static void src_state_reset(struct src_state *state)
-{
-
-	state->fir_delay_size = 0;
-	state->out_delay_size = 0;
-	state->fir_wi = 0;
-	state->out_ri = 0;
-}
-
-static int init_stages(
-	struct src_stage *stage1, struct src_stage *stage2,
-	struct polyphase_src *src, struct src_param *p,
-	int n, int32_t *delay_lines_start)
-{
-	/* Clear FIR state */
-	src_state_reset(&src->state1);
-	src_state_reset(&src->state2);
-
-	src->number_of_stages = n;
-	src->stage1 = stage1;
-	src->stage2 = stage2;
-	if ((n == 1) && (stage1->blk_out == 0))
-		return -EINVAL;
-
-	/* Delay line sizes */
-	src->state1.fir_delay_size = p->fir_s1;
-	src->state1.out_delay_size = p->out_s1;
-	src->state1.fir_delay = delay_lines_start;
-	src->state1.out_delay =
-		src->state1.fir_delay + src->state1.fir_delay_size;
-	if (n > 1) {
-		src->state2.fir_delay_size = p->fir_s2;
-		src->state2.out_delay_size = p->out_s2;
-		src->state2.fir_delay =
-			src->state1.out_delay + src->state1.out_delay_size;
-		src->state2.out_delay =
-			src->state2.fir_delay + src->state2.fir_delay_size;
-	} else {
-		src->state2.fir_delay_size = 0;
-		src->state2.out_delay_size = 0;
-		src->state2.fir_delay = NULL;
-		src->state2.out_delay = NULL;
-	}
-
-	/* Check the sizes are less than MAX */
-	if ((src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
-		|| (src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)
-		|| (src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
-		|| (src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)) {
-		src->state1.fir_delay = NULL;
-		src->state1.out_delay = NULL;
-		src->state2.fir_delay = NULL;
-		src->state2.out_delay = NULL;
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-void src_polyphase_reset(struct polyphase_src *src)
-{
-
-	src->number_of_stages = 0;
-	src->stage1 = NULL;
-	src->stage2 = NULL;
-	src_state_reset(&src->state1);
-	src_state_reset(&src->state2);
-}
-
-int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
-	int32_t *delay_lines_start)
-{
-	struct src_stage *stage1;
-	struct src_stage *stage2;
-	int n_stages;
-	int ret;
-
-	if ((p->idx_in < 0) || (p->idx_out < 0)) {
-		return -EINVAL;
-	}
-
-	/* Get setup for 2 stage conversion */
-	stage1 = src_table1[p->idx_out][p->idx_in];
-	stage2 = src_table2[p->idx_out][p->idx_in];
-	ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start);
-	if (ret < 0)
-		return -EINVAL;
-
-	/* Get number of stages used for optimize opportunity. 2nd
-	 * stage length is one if conversion needs only one stage.
-	 * If input and output rate is the same return 0 to
-	 * use a simple copy function instead of 1 stage FIR with one
-	 * tap.
-	 */
-	n_stages = (src->stage2->filter_length == 1) ? 1 : 2;
-	if (p->idx_in == p->idx_out)
-		n_stages = 0;
-
-	/* If filter length for first stage is zero this is a deleted
-	 * mode from in/out matrix. Computing of such SRC mode needs
-	 * to be prevented.
-	 */
-	if (src->stage1->filter_length == 0)
-		return -EINVAL;
-
-	return n_stages;
-}
-
-#if SRC_SHORT == 1
-
-/* Calculate a FIR filter part that does not need circular modification */
-
-static inline void fir_part(int64_t y[], int *id, int *ic,
-	const int32_t data[], const int16_t coef[], int nch_x_taps, int nch)
-{
-	int64_t tap0;
-	int64_t tap1;
-	int n;
-	int64_t a = 0;
-	int64_t b = 0;
-	int c = *ic;
-	int d = *id;
-	int d_end = d - nch_x_taps;
-
-	/* Data is Q1.31, coef is Q1.15, product is Q2.46 */
-	if (nch == 2) {
-		for (n = 0; n < (nch_x_taps >> 2); n++) {
-			tap0 = coef[c++];
-			tap1 = coef[c++];
-			b += data[d--] * tap0;
-			a += data[d--] * tap0;
-			b += data[d--] * tap1;
-			a += data[d--] * tap1;
-		}
-		if (d > d_end) {
-			tap0 = coef[c++];
-			b += data[d--] * tap0;
-			a += data[d--] * tap0;
-		}
-		y[1] += b;
-		y[0] += a;
-	} else {
-		while (d > d_end) {
-			tap0 = coef[c++];
-			for (n = nch - 1; n >= 0; n--)
-				y[n] += data[d--] * tap0;
-		}
-	}
-	*ic = c;
-	*id = d;
-}
-
-#else
-
-static inline void fir_part(int64_t y[], int *id, int *ic,
-	const int32_t data[], const int32_t coef[], int nch_x_taps, int nch)
-{
-	int64_t tap0;
-	int64_t tap1;
-	int n;
-	int64_t a = 0;
-	int64_t b = 0;
-	int c = *ic;
-	int d = *id;
-	int d_end = d - nch_x_taps;
-
-	/* Data is Q1.31, coef is Q1.23, product is Q2.54 */
-	if (nch == 2) {
-		for (n = 0; n < (nch_x_taps >> 2); n++) {
-			tap0 = coef[c++];
-			tap1 = coef[c++];
-			b += data[d--] * tap0;
-			a += data[d--] * tap0;
-			b += data[d--] * tap1;
-			a += data[d--] * tap1;
-		}
-		if (d > d_end) {
-			tap0 = coef[c++];
-			b += data[d--] * tap0;
-			a += data[d--] * tap0;
-		}
-		y[1] += b;
-		y[0] += a;
-	} else {
-		while (d > d_end) {
-			tap0 = coef[c++];
-			for (n = nch - 1; n >= 0; n--)
-				y[n] += data[d--] * tap0;
-		}
-	}
-	*ic = c;
-	*id = d;
-}
-
-#endif
-
-#if SRC_SHORT == 1
-
-static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
-	int32_t out_delay[], const int16_t coefs[], int dsm1, int taps,
-	int shift, int nch)
-{
-	int n2;
-	int i;
-	int64_t y[PLATFORM_MAX_CHANNELS];
-	int ri = ri0;
-	int wi = wi0;
-	int n1 = ri0 + 1; /* Convert to number of sequential frames */
-	int qshift = 15 + shift; /* Q2.46 -> Q2.31 */
-	int32_t rnd = 1 << (qshift - 1); /* Half LSB */
-	int nch_x_taps = nch * taps;
-
-	/* Initialize to half LSB for rounding */
-	for (i = 0; i < nch; i++)
-		y[i] = rnd;
-
-	if (n1 >= nch_x_taps) {
-		fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
-	} else {
-		n2 = nch_x_taps - n1;
-		fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
-		ri = dsm1;
-		fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
-	}
-
-	for (i = 0; i < nch; i++)
-		out_delay[wi++] = sat_int32(y[i] >> qshift);
-}
-#else
-
-static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
-	int32_t out_delay[], const int32_t coefs[], int dsm1, int taps,
-	int shift, int nch)
-{
-	int n2;
-	int i;
-	int64_t y[PLATFORM_MAX_CHANNELS];
-	int ri = ri0;
-	int wi = wi0;
-	int n1 = ri0 + 1; /* Convert to number of sequential frames */
-	int qshift = 23 + shift; /* Q2.54 -> Q2.31 */
-	int32_t rnd = 1 << (qshift - 1); /* Half LSB */
-	int nch_x_taps = nch * taps;
-
-	/* Initialize to half LSB for rounding */
-	for (i = 0; i < nch; i++)
-		y[i] = rnd;
-
-	if (n1 >= nch_x_taps) {
-		fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
-	} else {
-		n2 = nch_x_taps - n1;
-		fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
-		ri = dsm1;
-		fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
-	}
-
-	for (i = 0; i < nch; i++)
-		out_delay[wi++] = sat_int32(y[i] >> qshift);
-
-}
-
-#endif
-
-void src_polyphase_stage_cir(struct src_stage_prm * s)
-{
-	struct src_state *fir = s->state;
-	struct src_stage *cfg = s->stage;
-	int n;
-	int m;
-	int f;
-	int ci;
-	int ri;
-	int n_wrap_fir;
-	int n_wrap_buf;
-	int n_wrap_min;
-	int n_min;
-	int wi;
-	const void *coef = cfg->coefs;
-	int32_t *in_delay = fir->fir_delay;
-	int32_t *out_delay = fir->out_delay;
-	int dsm1 = fir->fir_delay_size - 1;
-	int shift = cfg->shift;
-	int nch = s->nch;
-	int rewind = -nch * (cfg->blk_in
-		+ (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
-	int nch_x_idm = cfg->idm * nch;
-	int nch_x_odm = cfg->odm * nch;
-	size_t sz = sizeof(int32_t);
-	int blk_in_bytes = nch * cfg->blk_in * sz;
-	int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
-
-
-	for (n = 0; n < s->times; n++) {
-		/* Input data */
-		m = blk_in_bytes;
-		while (m > 0) {
-			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
-			n_wrap_buf = s->x_end_addr - s->x_rptr;
-			n_wrap_min = (n_wrap_fir < n_wrap_buf)
-				? n_wrap_fir : n_wrap_buf;
-			n_min = (m < n_wrap_min) ? m : n_wrap_min;
-			while (n_min > 0) {
-				fir->fir_delay[fir->fir_wi++] = *s->x_rptr;
-				s->x_rptr++;
-				n_min -= sz;
-				m -= sz;
-			}
-			/* Check for wrap */
-			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
-			if (fir->fir_wi == fir->fir_delay_size)
-				fir->fir_wi = 0;
-		}
-
-		/* Filter */
-		ci = 0; /* Reset to 1st coefficient */
-		ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
-		if (ri < 0)
-			ri += fir->fir_delay_size;
-
-		wi = fir->out_ri;
-		for (f = 0; f < cfg->num_of_subfilters; f++) {
-			fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
-				dsm1, cfg->subfilter_length, shift, nch);
-
-			wi += nch_x_odm;
-			if (wi >= fir->out_delay_size)
-				wi -= fir->out_delay_size;
-
-			ri += nch_x_idm; /* Next sub-filter start */
-			if (ri >= fir->fir_delay_size)
-				ri -= fir->fir_delay_size;
-		}
-
-		/* Output */
-		m = blk_out_bytes;
-		while (m > 0) {
-			n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
-			n_wrap_buf = s->y_end_addr - s->y_wptr;
-			n_wrap_min = (n_wrap_fir < n_wrap_buf)
-				? n_wrap_fir : n_wrap_buf;
-			n_min = (m < n_wrap_min) ? m : n_wrap_min;
-			while (n_min > 0) {
-				*s->y_wptr = fir->out_delay[fir->out_ri++];
-				s->y_wptr++;
-				n_min -= sz;
-				m -= sz;
-			}
-			/* Check wrap */
-			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
-			if (fir->out_ri == fir->out_delay_size)
-				fir->out_ri = 0;
-		}
-	}
-}
-
-void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
-{
-	struct src_state *fir = s->state;
-	struct src_stage *cfg = s->stage;
-	int n;
-	int m;
-	int f;
-	int ci;
-	int ri;
-	int n_wrap_fir;
-	int n_wrap_buf;
-	int n_wrap_min;
-	int n_min;
-	int wi;
-	const void *coef = cfg->coefs;
-	int32_t *in_delay = fir->fir_delay;
-	int32_t *out_delay = fir->out_delay;
-	int dsm1 = fir->fir_delay_size - 1;
-	int shift = cfg->shift;
-	int nch = s->nch;
-	int rewind = -nch * (cfg->blk_in
-		+ (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
-	int nch_x_idm = cfg->idm * nch;
-	int nch_x_odm = cfg->odm * nch;
-	size_t sz = sizeof(int32_t);
-	int blk_in_bytes = nch * cfg->blk_in * sz;
-	int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
-
-	for (n = 0; n < s->times; n++) {
-		/* Input data */
-		m = blk_in_bytes;
-		while (m > 0) {
-			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
-			n_wrap_buf = s->x_end_addr - s->x_rptr;
-			n_wrap_min = (n_wrap_fir < n_wrap_buf)
-				? n_wrap_fir : n_wrap_buf;
-			n_min = (m < n_wrap_min) ? m : n_wrap_min;
-			while (n_min > 0) {
-				fir->fir_delay[fir->fir_wi++] = *s->x_rptr << 8;
-				s->x_rptr++;
-				n_min -= sz;
-				m -= sz;
-			}
-			/* Check for wrap */
-			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
-			if (fir->fir_wi == fir->fir_delay_size)
-				fir->fir_wi = 0;
-		}
-
-		/* Filter */
-		ci = 0; /* Reset to 1st coefficient */
-		ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
-		if (ri < 0)
-			ri += fir->fir_delay_size;
-
-		wi = fir->out_ri;
-		for (f = 0; f < cfg->num_of_subfilters; f++) {
-			fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
-				dsm1, cfg->subfilter_length, shift, nch);
-
-			wi += nch_x_odm;
-			if (wi >= fir->out_delay_size)
-				wi -= fir->out_delay_size;
-
-			ri += nch_x_idm; /* Next sub-filter start */
-			if (ri >= fir->fir_delay_size)
-				ri -= fir->fir_delay_size;
-		}
-
-		/* Output */
-		m = blk_out_bytes;
-		while (m > 0) {
-			n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
-			n_wrap_buf = s->y_end_addr - s->y_wptr;
-			n_wrap_min = (n_wrap_fir < n_wrap_buf)
-				? n_wrap_fir : n_wrap_buf;
-			n_min = (m < n_wrap_min) ? m : n_wrap_min;
-			while (n_min > 0) {
-				*s->y_wptr = fir->out_delay[fir->out_ri++] >> 8;
-				s->y_wptr++;
-				n_min -= sz;
-				m -= sz;
-			}
-			/* Check wrap */
-			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
-			if (fir->out_ri == fir->out_delay_size)
-				fir->out_ri = 0;
-		}
-	}
-
-}
diff --git a/src/audio/src_generic.c b/src/audio/src_generic.c
new file mode 100644
index 0000000..9caa090
--- /dev/null
+++ b/src/audio/src_generic.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* Default C implementation guaranteed to work on any
+ * architecture.
+ */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_GENERIC
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0,
+	int32_t *fir_start, int32_t *fir_end, const int fir_delay_length,
+	const int taps_x_nch, const int shift, const int nch)
+{
+	int64_t y0;
+	int64_t y1;
+	int32_t *data;
+	const int16_t *coef;
+	int i;
+	int j;
+	int n1;
+	int n2;
+	int frames;
+	const int qshift = 15 + shift; /* Q2.46 -> Q2.31 */
+	const int32_t rnd = 1 << (qshift - 1); /* Half LSB */
+	int32_t *d = rp;
+	int32_t *wp = wp0;
+
+	/* Check for 2ch FIR case */
+	if (nch == 2) {
+		/* Decrement data pointer to next channel start. Note that
+		 * initialization code ensures that circular wrap does not
+		 * happen mid-frame.
+		 */
+		data = d - 1;
+
+		/* Initialize to half LSB for rounding, prepare for FIR core */
+		y0 = rnd;
+		y1 = rnd;
+		coef = (const int16_t *)cp;
+		frames = fir_end - data; /* Frames until wrap */
+		n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1;
+		n2 = (taps_x_nch >> 1) - n1;
+
+		/* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The
+		 * output shift includes the shift by 15 for Qx.46 to
+		 * Qx.31.
+		 */
+		for (i = 0; i < n1; i++) {
+			y0 += (int64_t)(*coef) * (*data);
+			data++;
+			y1 += (int64_t)(*coef) * (*data);
+			data++;
+			coef++;
+		}
+		if (data == fir_end)
+			data = fir_start;
+
+		for (i = 0; i < n2; i++) {
+			y0 += (int64_t)(*coef) * (*data);
+			data++;
+			y1 += (int64_t)(*coef) * (*data);
+			data++;
+			coef++;
+		}
+
+		*wp = sat_int32(y1 >> qshift);
+		*(wp + 1) = sat_int32(y0 >> qshift);
+		return;
+	}
+
+	for (j = 0; j < nch; j++) {
+		/* Decrement data pointer to next channel start. Note that
+		 * initialization code ensures that circular wrap does not
+		 * happen mid-frame.
+		 */
+		data = d--;
+
+		/* Initialize to half LSB for rounding, prepare for FIR core */
+		y0 = rnd;
+		coef = (const int16_t *)cp;
+		frames = fir_end - data + nch - j - 1; /* Frames until wrap */
+		n1 = (taps_x_nch < frames) ? taps_x_nch : frames;
+		n2 = taps_x_nch - n1;
+
+		/* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The
+		 * output shift includes the shift by 15 for Qx.46 to
+		 * Qx.31.
+		 */
+		for (i = 0; i < n1; i += nch) {
+			y0 += (int64_t)(*coef) * (*data);
+			coef++;
+			data += nch;
+		}
+		if (data >= fir_end)
+			data -= fir_delay_length;
+
+		for (i = 0; i < n2; i += nch) {
+			y0 += (int64_t)(*coef) * (*data);
+			coef++;
+			data += nch;
+		}
+
+		*wp = sat_int32(y0 >> qshift);
+		wp++;
+	}
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0,
+	int32_t *fir_start, int32_t *fir_end, int fir_delay_length,
+	const int taps_x_nch, const int shift, const int nch)
+{
+	int64_t y0;
+	int64_t y1;
+	int32_t *data;
+	const int32_t *coef;
+	int i;
+	int j;
+	int frames;
+	int n1;
+	int n2;
+
+	const int qshift = 23 + shift; /* Qx.54 -> Qx.31 */
+	const int32_t rnd = 1 << (qshift - 1); /* Half LSB */
+	int32_t *d = rp;
+	int32_t *wp = wp0;
+
+	/* Check for 2ch FIR case */
+	if (nch == 2) {
+		/* Decrement data pointer to next channel start. Note that
+		 * initialization code ensures that circular wrap does not
+		 * happen mid-frame.
+		 */
+		data = d - 1;
+
+		/* Initialize to half LSB for rounding, prepare for FIR core */
+		y0 = rnd;
+		y1 = rnd;
+		coef = (const int32_t *)cp;
+		frames = fir_end - data; /* Frames until wrap */
+		n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1;
+		n2 = (taps_x_nch >> 1) - n1;
+
+		/* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The
+		 * output shift includes the shift by 23 for Qx.54 to
+		 * Qx.31.
+		 */
+		for (i = 0; i < n1; i++) {
+			y0 += (int64_t)(*coef >> 8) * (*data);
+			data++;
+			y1 += (int64_t)(*coef >> 8) * (*data);
+			data++;
+			coef++;
+		}
+		if (data == fir_end)
+			data = fir_start;
+
+		for (i = 0; i < n2; i++) {
+			y0 += (int64_t)(*coef >> 8) * (*data);
+			data++;
+			y1 += (int64_t)(*coef >> 8) * (*data);
+			data++;
+			coef++;
+		}
+		*wp = sat_int32(y1 >> qshift);
+		*(wp + 1) = sat_int32(y0 >> qshift);
+		return;
+	}
+
+	for (j = 0; j < nch; j++) {
+		/* Decrement data pointer to next channel start. Note that
+		 * initialization code ensures that circular wrap does not
+		 * happen mid-frame.
+		 */
+		data = d--;
+
+		/* Initialize to half LSB for rounding, prepare for FIR core */
+		y0 = rnd;
+		coef = (const int32_t *)cp;
+		frames = fir_end - data + nch - j - 1; /* Frames until wrap */
+		n1 = (taps_x_nch < frames) ? taps_x_nch : frames;
+		n2 = taps_x_nch - n1;
+
+		/* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The
+		 * output shift includes the shift by 23 for Qx.54 to
+		 * Qx.31.
+		 */
+		for (i = 0; i < n1; i += nch) {
+			y0 += (int64_t)(*coef >> 8) * (*data);
+			coef++;
+			data += nch;
+		}
+		if (data >= fir_end)
+			data -= fir_delay_length;
+
+		for (i = 0; i < n2; i += nch) {
+			y0 += (int64_t)(*coef >> 8) * (*data);
+			coef++;
+			data += nch;
+		}
+		*wp = sat_int32(y0 >> qshift);
+		wp++;
+	}
+}
+
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_wrap_fir;
+	int n_min;
+	int32_t *rp;
+	int32_t *wp;
+
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_delay = fir->fir_delay;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int fir_length = fir->fir_delay_size;
+	const int rewind = nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch;
+	const int nch_x_idm = nch * cfg->idm;
+	const size_t fir_size = fir->fir_delay_size * sizeof(int32_t);
+	const int taps_x_nch = cfg->subfilter_length * nch;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data */
+		m = blk_in_words;
+		while (m > 0) {
+			/* Number of words without circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_wrap_fir = fir->fir_wp - fir->fir_delay + 1;
+			n_min = (n_wrap_fir < n_wrap_buf)
+				? n_wrap_fir : n_wrap_buf;
+			n_min = (m < n_min) ? m : n_min;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				*fir->fir_wp = *s->x_rptr;
+				fir->fir_wp--;
+				s->x_rptr++;
+			}
+			/* Check for wrap */
+			src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size);
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = fir->fir_wp + rewind;
+		src_circ_inc_wrap(&rp, fir_end, fir_size);
+		wp = fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter_generic(rp, cp, wp,
+				fir_delay, fir_end, fir_length,
+				taps_x_nch, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap(&wp, out_delay_end, out_size);
+			rp -= nch_x_idm; /* Next sub-filter start */
+			src_circ_dec_wrap(&rp, fir_delay, fir_size);
+		}
+
+		/* Output */
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_fir = out_delay_end - fir->out_rp;
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (n_wrap_fir < n_wrap_buf)
+				? n_wrap_fir : n_wrap_buf;
+			n_min = (m < n_min) ? m : n_min;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				*s->y_wptr = *fir->out_rp;
+				s->y_wptr++;
+				fir->out_rp++;
+			}
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+			src_circ_inc_wrap(&fir->out_rp, out_delay_end,
+				out_size);
+		}
+	}
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_wrap_fir;
+	int n_min;
+	int32_t *rp;
+	int32_t *wp;
+
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_delay = fir->fir_delay;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int fir_length = fir->fir_delay_size;
+	const int rewind = nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch;
+	const int nch_x_idm = nch * cfg->idm;
+	const size_t fir_size = fir->fir_delay_size * sizeof(int32_t);
+	const int taps_x_nch = cfg->subfilter_length * nch;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data */
+		m = blk_in_words;
+		while (m > 0) {
+			/* Number of words without circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_wrap_fir = fir->fir_wp - fir->fir_delay + 1;
+			n_min = (n_wrap_fir < n_wrap_buf)
+				? n_wrap_fir : n_wrap_buf;
+			n_min = (m < n_min) ? m : n_min;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				*fir->fir_wp = *s->x_rptr << 8;
+				fir->fir_wp--;
+				s->x_rptr++;
+			}
+			/* Check for wrap */
+			src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size);
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = fir->fir_wp + rewind;
+		src_circ_inc_wrap(&rp, fir_end, fir_size);
+		wp = fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter_generic(rp, cp, wp,
+				fir_delay, fir_end, fir_length,
+				taps_x_nch, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap(&wp, out_delay_end, out_size);
+			rp -= nch_x_idm; /* Next sub-filter start */
+			src_circ_dec_wrap(&rp, fir_delay, fir_size);
+		}
+
+		/* Output */
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_fir = out_delay_end - fir->out_rp;
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (n_wrap_fir < n_wrap_buf)
+				? n_wrap_fir : n_wrap_buf;
+			n_min = (m < n_min) ? m : n_min;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				*s->y_wptr = *fir->out_rp >> 8;
+				s->y_wptr++;
+				fir->out_rp++;
+			}
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+			src_circ_inc_wrap(&fir->out_rp, out_delay_end,
+				out_size);
+		}
+	}
+}
+
+#endif
diff --git a/src/audio/src_hifi2ep.c b/src/audio/src_hifi2ep.c
new file mode 100644
index 0000000..0d03ffa
--- /dev/null
+++ b/src/audio/src_hifi2ep.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* HiFi EP optimized code parts for SRC */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_HIFIEP
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi2.h>
+
+/* HiFi EP has
+ * 4x 56 bit registers in register file Q
+ * 8x 48 bit registers in register file P
+ */
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
+	const int taps_div_4, const int shift, const int nch)
+{
+	/* This function uses
+	 * 2x 56 bit registers Q,
+	 * 4x 48 bit registers P
+	 * 3x integers
+	 * 4x address pointers,
+	 */
+	ae_q56s a0;
+	ae_q56s a1;
+	ae_p24x2f data2;
+	ae_p24x2f coef2;
+	ae_p24x2f p0;
+	ae_p24x2f p1;
+	ae_p16x2s *coefp;
+	ae_p24x2f *dp = (ae_p24x2f *)rp;
+	ae_p24x2f *dp0;
+	ae_q32s *wp = wp0;
+	int i;
+	int j;
+	const int inc = sizeof(ae_p24x2f);
+
+	/* 2ch FIR case */
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_p16x2s *)cp;
+		a0 = AE_ZEROQ56();
+		a1 = AE_ZEROQ56();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients. Coef2_h contains tap *coefp
+			 * and coef2_l contains the next tap.
+			 */
+			coef2 = AE_LP16X2F_I(coefp, 0);
+			coefp++;
+
+			/* Load two data samples from two channels */
+			AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
+			AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */
+
+			/* Select to d0 successive left channel samples, to d1
+			 * successive right channel samples. Then accumulate
+			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+			 * data and Q1.15 coefficients are used as 24 bits as
+			 * Q1.23 values.
+			 */
+			data2 = AE_SELP24_LL(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(p0, p1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+			/* Repeat for next two taps */
+			coef2 = AE_LP16X2F_I(coefp, 0);
+			coefp++;
+			AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
+			AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
+			data2 = AE_SELP24_LL(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(p0, p1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
+			sizeof(int32_t));
+		return;
+	}
+
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp;
+		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_p16x2s *)cp;
+		a0 = AE_ZEROQ56();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients */
+			coef2 = *coefp++;
+
+			/* Load two data samples */
+			AE_LP24F_C(p0, dp0, inc);
+			AE_LP24F_C(p1, dp0, inc);
+
+			/* Pack p0 and p1 to data2_h and data2_l */
+			data2 = AE_SELP24_LL(p0, p1);
+
+			/* Accumulate data2_h * coef2_h + data2_l * coef2_l */
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+
+			/* Repeat for next two filter taps */
+			coef2 = *coefp++;
+			AE_LP24F_C(p0, dp0, inc);
+			AE_LP24F_C(p1, dp0, inc);
+			data2 = AE_SELP24_LL(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+		wp++;
+	}
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0,
+	const int taps_div_4, const int shift, const int nch)
+{
+	/* This function uses
+	 * 2x 56 bit registers Q,
+	 * 4x 48 bit registers P
+	 * 3x integers
+	 * 4x address pointers,
+	 */
+	ae_q56s a0;
+	ae_q56s a1;
+	ae_p24x2f p0;
+	ae_p24x2f p1;
+	ae_p24x2f data2;
+	ae_p24x2f coef2;
+	ae_p24x2f *coefp;
+	ae_p24x2f *dp = (ae_p24x2f *)rp;
+	ae_p24x2f *dp0;
+	ae_q32s *wp = wp0;
+	int i;
+	int j;
+	const int inc = sizeof(ae_p24x2f);
+
+	/* 2ch FIR case */
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_p24x2f *)cp;
+		a0 = AE_ZEROQ56();
+		a1 = AE_ZEROQ56();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients. Coef2_h contains tap *coefp
+			 * and coef2_l contains the next tap.
+			 */
+			/* TODO: Ensure coefficients are 64 bits aligned */
+			coef2 = AE_LP24X2F_I(coefp, 0);
+			coefp++;
+
+			/* Load two data samples from two channels */
+			AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */
+			AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */
+
+			/* Select to d0 successive left channel samples, to d1
+			 * successive right channel samples.
+			 */
+
+			/* Accumulate to m
+			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+			 * data and Q1.15 coefficients are used as 24 bits as
+			 * Q1.23 values.
+			 */
+			data2 = AE_SELP24_LL(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(p0, p1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+			/* Repeat for next two taps */
+			coef2 = AE_LP24X2F_I(coefp, 0);
+			coefp++;
+			AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */
+			AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */
+			data2 = AE_SELP24_LL(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(p0, p1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp,
+			sizeof(int32_t));
+		return;
+	}
+
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp;
+		AE_LP24F_C(p0, dp, -sizeof(ae_p24f));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_p24x2f *)cp;
+		a0 = AE_ZEROQ56();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients */
+			coef2 = *coefp++;
+
+			/* Load two data samples and place them to L and H of
+			 * data2.
+			 */
+			AE_LP24F_C(p0, dp0, inc);
+			AE_LP24F_C(p1, dp0, inc);
+			data2 = AE_SELP24_LH(p0, p1);
+
+			/* Accumulate to m
+			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+			 * data and coefficients are used as the most
+			 * significant 24 bits as Q1.23 values.
+			 */
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+
+			/* Repeat for next two filter taps */
+			coef2 = *coefp++;
+			AE_LP24F_C(p0, dp0, inc);
+			AE_LP24F_C(p1, dp0, inc);
+			data2 = AE_SELP24_LH(p0, p1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0);
+		wp++;
+	}
+}
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  1x 56 bit registers Q,
+	 *  0x 48 bit registers P,
+	 * 16x integers
+	 *  7x address pointers,
+	 */
+	ae_q56s q;
+	ae_q32s *rp;
+	ae_q32s *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * (nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data to filter */
+		m = blk_in_words;
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+
+		while (m > 0) {
+			/* Number of words until circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Load 32 bits sample to accumulator */
+				q = AE_LQ32F_I((ae_q32s *)s->x_rptr++, 0);
+
+				/* Store to circular buffer, advance pointer */
+				AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_q32s *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to to data start. Loaded value q is discarded.
+		 */
+		AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);
+
+		/* Reset FIR write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_q32s *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+				out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_LQ32F_C(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Circular load followed by linear store */
+				AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
+				AE_SQ32F_I(q, (ae_q32s *)s->y_wptr, 0);
+				s->y_wptr++;
+			}
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+		}
+	}
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  1x 56 bit registers Q,
+	 *  0x 48 bit registers P,
+	 * 16x integers
+	 *  7x address pointers,
+	 */
+	ae_q56s q;
+	ae_q32s *rp;
+	ae_q32s *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * (nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data to filter */
+		m = blk_in_words;
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+
+		while (m > 0) {
+			/* Number of words without circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Load 32 bits sample to accumulator
+				 * and left shift by 8, advance read
+				 * pointer.
+				 */
+				q = AE_SLLIQ56(AE_LQ32F_I(
+					(ae_q32s *)s->x_rptr++, 0), 8);
+
+				/* Store to circular buffer, advance
+				 * write pointer.
+				 */
+				AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_q32s *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to to data start. Loaded value q is discarded.
+		 */
+		AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz);
+
+		/* Reset FIR output write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_q32s *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+				out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_LQ32F_C(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Circular load for 32 bit sample,
+				 * advance pointer.
+				 */
+				AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz);
+
+				/* Store value as shifted right by 8 for
+				 * sign extended 24 bit value, advance pointer.
+				 */
+				AE_SQ32F_I(AE_SRAIQ56(q, 8),
+					   (ae_q32s *)s->y_wptr, 0);
+				s->y_wptr++;
+			}
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+		}
+	}
+}
+
+#endif
diff --git a/src/audio/src_hifi3.c b/src/audio/src_hifi3.c
new file mode 100644
index 0000000..96d3c99
--- /dev/null
+++ b/src/audio/src_hifi3.c
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2016, Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
+ *
+ */
+
+/* HiFi3 optimized code parts for SRC */
+
+#include <stdint.h>
+#include <reef/alloc.h>
+#include <reef/audio/format.h>
+#include <reef/math/numbers.h>
+
+#include "src_config.h"
+#include "src.h"
+
+#if SRC_HIFI3
+
+#include <xtensa/config/defs.h>
+#include <xtensa/tie/xt_hifi3.h>
+
+/* HiFi3 has
+ * 16x 64 bit registers in register file AE_DR
+ */
+
+#if SRC_SHORT /* 16 bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+	const int taps_div_4, const int shift, const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 5x address pointers,
+	 */
+	ae_f64 a0;
+	ae_f64 a1;
+	ae_valign u;
+	ae_f16x4 coef4;
+	ae_f32x2 d0;
+	ae_f32x2 d1;
+	ae_f32x2 data2;
+	ae_f16x4 *coefp;
+	ae_f32x2 *dp;
+	ae_f32 *dp0;
+	ae_f32 *dp1;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		dp = (ae_f32x2 *)rp;
+		AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f16x4 *)cp;
+		a0 = AE_ZERO64();
+		a1 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Four coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load four coefficients */
+			AE_LA16X4_IP(coef4, u, coefp);
+
+			/* Load two data samples from two channels */
+			AE_L32X2_XC(d0, dp, inc); /* r0, l0 */
+			AE_L32X2_XC(d1, dp, inc); /* r1, l1 */
+
+			/* Select to data2 sequential samples from a channel
+			 * and then accumulate to a0 and a1
+			 * data2_h * coef4_3 + data2_l * coef4_2.
+			 * The data is 32 bits Q1.31 and coefficient 16 bits
+			 * Q1.15. The accumulators are Q17.47.
+			 */
+			data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */
+			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+			data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */
+			AE_MULAAFD32X16_H3_L2(a1, data2, coef4);
+
+			/* Load two data samples from two channels */
+			AE_L32X2_XC(d0, dp, inc); /* r2, l2 */
+			AE_L32X2_XC(d1, dp, inc); /* r3, l3 */
+
+			/* Accumulate
+			 * data2_h * coef4_1 + data2_l * coef4_0.
+			 */
+			data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */
+			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+			data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */
+			AE_MULAAFD32X16_H1_L0(a1, data2, coef4);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			sizeof(int32_t));
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+			sizeof(int32_t));
+
+		return;
+	}
+
+	dp1 = (ae_f32 *)rp;
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp1;
+		AE_L32_XC(d0, dp1, -sizeof(ae_f32));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f16x4 *)cp;
+		a0 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration. Data is read from
+		 * interleaved buffer with stride of channels count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load four coefficients */
+			AE_LA16X4_IP(coef4, u, coefp);
+
+			/* Load two data samples, place to high and
+			 * low of data2.
+			 */
+			AE_L32_XC(d0, dp0, inc);
+			AE_L32_XC(d1, dp0, inc);
+			data2 = AE_SEL32_LL(d0, d1);
+
+			/* Accumulate
+			 * data2_h * coef4_3 + data2_l* coef4_2.
+			 * The data is 32 bits Q1.31 and coefficient 16 bits
+			 * Q1.15. The accumulator is Q17.47.
+			 */
+			AE_MULAAFD32X16_H3_L2(a0, data2, coef4);
+
+			/* Repeat with next two samples */
+			AE_L32_XC(d0, dp0, inc);
+			AE_L32_XC(d1, dp0, inc);
+			data2 = AE_SEL32_LL(d0, d1);
+
+			/* Accumulate
+			 * data2_h * coef4_1 + data2_l * coef4_0.
+			 */
+			AE_MULAAFD32X16_H1_L0(a0, data2, coef4);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate Q17.47
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			sizeof(int32_t));
+	}
+}
+
+#else /* 32bit coefficients version */
+
+static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0,
+	const int taps_div_4, const int shift, const int nch)
+{
+	/* This function uses
+	 * 6x 64 bit registers
+	 * 3x integers
+	 * 5x address pointers,
+	 */
+	ae_f64 a0;
+	ae_f64 a1;
+	ae_f24x2 data2;
+	ae_f24x2 coef2;
+	ae_f24x2 d0;
+	ae_f24x2 d1;
+	ae_f24x2 *coefp;
+	ae_f24x2 *dp;
+	ae_f24 *dp1;
+	ae_f24 *dp0;
+	int i;
+	int j;
+	ae_f32 *wp = wp0;
+	const int inc = nch * sizeof(int32_t);
+
+	if (nch == 2) {
+		/* Move data pointer back by one sample to start from right
+		 * channel sample. Discard read value p0.
+		 */
+		dp = (ae_f24x2 *)rp;
+		AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f24x2 *)cp;
+		a0 = AE_ZERO64();
+		a1 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration.  Two coefficients
+		 * are loaded simultaneously. Data is read
+		 * from interleaved buffer with stride of channels
+		 * count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients. Coef2_h contains tap *coefp
+			 * and coef2_l contains the next tap.
+			 */
+			/* TODO: Ensure coefficients are 64 bits aligned */
+			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+
+			/* Load two data samples from two channels */
+			AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */
+			AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */
+
+			/* Select to d0 successive left channel samples, to d1
+			 * successive right channel samples. Then Accumulate
+			 * to a0 and a1
+			 * data2_h * coef2_h + data2_l * coef2_l. The Q1.31
+			 * data and Q1.15 coefficients are used as 24 bits as
+			 * Q1.23 values.
+			 */
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(d0, d1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+
+			/* Repeat for next two taps */
+			AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2));
+			AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */
+			AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFP24S_HH_LL(a0, data2, coef2);
+			data2 = AE_SELP24_HH(d0, d1);
+			AE_MULAAFP24S_HH_LL(a1, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate
+		 * to Q1.31, and store 32 bit output.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			sizeof(int32_t));
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp,
+			sizeof(int32_t));
+
+		return;
+	}
+
+	dp1 = (ae_f24 *)rp;
+	for (j = 0; j < nch; j++) {
+		/* Copy pointer and advance to next ch with dummy load */
+		dp0 = dp1;
+		AE_L32F24_XC(data2, dp1, -sizeof(ae_f24));
+
+		/* Reset coefficient pointer and clear accumulator */
+		coefp = (ae_f24x2 *)cp;
+		a0 = AE_ZERO64();
+
+		/* Compute FIR filter for current channel with four
+		 * taps per every loop iteration. Data is read from
+		 * interleaved buffer with stride of channels count.
+		 */
+		for (i = 0; i < taps_div_4; i++) {
+			/* Load two coefficients */
+			coef2 = *coefp++;
+
+			/* Load two data samples, place to high and
+			 * low of data2.
+			 */
+			AE_L32F24_XC(d0, dp0, inc);
+			AE_L32F24_XC(d1, dp0, inc);
+			data2 = AE_SELP24_LL(d0, d1);
+
+			/* Accumulate to data2_h * coef2_h +
+			 * data2_l*coef2_l. The Q1.31 bit data is used
+			 * as Q1.23 from MSB side bits of the 32 bit
+			 * word. The accumulator m is Q17.47.
+			 */
+			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+
+			/* Repeat the same for next two filter taps */
+			coef2 = *coefp++;
+			AE_L32F24_XC(d0, dp0, inc);
+			AE_L32F24_XC(d1, dp0, inc);
+			data2 = AE_SELP24_LL(d0, d1);
+			AE_MULAAFD24_HH_LL(a0, data2, coef2);
+		}
+
+		/* Scale FIR output with right shifts, round/saturate Q17.47
+		 * to Q1.31, and store 32 bit output. Advance write
+		 * pointer to next sample.
+		 */
+		AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp,
+			sizeof(int32_t));
+	}
+}
+
+#endif /* 32bit coefficients version */
+
+void src_polyphase_stage_cir(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  1x 64 bit registers
+	 * 16x integers
+	 *  7x address pointers,
+	 */
+	ae_int32x2 q;
+	ae_f32 *rp;
+	ae_f32 *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * (nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data to filter */
+		m = blk_in_words;
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+
+		while (m > 0) {
+			/* Number of words until circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Load 32 bits sample to accumulator,
+				 * advance pointer.
+				 */
+				AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
+
+				/* Store to circular buffer, advance pointer */
+				AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_f32 *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to to data start. Loaded value q is discarded.
+		 */
+		AE_L32_XC(q, rp, rewind_sz);
+
+		/* Reset FIR write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_f32 *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+				out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_L32_XC(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Circular load followed by linear store,
+				 * advance read and write pointers.
+				 */
+				AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+				AE_S32_L_XP(q, (ae_int32 *)s->y_wptr, sz);
+			}
+
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+		}
+	}
+}
+
+void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
+{
+	/* This function uses
+	 *  1x 64 bit registers
+	 * 16x integers
+	 *  7x address pointers,
+	 */
+	ae_int32x2 q;
+	ae_f32 *rp;
+	ae_f32 *wp;
+	int i;
+	int n;
+	int m;
+	int n_wrap_buf;
+	int n_min;
+
+	struct src_state *fir = s->state;
+	struct src_stage *cfg = s->stage;
+	int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size];
+	int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size];
+	const void *cp; /* Can be int32_t or int16_t */
+	const size_t out_size = fir->out_delay_size * sizeof(int32_t);
+	const int nch = s->nch;
+	const int nch_x_odm = cfg->odm * nch;
+	const int blk_in_words = nch * cfg->blk_in;
+	const int blk_out_words = nch * cfg->num_of_subfilters;
+	const int sz = sizeof(int32_t);
+	const int n_sz = -sizeof(int32_t);
+	const int rewind_sz = sz * (nch * (cfg->blk_in
+		+ (cfg->num_of_subfilters - 1) * cfg->idm) - nch);
+	const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t);
+	const int taps_div_4 = cfg->subfilter_length >> 2;
+
+#if SRC_SHORT
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t);
+#else
+	const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t);
+#endif
+
+	for (n = 0; n < s->times; n++) {
+		/* Input data */
+		m = blk_in_words;
+
+		/* Setup circular buffer for FIR input data delay */
+		AE_SETCBEGIN0(fir->fir_delay);
+		AE_SETCEND0(fir_end);
+
+		while (m > 0) {
+			/* Number of words without circular wrap */
+			n_wrap_buf = s->x_end_addr - s->x_rptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Load 32 bits sample to accumulator
+				 * and left shift by 8, advance read
+				 * pointer.
+				 */
+				AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz);
+				AE_S32_L_XC(AE_SLAI32(q, 8),
+					(ae_int32 *)fir->fir_wp, n_sz);
+			}
+
+			/* Check for wrap */
+			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
+		}
+
+		/* Do filter */
+		cp = cfg->coefs; /* Reset to 1st coefficient */
+		rp = (ae_f32 *)fir->fir_wp;
+
+		/* Do circular modification to pointer rp by amount of
+		 * rewind to to data start. Loaded value q is discarded.
+		 */
+		AE_L32_XC(q, rp, rewind_sz);
+
+		/* Reset FIR output write pointer and compute all polyphase
+		 * sub-filters.
+		 */
+		wp = (ae_f32 *)fir->out_rp;
+		for (i = 0; i < cfg->num_of_subfilters; i++) {
+			fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch);
+			wp += nch_x_odm;
+			cp += subfilter_size;
+			src_circ_inc_wrap((int32_t **)&wp, out_delay_end,
+				out_size);
+
+			/* Circular advance pointer rp by number of
+			 * channels x input delay multiplier. Loaded value q
+			 * is discarded.
+			 */
+			AE_L32_XC(q, rp, nch_x_idm_sz);
+		}
+
+		/* Output */
+
+		/* Setup circular buffer for SRC out delay access */
+		AE_SETCBEGIN0(fir->out_delay);
+		AE_SETCEND0(out_delay_end);
+		m = blk_out_words;
+		while (m > 0) {
+			n_wrap_buf = s->y_end_addr - s->y_wptr;
+			n_min = (m < n_wrap_buf) ? m : n_wrap_buf;
+			m -= n_min;
+			for (i = 0; i < n_min; i++) {
+				/* Circular load for 32 bit sample,
+				 * advance read pointer.
+				 */
+				AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz);
+
+				/* Store value as shifted right by 8
+				 * for sign extended 24 bit value,
+				 * advance write pointer.
+				 */
+				AE_S32_L_XP(AE_SRAI32(q, 8),
+					(ae_int32 *)s->y_wptr, sz);
+			}
+
+			/* Check wrap */
+			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
+		}
+	}
+}
+
+#endif
-- 
2.14.1