
This patch moves generic common code to src.c/h from src_core.c/h and places generic C optimized filter to src_generic.c. The HiFi EP version is in src_hifi2ep.c and HiFi3 version is in src_hifi3.c. Use of the Xtensa optimized versions require xt-xcc compiler.
The non-used SRC in/out rates query code is removed. The 24 bit coefficients were replaced by 32 bit coefficients those are compatible with Xtensa fractional integer types.
Signed-off-by: Seppo Ingalsuo seppo.ingalsuo@linux.intel.com --- src/audio/Makefile.am | 4 +- src/audio/src.c | 346 ++++++++++++++++++-- src/audio/{src_core.h => src.h} | 17 +- src/audio/src_config.h | 57 +++- src/audio/src_core.c | 676 ---------------------------------------- src/audio/src_generic.c | 435 ++++++++++++++++++++++++++ src/audio/src_hifi2ep.c | 562 +++++++++++++++++++++++++++++++++ src/audio/src_hifi3.c | 567 +++++++++++++++++++++++++++++++++ 8 files changed, 1938 insertions(+), 726 deletions(-) rename src/audio/{src_core.h => src.h} (93%) delete mode 100644 src/audio/src_core.c create mode 100644 src/audio/src_generic.c create mode 100644 src/audio/src_hifi2ep.c create mode 100644 src/audio/src_hifi3.c
diff --git a/src/audio/Makefile.am b/src/audio/Makefile.am index bccedbf..ae58289 100644 --- a/src/audio/Makefile.am +++ b/src/audio/Makefile.am @@ -1006,7 +1006,9 @@ libaudio_a_SOURCES = \ fir.c \ tone.c \ src.c \ - src_core.c \ + src_generic.c \ + src_hifi2ep.c \ + src_hifi3.c \ mixer.c \ mux.c \ volume.c \ diff --git a/src/audio/src.c b/src/audio/src.c index c7ac649..cca0cbc 100644 --- a/src/audio/src.c +++ b/src/audio/src.c @@ -43,7 +43,17 @@ #include <reef/audio/component.h> #include <reef/audio/pipeline.h> #include <uapi/ipc.h> -#include "src_core.h" + +#include "src_config.h" +#include "src.h" + +#if SRC_SHORT +#include <reef/audio/coefficients/src/src_tiny_int16_define.h> +#include <reef/audio/coefficients/src/src_tiny_int16_table.h> +#else +#include <reef/audio/coefficients/src/src_std_int32_define.h> +#include <reef/audio/coefficients/src/src_std_int32_table.h> +#endif
#ifdef MODULE_TEST #include <stdio.h> @@ -53,6 +63,10 @@ #define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e) #define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e)
+/* The FIR maximum lengths are per channel so need to multiply them */ +#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE) +#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE) + /* src component private data */ struct comp_data { struct polyphase_src src; @@ -63,14 +77,273 @@ struct comp_data { int32_t *sbuf_w_ptr; int32_t *sbuf_r_ptr; int sbuf_avail; - void (* src_func)(struct comp_dev *dev, + void (*src_func)(struct comp_dev *dev, struct comp_buffer *source, struct comp_buffer *sink, size_t *consumed, size_t *produced); - void (* polyphase_func)(struct src_stage_prm *s); + void (*polyphase_func)(struct src_stage_prm *s); };
+/* Calculate ceil() for integer division */ +int src_ceil_divide(int a, int b) +{ + int c; + + c = a / b; + if (c * b < a) + c++; + + return c; +} + +/* Calculates the needed FIR delay line length */ +static int src_fir_delay_length(struct src_stage *s) +{ + return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm + + s->blk_in; +} + +/* Calculates the FIR output delay line length */ +static int src_out_delay_length(struct src_stage *s) +{ + return 1 + (s->num_of_subfilters - 1) * s->odm; +} + +/* Returns index of a matching sample rate */ +static int src_find_fs(int fs_list[], int list_length, int fs) +{ + int i; + + for (i = 0; i < list_length; i++) { + if (fs_list[i] == fs) + return i; + } + return -EINVAL; +} + +/* Calculates buffers to allocate for a SRC mode */ +int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch, + int frames, int frames_is_for_source) +{ + struct src_stage *stage1; + struct src_stage *stage2; + int q; + int den; + int num; + int frames2; + + if (nch > PLATFORM_MAX_CHANNELS) { + trace_src_error("che"); + tracev_value(nch); + return -EINVAL; + } + + a->nch = nch; + a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in); + a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out); + + /* Check that both in and out rates are supported */ + if (a->idx_in < 0 || a->idx_out < 0) { + trace_src_error("us1"); + tracev_value(fs_in); + tracev_value(fs_out); + return -EINVAL; + } + + stage1 = src_table1[a->idx_out][a->idx_in]; + stage2 = src_table2[a->idx_out][a->idx_in]; + + /* Check from stage1 parameter for a deleted in/out rate combination.*/ + if (stage1->filter_length < 1) { + trace_src_error("us2"); + tracev_value(fs_in); + tracev_value(fs_out); + return -EINVAL; + } + + a->fir_s1 = nch * src_fir_delay_length(stage1); + a->out_s1 = nch * src_out_delay_length(stage1); + + /* Find out how many additional times the SRC can be executed + * while having block size less or equal to max_frames. + */ + if (frames_is_for_source) { + /* Times that stage1 needs to run to input length of frames */ + a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in); + q = frames / stage1->blk_in; + a->stage1_times = MAX(q, 1); + a->blk_in = a->stage1_times * stage1->blk_in; + + /* Times that stage2 needs to run */ + den = stage2->blk_in * stage1->blk_in; + num = frames * stage2->blk_out * stage1->blk_out; + frames2 = src_ceil_divide(num, den); + a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out); + q = frames2 / stage2->blk_out; + a->stage2_times = MAX(q, 1); + a->blk_out = a->stage2_times * stage2->blk_out; + } else { + /* Times that stage2 needs to run to output length of frames */ + a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out); + q = frames / stage2->blk_out; + a->stage2_times = MAX(q, 1); + a->blk_out = a->stage2_times * stage2->blk_out; + + /* Times that stage1 needs to run */ + num = frames * stage2->blk_in * stage1->blk_in; + den = stage2->blk_out * stage1->blk_out; + frames2 = src_ceil_divide(num, den); + a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in); + q = frames2 / stage1->blk_in; + a->stage1_times = MAX(q, 1); + a->blk_in = a->stage1_times * stage1->blk_in; + } + + if (stage2->filter_length == 1) { + a->fir_s2 = 0; + a->out_s2 = 0; + a->stage2_times = 0; + a->stage2_times_max = 0; + a->sbuf_length = 0; + } else { + a->fir_s2 = nch * src_fir_delay_length(stage2); + a->out_s2 = nch * src_out_delay_length(stage2); + /* 2x is an empirically tested length. Since the sink buffer + * capability to receive samples varies a shorter stage 2 output + * block will create a peak in internal buffer usage. + */ + + /* TODO 1: Equation for needed length */ + a->sbuf_length = 2 * nch * stage1->blk_out + * a->stage1_times_max; + } + + a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2; + a->total = a->sbuf_length + a->src_multich; + + return 0; +} + +static void src_state_reset(struct src_state *state) +{ + state->fir_delay_size = 0; + state->out_delay_size = 0; +} + +static int init_stages(struct src_stage *stage1, struct src_stage *stage2, + struct polyphase_src *src, struct src_param *p, + int n, int32_t *delay_lines_start) +{ + /* Clear FIR state */ + src_state_reset(&src->state1); + src_state_reset(&src->state2); + + src->number_of_stages = n; + src->stage1 = stage1; + src->stage2 = stage2; + if (n == 1 && stage1->blk_out == 0) + return -EINVAL; + + /* Optimized SRC requires subfilter length multiple of 4 */ + if (stage1->filter_length > 1 && (stage1->subfilter_length & 0x3) > 0) + return -EINVAL; + + if (stage2->filter_length > 1 && (stage2->subfilter_length & 0x3) > 0) + return -EINVAL; + + /* Delay line sizes */ + src->state1.fir_delay_size = p->fir_s1; + src->state1.out_delay_size = p->out_s1; + src->state1.fir_delay = delay_lines_start; + src->state1.out_delay = + src->state1.fir_delay + src->state1.fir_delay_size; + /* Initialize to last ensures that circular wrap cannot happen + * mid-frame. The size is multiple of channels count. + */ + src->state1.fir_wp = &src->state1.fir_delay[p->fir_s1 - 1]; + src->state1.out_rp = src->state1.out_delay; + if (n > 1) { + src->state2.fir_delay_size = p->fir_s2; + src->state2.out_delay_size = p->out_s2; + src->state2.fir_delay = + src->state1.out_delay + src->state1.out_delay_size; + src->state2.out_delay = + src->state2.fir_delay + src->state2.fir_delay_size; + /* Initialize to last ensures that circular wrap cannot happen + * mid-frame. The size is multiple of channels count. + */ + src->state2.fir_wp = &src->state2.fir_delay[p->fir_s2 - 1]; + src->state2.out_rp = src->state2.out_delay; + } else { + src->state2.fir_delay_size = 0; + src->state2.out_delay_size = 0; + src->state2.fir_delay = NULL; + src->state2.out_delay = NULL; + } + + /* Check the sizes are less than MAX */ + if (src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH || + src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH || + src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH || + src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH) { + src->state1.fir_delay = NULL; + src->state1.out_delay = NULL; + src->state2.fir_delay = NULL; + src->state2.out_delay = NULL; + return -EINVAL; + } + + return 0; +} + +void src_polyphase_reset(struct polyphase_src *src) +{ + src->number_of_stages = 0; + src->stage1 = NULL; + src->stage2 = NULL; + src_state_reset(&src->state1); + src_state_reset(&src->state2); +} + +int src_polyphase_init(struct polyphase_src *src, struct src_param *p, + int32_t *delay_lines_start) +{ + struct src_stage *stage1; + struct src_stage *stage2; + int n_stages; + int ret; + + if (p->idx_in < 0 || p->idx_out < 0) + return -EINVAL; + + /* Get setup for 2 stage conversion */ + stage1 = src_table1[p->idx_out][p->idx_in]; + stage2 = src_table2[p->idx_out][p->idx_in]; + ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start); + if (ret < 0) + return -EINVAL; + + /* Get number of stages used for optimize opportunity. 2nd + * stage length is one if conversion needs only one stage. + * If input and output rate is the same return 0 to + * use a simple copy function instead of 1 stage FIR with one + * tap. + */ + n_stages = (src->stage2->filter_length == 1) ? 1 : 2; + if (p->idx_in == p->idx_out) + n_stages = 0; + + /* If filter length for first stage is zero this is a deleted + * mode from in/out matrix. Computing of such SRC mode needs + * to be prevented. + */ + if (src->stage1->filter_length == 0) + return -EINVAL; + + return n_stages; +} + /* Fallback function */ static void src_fallback(struct comp_dev *dev, struct comp_buffer *source, struct comp_buffer *sink, size_t *bytes_read, size_t *bytes_written) @@ -91,8 +364,9 @@ static void src_2s_s32_default(struct comp_dev *dev, int s2_blk_in; int s2_blk_out; struct comp_data *cd = comp_get_drvdata(dev); - int32_t *dest = (int32_t *) sink->w_ptr; - int32_t *src = (int32_t *) source->r_ptr; + int32_t *dest = (int32_t *)sink->w_ptr; + int32_t *src = (int32_t *)source->r_ptr; + int32_t *sbuf_addr = cd->delay_lines; int32_t *sbuf_end_addr = &cd->delay_lines[cd->param.sbuf_length]; int32_t sbuf_size = cd->param.sbuf_length * sizeof(int32_t); int nch = dev->params.channels; @@ -107,6 +381,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
s1.x_end_addr = source->end_addr; s1.x_size = source->size; + s1.y_addr = sbuf_addr; s1.y_end_addr = sbuf_end_addr; s1.y_size = sbuf_size; s1.state = &cd->src.state1; @@ -117,6 +392,7 @@ static void src_2s_s32_default(struct comp_dev *dev,
s2.x_end_addr = sbuf_end_addr; s2.x_size = sbuf_size; + s2.y_addr = sink->addr; s2.y_end_addr = sink->end_addr; s2.y_size = sink->size; s2.state = &cd->src.state2; @@ -125,14 +401,13 @@ static void src_2s_s32_default(struct comp_dev *dev, s2.y_wptr = dest; s2.nch = nch;
- /* Test if 1st stage can be run with default block length to reach * the period length or just under it. */ s1.times = cd->param.stage1_times; s1_blk_in = s1.times * cd->src.stage1->blk_in * nch; s1_blk_out = s1.times * cd->src.stage1->blk_out * nch; - if ((avail_b >= s1_blk_in * sz) && (sbuf_free >= s1_blk_out)) { + if (avail_b >= s1_blk_in * sz && sbuf_free >= s1_blk_out) { cd->polyphase_func(&s1);
cd->sbuf_w_ptr = s1.y_wptr; @@ -147,8 +422,9 @@ static void src_2s_s32_default(struct comp_dev *dev, s1.times = 1; s1_blk_in = cd->src.stage1->blk_in * nch; s1_blk_out = cd->src.stage1->blk_out * nch; - while ((n1 < cd->param.stage1_times_max) && (avail_b >= s1_blk_in * sz) - && (sbuf_free >= s1_blk_out)) { + while (n1 < cd->param.stage1_times_max && + avail_b >= s1_blk_in * sz && + sbuf_free >= s1_blk_out) { cd->polyphase_func(&s1);
cd->sbuf_w_ptr = s1.y_wptr; @@ -163,7 +439,7 @@ static void src_2s_s32_default(struct comp_dev *dev, s2.times = cd->param.stage2_times; s2_blk_in = s2.times * cd->src.stage2->blk_in * nch; s2_blk_out = s2.times * cd->src.stage2->blk_out * nch; - if ((cd->sbuf_avail >= s2_blk_in) && (free_b >= s2_blk_out * sz)) { + if (cd->sbuf_avail >= s2_blk_in && free_b >= s2_blk_out * sz) { cd->polyphase_func(&s2);
cd->sbuf_r_ptr = s2.x_rptr; @@ -173,14 +449,13 @@ static void src_2s_s32_default(struct comp_dev *dev, n2 = s2.times; }
- /* Run one block at time the remaining 2nd stage output */ s2.times = 1; s2_blk_in = cd->src.stage2->blk_in * nch; s2_blk_out = cd->src.stage2->blk_out * nch; - while ((n2 < cd->param.stage2_times_max) - && (cd->sbuf_avail >= s2_blk_in) - && (free_b >= s2_blk_out * sz)) { + while (n2 < cd->param.stage2_times_max && + cd->sbuf_avail >= s2_blk_in && + free_b >= s2_blk_out * sz) { cd->polyphase_func(&s2);
cd->sbuf_r_ptr = s2.x_rptr; @@ -205,10 +480,10 @@ static void src_1s_s32_default(struct comp_dev *dev, int n_written = 0;
s1.times = cd->param.stage1_times; - s1.x_rptr = (int32_t *) source->r_ptr; + s1.x_rptr = (int32_t *)source->r_ptr; s1.x_end_addr = source->end_addr; s1.x_size = source->size; - s1.y_wptr = (int32_t *) sink->w_ptr; + s1.y_wptr = (int32_t *)sink->w_ptr; s1.y_end_addr = sink->end_addr; s1.y_size = sink->size; s1.state = &cd->src.state1; @@ -229,8 +504,8 @@ static void src_copy_s32_default(struct comp_dev *dev, size_t *bytes_read, size_t *bytes_written) { struct comp_data *cd = comp_get_drvdata(dev); - int32_t *src = (int32_t *) source->r_ptr; - int32_t *snk = (int32_t *) sink->w_ptr; + int32_t *src = (int32_t *)source->r_ptr; + int32_t *snk = (int32_t *)sink->w_ptr; int nch = dev->params.channels; int frames = cd->param.blk_in; int n; @@ -241,9 +516,10 @@ static void src_copy_s32_default(struct comp_dev *dev,
n = frames * nch; while (n > 0) { - n_wrap_src = (int32_t *) source->end_addr - src; - n_wrap_snk = (int32_t *) sink->end_addr - snk; - n_wrap_min = (n_wrap_src < n_wrap_snk) ? n_wrap_src : n_wrap_snk; + n_wrap_src = (int32_t *)source->end_addr - src; + n_wrap_snk = (int32_t *)sink->end_addr - snk; + n_wrap_min = (n_wrap_src < n_wrap_snk) ? + n_wrap_src : n_wrap_snk; n_copy = (n < n_wrap_min) ? n : n_wrap_min; memcpy(snk, src, n_copy * sizeof(int32_t));
@@ -253,7 +529,6 @@ static void src_copy_s32_default(struct comp_dev *dev, snk += n_copy; src_circ_inc_wrap(&src, source->end_addr, source->size); src_circ_inc_wrap(&snk, sink->end_addr, sink->size); - } *bytes_read = frames * nch * sizeof(int32_t); *bytes_written = frames * nch * sizeof(int32_t); @@ -263,7 +538,7 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp) { struct comp_dev *dev; struct sof_ipc_comp_src *src; - struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *) comp; + struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *)comp; struct comp_data *cd;
trace_src("new"); @@ -276,14 +551,14 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
dev = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM, COMP_SIZE(struct sof_ipc_comp_src)); - if (dev == NULL) + if (!dev) return NULL;
- src = (struct sof_ipc_comp_src *) &dev->comp; + src = (struct sof_ipc_comp_src *)&dev->comp; memcpy(src, ipc_src, sizeof(struct sof_ipc_comp_src));
cd = rzalloc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM, sizeof(*cd)); - if (cd == NULL) { + if (!cd) { rfree(dev); return NULL; } @@ -306,7 +581,7 @@ static void src_free(struct comp_dev *dev) trace_src("fre");
/* Free dynamically reserved buffers for SRC algorithm */ - if (cd->delay_lines != NULL) + if (!cd->delay_lines) rfree(cd->delay_lines);
rfree(cd); @@ -347,7 +622,8 @@ static int src_params(struct comp_dev *dev) }
/* Calculate source and sink rates, one rate will come from IPC new - * and the other from params. */ + * and the other from params. + */ if (src->source_rate == 0) { /* params rate is source rate */ source_rate = params->rate; @@ -383,12 +659,12 @@ static int src_params(struct comp_dev *dev) }
/* free any existing delay lines. TODO reuse if same size */ - if (cd->delay_lines != NULL) + if (!cd->delay_lines) rfree(cd->delay_lines);
cd->delay_lines = rballoc(RZONE_RUNTIME, SOF_MEM_CAPS_RAM, delay_lines_size); - if (cd->delay_lines == NULL) { + if (!cd->delay_lines) { trace_src_error("sr3"); trace_value(delay_lines_size); return -EINVAL; @@ -424,7 +700,6 @@ static int src_params(struct comp_dev *dev) trace_src("SFa"); cd->src_func = src_fallback; return -EINVAL; - break; }
/* Calculate period size based on config. First make sure that @@ -438,7 +713,7 @@ static int src_params(struct comp_dev *dev) * buffer_set_size will return an error if the required length would * be too long. */ - q = src_ceil_divide(cd->param.blk_out, (int) dev->frames) + 1; + q = src_ceil_divide(cd->param.blk_out, (int)dev->frames) + 1;
/* Configure downstream buffer */ sink = list_first_item(&dev->bsink_list, struct comp_buffer, @@ -459,7 +734,6 @@ static int src_params(struct comp_dev *dev) return -EINVAL; }
- return 0; }
@@ -518,7 +792,8 @@ static int src_copy(struct comp_dev *dev)
/* make sure source component buffer has enough data available and that * the sink component buffer has enough free bytes for copy. Also - * check for XRUNs */ + * check for XRUNs. + */ if (source->avail < need_source) { trace_src_error("xru"); return -EIO; /* xrun */ @@ -530,6 +805,9 @@ static int src_copy(struct comp_dev *dev)
cd->src_func(dev, source, sink, &consumed, &produced);
+ tracev_value(consumed >> 3); + tracev_value(produced >> 3); + /* Calc new free and available if data was processed. These * functions must not be called with 0 consumed/produced. */ diff --git a/src/audio/src_core.h b/src/audio/src.h similarity index 93% rename from src/audio/src_core.h rename to src/audio/src.h index 3ea6028..3208693 100644 --- a/src/audio/src_core.h +++ b/src/audio/src.h @@ -29,8 +29,8 @@ * */
-#ifndef SRC_CORE_H -#define SRC_CORE_H +#ifndef SRC_H +#define SRC_H
#define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) @@ -68,12 +68,12 @@ struct src_stage { };
struct src_state { - int fir_delay_size; - int out_delay_size; - int fir_wi; - int out_ri; + int fir_delay_size; /* samples */ + int out_delay_size; /* samples */ int32_t *fir_delay; int32_t *out_delay; + int32_t *fir_wp; + int32_t *out_rp; };
struct polyphase_src { @@ -91,6 +91,7 @@ struct src_stage_prm { int32_t *x_end_addr; size_t x_size; int32_t *y_wptr; + int32_t *y_addr; int32_t *y_end_addr; size_t y_size; struct src_state *state; @@ -100,13 +101,13 @@ struct src_stage_prm { static inline void src_circ_inc_wrap(int32_t **ptr, int32_t *end, size_t size) { if (*ptr >= end) - *ptr = (int32_t *) ((size_t) * ptr - size); + *ptr = (int32_t *)((size_t)*ptr - size); }
static inline void src_circ_dec_wrap(int32_t **ptr, int32_t *addr, size_t size) { if (*ptr < addr) - *ptr = (int32_t *) ((size_t) * ptr + size); + *ptr = (int32_t *)((size_t)*ptr + size); }
void src_polyphase_reset(struct polyphase_src *src); diff --git a/src/audio/src_config.h b/src/audio/src_config.h index 3ad4c78..65d6247 100644 --- a/src/audio/src_config.h +++ b/src/audio/src_config.h @@ -34,14 +34,57 @@
#include <config.h>
-#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL || defined CONFIG_BROADWELL || defined CONFIG_HASWELL -#define SRC_SHORT 1 -#include <reef/audio/coefficients/src/src_tiny_int16_define.h> -#include <reef/audio/coefficients/src/src_tiny_int16_table.h> +/* If next defines are set to 1 the SRC is configured automatically. Setting + * to zero temporarily is useful is for testing needs. + * Setting SRC_AUTODSP to 0 allows to manually set the code variant. + * Setting SRC_AUTOCOEF to 0 allows to select the coefficient type. + */ +#define SRC_AUTOARCH 1 +#define SRC_AUTOCOEF 1 + +/* Force manually some code variant when SRC_AUTODSP is set to zero. These + * are useful in code debugging. + */ +#if SRC_AUTOARCH == 0 +#define SRC_GENERIC 1 +#define SRC_HIFIEP 0 +#define SRC_HIFI3 0 +#endif +#if SRC_AUTOCOEF == 0 +#define SRC_SHORT 0 +#endif + +/* Select 16 bit coefficients for specific platforms. + * Otherwise 32 bits is the default. + */ +#if SRC_AUTOCOEF == 1 +#if defined CONFIG_BAYTRAIL || defined CONFIG_CHERRYTRAIL \ + || defined CONFIG_BROADWELL || defined CONFIG_HASWELL +#define SRC_SHORT 1 /* Use int16_t filter coefficients */ #else -#define SHORT_SHORT 0 -#include <reef/audio/coefficients/src/src_std_int24_define.h> -#include <reef/audio/coefficients/src/src_std_int24_table.h> +#define SRC_SHORT 0 /* Use int32_t filter coefficients */ +#endif +#endif + +/* Select optimized code variant when xt-xcc compiler is used */ +#if SRC_AUTOARCH == 1 +#if defined __XCC__ +#include <xtensa/config/core-isa.h> +#define SRC_GENERIC 0 +#if XCHAL_HAVE_HIFI2EP == 1 +#define SRC_HIFIEP 1 +#define SRC_HIFI3 0 +#endif +#if XCHAL_HAVE_HIFI3 == 1 +#define SRC_HIFI3 1 +#define SRC_HIFIEP 0 +#endif +#else +/* GCC */ +#define SRC_GENERIC 1 +#define SRC_HIFIEP 0 +#define SRC_HIFI3 0 +#endif #endif
#endif diff --git a/src/audio/src_core.c b/src/audio/src_core.c deleted file mode 100644 index d8b9a3d..0000000 --- a/src/audio/src_core.c +++ /dev/null @@ -1,676 +0,0 @@ -/* - * Copyright (c) 2016, Intel Corporation - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the Intel Corporation nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Seppo Ingalsuo seppo.ingalsuo@linux.intel.com - * - */ - -/* Non optimized default C implementation guaranteed to work on any - * architecture. - */ - -#include <stdint.h> - -#ifdef MODULE_TEST -#include <stdio.h> -#endif - -#include <reef/alloc.h> -#include <reef/audio/format.h> -#include <reef/math/numbers.h> -#include "src_core.h" -#include "src_config.h" - -#define trace_src(__e) trace_event(TRACE_CLASS_SRC, __e) -#define tracev_src(__e) tracev_event(TRACE_CLASS_SRC, __e) -#define trace_src_error(__e) trace_error(TRACE_CLASS_SRC, __e) - -/* TODO: These should be defined somewhere else. */ -#define SOF_RATES_LENGTH 15 -int sof_rates[SOF_RATES_LENGTH] = {8000, 11025, 12000, 16000, 18900, - 22050, 24000, 32000, 44100, 48000, 64000, 88200, 96000, 176400, - 192000}; - -/* The FIR maximum lengths are per channel so need to multiply them */ -#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE) -#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE) - -/* Calculate ceil() for integer division */ -int src_ceil_divide(int a, int b) -{ - int c; - - c = a / b; - if (c * b < a) - c++; - - return c; -} - -/* Calculates the needed FIR delay line length */ -static int src_fir_delay_length(struct src_stage *s) -{ - return s->subfilter_length + (s->num_of_subfilters - 1) * s->idm - + s->blk_in; -} - -/* Calculates the FIR output delay line length */ -static int src_out_delay_length(struct src_stage *s) -{ - - return 1 + (s->num_of_subfilters - 1) * s->odm; -} - -/* Returns index of a matching sample rate */ -static int src_find_fs(int fs_list[], int list_length, int fs) -{ - int i; - - for (i = 0; i < list_length; i++) { - if (fs_list[i] == fs) - return i; - } - return -EINVAL; -} - -/* Match SOF and defined SRC input rates into a bit mask */ -int32_t src_input_rates(void) -{ - int n; - int b; - int mask = 0; - - for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) { - b = (src_find_fs(src_in_fs, NUM_IN_FS, sof_rates[n]) >= 0) - ? 1 : 0; - mask = (mask << 1) | b; - } - return mask; -} - -/* Match SOF and defined SRC output rates into a bit mask */ -int32_t src_output_rates(void) -{ - int n; - int b; - int mask = 0; - - for (n = SOF_RATES_LENGTH - 1; n >= 0; n--) { - b = (src_find_fs(src_out_fs, NUM_OUT_FS, sof_rates[n]) >= 0) - ? 1 : 0; - mask = (mask << 1) | b; - } - return mask; -} - -/* Calculates buffers to allocate for a SRC mode */ -int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch, - int frames, int frames_is_for_source) -{ - struct src_stage *stage1; - struct src_stage *stage2; - int q; - int den; - int num; - int frames2; - - if (nch > PLATFORM_MAX_CHANNELS) { - trace_src_error("che"); - tracev_value(nch); - return -EINVAL; - } - - a->nch = nch; - a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in); - a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out); - - /* Check that both in and out rates are supported */ - if ((a->idx_in < 0) || (a->idx_out < 0)) { - trace_src_error("us1"); - tracev_value(fs_in); - tracev_value(fs_out); - return -EINVAL; - } - - stage1 = src_table1[a->idx_out][a->idx_in]; - stage2 = src_table2[a->idx_out][a->idx_in]; - - /* Check from stage1 parameter for a deleted in/out rate combination.*/ - if (stage1->filter_length < 1) { - trace_src_error("us2"); - tracev_value(fs_in); - tracev_value(fs_out); - return -EINVAL; - } - - a->fir_s1 = nch * src_fir_delay_length(stage1); - a->out_s1 = nch * src_out_delay_length(stage1); - - /* Find out how many additional times the SRC can be executed - while having block size less or equal to max_frames. - */ - if (frames_is_for_source) { - /* Times that stage1 needs to run to input length of frames */ - a->stage1_times_max = src_ceil_divide(frames, stage1->blk_in); - q = frames / stage1->blk_in; - a->stage1_times = MAX(q, 1); - a->blk_in = a->stage1_times * stage1->blk_in; - - /* Times that stage2 needs to run */ - den = stage2->blk_in * stage1->blk_in; - num = frames * stage2->blk_out * stage1->blk_out; - frames2 = src_ceil_divide(num, den); - a->stage2_times_max = src_ceil_divide(frames2, stage2->blk_out); - q = frames2 / stage2->blk_out; - a->stage2_times = MAX(q, 1); - a->blk_out = a->stage2_times * stage2->blk_out; - } else { - /* Times that stage2 needs to run to output length of frames */ - a->stage2_times_max = src_ceil_divide(frames, stage2->blk_out); - q = frames / stage2->blk_out; - a->stage2_times = MAX(q, 1); - a->blk_out = a->stage2_times * stage2->blk_out; - - /* Times that stage1 needs to run */ - num = frames * stage2->blk_in * stage1->blk_in; - den = stage2->blk_out * stage1->blk_out; - frames2 = src_ceil_divide(num, den); - a->stage1_times_max = src_ceil_divide(frames2, stage1->blk_in); - q = frames2 / stage1->blk_in; - a->stage1_times = MAX(q, 1); - a->blk_in = a->stage1_times * stage1->blk_in; - } - - if (stage2->filter_length == 1) { - a->fir_s2 = 0; - a->out_s2 = 0; - a->stage2_times = 0; - a->stage2_times_max = 0; - a->sbuf_length = 0; - } else { - a->fir_s2 = nch * src_fir_delay_length(stage2); - a->out_s2 = nch * src_out_delay_length(stage2); - /* 2x is an empirically tested length. Since the sink buffer - * capability to receive samples varies a shorter stage 2 output - * block will create a peak in internal buffer usage. - */ - a->sbuf_length = 2 * nch * stage1->blk_out * a->stage1_times_max; - } - - a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2; - a->total = a->sbuf_length + a->src_multich; - - return 0; -} - -static void src_state_reset(struct src_state *state) -{ - - state->fir_delay_size = 0; - state->out_delay_size = 0; - state->fir_wi = 0; - state->out_ri = 0; -} - -static int init_stages( - struct src_stage *stage1, struct src_stage *stage2, - struct polyphase_src *src, struct src_param *p, - int n, int32_t *delay_lines_start) -{ - /* Clear FIR state */ - src_state_reset(&src->state1); - src_state_reset(&src->state2); - - src->number_of_stages = n; - src->stage1 = stage1; - src->stage2 = stage2; - if ((n == 1) && (stage1->blk_out == 0)) - return -EINVAL; - - /* Delay line sizes */ - src->state1.fir_delay_size = p->fir_s1; - src->state1.out_delay_size = p->out_s1; - src->state1.fir_delay = delay_lines_start; - src->state1.out_delay = - src->state1.fir_delay + src->state1.fir_delay_size; - if (n > 1) { - src->state2.fir_delay_size = p->fir_s2; - src->state2.out_delay_size = p->out_s2; - src->state2.fir_delay = - src->state1.out_delay + src->state1.out_delay_size; - src->state2.out_delay = - src->state2.fir_delay + src->state2.fir_delay_size; - } else { - src->state2.fir_delay_size = 0; - src->state2.out_delay_size = 0; - src->state2.fir_delay = NULL; - src->state2.out_delay = NULL; - } - - /* Check the sizes are less than MAX */ - if ((src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH) - || (src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH) - || (src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH) - || (src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)) { - src->state1.fir_delay = NULL; - src->state1.out_delay = NULL; - src->state2.fir_delay = NULL; - src->state2.out_delay = NULL; - return -EINVAL; - } - - return 0; -} - -void src_polyphase_reset(struct polyphase_src *src) -{ - - src->number_of_stages = 0; - src->stage1 = NULL; - src->stage2 = NULL; - src_state_reset(&src->state1); - src_state_reset(&src->state2); -} - -int src_polyphase_init(struct polyphase_src *src, struct src_param *p, - int32_t *delay_lines_start) -{ - struct src_stage *stage1; - struct src_stage *stage2; - int n_stages; - int ret; - - if ((p->idx_in < 0) || (p->idx_out < 0)) { - return -EINVAL; - } - - /* Get setup for 2 stage conversion */ - stage1 = src_table1[p->idx_out][p->idx_in]; - stage2 = src_table2[p->idx_out][p->idx_in]; - ret = init_stages(stage1, stage2, src, p, 2, delay_lines_start); - if (ret < 0) - return -EINVAL; - - /* Get number of stages used for optimize opportunity. 2nd - * stage length is one if conversion needs only one stage. - * If input and output rate is the same return 0 to - * use a simple copy function instead of 1 stage FIR with one - * tap. - */ - n_stages = (src->stage2->filter_length == 1) ? 1 : 2; - if (p->idx_in == p->idx_out) - n_stages = 0; - - /* If filter length for first stage is zero this is a deleted - * mode from in/out matrix. Computing of such SRC mode needs - * to be prevented. - */ - if (src->stage1->filter_length == 0) - return -EINVAL; - - return n_stages; -} - -#if SRC_SHORT == 1 - -/* Calculate a FIR filter part that does not need circular modification */ - -static inline void fir_part(int64_t y[], int *id, int *ic, - const int32_t data[], const int16_t coef[], int nch_x_taps, int nch) -{ - int64_t tap0; - int64_t tap1; - int n; - int64_t a = 0; - int64_t b = 0; - int c = *ic; - int d = *id; - int d_end = d - nch_x_taps; - - /* Data is Q1.31, coef is Q1.15, product is Q2.46 */ - if (nch == 2) { - for (n = 0; n < (nch_x_taps >> 2); n++) { - tap0 = coef[c++]; - tap1 = coef[c++]; - b += data[d--] * tap0; - a += data[d--] * tap0; - b += data[d--] * tap1; - a += data[d--] * tap1; - } - if (d > d_end) { - tap0 = coef[c++]; - b += data[d--] * tap0; - a += data[d--] * tap0; - } - y[1] += b; - y[0] += a; - } else { - while (d > d_end) { - tap0 = coef[c++]; - for (n = nch - 1; n >= 0; n--) - y[n] += data[d--] * tap0; - } - } - *ic = c; - *id = d; -} - -#else - -static inline void fir_part(int64_t y[], int *id, int *ic, - const int32_t data[], const int32_t coef[], int nch_x_taps, int nch) -{ - int64_t tap0; - int64_t tap1; - int n; - int64_t a = 0; - int64_t b = 0; - int c = *ic; - int d = *id; - int d_end = d - nch_x_taps; - - /* Data is Q1.31, coef is Q1.23, product is Q2.54 */ - if (nch == 2) { - for (n = 0; n < (nch_x_taps >> 2); n++) { - tap0 = coef[c++]; - tap1 = coef[c++]; - b += data[d--] * tap0; - a += data[d--] * tap0; - b += data[d--] * tap1; - a += data[d--] * tap1; - } - if (d > d_end) { - tap0 = coef[c++]; - b += data[d--] * tap0; - a += data[d--] * tap0; - } - y[1] += b; - y[0] += a; - } else { - while (d > d_end) { - tap0 = coef[c++]; - for (n = nch - 1; n >= 0; n--) - y[n] += data[d--] * tap0; - } - } - *ic = c; - *id = d; -} - -#endif - -#if SRC_SHORT == 1 - -static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[], - int32_t out_delay[], const int16_t coefs[], int dsm1, int taps, - int shift, int nch) -{ - int n2; - int i; - int64_t y[PLATFORM_MAX_CHANNELS]; - int ri = ri0; - int wi = wi0; - int n1 = ri0 + 1; /* Convert to number of sequential frames */ - int qshift = 15 + shift; /* Q2.46 -> Q2.31 */ - int32_t rnd = 1 << (qshift - 1); /* Half LSB */ - int nch_x_taps = nch * taps; - - /* Initialize to half LSB for rounding */ - for (i = 0; i < nch; i++) - y[i] = rnd; - - if (n1 >= nch_x_taps) { - fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch); - } else { - n2 = nch_x_taps - n1; - fir_part(y, &ri, ci, in_delay, coefs, n1, nch); - ri = dsm1; - fir_part(y, &ri, ci, in_delay, coefs, n2, nch); - } - - for (i = 0; i < nch; i++) - out_delay[wi++] = sat_int32(y[i] >> qshift); -} -#else - -static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[], - int32_t out_delay[], const int32_t coefs[], int dsm1, int taps, - int shift, int nch) -{ - int n2; - int i; - int64_t y[PLATFORM_MAX_CHANNELS]; - int ri = ri0; - int wi = wi0; - int n1 = ri0 + 1; /* Convert to number of sequential frames */ - int qshift = 23 + shift; /* Q2.54 -> Q2.31 */ - int32_t rnd = 1 << (qshift - 1); /* Half LSB */ - int nch_x_taps = nch * taps; - - /* Initialize to half LSB for rounding */ - for (i = 0; i < nch; i++) - y[i] = rnd; - - if (n1 >= nch_x_taps) { - fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch); - } else { - n2 = nch_x_taps - n1; - fir_part(y, &ri, ci, in_delay, coefs, n1, nch); - ri = dsm1; - fir_part(y, &ri, ci, in_delay, coefs, n2, nch); - } - - for (i = 0; i < nch; i++) - out_delay[wi++] = sat_int32(y[i] >> qshift); - -} - -#endif - -void src_polyphase_stage_cir(struct src_stage_prm * s) -{ - struct src_state *fir = s->state; - struct src_stage *cfg = s->stage; - int n; - int m; - int f; - int ci; - int ri; - int n_wrap_fir; - int n_wrap_buf; - int n_wrap_min; - int n_min; - int wi; - const void *coef = cfg->coefs; - int32_t *in_delay = fir->fir_delay; - int32_t *out_delay = fir->out_delay; - int dsm1 = fir->fir_delay_size - 1; - int shift = cfg->shift; - int nch = s->nch; - int rewind = -nch * (cfg->blk_in - + (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1; - int nch_x_idm = cfg->idm * nch; - int nch_x_odm = cfg->odm * nch; - size_t sz = sizeof(int32_t); - int blk_in_bytes = nch * cfg->blk_in * sz; - int blk_out_bytes = nch * cfg->num_of_subfilters * sz; - - - for (n = 0; n < s->times; n++) { - /* Input data */ - m = blk_in_bytes; - while (m > 0) { - n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz; - n_wrap_buf = s->x_end_addr - s->x_rptr; - n_wrap_min = (n_wrap_fir < n_wrap_buf) - ? n_wrap_fir : n_wrap_buf; - n_min = (m < n_wrap_min) ? m : n_wrap_min; - while (n_min > 0) { - fir->fir_delay[fir->fir_wi++] = *s->x_rptr; - s->x_rptr++; - n_min -= sz; - m -= sz; - } - /* Check for wrap */ - src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); - if (fir->fir_wi == fir->fir_delay_size) - fir->fir_wi = 0; - } - - /* Filter */ - ci = 0; /* Reset to 1st coefficient */ - ri = fir->fir_wi + rewind; /* Newest data for last subfilter */ - if (ri < 0) - ri += fir->fir_delay_size; - - wi = fir->out_ri; - for (f = 0; f < cfg->num_of_subfilters; f++) { - fir_filter(ri, &ci, wi, in_delay, out_delay, coef, - dsm1, cfg->subfilter_length, shift, nch); - - wi += nch_x_odm; - if (wi >= fir->out_delay_size) - wi -= fir->out_delay_size; - - ri += nch_x_idm; /* Next sub-filter start */ - if (ri >= fir->fir_delay_size) - ri -= fir->fir_delay_size; - } - - /* Output */ - m = blk_out_bytes; - while (m > 0) { - n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz; - n_wrap_buf = s->y_end_addr - s->y_wptr; - n_wrap_min = (n_wrap_fir < n_wrap_buf) - ? n_wrap_fir : n_wrap_buf; - n_min = (m < n_wrap_min) ? m : n_wrap_min; - while (n_min > 0) { - *s->y_wptr = fir->out_delay[fir->out_ri++]; - s->y_wptr++; - n_min -= sz; - m -= sz; - } - /* Check wrap */ - src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); - if (fir->out_ri == fir->out_delay_size) - fir->out_ri = 0; - } - } -} - -void src_polyphase_stage_cir_s24(struct src_stage_prm *s) -{ - struct src_state *fir = s->state; - struct src_stage *cfg = s->stage; - int n; - int m; - int f; - int ci; - int ri; - int n_wrap_fir; - int n_wrap_buf; - int n_wrap_min; - int n_min; - int wi; - const void *coef = cfg->coefs; - int32_t *in_delay = fir->fir_delay; - int32_t *out_delay = fir->out_delay; - int dsm1 = fir->fir_delay_size - 1; - int shift = cfg->shift; - int nch = s->nch; - int rewind = -nch * (cfg->blk_in - + (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1; - int nch_x_idm = cfg->idm * nch; - int nch_x_odm = cfg->odm * nch; - size_t sz = sizeof(int32_t); - int blk_in_bytes = nch * cfg->blk_in * sz; - int blk_out_bytes = nch * cfg->num_of_subfilters * sz; - - for (n = 0; n < s->times; n++) { - /* Input data */ - m = blk_in_bytes; - while (m > 0) { - n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz; - n_wrap_buf = s->x_end_addr - s->x_rptr; - n_wrap_min = (n_wrap_fir < n_wrap_buf) - ? n_wrap_fir : n_wrap_buf; - n_min = (m < n_wrap_min) ? m : n_wrap_min; - while (n_min > 0) { - fir->fir_delay[fir->fir_wi++] = *s->x_rptr << 8; - s->x_rptr++; - n_min -= sz; - m -= sz; - } - /* Check for wrap */ - src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); - if (fir->fir_wi == fir->fir_delay_size) - fir->fir_wi = 0; - } - - /* Filter */ - ci = 0; /* Reset to 1st coefficient */ - ri = fir->fir_wi + rewind; /* Newest data for last subfilter */ - if (ri < 0) - ri += fir->fir_delay_size; - - wi = fir->out_ri; - for (f = 0; f < cfg->num_of_subfilters; f++) { - fir_filter(ri, &ci, wi, in_delay, out_delay, coef, - dsm1, cfg->subfilter_length, shift, nch); - - wi += nch_x_odm; - if (wi >= fir->out_delay_size) - wi -= fir->out_delay_size; - - ri += nch_x_idm; /* Next sub-filter start */ - if (ri >= fir->fir_delay_size) - ri -= fir->fir_delay_size; - } - - /* Output */ - m = blk_out_bytes; - while (m > 0) { - n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz; - n_wrap_buf = s->y_end_addr - s->y_wptr; - n_wrap_min = (n_wrap_fir < n_wrap_buf) - ? n_wrap_fir : n_wrap_buf; - n_min = (m < n_wrap_min) ? m : n_wrap_min; - while (n_min > 0) { - *s->y_wptr = fir->out_delay[fir->out_ri++] >> 8; - s->y_wptr++; - n_min -= sz; - m -= sz; - } - /* Check wrap */ - src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); - if (fir->out_ri == fir->out_delay_size) - fir->out_ri = 0; - } - } - -} diff --git a/src/audio/src_generic.c b/src/audio/src_generic.c new file mode 100644 index 0000000..9caa090 --- /dev/null +++ b/src/audio/src_generic.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2016, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the Intel Corporation nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Seppo Ingalsuo seppo.ingalsuo@linux.intel.com + * + */ + +/* Default C implementation guaranteed to work on any + * architecture. + */ + +#include <stdint.h> +#include <reef/alloc.h> +#include <reef/audio/format.h> +#include <reef/math/numbers.h> + +#include "src_config.h" +#include "src.h" + +#if SRC_GENERIC + +#if SRC_SHORT /* 16 bit coefficients version */ + +static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0, + int32_t *fir_start, int32_t *fir_end, const int fir_delay_length, + const int taps_x_nch, const int shift, const int nch) +{ + int64_t y0; + int64_t y1; + int32_t *data; + const int16_t *coef; + int i; + int j; + int n1; + int n2; + int frames; + const int qshift = 15 + shift; /* Q2.46 -> Q2.31 */ + const int32_t rnd = 1 << (qshift - 1); /* Half LSB */ + int32_t *d = rp; + int32_t *wp = wp0; + + /* Check for 2ch FIR case */ + if (nch == 2) { + /* Decrement data pointer to next channel start. Note that + * initialization code ensures that circular wrap does not + * happen mid-frame. + */ + data = d - 1; + + /* Initialize to half LSB for rounding, prepare for FIR core */ + y0 = rnd; + y1 = rnd; + coef = (const int16_t *)cp; + frames = fir_end - data; /* Frames until wrap */ + n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1; + n2 = (taps_x_nch >> 1) - n1; + + /* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The + * output shift includes the shift by 15 for Qx.46 to + * Qx.31. + */ + for (i = 0; i < n1; i++) { + y0 += (int64_t)(*coef) * (*data); + data++; + y1 += (int64_t)(*coef) * (*data); + data++; + coef++; + } + if (data == fir_end) + data = fir_start; + + for (i = 0; i < n2; i++) { + y0 += (int64_t)(*coef) * (*data); + data++; + y1 += (int64_t)(*coef) * (*data); + data++; + coef++; + } + + *wp = sat_int32(y1 >> qshift); + *(wp + 1) = sat_int32(y0 >> qshift); + return; + } + + for (j = 0; j < nch; j++) { + /* Decrement data pointer to next channel start. Note that + * initialization code ensures that circular wrap does not + * happen mid-frame. + */ + data = d--; + + /* Initialize to half LSB for rounding, prepare for FIR core */ + y0 = rnd; + coef = (const int16_t *)cp; + frames = fir_end - data + nch - j - 1; /* Frames until wrap */ + n1 = (taps_x_nch < frames) ? taps_x_nch : frames; + n2 = taps_x_nch - n1; + + /* The FIR is calculated as Q1.15 x Q1.31 -> Q2.46. The + * output shift includes the shift by 15 for Qx.46 to + * Qx.31. + */ + for (i = 0; i < n1; i += nch) { + y0 += (int64_t)(*coef) * (*data); + coef++; + data += nch; + } + if (data >= fir_end) + data -= fir_delay_length; + + for (i = 0; i < n2; i += nch) { + y0 += (int64_t)(*coef) * (*data); + coef++; + data += nch; + } + + *wp = sat_int32(y0 >> qshift); + wp++; + } +} + +#else /* 32bit coefficients version */ + +static inline void fir_filter_generic(int32_t *rp, const void *cp, int32_t *wp0, + int32_t *fir_start, int32_t *fir_end, int fir_delay_length, + const int taps_x_nch, const int shift, const int nch) +{ + int64_t y0; + int64_t y1; + int32_t *data; + const int32_t *coef; + int i; + int j; + int frames; + int n1; + int n2; + + const int qshift = 23 + shift; /* Qx.54 -> Qx.31 */ + const int32_t rnd = 1 << (qshift - 1); /* Half LSB */ + int32_t *d = rp; + int32_t *wp = wp0; + + /* Check for 2ch FIR case */ + if (nch == 2) { + /* Decrement data pointer to next channel start. Note that + * initialization code ensures that circular wrap does not + * happen mid-frame. + */ + data = d - 1; + + /* Initialize to half LSB for rounding, prepare for FIR core */ + y0 = rnd; + y1 = rnd; + coef = (const int32_t *)cp; + frames = fir_end - data; /* Frames until wrap */ + n1 = ((taps_x_nch < frames) ? taps_x_nch : frames) >> 1; + n2 = (taps_x_nch >> 1) - n1; + + /* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The + * output shift includes the shift by 23 for Qx.54 to + * Qx.31. + */ + for (i = 0; i < n1; i++) { + y0 += (int64_t)(*coef >> 8) * (*data); + data++; + y1 += (int64_t)(*coef >> 8) * (*data); + data++; + coef++; + } + if (data == fir_end) + data = fir_start; + + for (i = 0; i < n2; i++) { + y0 += (int64_t)(*coef >> 8) * (*data); + data++; + y1 += (int64_t)(*coef >> 8) * (*data); + data++; + coef++; + } + *wp = sat_int32(y1 >> qshift); + *(wp + 1) = sat_int32(y0 >> qshift); + return; + } + + for (j = 0; j < nch; j++) { + /* Decrement data pointer to next channel start. Note that + * initialization code ensures that circular wrap does not + * happen mid-frame. + */ + data = d--; + + /* Initialize to half LSB for rounding, prepare for FIR core */ + y0 = rnd; + coef = (const int32_t *)cp; + frames = fir_end - data + nch - j - 1; /* Frames until wrap */ + n1 = (taps_x_nch < frames) ? taps_x_nch : frames; + n2 = taps_x_nch - n1; + + /* The FIR is calculated as Q1.23 x Q1.31 -> Q2.54. The + * output shift includes the shift by 23 for Qx.54 to + * Qx.31. + */ + for (i = 0; i < n1; i += nch) { + y0 += (int64_t)(*coef >> 8) * (*data); + coef++; + data += nch; + } + if (data >= fir_end) + data -= fir_delay_length; + + for (i = 0; i < n2; i += nch) { + y0 += (int64_t)(*coef >> 8) * (*data); + coef++; + data += nch; + } + *wp = sat_int32(y0 >> qshift); + wp++; + } +} + +#endif /* 32bit coefficients version */ + +void src_polyphase_stage_cir(struct src_stage_prm *s) +{ + int i; + int n; + int m; + int n_wrap_buf; + int n_wrap_fir; + int n_min; + int32_t *rp; + int32_t *wp; + + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_delay = fir->fir_delay; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int fir_length = fir->fir_delay_size; + const int rewind = nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch; + const int nch_x_idm = nch * cfg->idm; + const size_t fir_size = fir->fir_delay_size * sizeof(int32_t); + const int taps_x_nch = cfg->subfilter_length * nch; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data */ + m = blk_in_words; + while (m > 0) { + /* Number of words without circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_wrap_fir = fir->fir_wp - fir->fir_delay + 1; + n_min = (n_wrap_fir < n_wrap_buf) + ? n_wrap_fir : n_wrap_buf; + n_min = (m < n_min) ? m : n_min; + m -= n_min; + for (i = 0; i < n_min; i++) { + *fir->fir_wp = *s->x_rptr; + fir->fir_wp--; + s->x_rptr++; + } + /* Check for wrap */ + src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size); + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = fir->fir_wp + rewind; + src_circ_inc_wrap(&rp, fir_end, fir_size); + wp = fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_generic(rp, cp, wp, + fir_delay, fir_end, fir_length, + taps_x_nch, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap(&wp, out_delay_end, out_size); + rp -= nch_x_idm; /* Next sub-filter start */ + src_circ_dec_wrap(&rp, fir_delay, fir_size); + } + + /* Output */ + m = blk_out_words; + while (m > 0) { + n_wrap_fir = out_delay_end - fir->out_rp; + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (n_wrap_fir < n_wrap_buf) + ? n_wrap_fir : n_wrap_buf; + n_min = (m < n_min) ? m : n_min; + m -= n_min; + for (i = 0; i < n_min; i++) { + *s->y_wptr = *fir->out_rp; + s->y_wptr++; + fir->out_rp++; + } + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + src_circ_inc_wrap(&fir->out_rp, out_delay_end, + out_size); + } + } +} + +void src_polyphase_stage_cir_s24(struct src_stage_prm *s) +{ + int i; + int n; + int m; + int n_wrap_buf; + int n_wrap_fir; + int n_min; + int32_t *rp; + int32_t *wp; + + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_delay = fir->fir_delay; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int fir_length = fir->fir_delay_size; + const int rewind = nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch; + const int nch_x_idm = nch * cfg->idm; + const size_t fir_size = fir->fir_delay_size * sizeof(int32_t); + const int taps_x_nch = cfg->subfilter_length * nch; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data */ + m = blk_in_words; + while (m > 0) { + /* Number of words without circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_wrap_fir = fir->fir_wp - fir->fir_delay + 1; + n_min = (n_wrap_fir < n_wrap_buf) + ? n_wrap_fir : n_wrap_buf; + n_min = (m < n_min) ? m : n_min; + m -= n_min; + for (i = 0; i < n_min; i++) { + *fir->fir_wp = *s->x_rptr << 8; + fir->fir_wp--; + s->x_rptr++; + } + /* Check for wrap */ + src_circ_dec_wrap(&fir->fir_wp, fir_delay, fir_size); + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = fir->fir_wp + rewind; + src_circ_inc_wrap(&rp, fir_end, fir_size); + wp = fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter_generic(rp, cp, wp, + fir_delay, fir_end, fir_length, + taps_x_nch, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap(&wp, out_delay_end, out_size); + rp -= nch_x_idm; /* Next sub-filter start */ + src_circ_dec_wrap(&rp, fir_delay, fir_size); + } + + /* Output */ + m = blk_out_words; + while (m > 0) { + n_wrap_fir = out_delay_end - fir->out_rp; + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (n_wrap_fir < n_wrap_buf) + ? n_wrap_fir : n_wrap_buf; + n_min = (m < n_min) ? m : n_min; + m -= n_min; + for (i = 0; i < n_min; i++) { + *s->y_wptr = *fir->out_rp >> 8; + s->y_wptr++; + fir->out_rp++; + } + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + src_circ_inc_wrap(&fir->out_rp, out_delay_end, + out_size); + } + } +} + +#endif diff --git a/src/audio/src_hifi2ep.c b/src/audio/src_hifi2ep.c new file mode 100644 index 0000000..0d03ffa --- /dev/null +++ b/src/audio/src_hifi2ep.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2017, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the Intel Corporation nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Seppo Ingalsuo seppo.ingalsuo@linux.intel.com + * + */ + +/* HiFi EP optimized code parts for SRC */ + +#include <stdint.h> +#include <reef/alloc.h> +#include <reef/audio/format.h> +#include <reef/math/numbers.h> + +#include "src_config.h" +#include "src.h" + +#if SRC_HIFIEP + +#include <xtensa/config/defs.h> +#include <xtensa/tie/xt_hifi2.h> + +/* HiFi EP has + * 4x 56 bit registers in register file Q + * 8x 48 bit registers in register file P + */ + +#if SRC_SHORT /* 16 bit coefficients version */ + +static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0, + const int taps_div_4, const int shift, const int nch) +{ + /* This function uses + * 2x 56 bit registers Q, + * 4x 48 bit registers P + * 3x integers + * 4x address pointers, + */ + ae_q56s a0; + ae_q56s a1; + ae_p24x2f data2; + ae_p24x2f coef2; + ae_p24x2f p0; + ae_p24x2f p1; + ae_p16x2s *coefp; + ae_p24x2f *dp = (ae_p24x2f *)rp; + ae_p24x2f *dp0; + ae_q32s *wp = wp0; + int i; + int j; + const int inc = sizeof(ae_p24x2f); + + /* 2ch FIR case */ + if (nch == 2) { + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + AE_LP24F_C(p0, dp, -sizeof(ae_p24f)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_p16x2s *)cp; + a0 = AE_ZEROQ56(); + a1 = AE_ZEROQ56(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients. Coef2_h contains tap *coefp + * and coef2_l contains the next tap. + */ + coef2 = AE_LP16X2F_I(coefp, 0); + coefp++; + + /* Load two data samples from two channels */ + AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */ + AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */ + + /* Select to d0 successive left channel samples, to d1 + * successive right channel samples. Then accumulate + * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 + * data and Q1.15 coefficients are used as 24 bits as + * Q1.23 values. + */ + data2 = AE_SELP24_LL(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(p0, p1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + + /* Repeat for next two taps */ + coef2 = AE_LP16X2F_I(coefp, 0); + coefp++; + AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */ + AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */ + data2 = AE_SELP24_LL(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(p0, p1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0); + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp, + sizeof(int32_t)); + return; + } + + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp0 = dp; + AE_LP24F_C(p0, dp, -sizeof(ae_p24f)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_p16x2s *)cp; + a0 = AE_ZEROQ56(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients */ + coef2 = *coefp++; + + /* Load two data samples */ + AE_LP24F_C(p0, dp0, inc); + AE_LP24F_C(p1, dp0, inc); + + /* Pack p0 and p1 to data2_h and data2_l */ + data2 = AE_SELP24_LL(p0, p1); + + /* Accumulate data2_h * coef2_h + data2_l * coef2_l */ + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + + /* Repeat for next two filter taps */ + coef2 = *coefp++; + AE_LP24F_C(p0, dp0, inc); + AE_LP24F_C(p1, dp0, inc); + data2 = AE_SELP24_LL(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0); + wp++; + } +} + +#else /* 32bit coefficients version */ + +static inline void fir_filter(ae_q32s *rp, const void *cp, ae_q32s *wp0, + const int taps_div_4, const int shift, const int nch) +{ + /* This function uses + * 2x 56 bit registers Q, + * 4x 48 bit registers P + * 3x integers + * 4x address pointers, + */ + ae_q56s a0; + ae_q56s a1; + ae_p24x2f p0; + ae_p24x2f p1; + ae_p24x2f data2; + ae_p24x2f coef2; + ae_p24x2f *coefp; + ae_p24x2f *dp = (ae_p24x2f *)rp; + ae_p24x2f *dp0; + ae_q32s *wp = wp0; + int i; + int j; + const int inc = sizeof(ae_p24x2f); + + /* 2ch FIR case */ + if (nch == 2) { + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + AE_LP24F_C(p0, dp, -sizeof(ae_p24f)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_p24x2f *)cp; + a0 = AE_ZEROQ56(); + a1 = AE_ZEROQ56(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients. Coef2_h contains tap *coefp + * and coef2_l contains the next tap. + */ + /* TODO: Ensure coefficients are 64 bits aligned */ + coef2 = AE_LP24X2F_I(coefp, 0); + coefp++; + + /* Load two data samples from two channels */ + AE_LP24X2F_C(p0, dp, inc); /* r0, l0 */ + AE_LP24X2F_C(p1, dp, inc); /* r1, l1 */ + + /* Select to d0 successive left channel samples, to d1 + * successive right channel samples. + */ + + /* Accumulate to m + * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 + * data and Q1.15 coefficients are used as 24 bits as + * Q1.23 values. + */ + data2 = AE_SELP24_LL(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(p0, p1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + + /* Repeat for next two taps */ + coef2 = AE_LP24X2F_I(coefp, 0); + coefp++; + AE_LP24X2F_C(p0, dp, inc); /* r2, l2 */ + AE_LP24X2F_C(p1, dp, inc); /* r3, l3 */ + data2 = AE_SELP24_LL(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(p0, p1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0); + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a1, shift)), wp, + sizeof(int32_t)); + return; + } + + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp0 = dp; + AE_LP24F_C(p0, dp, -sizeof(ae_p24f)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_p24x2f *)cp; + a0 = AE_ZEROQ56(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients */ + coef2 = *coefp++; + + /* Load two data samples and place them to L and H of + * data2. + */ + AE_LP24F_C(p0, dp0, inc); + AE_LP24F_C(p1, dp0, inc); + data2 = AE_SELP24_LH(p0, p1); + + /* Accumulate to m + * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 + * data and coefficients are used as the most + * significant 24 bits as Q1.23 values. + */ + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + + /* Repeat for next two filter taps */ + coef2 = *coefp++; + AE_LP24F_C(p0, dp0, inc); + AE_LP24F_C(p1, dp0, inc); + data2 = AE_SELP24_LH(p0, p1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + AE_SQ32F_I(AE_ROUNDSQ32SYM(AE_SRAAQ56(a0, shift)), wp, 0); + wp++; + } +} +#endif /* 32bit coefficients version */ + +void src_polyphase_stage_cir(struct src_stage_prm *s) +{ + /* This function uses + * 1x 56 bit registers Q, + * 0x 48 bit registers P, + * 16x integers + * 7x address pointers, + */ + ae_q56s q; + ae_q32s *rp; + ae_q32s *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * (nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data to filter */ + m = blk_in_words; + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + + while (m > 0) { + /* Number of words until circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Load 32 bits sample to accumulator */ + q = AE_LQ32F_I((ae_q32s *)s->x_rptr++, 0); + + /* Store to circular buffer, advance pointer */ + AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_q32s *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to to data start. Loaded value q is discarded. + */ + AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz); + + /* Reset FIR write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_q32s *)fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap((int32_t **)&wp, out_delay_end, + out_size); + + /* Circular advance pointer rp by number of + * channels x input delay multiplier. Loaded value q + * is discarded. + */ + AE_LQ32F_C(q, rp, nch_x_idm_sz); + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + m = blk_out_words; + while (m > 0) { + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Circular load followed by linear store */ + AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz); + AE_SQ32F_I(q, (ae_q32s *)s->y_wptr, 0); + s->y_wptr++; + } + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + } + } +} + +void src_polyphase_stage_cir_s24(struct src_stage_prm *s) +{ + /* This function uses + * 1x 56 bit registers Q, + * 0x 48 bit registers P, + * 16x integers + * 7x address pointers, + */ + ae_q56s q; + ae_q32s *rp; + ae_q32s *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * (nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data to filter */ + m = blk_in_words; + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + + while (m > 0) { + /* Number of words without circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Load 32 bits sample to accumulator + * and left shift by 8, advance read + * pointer. + */ + q = AE_SLLIQ56(AE_LQ32F_I( + (ae_q32s *)s->x_rptr++, 0), 8); + + /* Store to circular buffer, advance + * write pointer. + */ + AE_SQ32F_C(q, (ae_q32s *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_q32s *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to to data start. Loaded value q is discarded. + */ + AE_LQ32F_C(q, (ae_q32s *)rp, rewind_sz); + + /* Reset FIR output write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_q32s *)fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap((int32_t **)&wp, out_delay_end, + out_size); + + /* Circular advance pointer rp by number of + * channels x input delay multiplier. Loaded value q + * is discarded. + */ + AE_LQ32F_C(q, rp, nch_x_idm_sz); + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + m = blk_out_words; + while (m > 0) { + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Circular load for 32 bit sample, + * advance pointer. + */ + AE_LQ32F_C(q, (ae_q32s *)fir->out_rp, sz); + + /* Store value as shifted right by 8 for + * sign extended 24 bit value, advance pointer. + */ + AE_SQ32F_I(AE_SRAIQ56(q, 8), + (ae_q32s *)s->y_wptr, 0); + s->y_wptr++; + } + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + } + } +} + +#endif diff --git a/src/audio/src_hifi3.c b/src/audio/src_hifi3.c new file mode 100644 index 0000000..96d3c99 --- /dev/null +++ b/src/audio/src_hifi3.c @@ -0,0 +1,567 @@ +/* + * Copyright (c) 2016, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the Intel Corporation nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Seppo Ingalsuo seppo.ingalsuo@linux.intel.com + * + */ + +/* HiFi3 optimized code parts for SRC */ + +#include <stdint.h> +#include <reef/alloc.h> +#include <reef/audio/format.h> +#include <reef/math/numbers.h> + +#include "src_config.h" +#include "src.h" + +#if SRC_HIFI3 + +#include <xtensa/config/defs.h> +#include <xtensa/tie/xt_hifi3.h> + +/* HiFi3 has + * 16x 64 bit registers in register file AE_DR + */ + +#if SRC_SHORT /* 16 bit coefficients version */ + +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, const int nch) +{ + /* This function uses + * 6x 64 bit registers + * 3x integers + * 5x address pointers, + */ + ae_f64 a0; + ae_f64 a1; + ae_valign u; + ae_f16x4 coef4; + ae_f32x2 d0; + ae_f32x2 d1; + ae_f32x2 data2; + ae_f16x4 *coefp; + ae_f32x2 *dp; + ae_f32 *dp0; + ae_f32 *dp1; + int i; + int j; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); + + if (nch == 2) { + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f32x2 *)rp; + AE_L32_XC(d0, (ae_f32 *)dp, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f16x4 *)cp; + a0 = AE_ZERO64(); + a1 = AE_ZERO64(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Four coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA16X4_IP(coef4, u, coefp); + + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r0, l0 */ + AE_L32X2_XC(d1, dp, inc); /* r1, l1 */ + + /* Select to data2 sequential samples from a channel + * and then accumulate to a0 and a1 + * data2_h * coef4_3 + data2_l * coef4_2. + * The data is 32 bits Q1.31 and coefficient 16 bits + * Q1.15. The accumulators are Q17.47. + */ + data2 = AE_SEL32_LL(d0, d1); /* l0, l1 */ + AE_MULAAFD32X16_H3_L2(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r0, r1 */ + AE_MULAAFD32X16_H3_L2(a1, data2, coef4); + + /* Load two data samples from two channels */ + AE_L32X2_XC(d0, dp, inc); /* r2, l2 */ + AE_L32X2_XC(d1, dp, inc); /* r3, l3 */ + + /* Accumulate + * data2_h * coef4_1 + data2_l * coef4_0. + */ + data2 = AE_SEL32_LL(d0, d1); /* l2, l3 */ + AE_MULAAFD32X16_H1_L0(a0, data2, coef4); + data2 = AE_SEL32_HH(d0, d1); /* r2, r3 */ + AE_MULAAFD32X16_H1_L0(a1, data2, coef4); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, + sizeof(int32_t)); + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, + sizeof(int32_t)); + + return; + } + + dp1 = (ae_f32 *)rp; + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp0 = dp1; + AE_L32_XC(d0, dp1, -sizeof(ae_f32)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f16x4 *)cp; + a0 = AE_ZERO64(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Data is read from + * interleaved buffer with stride of channels count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load four coefficients */ + AE_LA16X4_IP(coef4, u, coefp); + + /* Load two data samples, place to high and + * low of data2. + */ + AE_L32_XC(d0, dp0, inc); + AE_L32_XC(d1, dp0, inc); + data2 = AE_SEL32_LL(d0, d1); + + /* Accumulate + * data2_h * coef4_3 + data2_l* coef4_2. + * The data is 32 bits Q1.31 and coefficient 16 bits + * Q1.15. The accumulator is Q17.47. + */ + AE_MULAAFD32X16_H3_L2(a0, data2, coef4); + + /* Repeat with next two samples */ + AE_L32_XC(d0, dp0, inc); + AE_L32_XC(d1, dp0, inc); + data2 = AE_SEL32_LL(d0, d1); + + /* Accumulate + * data2_h * coef4_1 + data2_l * coef4_0. + */ + AE_MULAAFD32X16_H1_L0(a0, data2, coef4); + } + + /* Scale FIR output with right shifts, round/saturate Q17.47 + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, + sizeof(int32_t)); + } +} + +#else /* 32bit coefficients version */ + +static inline void fir_filter(ae_f32 *rp, const void *cp, ae_f32 *wp0, + const int taps_div_4, const int shift, const int nch) +{ + /* This function uses + * 6x 64 bit registers + * 3x integers + * 5x address pointers, + */ + ae_f64 a0; + ae_f64 a1; + ae_f24x2 data2; + ae_f24x2 coef2; + ae_f24x2 d0; + ae_f24x2 d1; + ae_f24x2 *coefp; + ae_f24x2 *dp; + ae_f24 *dp1; + ae_f24 *dp0; + int i; + int j; + ae_f32 *wp = wp0; + const int inc = nch * sizeof(int32_t); + + if (nch == 2) { + /* Move data pointer back by one sample to start from right + * channel sample. Discard read value p0. + */ + dp = (ae_f24x2 *)rp; + AE_L32F24_XC(d0, (ae_f24 *)dp, -sizeof(ae_f24)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f24x2 *)cp; + a0 = AE_ZERO64(); + a1 = AE_ZERO64(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Two coefficients + * are loaded simultaneously. Data is read + * from interleaved buffer with stride of channels + * count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients. Coef2_h contains tap *coefp + * and coef2_l contains the next tap. + */ + /* TODO: Ensure coefficients are 64 bits aligned */ + AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); + + /* Load two data samples from two channels */ + AE_L32X2F24_XC(d0, dp, inc); /* r0, l0 */ + AE_L32X2F24_XC(d1, dp, inc); /* r1, l1 */ + + /* Select to d0 successive left channel samples, to d1 + * successive right channel samples. Then Accumulate + * to a0 and a1 + * data2_h * coef2_h + data2_l * coef2_l. The Q1.31 + * data and Q1.15 coefficients are used as 24 bits as + * Q1.23 values. + */ + data2 = AE_SELP24_LL(d0, d1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(d0, d1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + + /* Repeat for next two taps */ + AE_L32X2F24_IP(coef2, coefp, sizeof(ae_f24x2)); + AE_L32X2F24_XC(d0, dp, inc); /* r2, l2 */ + AE_L32X2F24_XC(d1, dp, inc); /* r3, l3 */ + data2 = AE_SELP24_LL(d0, d1); + AE_MULAAFP24S_HH_LL(a0, data2, coef2); + data2 = AE_SELP24_HH(d0, d1); + AE_MULAAFP24S_HH_LL(a1, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate + * to Q1.31, and store 32 bit output. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, + sizeof(int32_t)); + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a1, shift)), wp, + sizeof(int32_t)); + + return; + } + + dp1 = (ae_f24 *)rp; + for (j = 0; j < nch; j++) { + /* Copy pointer and advance to next ch with dummy load */ + dp0 = dp1; + AE_L32F24_XC(data2, dp1, -sizeof(ae_f24)); + + /* Reset coefficient pointer and clear accumulator */ + coefp = (ae_f24x2 *)cp; + a0 = AE_ZERO64(); + + /* Compute FIR filter for current channel with four + * taps per every loop iteration. Data is read from + * interleaved buffer with stride of channels count. + */ + for (i = 0; i < taps_div_4; i++) { + /* Load two coefficients */ + coef2 = *coefp++; + + /* Load two data samples, place to high and + * low of data2. + */ + AE_L32F24_XC(d0, dp0, inc); + AE_L32F24_XC(d1, dp0, inc); + data2 = AE_SELP24_LL(d0, d1); + + /* Accumulate to data2_h * coef2_h + + * data2_l*coef2_l. The Q1.31 bit data is used + * as Q1.23 from MSB side bits of the 32 bit + * word. The accumulator m is Q17.47. + */ + AE_MULAAFD24_HH_LL(a0, data2, coef2); + + /* Repeat the same for next two filter taps */ + coef2 = *coefp++; + AE_L32F24_XC(d0, dp0, inc); + AE_L32F24_XC(d1, dp0, inc); + data2 = AE_SELP24_LL(d0, d1); + AE_MULAAFD24_HH_LL(a0, data2, coef2); + } + + /* Scale FIR output with right shifts, round/saturate Q17.47 + * to Q1.31, and store 32 bit output. Advance write + * pointer to next sample. + */ + AE_S32_L_XP(AE_ROUND32F48SSYM(AE_SRAA64(a0, shift)), wp, + sizeof(int32_t)); + } +} + +#endif /* 32bit coefficients version */ + +void src_polyphase_stage_cir(struct src_stage_prm *s) +{ + /* This function uses + * 1x 64 bit registers + * 16x integers + * 7x address pointers, + */ + ae_int32x2 q; + ae_f32 *rp; + ae_f32 *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * (nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data to filter */ + m = blk_in_words; + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + + while (m > 0) { + /* Number of words until circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Load 32 bits sample to accumulator, + * advance pointer. + */ + AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz); + + /* Store to circular buffer, advance pointer */ + AE_S32_L_XC(q, (ae_int32 *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_f32 *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to to data start. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, rewind_sz); + + /* Reset FIR write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_f32 *)fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap((int32_t **)&wp, out_delay_end, + out_size); + + /* Circular advance pointer rp by number of + * channels x input delay multiplier. Loaded value q + * is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + m = blk_out_words; + while (m > 0) { + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Circular load followed by linear store, + * advance read and write pointers. + */ + AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); + AE_S32_L_XP(q, (ae_int32 *)s->y_wptr, sz); + } + + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + } + } +} + +void src_polyphase_stage_cir_s24(struct src_stage_prm *s) +{ + /* This function uses + * 1x 64 bit registers + * 16x integers + * 7x address pointers, + */ + ae_int32x2 q; + ae_f32 *rp; + ae_f32 *wp; + int i; + int n; + int m; + int n_wrap_buf; + int n_min; + + struct src_state *fir = s->state; + struct src_stage *cfg = s->stage; + int32_t *fir_end = &fir->fir_delay[fir->fir_delay_size]; + int32_t *out_delay_end = &fir->out_delay[fir->out_delay_size]; + const void *cp; /* Can be int32_t or int16_t */ + const size_t out_size = fir->out_delay_size * sizeof(int32_t); + const int nch = s->nch; + const int nch_x_odm = cfg->odm * nch; + const int blk_in_words = nch * cfg->blk_in; + const int blk_out_words = nch * cfg->num_of_subfilters; + const int sz = sizeof(int32_t); + const int n_sz = -sizeof(int32_t); + const int rewind_sz = sz * (nch * (cfg->blk_in + + (cfg->num_of_subfilters - 1) * cfg->idm) - nch); + const int nch_x_idm_sz = -nch * cfg->idm * sizeof(int32_t); + const int taps_div_4 = cfg->subfilter_length >> 2; + +#if SRC_SHORT + const size_t subfilter_size = cfg->subfilter_length * sizeof(int16_t); +#else + const size_t subfilter_size = cfg->subfilter_length * sizeof(int32_t); +#endif + + for (n = 0; n < s->times; n++) { + /* Input data */ + m = blk_in_words; + + /* Setup circular buffer for FIR input data delay */ + AE_SETCBEGIN0(fir->fir_delay); + AE_SETCEND0(fir_end); + + while (m > 0) { + /* Number of words without circular wrap */ + n_wrap_buf = s->x_end_addr - s->x_rptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Load 32 bits sample to accumulator + * and left shift by 8, advance read + * pointer. + */ + AE_L32_XP(q, (ae_int32 *)s->x_rptr, sz); + AE_S32_L_XC(AE_SLAI32(q, 8), + (ae_int32 *)fir->fir_wp, n_sz); + } + + /* Check for wrap */ + src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size); + } + + /* Do filter */ + cp = cfg->coefs; /* Reset to 1st coefficient */ + rp = (ae_f32 *)fir->fir_wp; + + /* Do circular modification to pointer rp by amount of + * rewind to to data start. Loaded value q is discarded. + */ + AE_L32_XC(q, rp, rewind_sz); + + /* Reset FIR output write pointer and compute all polyphase + * sub-filters. + */ + wp = (ae_f32 *)fir->out_rp; + for (i = 0; i < cfg->num_of_subfilters; i++) { + fir_filter(rp, cp, wp, taps_div_4, cfg->shift, nch); + wp += nch_x_odm; + cp += subfilter_size; + src_circ_inc_wrap((int32_t **)&wp, out_delay_end, + out_size); + + /* Circular advance pointer rp by number of + * channels x input delay multiplier. Loaded value q + * is discarded. + */ + AE_L32_XC(q, rp, nch_x_idm_sz); + } + + /* Output */ + + /* Setup circular buffer for SRC out delay access */ + AE_SETCBEGIN0(fir->out_delay); + AE_SETCEND0(out_delay_end); + m = blk_out_words; + while (m > 0) { + n_wrap_buf = s->y_end_addr - s->y_wptr; + n_min = (m < n_wrap_buf) ? m : n_wrap_buf; + m -= n_min; + for (i = 0; i < n_min; i++) { + /* Circular load for 32 bit sample, + * advance read pointer. + */ + AE_L32_XC(q, (ae_int32 *)fir->out_rp, sz); + + /* Store value as shifted right by 8 + * for sign extended 24 bit value, + * advance write pointer. + */ + AE_S32_L_XP(AE_SRAI32(q, 8), + (ae_int32 *)s->y_wptr, sz); + } + + /* Check wrap */ + src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size); + } + } +} + +#endif