[Sound-open-firmware] [PATCH] SRC: Use a multi-channel FIR core to optimize speed plus other cleanup

Thu Oct 19 23:10:45 CEST 2017

On Wed, 2017-10-18 at 19:56 +0300, Seppo Ingalsuo wrote:
> This patch changes the sample rate conversion processing to use a single
> multi-channel filter instead of per channel called mono filter instances.
> The filter output is now rounded with 1/2 LSB add. The polyphase filter
> input input block sizes are multiplied in SRC initialization to reach
> near to or exacly the period length to reduce polyphase filter call
> overhead.
> 
> The polyphase filter is now called via function pointer for more
> flexibility with audio data formats. Currently S32_LE and S24_4LE are
> supported.
> 
> Code cleanup includes removal of redundant variables and some debug
> print code.
> 
> Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo at linux.intel.com>
> ---
>  src/audio/src.c      | 198 ++++++++++-------------
>  src/audio/src_core.c | 447 ++++++++++++++++++++++++++-------------------------
>  src/audio/src_core.h |  42 +----
>  3 files changed, 323 insertions(+), 364 deletions(-)
> 
> diff --git a/src/audio/src.c b/src/audio/src.c
> index 4c4bc6e..854cb82 100644
> --- a/src/audio/src.c
> +++ b/src/audio/src.c
> @@ -55,7 +55,7 @@
>  
>  /* src component private data */
>  struct comp_data {
> -	struct polyphase_src src[PLATFORM_MAX_CHANNELS];
> +	struct polyphase_src src;
>  	struct src_param param;
>  	int32_t *delay_lines;
>  	uint32_t sink_rate;
> @@ -63,16 +63,16 @@ struct comp_data {
>  	int32_t *sbuf_w_ptr;
>  	int32_t *sbuf_r_ptr;
>  	int sbuf_avail;
> -	int sign_extend_s24; /* Set if need to copy sign bit to b24..b31 */
>  	void (* src_func)(struct comp_dev *dev,
>  		struct comp_buffer *source,
>  		struct comp_buffer *sink,
>  		size_t *consumed,
>  		size_t *produced);
> +	void (* polyphase_func)(struct src_stage_prm *s);
>  };
>  
>  /* Fallback function */
> -static void fallback_s32(struct comp_dev *dev, struct comp_buffer *source,
> +static void src_fallback(struct comp_dev *dev, struct comp_buffer *source,
>  	struct comp_buffer *sink, size_t *bytes_read, size_t *bytes_written)
>  {
>  	*bytes_read = 0;
> @@ -84,101 +84,105 @@ static void src_2s_s32_default(struct comp_dev *dev,
>  	struct comp_buffer *source, struct comp_buffer *sink,
>  	size_t *bytes_read, size_t *bytes_written)
>  {
> -	struct polyphase_src *s;
>  	struct src_stage_prm s1;
>  	struct src_stage_prm s2;
> -	int j;
> +	int s1_blk_in;
> +	int s1_blk_out;
> +	int s2_blk_in;
> +	int s2_blk_out;
>  	struct comp_data *cd = comp_get_drvdata(dev);
>  	int32_t *dest = (int32_t *) sink->w_ptr;
>  	int32_t *src = (int32_t *) source->r_ptr;
> -	int32_t *sbuf_addr = cd->delay_lines;
>  	int32_t *sbuf_end_addr = &cd->delay_lines[cd->param.sbuf_length];
>  	int32_t sbuf_size = cd->param.sbuf_length * sizeof(int32_t);
> -	const int nch = dev->params.channels;
> -	const int s1_blk_in = cd->src[0].stage1->blk_in * nch;
> -	const int s1_blk_out = cd->src[0].stage1->blk_out * nch;
> -	const int s2_blk_in = cd->src[0].stage2->blk_in * nch;
> -	const int s2_blk_out = cd->src[0].stage2->blk_out * nch;
> +	int nch = dev->params.channels;
>  	int sbuf_free = cd->param.sbuf_length - cd->sbuf_avail;
> -	int source_check = (source->avail >> 2) - s1_blk_in; /* for int32_t */
> -	int sink_check = (sink->free >> 2) - s2_blk_out; /* For int32_t */
>  	int n_read = 0;
>  	int n_written = 0;
>  	int n1 = 0;
>  	int n2 = 0;
> +	int avail = source->avail >> 2; /* For int32_t */
> +	int free = sink->free >> 2; /* For int32_t */

Is this converting bytes to words ? otherwise it hard codes the format ?
Please add a comment on what it's doing.

>  
> -	s1.times = 1;
>  	s1.x_end_addr = source->end_addr;
>  	s1.x_size = source->size;
> -	s1.x_inc = nch;
>  	s1.y_end_addr = sbuf_end_addr;
>  	s1.y_size = sbuf_size;
> -	s1.y_inc = nch;
> +	s1.state = &cd->src.state1;
> +	s1.stage = cd->src.stage1;
> +	s1.x_rptr = src;
> +	s1.y_wptr = cd->sbuf_w_ptr;
> +	s1.nch = nch;
>  
> -	s2.times = 1;
>  	s2.x_end_addr = sbuf_end_addr;
>  	s2.x_size = sbuf_size;
> -	s2.x_inc = nch;
>  	s2.y_end_addr = sink->end_addr;
>  	s2.y_size = sink->size;
> -	s2.y_inc = nch;
> +	s2.state = &cd->src.state2;
> +	s2.stage = cd->src.stage2;
> +	s2.x_rptr = cd->sbuf_r_ptr;
> +	s2.y_wptr = dest;
> +	s2.nch = nch;
> +
>  
> -	/* 1st stage runs once a long multiplied length block.
> -	 * The stage buffer much be large enough to fit one s1 output block
> -	 * plus one s2 input block plus jitter in s2 consumption.
> +	/* Test if 1st stage can be run with default block length to reach
> +	 * the period length or just under it.
>  	 */
> -	while ((n1 < cd->param.stage1_times_max)
> -		&& (n_read <= source_check)
> -		&& (sbuf_free >= s1_blk_out)) {
> -		for (j = 0; j < nch; j++) {
> -			s = &cd->src[j]; /* Point to src[] for this channel */
> -			s1.x_rptr = src++;
> -			s1.y_wptr = cd->sbuf_w_ptr++;
> -			src_circ_inc_wrap(&src, source->end_addr, source->size);
> -			src_circ_inc_wrap(&cd->sbuf_w_ptr, sbuf_end_addr, sbuf_size);
> -			s1.state = &s->state1;
> -			s1.stage = s->stage1;
> -			if (cd->sign_extend_s24)
> -				src_polyphase_stage_cir_s24(&s1);
> -			else
> -				src_polyphase_stage_cir(&s1);
> -
> -		}
> +	s1.times = cd->param.stage1_times;
> +	s1_blk_in = s1.times * cd->src.stage1->blk_in * nch;
> +	s1_blk_out = s1.times * cd->src.stage1->blk_out * nch;
> +	if ((avail >= s1_blk_in) && (sbuf_free >= s1_blk_out)) {
> +		cd->polyphase_func(&s1);
> +		cd->sbuf_w_ptr = s1.y_wptr;
> +		cd->sbuf_avail += s1_blk_out;
>  		n_read += s1_blk_in;
> +		avail -= s1_blk_in;
> +		sbuf_free -= s1_blk_out;
> +		n1 = s1.times;
> +	}
> +
> +	/* Run one block at time the remaining data for 1st stage. */
> +	s1.times = 1;
> +	s1_blk_in = cd->src.stage1->blk_in * nch;
> +	s1_blk_out = cd->src.stage1->blk_out * nch;
> +	while ((n1 < cd->param.stage1_times_max) && (avail >= s1_blk_in)
> +		&& (sbuf_free >= s1_blk_out)) {
> +		cd->polyphase_func(&s1);
> +		cd->sbuf_w_ptr = s1.y_wptr;
>  		cd->sbuf_avail += s1_blk_out;
> +		n_read += s1_blk_in;
> +		avail -= s1_blk_in;
>  		sbuf_free -= s1_blk_out;
> -		src = s1.x_rptr - nch + 1;
> -		cd->sbuf_w_ptr = s1.y_wptr - nch + 1;
> -		src_circ_dec_wrap(&src, source->addr, source->size);
> -		src_circ_dec_wrap(&cd->sbuf_w_ptr, sbuf_addr, sbuf_size);
> -		n1++;
> +		n1 += s1.times;
> +	}
> +
> +	/* Test if 2nd stage can be run with default block length. */
> +	s2.times = cd->param.stage2_times;
> +	s2_blk_in = s2.times * cd->src.stage2->blk_in * nch;
> +	s2_blk_out = s2.times * cd->src.stage2->blk_out * nch;
> +	if ((cd->sbuf_avail >= s2_blk_in) && (free >= s2_blk_out)) {
> +		cd->polyphase_func(&s2);
> +		cd->sbuf_r_ptr = s2.x_rptr;
> +		cd->sbuf_avail -= s2_blk_in;
> +		free -= s2_blk_out;
> +		n_written += s2_blk_out;
> +		n2 = s2.times;
>  	}
>  
> -	/* 2nd stage runs as many min size blocks as buffers allow */
> +
> +	/* Run one block at time the remaining 2nd stage output */
> +	s2.times = 1;
> +	s2_blk_in = cd->src.stage2->blk_in * nch;
> +	s2_blk_out = cd->src.stage2->blk_out * nch;
>  	while ((n2 < cd->param.stage2_times_max)
>  		&& (cd->sbuf_avail >= s2_blk_in)
> -		&& (n_written <= sink_check)) {
> -		for (j = 0; j < nch; j++) {
> -			s2.x_rptr = cd->sbuf_r_ptr++;
> -			s2.y_wptr = dest++;
> -			src_circ_inc_wrap(&cd->sbuf_r_ptr, sbuf_end_addr, sbuf_size);
> -			src_circ_inc_wrap(&dest, sink->end_addr, sink->size);
> -			s = &cd->src[j]; /* Point to src[] for this channel */
> -			s2.state = &s->state2;
> -			s2.stage = s->stage2;
> -			if (cd->sign_extend_s24)
> -				src_polyphase_stage_cir_s24(&s2);
> -			else
> -				src_polyphase_stage_cir(&s2);
> -
> -		}
> -		cd->sbuf_r_ptr = s2.x_rptr - nch + 1;
> -		dest = s2.y_wptr - nch + 1;
> -		src_circ_dec_wrap(&cd->sbuf_r_ptr, sbuf_addr, sbuf_size);
> -		src_circ_dec_wrap(&dest, sink->addr, sink->size);
> -		n_written += s2_blk_out;
> +		&& (free >= s2_blk_out)) {
> +		cd->polyphase_func(&s2);
> +		cd->sbuf_r_ptr = s2.x_rptr;
>  		cd->sbuf_avail -= s2_blk_in;
> -		n2++;
> +		free -= s2_blk_out;
> +		n_written += s2_blk_out;
> +		n2 += s2.times;
>  	}
>  	*bytes_read = sizeof(int32_t) * n_read;
>  	*bytes_written = sizeof(int32_t) * n_written;
> @@ -189,42 +193,27 @@ static void src_1s_s32_default(struct comp_dev *dev,
>  	struct comp_buffer *source, struct comp_buffer *sink,
>  	size_t *bytes_read, size_t *bytes_written)
>  {
> -	struct polyphase_src *s;
>  	struct src_stage_prm s1;
> -	int j;
>  	struct comp_data *cd = comp_get_drvdata(dev);
> -	int32_t *dest = (int32_t *) sink->w_ptr;
> -	int32_t *src = (int32_t *) source->r_ptr;
>  	int nch = dev->params.channels;
>  	int n_read = 0;
>  	int n_written = 0;
>  
>  	s1.times = cd->param.stage1_times;
> +	s1.x_rptr = (int32_t *) source->r_ptr;
>  	s1.x_end_addr = source->end_addr;
>  	s1.x_size = source->size;
> -	s1.x_inc = nch;
> +	s1.y_wptr = (int32_t *) sink->w_ptr;
>  	s1.y_end_addr = sink->end_addr;
>  	s1.y_size = sink->size;
> -	s1.y_inc = nch;
> -	s1.x_rptr = src + nch - 1;
> -	s1.y_wptr = dest + nch - 1;
> -
> -	for (j = 0; j < nch; j++) {
> -		s = &cd->src[j]; /* Point to src for this channel */
> -		s1.x_rptr = src++;
> -		s1.y_wptr = dest++;
> -		src_circ_inc_wrap(&src, source->end_addr, source->size);
> -		src_circ_inc_wrap(&dest, sink->end_addr, sink->size);
> -		s1.state = &s->state1;
> -		s1.stage = s->stage1;
> -		if (cd->sign_extend_s24)
> -			src_polyphase_stage_cir_s24(&s1);
> -		else
> -			src_polyphase_stage_cir(&s1);
> -
> -		n_read += cd->param.blk_in;
> -		n_written += cd->param.blk_out;
> -	}
> +	s1.state = &cd->src.state1;
> +	s1.stage = cd->src.stage1;
> +	s1.nch = dev->params.channels;
> +
> +	cd->polyphase_func(&s1);
> +
> +	n_read += nch * cd->param.blk_in;
> +	n_written += nch * cd->param.blk_out;
>  	*bytes_read = n_read * sizeof(int32_t);
>  	*bytes_written = n_written * sizeof(int32_t);
>  }
> @@ -271,7 +260,6 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
>  	struct sof_ipc_comp_src *src;
>  	struct sof_ipc_comp_src *ipc_src = (struct sof_ipc_comp_src *) comp;
>  	struct comp_data *cd;
> -	int i;
>  
>  	trace_src("new");
>  
> @@ -299,8 +287,8 @@ static struct comp_dev *src_new(struct sof_ipc_comp *comp)
>  
>  	cd->delay_lines = NULL;
>  	cd->src_func = src_2s_s32_default;
> -	for (i = 0; i < PLATFORM_MAX_CHANNELS; i++)
> -		src_polyphase_reset(&cd->src[i]);
> +	cd->polyphase_func = src_polyphase_stage_cir;
> +	src_polyphase_reset(&cd->src);
>  
>  	dev->state = COMP_STATE_READY;
>  	return dev;
> @@ -334,10 +322,8 @@ static int src_params(struct comp_dev *dev)
>  	uint32_t sink_rate;
>  	int32_t *buffer_start;
>  	int n = 0;
> -	int i;
>  	int err;
>  	int frames_is_for_source;
> -	int nch;
>  	int q;
>  
>  	trace_src("par");
> @@ -345,10 +331,10 @@ static int src_params(struct comp_dev *dev)
>  	/* SRC supports S24_4LE and S32_LE formats */
>  	switch (config->frame_fmt) {
>  	case SOF_IPC_FRAME_S24_4LE:
> -		cd->sign_extend_s24 = 1;
> +		cd->polyphase_func = src_polyphase_stage_cir_s24;
>  		break;
>  	case SOF_IPC_FRAME_S32_LE:
> -		cd->sign_extend_s24 = 0;
> +		cd->polyphase_func = src_polyphase_stage_cir;
>  		break;
>  	default:
>  		trace_src_error("sr0");
> @@ -407,11 +393,7 @@ static int src_params(struct comp_dev *dev)
>  	buffer_start = cd->delay_lines + cd->param.sbuf_length;
>  
>  	/* Initize SRC for actual sample rate */
> -	nch = MIN(params->channels, PLATFORM_MAX_CHANNELS);
> -	for (i = 0; i < nch; i++) {
> -		n = src_polyphase_init(&cd->src[i], &cd->param, buffer_start);
> -		buffer_start += cd->param.single_src;
> -	}
> +	n = src_polyphase_init(&cd->src, &cd->param, buffer_start);
>  
>  	/* Reset stage buffer */
>  	cd->sbuf_r_ptr = cd->delay_lines;
> @@ -434,7 +416,7 @@ static int src_params(struct comp_dev *dev)
>  		 * muted if copy() is run.
>  		 */
>  		trace_src("SFa");
> -		cd->src_func = fallback_s32;
> +		cd->src_func = src_fallback;
>  		return -EINVAL;
>  		break;
>  	}
> @@ -532,7 +514,6 @@ static int src_copy(struct comp_dev *dev)
>  
>  	/* Run SRC function if buffers avail and free allow */
>  	if (((int) source->avail >= need_source) && ((int) sink->free >= need_sink)) {
> -		/* Run src */
>  		cd->src_func(dev, source, sink, &consumed, &produced);
>  
>  		/* Calc new free and available if data was processed. These
> @@ -561,14 +542,12 @@ static int src_preload(struct comp_dev *dev)
>  
>  static int src_reset(struct comp_dev *dev)
>  {
> -	int i;
>  	struct comp_data *cd = comp_get_drvdata(dev);
>  
>  	trace_src("SRe");
>  
>  	cd->src_func = src_2s_s32_default;
> -	for (i = 0; i < PLATFORM_MAX_CHANNELS; i++)
> -		src_polyphase_reset(&cd->src[i]);
> +	src_polyphase_reset(&cd->src);
>  
>  	comp_set_state(dev, COMP_CMD_RESET);
>  	return 0;
> @@ -576,7 +555,8 @@ static int src_reset(struct comp_dev *dev)
>  
>  struct comp_driver comp_src = {
>  	.type = SOF_COMP_SRC,
> -	.ops = {
> +	.ops =
> +	{

lets keep the kernel.org style.

>  		.new = src_new,
>  		.free = src_free,
>  		.params = src_params,
> diff --git a/src/audio/src_core.c b/src/audio/src_core.c
> index 4ee2efd..0d8cc17 100644
> --- a/src/audio/src_core.c
> +++ b/src/audio/src_core.c
> @@ -51,6 +51,10 @@ int sof_rates[SOF_RATES_LENGTH] = {8000, 11025, 12000, 16000, 18900,
>  	22050, 24000, 32000, 44100, 48000, 64000, 88200, 96000, 176400,
>  	192000};
>  
> +/* The FIR maximum lenghts are per channel so need to multiply them */
> +#define MAX_FIR_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_FIR_DELAY_SIZE)
> +#define MAX_OUT_DELAY_SIZE_XNCH (PLATFORM_MAX_CHANNELS * MAX_OUT_DELAY_SIZE)
> +
>  /* Calculate ceil() for integer division */
>  int src_ceil_divide(int a, int b)
>  {
> @@ -131,6 +135,10 @@ int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
>  	int num;
>  	int frames2;
>  
> +	if (nch > PLATFORM_MAX_CHANNELS)
> +		return -EINVAL;

also report this via trace_error

> +
> +	a->nch = nch;
>  	a->idx_in = src_find_fs(src_in_fs, NUM_IN_FS, fs_in);
>  	a->idx_out = src_find_fs(src_out_fs, NUM_OUT_FS, fs_out);
>  
> @@ -146,8 +154,8 @@ int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
>  
>  	stage1 = src_table1[a->idx_out][a->idx_in];
>  	stage2 = src_table2[a->idx_out][a->idx_in];
> -	a->fir_s1 = src_fir_delay_length(stage1);
> -	a->out_s1 = src_out_delay_length(stage1);
> +	a->fir_s1 = nch * src_fir_delay_length(stage1);
> +	a->out_s1 = nch * src_out_delay_length(stage1);
>  
>  	/* Find out how many additional times the SRC can be executed
>  	   while having block size less or equal to max_frames.
> @@ -191,8 +199,8 @@ int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
>  		a->stage2_times_max = 0;
>  		a->sbuf_length = 0;
>  	} else {
> -		a->fir_s2 = src_fir_delay_length(stage2);
> -		a->out_s2 = src_out_delay_length(stage2);
> +		a->fir_s2 = nch * src_fir_delay_length(stage2);
> +		a->out_s2 = nch * src_out_delay_length(stage2);
>  		/* 2x is an empirically tested length. Since the sink buffer
>  		 * capability to receive samples varies a shorter stage 2 output
>  		 * block will create a peak in internal buffer usage.
> @@ -200,8 +208,8 @@ int src_buffer_lengths(struct src_param *a, int fs_in, int fs_out, int nch,
>  		a->sbuf_length = 2 * nch * stage1->blk_out * a->stage1_times_max;
>  	}
>  
> -	a->single_src = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
> -	a->total = a->sbuf_length + nch * a->single_src;
> +	a->src_multich = a->fir_s1 + a->fir_s2 + a->out_s1 + a->out_s2;
> +	a->total = a->sbuf_length + a->src_multich;
>  
>  	return 0;
>  }
> @@ -212,8 +220,6 @@ static void src_state_reset(struct src_state *state)
>  	state->fir_delay_size = 0;
>  	state->out_delay_size = 0;
>  	state->fir_wi = 0;
> -	state->fir_ri = 0;
> -	state->out_wi = 0;
>  	state->out_ri = 0;
>  }
>  
> @@ -229,17 +235,8 @@ static int init_stages(
>  	src->number_of_stages = n;
>  	src->stage1 = stage1;
>  	src->stage2 = stage2;
> -	src->blk_in = p->blk_in;
> -	src->blk_out = p->blk_out;
> -	if (n == 1) {
> -		src->stage1_times = p->stage1_times;
> -		src->stage2_times = 0;
> -		if (stage1->blk_out == 0)
> -			return -EINVAL;
> -	} else {
> -		src->stage1_times = p->stage1_times;
> -		src->stage2_times = p->stage2_times;
> -	}
> +	if ((n == 1) && (stage1->blk_out == 0))
> +		return -EINVAL;
>  
>  	/* Delay line sizes */
>  	src->state1.fir_delay_size = p->fir_s1;
> @@ -262,10 +259,10 @@ static int init_stages(
>  	}
>  
>  	/* Check the sizes are less than MAX */
> -	if ((src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE)
> -		|| (src->state1.out_delay_size > MAX_OUT_DELAY_SIZE)
> -		|| (src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE)
> -		|| (src->state2.out_delay_size > MAX_OUT_DELAY_SIZE)) {
> +	if ((src->state1.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
> +		|| (src->state1.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)
> +		|| (src->state2.fir_delay_size > MAX_FIR_DELAY_SIZE_XNCH)
> +		|| (src->state2.out_delay_size > MAX_OUT_DELAY_SIZE_XNCH)) {
>  		src->state1.fir_delay = NULL;
>  		src->state1.out_delay = NULL;
>  		src->state2.fir_delay = NULL;
> @@ -279,12 +276,7 @@ static int init_stages(
>  void src_polyphase_reset(struct polyphase_src *src)
>  {
>  
> -	src->mute = 0;
>  	src->number_of_stages = 0;
> -	src->blk_in = 0;
> -	src->blk_out = 0;
> -	src->stage1_times = 0;
> -	src->stage2_times = 0;
>  	src->stage1 = NULL;
>  	src->stage2 = NULL;
>  	src_state_reset(&src->state1);
> @@ -300,8 +292,6 @@ int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
>  	struct src_stage *stage2;
>  
>  	if ((p->idx_in < 0) || (p->idx_out < 0)) {
> -		src->blk_in = p->blk_in;
> -		src->blk_out = p->blk_out;
>  		return -EINVAL;
>  	}
>  
> @@ -335,141 +325,203 @@ int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
>  #if SRC_SHORT == 1
>  
>  /* Calculate a FIR filter part that does not need circular modification */
> -static inline void fir_part(int64_t *y, int ntaps, const int16_t c[], int *ic,
> -	int32_t d[], int *id)
> +
> +static inline void fir_part(int64_t y[], int *id, int *ic,
> +	const int32_t data[], const int16_t coef[], int nch_x_taps, int nch)
>  {
> +	int64_t tap0;
> +	int64_t tap1;
>  	int n;
>  	int64_t a = 0;
> +	int64_t b = 0;
> +	int c = *ic;
> +	int d = *id;
> +	int d_end = d - nch_x_taps;
>  
>  	/* Data is Q1.31, coef is Q1.15, product is Q2.46 */
> -	for (n = 0; n < (ntaps >> 1); n++) {
> -		a += (int64_t) c[*ic] * d[*id]
> -			+ (int64_t) c[*ic + 1] * d[*id - 1];
> -		*ic += 2;
> -		*id -= 2;
> +	if (nch == 2) {
> +		for (n = 0; n < (nch_x_taps >> 2); n++) {
> +			tap0 = coef[c++];
> +			tap1 = coef[c++];
> +			b += data[d--] * tap0;
> +			a += data[d--] * tap0;
> +			b += data[d--] * tap1;
> +			a += data[d--] * tap1;
> +		}
> +		if (d > d_end) {
> +			tap0 = coef[c++];
> +			b += data[d--] * tap0;
> +			a += data[d--] * tap0;
> +		}
> +		y[1] += b;
> +		y[0] += a;
> +	} else {
> +		while (d > d_end) {
> +			tap0 = coef[c++];
> +			for (n = nch - 1; n >= 0; n--)
> +				y[n] += data[d--] * tap0;
> +		}
>  	}
> -	if (ntaps & 1)
> -		a += (int64_t) c[(*ic)++] * d[(*id)--];
> -
> -	*y += a;
> +	*ic = c;
> +	*id = d;
>  }
> +
>  #else
>  
> -/* Calculate a FIR filter part that does not need circular modification */
> -static inline void fir_part(int64_t *y, int ntaps, const int32_t c[], int *ic,
> -	int32_t d[], int *id)
> +static inline void fir_part(int64_t y[], int *id, int *ic,
> +	const int32_t data[], const int32_t coef[], int nch_x_taps, int nch)
>  {
> +	int64_t tap0;
> +	int64_t tap1;
>  	int n;
>  	int64_t a = 0;
> -
> -	/* Data is Q8.24, coef is Q1.23, product is Q9.47 */
> -	for (n = 0; n < (ntaps >> 1); n++) {
> -		a += (int64_t) c[*ic] * d[*id]
> -			+ (int64_t) c[*ic + 1] * d[*id - 1];
> -		*ic += 2;
> -		*id -= 2;
> +	int64_t b = 0;
> +	int c = *ic;
> +	int d = *id;
> +	int d_end = d - nch_x_taps;
> +
> +	/* Data is Q1.31, coef is Q1.23, product is Q2.54 */
> +	if (nch == 2) {
> +		for (n = 0; n < (nch_x_taps >> 2); n++) {
> +			tap0 = coef[c++];
> +			tap1 = coef[c++];
> +			b += data[d--] * tap0;
> +			a += data[d--] * tap0;
> +			b += data[d--] * tap1;
> +			a += data[d--] * tap1;
> +		}
> +		if (d > d_end) {
> +			tap0 = coef[c++];
> +			b += data[d--] * tap0;
> +			a += data[d--] * tap0;
> +		}
> +		y[1] += b;
> +		y[0] += a;
> +	} else {
> +		while (d > d_end) {
> +			tap0 = coef[c++];
> +			for (n = nch - 1; n >= 0; n--)
> +				y[n] += data[d--] * tap0;
> +		}
>  	}
> -	if (ntaps & 1)
> -		a += (int64_t) c[(*ic)++] * d[(*id)--];
> -
> -	*y += a;
> +	*ic = c;
> +	*id = d;
>  }
> +
>  #endif
>  
>  #if SRC_SHORT == 1
>  
> -static inline int32_t fir_filter(
> -	struct src_state *fir, const int16_t coefs[],
> -	int *coefi, int filter_length, int shift)
> +static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
> +	int32_t out_delay[], const int16_t coefs[], int dsm1, int taps,
> +	int shift, int nch)
>  {
> -	int64_t y = 0;
> -	int n1;
>  	int n2;
> -
> -	n1 = fir->fir_ri + 1;
> -	if (n1 > filter_length) {
> -		/* No need to un-wrap fir read index, make sure fir_fi
> -		 * is ge 0 after FIR computation.
> -		 */
> -		fir_part(&y, filter_length, coefs, coefi, fir->fir_delay,
> -			&fir->fir_ri);
> +	int i;
> +	int64_t y[PLATFORM_MAX_CHANNELS];
> +	int ri = ri0;
> +	int wi = wi0;
> +	int n1 = ri0 + 1; /* Convert to number of sequential frames */
> +	int qshift = 15 + shift; /* Q2.46 -> Q2.31 */
> +	int32_t rnd = 1 << (qshift - 1); /* Half LSB */
> +	int nch_x_taps = nch * taps;
> +
> +	/* Initialize to half LSB for rounding */
> +	for (i = 0; i < nch; i++)
> +		y[i] = rnd;
> +
> +	if (n1 >= nch_x_taps) {
> +		fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
>  	} else {
> -		n2 = filter_length - n1;
> -		/* Part 1, loop n1 times, fir_ri becomes -1 */
> -		fir_part(&y, n1, coefs, coefi, fir->fir_delay, &fir->fir_ri);
> -
> -		/* Part 2, unwrap fir_ri, continue rest of filter */
> -		fir->fir_ri = fir->fir_delay_size - 1;
> -		fir_part(&y, n2, coefs, coefi, fir->fir_delay, &fir->fir_ri);
> +		n2 = nch_x_taps - n1;
> +		fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
> +		ri = dsm1;
> +		fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
>  	}
> -	/* Q2.46 -> Q2.31, saturate to Q1.31 */
> -	y = y >> (15 + shift);
>  
> -	return(int32_t) sat_int32(y);
> +	for (i = 0; i < nch; i++)
> +		out_delay[wi++] = sat_int32(y[i] >> qshift);
>  }
>  #else
>  
> -static inline int32_t fir_filter(
> -	struct src_state *fir, const int32_t coefs[],
> -	int *coefi, int filter_length, int shift)
> +static void fir_filter(int ri0, int *ci, int wi0, int32_t in_delay[],
> +	int32_t out_delay[], const int32_t coefs[], int dsm1, int taps,
> +	int shift, int nch)
>  {
> -	int64_t y = 0;
> -	int n1;
>  	int n2;
> -
> -	n1 = fir->fir_ri + 1;
> -	if (n1 > filter_length) {
> -		/* No need to un-wrap fir read index, make sure fir_fi
> -		 * is ge 0 after FIR computation.
> -		 */
> -		fir_part(&y, filter_length, coefs, coefi, fir->fir_delay,
> -			&fir->fir_ri);
> +	int i;
> +	int64_t y[PLATFORM_MAX_CHANNELS];
> +	int ri = ri0;
> +	int wi = wi0;
> +	int n1 = ri0 + 1; /* Convert to number of sequential frames */
> +	int qshift = 23 + shift; /* Q2.54 -> Q2.31 */
> +	int32_t rnd = 1 << (qshift - 1); /* Half LSB */
> +	int nch_x_taps = nch * taps;
> +
> +	/* Initialize to half LSB for rounding */
> +	for (i = 0; i < nch; i++)
> +		y[i] = rnd;
> +
> +	if (n1 >= nch_x_taps) {
> +		fir_part(y, &ri, ci, in_delay, coefs, nch_x_taps, nch);
>  	} else {
> -		n2 = filter_length - n1;
> -		/* Part 1, loop n1 times, fir_ri becomes -1 */
> -		fir_part(&y, n1, coefs, coefi, fir->fir_delay, &fir->fir_ri);
> -
> -		/* Part 2, unwrap fir_ri, continue rest of filter */
> -		fir->fir_ri = fir->fir_delay_size - 1;
> -		fir_part(&y, n2, coefs, coefi, fir->fir_delay, &fir->fir_ri);
> +		n2 = nch_x_taps - n1;
> +		fir_part(y, &ri, ci, in_delay, coefs, n1, nch);
> +		ri = dsm1;
> +		fir_part(y, &ri, ci, in_delay, coefs, n2, nch);
>  	}
> -	/* Q9.47 -> Q9.24, saturate to Q8.24 */
> -	y = y >> (23 + shift);
>  
> -	return(int32_t) sat_int32(y);
> +	for (i = 0; i < nch; i++)
> +		out_delay[wi++] = sat_int32(y[i] >> qshift);
> +
>  }
> +
>  #endif
>  
>  void src_polyphase_stage_cir(struct src_stage_prm * s)
>  {
> -	struct src_state *fir = s->state;
> -	struct src_stage *cfg = s->stage;
>  	int n;
>  	int m;
>  	int f;
> -	int c;
> -	int r;
> +	int ci;
> +	int ri;
>  	int n_wrap_fir;
>  	int n_wrap_buf;
>  	int n_wrap_min;
>  	int n_min;
> -	int32_t z;
> +	int wi;
> +
> +	struct src_state *fir = s->state;
> +	struct src_stage *cfg = s->stage;
> +	const void *coef = cfg->coefs;
> +	int32_t *in_delay = fir->fir_delay;
> +	int32_t *out_delay = fir->out_delay;
> +	int dsm1 = fir->fir_delay_size - 1;
> +	int shift = cfg->shift;
> +	int nch = s->nch;
> +	int rewind = -nch * (cfg->blk_in
> +		+ (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
> +	int nch_x_idm = cfg->idm * nch;
> +	int nch_x_odm = cfg->odm * nch;
> +	size_t sz = sizeof(int32_t);
> +	int blk_in_bytes = nch * cfg->blk_in * sz;
> +	int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
> +
>  
>  	for (n = 0; n < s->times; n++) {
>  		/* Input data */
> -		m = s->x_inc * cfg->blk_in;
> +		m = blk_in_bytes;
>  		while (m > 0) {
> -			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi)
> -				* s->x_inc;
> +			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
>  			n_wrap_buf = s->x_end_addr - s->x_rptr;
>  			n_wrap_min = (n_wrap_fir < n_wrap_buf)
>  				? n_wrap_fir : n_wrap_buf;
>  			n_min = (m < n_wrap_min) ? m : n_wrap_min;
>  			while (n_min > 0) {
>  				fir->fir_delay[fir->fir_wi++] = *s->x_rptr;
> -				s->x_rptr += s->x_inc;
> -				n_min -= s->x_inc;
> -				m -= s->x_inc;
> +				s->x_rptr++;
> +				n_min -= sz;
> +				m -= sz;
>  			}
>  			/* Check for wrap */
>  			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
> @@ -478,41 +530,38 @@ void src_polyphase_stage_cir(struct src_stage_prm * s)
>  		}
>  
>  		/* Filter */
> -		c = 0;
> -		r = fir->fir_wi - cfg->blk_in
> -			- (cfg->num_of_subfilters - 1) * cfg->idm;
> -		if (r < 0)
> -			r += fir->fir_delay_size;
> +		ci = 0; /* Reset to 1st coefficient */
> +		ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
> +		if (ri < 0)
> +			ri += fir->fir_delay_size;
>  
> -		fir->out_wi = fir->out_ri;
> +		wi = fir->out_ri;
>  		for (f = 0; f < cfg->num_of_subfilters; f++) {
> -			fir->fir_ri = r;
> -			z = fir_filter(fir, cfg->coefs, &c,
> -				cfg->subfilter_length, cfg->shift);
> -			r += cfg->idm;
> -			if (r >= fir->fir_delay_size)
> -				r -= fir->fir_delay_size;
> -
> -			fir->out_delay[fir->out_wi] = z;
> -			fir->out_wi += cfg->odm;
> -			if (fir->out_wi >= fir->out_delay_size)
> -				fir->out_wi -= fir->out_delay_size;
> +			fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
> +				dsm1, cfg->subfilter_length, shift, nch);
> +
> +			wi += nch_x_odm;
> +			if (wi >= fir->out_delay_size)
> +				wi -= fir->out_delay_size;
> +
> +			ri += nch_x_idm; /* Next sub-filter start */
> +			if (ri >= fir->fir_delay_size)
> +				ri -= fir->fir_delay_size;
>  		}
>  
>  		/* Output */
> -		m = s->y_inc * cfg->num_of_subfilters;
> +		m = blk_out_bytes;
>  		while (m > 0) {
> -			n_wrap_fir = (fir->out_delay_size - fir->out_ri)
> -				* s->y_inc;
> +			n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
>  			n_wrap_buf = s->y_end_addr - s->y_wptr;
>  			n_wrap_min = (n_wrap_fir < n_wrap_buf)
>  				? n_wrap_fir : n_wrap_buf;
>  			n_min = (m < n_wrap_min) ? m : n_wrap_min;
>  			while (n_min > 0) {
>  				*s->y_wptr = fir->out_delay[fir->out_ri++];
> -				s->y_wptr += s->y_inc;
> -				n_min -= s->y_inc;
> -				m -= s->y_inc;
> +				s->y_wptr++;
> +				n_min -= sz;
> +				m -= sz;
>  			}
>  			/* Check wrap */
>  			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
> @@ -522,38 +571,49 @@ void src_polyphase_stage_cir(struct src_stage_prm * s)
>  	}
>  }
>  
> -void src_polyphase_stage_cir_s24(struct src_stage_prm * s)
> +void src_polyphase_stage_cir_s24(struct src_stage_prm *s)
>  {
> -	struct src_state *fir = s->state;
> -	struct src_stage *cfg = s->stage;
>  	int n;
>  	int m;
>  	int f;
> -	int c;
> -	int r;
> +	int ci;
> +	int ri;
>  	int n_wrap_fir;
>  	int n_wrap_buf;
>  	int n_wrap_min;
>  	int n_min;
> -	int32_t z;
> -	int32_t se;
> +	int wi;
> +
> +	struct src_state *fir = s->state;
> +	struct src_stage *cfg = s->stage;

structs should be declared before local ints, this makes it easier to
reference them against params passed in

> +	const void *coef = cfg->coefs;
> +	int32_t *in_delay = fir->fir_delay;
> +	int32_t *out_delay = fir->out_delay;
> +	int dsm1 = fir->fir_delay_size - 1;
> +	int shift = cfg->shift;
> +	int nch = s->nch;
> +	int rewind = -nch * (cfg->blk_in
> +		+ (cfg->num_of_subfilters - 1) * cfg->idm) + nch - 1;
> +	int nch_x_idm = cfg->idm * nch;
> +	int nch_x_odm = cfg->odm * nch;
> +	size_t sz = sizeof(int32_t);
> +	int blk_in_bytes = nch * cfg->blk_in * sz;
> +	int blk_out_bytes = nch * cfg->num_of_subfilters * sz;
>  
>  	for (n = 0; n < s->times; n++) {
>  		/* Input data */
> -		m = s->x_inc * cfg->blk_in;
> +		m = blk_in_bytes;
>  		while (m > 0) {
> -			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi)
> -				* s->x_inc;
> +			n_wrap_fir = (fir->fir_delay_size - fir->fir_wi) * sz;
>  			n_wrap_buf = s->x_end_addr - s->x_rptr;
>  			n_wrap_min = (n_wrap_fir < n_wrap_buf)
>  				? n_wrap_fir : n_wrap_buf;
>  			n_min = (m < n_wrap_min) ? m : n_wrap_min;
>  			while (n_min > 0) {
> -				se = *s->x_rptr << 8;
> -				fir->fir_delay[fir->fir_wi++] = se >> 8;
> -				s->x_rptr += s->x_inc;
> -				n_min -= s->x_inc;
> -				m -= s->x_inc;
> +				fir->fir_delay[fir->fir_wi++] = *s->x_rptr << 8;
> +				s->x_rptr++;
> +				n_min -= sz;
> +				m -= sz;
>  			}
>  			/* Check for wrap */
>  			src_circ_inc_wrap(&s->x_rptr, s->x_end_addr, s->x_size);
> @@ -562,41 +622,38 @@ void src_polyphase_stage_cir_s24(struct src_stage_prm * s)
>  		}
>  
>  		/* Filter */
> -		c = 0;
> -		r = fir->fir_wi - cfg->blk_in
> -			- (cfg->num_of_subfilters - 1) * cfg->idm;
> -		if (r < 0)
> -			r += fir->fir_delay_size;
> +		ci = 0; /* Reset to 1st coefficient */
> +		ri = fir->fir_wi + rewind; /* Newest data for last subfilter */
> +		if (ri < 0)
> +			ri += fir->fir_delay_size;
>  
> -		fir->out_wi = fir->out_ri;
> +		wi = fir->out_ri;
>  		for (f = 0; f < cfg->num_of_subfilters; f++) {
> -			fir->fir_ri = r;
> -			z = fir_filter(fir, cfg->coefs, &c,
> -				cfg->subfilter_length, cfg->shift);
> -			r += cfg->idm;
> -			if (r >= fir->fir_delay_size)
> -				r -= fir->fir_delay_size;
> -
> -			fir->out_delay[fir->out_wi] = z;
> -			fir->out_wi += cfg->odm;
> -			if (fir->out_wi >= fir->out_delay_size)
> -				fir->out_wi -= fir->out_delay_size;
> +			fir_filter(ri, &ci, wi, in_delay, out_delay, coef,
> +				dsm1, cfg->subfilter_length, shift, nch);
> +
> +			wi += nch_x_odm;
> +			if (wi >= fir->out_delay_size)
> +				wi -= fir->out_delay_size;
> +
> +			ri += nch_x_idm; /* Next sub-filter start */
> +			if (ri >= fir->fir_delay_size)
> +				ri -= fir->fir_delay_size;
>  		}
>  
>  		/* Output */
> -		m = s->y_inc * cfg->num_of_subfilters;
> +		m = blk_out_bytes;
>  		while (m > 0) {
> -			n_wrap_fir = (fir->out_delay_size - fir->out_ri)
> -				* s->y_inc;
> +			n_wrap_fir = (fir->out_delay_size - fir->out_ri) * sz;
>  			n_wrap_buf = s->y_end_addr - s->y_wptr;
>  			n_wrap_min = (n_wrap_fir < n_wrap_buf)
>  				? n_wrap_fir : n_wrap_buf;
>  			n_min = (m < n_wrap_min) ? m : n_wrap_min;
>  			while (n_min > 0) {
> -				*s->y_wptr = fir->out_delay[fir->out_ri++];
> -				s->y_wptr += s->y_inc;
> -				n_min -= s->y_inc;
> -				m -= s->y_inc;
> +				*s->y_wptr = fir->out_delay[fir->out_ri++] >> 8;
> +				s->y_wptr++;
> +				n_min -= sz;
> +				m -= sz;
>  			}
>  			/* Check wrap */
>  			src_circ_inc_wrap(&s->y_wptr, s->y_end_addr, s->y_size);
> @@ -604,47 +661,5 @@ void src_polyphase_stage_cir_s24(struct src_stage_prm * s)
>  				fir->out_ri = 0;
>  		}
>  	}
> -}
> -
>  
> -#ifdef MODULE_TEST
> -
> -void src_print_info(struct polyphase_src * src)
> -{
> -
> -	int n1;
> -	int n2;
> -
> -	n1 = src->stage1->filter_length;
> -	n2 = src->stage2->filter_length;
> -	printf("SRC stages %d\n", src->number_of_stages);
> -	printf("SRC input blk %d\n", src->blk_in);
> -	printf("SRC output blk %d\n", src->blk_out);
> -	printf("SRC stage1 %d times\n", src->stage1_times);
> -	printf("SRC stage2 %d times\n", src->stage2_times);
> -
> -	printf("SRC1 filter length %d\n", n1);
> -	printf("SRC1 subfilter length %d\n", src->stage1->subfilter_length);
> -	printf("SRC1 number of subfilters %d\n",
> -		src->stage1->num_of_subfilters);
> -	printf("SRC1 idm %d\n", src->stage1->idm);
> -	printf("SRC1 odm %d\n", src->stage1->odm);
> -	printf("SRC1 input blk %d\n", src->stage1->blk_in);
> -	printf("SRC1 output blk %d\n", src->stage1->blk_out);
> -	printf("SRC1 halfband %d\n", src->stage1->halfband);
> -	printf("SRC1 FIR delay %d\n", src->state1.fir_delay_size);
> -	printf("SRC1 out delay %d\n", src->state1.out_delay_size);
> -
> -	printf("SRC2 filter length %d\n", n2);
> -	printf("SRC2 subfilter length %d\n", src->stage2->subfilter_length);
> -	printf("SRC2 number of subfilters %d\n",
> -		src->stage2->num_of_subfilters);
> -	printf("SRC2 idm %d\n", src->stage2->idm);
> -	printf("SRC2 odm %d\n", src->stage2->odm);
> -	printf("SRC2 input blk %d\n", src->stage2->blk_in);
> -	printf("SRC2 output blk %d\n", src->stage2->blk_out);
> -	printf("SRC2 halfband %d\n", src->stage2->halfband);
> -	printf("SRC2 FIR delay %d\n", src->state2.fir_delay_size);
> -	printf("SRC2 out delay %d\n", src->state2.out_delay_size);
>  }
> -#endif
> diff --git a/src/audio/src_core.h b/src/audio/src_core.h
> index 3859e6f..3ea6028 100644
> --- a/src/audio/src_core.h
> +++ b/src/audio/src_core.h
> @@ -41,7 +41,7 @@ struct src_param {
>  	int out_s1;
>  	int out_s2;
>  	int sbuf_length;
> -	int single_src;
> +	int src_multich;
>  	int total;
>  	int blk_in;
>  	int blk_out;
> @@ -51,6 +51,7 @@ struct src_param {
>  	int stage2_times_max;
>  	int idx_in;
>  	int idx_out;
> +	int nch;
>  };
>  
>  struct src_stage {
> @@ -70,20 +71,13 @@ struct src_state {
>  	int fir_delay_size;
>  	int out_delay_size;
>  	int fir_wi;
> -	int fir_ri;
> -	int out_wi;
>  	int out_ri;
>  	int32_t *fir_delay;
>  	int32_t *out_delay;
>  };
>  
>  struct polyphase_src {
> -	int mute;
>  	int number_of_stages;
> -	int blk_in;
> -	int blk_out;
> -	int stage1_times;
> -	int stage2_times;
>  	struct src_stage *stage1;
>  	struct src_stage *stage2;
>  	struct src_state state1;
> @@ -91,15 +85,14 @@ struct polyphase_src {
>  };
>  
>  struct src_stage_prm {
> +	int nch;
>  	int times;
>  	int32_t *x_rptr;
>  	int32_t *x_end_addr;
>  	size_t x_size;
> -	int x_inc;
>  	int32_t *y_wptr;
>  	int32_t *y_end_addr;
>  	size_t y_size;
> -	int y_inc;
>  	struct src_state *state;
>  	struct src_stage *stage;
>  };
> @@ -116,31 +109,6 @@ static inline void src_circ_dec_wrap(int32_t **ptr, int32_t *addr, size_t size)
>  		*ptr = (int32_t *) ((size_t) * ptr + size);
>  }
>  
> -static inline void src_polyphase_mute(struct polyphase_src *src)
> -{
> -	src->mute = 1;
> -}
> -
> -static inline void src_polyphase_unmute(struct polyphase_src *src)
> -{
> -	src->mute = 0;
> -}
> -
> -static inline int src_polyphase_getmute(struct polyphase_src *src)
> -{
> -	return src->mute;
> -}
> -
> -static inline int src_polyphase_get_blk_in(struct polyphase_src *src)
> -{
> -	return src->blk_in;
> -}
> -
> -static inline int src_polyphase_get_blk_out(struct polyphase_src *src)
> -{
> -	return src->blk_out;
> -}
> -
>  void src_polyphase_reset(struct polyphase_src *src);
>  
>  int src_polyphase_init(struct polyphase_src *src, struct src_param *p,
> @@ -162,8 +130,4 @@ int32_t src_output_rates(void);
>  
>  int src_ceil_divide(int a, int b);
>  
> -#ifdef MODULE_TEST
> -void src_print_info(struct polyphase_src *src);
> -#endif
> -
>  #endif