[RFC PATCH] soundwire: use driver callbacks directly with proper locking
Vinod Koul
vkoul at kernel.org
Tue Apr 12 12:53:27 CEST 2022
On 07-04-22, 17:39, Pierre-Louis Bossart wrote:
> In the SoundWire probe, we store a pointer from the driver ops into
> the 'slave' structure. This can lead to kernel oopses when unbinding
> codec drivers, e.g. with the following sequence to remove machine
> driver and codec driver.
>
> /sbin/modprobe -r snd_soc_sof_sdw
> /sbin/modprobe -r snd_soc_rt711
>
> The full details can be found in the BugLink below, for reference the
> two following examples show different cases of driver ops/callbacks
> being invoked after the driver .remove().
>
> kernel: BUG: kernel NULL pointer dereference, address: 0000000000000150
> kernel: Workqueue: events cdns_update_slave_status_work [soundwire_cadence]
> kernel: RIP: 0010:mutex_lock+0x19/0x30
> kernel: Call Trace:
> kernel: ? sdw_handle_slave_status+0x426/0xe00 [soundwire_bus 94ff184bf398570c3f8ff7efe9e32529f532e4ae]
> kernel: ? newidle_balance+0x26a/0x400
> kernel: ? cdns_update_slave_status_work+0x1e9/0x200 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82]
>
> kernel: BUG: unable to handle page fault for address: ffffffffc07654c8
> kernel: Workqueue: pm pm_runtime_work
> kernel: RIP: 0010:sdw_bus_prep_clk_stop+0x6f/0x160 [soundwire_bus]
> kernel: Call Trace:
> kernel: <TASK>
> kernel: sdw_cdns_clock_stop+0xb5/0x1b0 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82]
> kernel: intel_suspend_runtime+0x5f/0x120 [soundwire_intel aca858f7c87048d3152a4a41bb68abb9b663a1dd]
> kernel: ? dpm_sysfs_remove+0x60/0x60
>
> This was not detected earlier in Intel tests since the tests first
> remove the parent PCI device and shut down the bus. The sequence
> above is a corner case which keeps the bus operational but without a
> driver bound.
>
> This patch removes the use the 'slave' ops and uses proper locking to
> make sure there are no live callbacks based on a dangling pointer
> executed during or after the driver unbinding sequence. In one
> specific case, indicated with comments in the code, the device lock is
> already taken at a higher level when starting the resume operations.
>
> The issue with the ops pointer has been there since December 2017, but
> there were so many changes in the bus handling code that this patch
> will not apply cleanly all the way to this initial commit. The changes
> can be easily backported though.
>
> Thanks to Dan Williams for his suggestions on an earlier version of
> this patch.
>
> BugLink: https://github.com/thesofproject/linux/issues/3531
> Fixes: 56d4fe31af77 ("soundwire: Add MIPI DisCo property helpers")
> Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart at linux.intel.com>
> ---
>
> This is a follow-up on the initial discussion in https://lore.kernel.org/alsa-devel/d0559e97-c4a0-b817-428c-d3e305390270@linux.intel.com/
>
> I could use feedback on whether using device_lock() is appropriate and
Looking at other uses of device_lock() it seems apt to me
> test results on non-Intel platforms. Thanks!
> Pierre
>
> drivers/soundwire/bus.c | 78 ++++++++++++++++++++++++++++--------
> drivers/soundwire/bus_type.c | 6 +--
> drivers/soundwire/stream.c | 57 +++++++++++++++++---------
> 3 files changed, 102 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
> index 8b7a680f388e..545b379a119e 100644
> --- a/drivers/soundwire/bus.c
> +++ b/drivers/soundwire/bus.c
> @@ -7,6 +7,7 @@
> #include <linux/pm_runtime.h>
> #include <linux/soundwire/sdw_registers.h>
> #include <linux/soundwire/sdw.h>
> +#include <linux/soundwire/sdw_type.h>
> #include "bus.h"
> #include "sysfs_local.h"
>
> @@ -846,12 +847,18 @@ static int sdw_slave_clk_stop_callback(struct sdw_slave *slave,
> enum sdw_clk_stop_mode mode,
> enum sdw_clk_stop_type type)
> {
> - int ret;
> + struct device *dev = &slave->dev;
> + struct sdw_driver *drv;
>
> - if (slave->ops && slave->ops->clk_stop) {
> - ret = slave->ops->clk_stop(slave, mode, type);
> - if (ret < 0)
> - return ret;
> + /*
> + * this function can only be called from a pm_runtime
> + * sequence where the device is already locked
> + */
If this is guaranteed..
> +
> + if (dev->driver) {
do we need to check this? Did you find a case where this was not valid
while device is locked, maybe do this while holding the lock (kind of
moot to process the calls if driver is gone)
> + drv = drv_to_sdw_driver(dev->driver);
> + if (drv && drv->ops && drv->ops->clk_stop)
> + return drv->ops->clk_stop(slave, mode, type);
> }
>
> return 0;
> @@ -1616,14 +1623,25 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
> }
>
> /* Update the Slave driver */
> - if (slave_notify && slave->ops &&
> - slave->ops->interrupt_callback) {
> - slave_intr.sdca_cascade = sdca_cascade;
> - slave_intr.control_port = clear;
> - memcpy(slave_intr.port, &port_status,
> - sizeof(slave_intr.port));
> -
> - slave->ops->interrupt_callback(slave, &slave_intr);
> + if (slave_notify) {
> + struct device *dev = &slave->dev;
> + struct sdw_driver *drv;
> +
> + device_lock(dev);
device is locked
> +
> + if (dev->driver) {
Same here as well
> + drv = drv_to_sdw_driver(dev->driver);
> + if (drv && drv->ops && drv->ops->interrupt_callback) {
> + slave_intr.sdca_cascade = sdca_cascade;
> + slave_intr.control_port = clear;
> + memcpy(slave_intr.port, &port_status,
> + sizeof(slave_intr.port));
> +
> + drv->ops->interrupt_callback(slave, &slave_intr);
> + }
> + }
> +
> + device_unlock(dev);
> }
>
> /* Ack interrupt */
> @@ -1697,7 +1715,12 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave)
> static int sdw_update_slave_status(struct sdw_slave *slave,
> enum sdw_slave_status status)
> {
> + struct device *dev = &slave->dev;
> + struct sdw_driver *drv;
> unsigned long time;
> + int ret = 0;
> +
> + device_lock_assert(dev);
>
> if (!slave->probed) {
> /*
> @@ -1716,10 +1739,13 @@ static int sdw_update_slave_status(struct sdw_slave *slave,
> }
> }
>
> - if (!slave->ops || !slave->ops->update_status)
> - return 0;
> + if (dev->driver) {
> + drv = drv_to_sdw_driver(dev->driver);
> + if (drv && drv->ops && drv->ops->update_status)
> + ret = drv->ops->update_status(slave, status);
> + }
>
> - return slave->ops->update_status(slave, status);
> + return ret;
> }
>
> /**
> @@ -1828,7 +1854,10 @@ int sdw_handle_slave_status(struct sdw_bus *bus,
> break;
> }
>
> + device_lock(&slave->dev);
> ret = sdw_update_slave_status(slave, status[i]);
> + device_unlock(&slave->dev);
> +
> if (ret < 0)
> dev_err(&slave->dev,
> "Update Slave status failed:%d\n", ret);
> @@ -1860,6 +1889,7 @@ EXPORT_SYMBOL(sdw_handle_slave_status);
> void sdw_clear_slave_status(struct sdw_bus *bus, u32 request)
> {
> struct sdw_slave *slave;
> + bool lock;
> int i;
>
> /* Check all non-zero devices */
> @@ -1878,7 +1908,23 @@ void sdw_clear_slave_status(struct sdw_bus *bus, u32 request)
> if (slave->status != SDW_SLAVE_UNATTACHED) {
> sdw_modify_slave_status(slave, SDW_SLAVE_UNATTACHED);
> slave->first_interrupt_done = false;
> +
> + lock = device_trylock(&slave->dev);
should we proceed if we dont get a lock? also why the trylock variant.
We can do the lock, this is wq context
> +
> + /*
> + * this bus/manager-level function can only be called from
> + * a resume sequence. If the peripheral device (child of the
> + * manager device) is locked, this indicates a resume operation
> + * initiated by the device core to deal with .remove() or .shutdown()
> + * at the peripheral level. With the parent-child order enforced
> + * by PM frameworks on resume, the peripheral resume has not started
> + * yet, so it's safe to assume the lock will not be released while
> + * the update_status callback is invoked.
> + */
> sdw_update_slave_status(slave, SDW_SLAVE_UNATTACHED);
> +
> + if (lock)
> + device_unlock(&slave->dev);
> }
>
> /* keep track of request, used in pm_runtime resume */
> diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c
> index 893296f3fe39..91f39c8c119a 100644
> --- a/drivers/soundwire/bus_type.c
> +++ b/drivers/soundwire/bus_type.c
> @@ -98,8 +98,6 @@ static int sdw_drv_probe(struct device *dev)
> if (!id)
> return -ENODEV;
>
> - slave->ops = drv->ops;
> -
> /*
> * attach to power domain but don't turn on (last arg)
> */
> @@ -118,8 +116,8 @@ static int sdw_drv_probe(struct device *dev)
> }
>
> /* device is probed so let's read the properties now */
> - if (slave->ops && slave->ops->read_prop)
> - slave->ops->read_prop(slave);
> + if (drv->ops && drv->ops->read_prop)
> + drv->ops->read_prop(slave);
>
> /* init the sysfs as we have properties now */
> ret = sdw_slave_sysfs_init(slave);
> diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
> index f273459b2023..7862b4403d14 100644
> --- a/drivers/soundwire/stream.c
> +++ b/drivers/soundwire/stream.c
> @@ -13,6 +13,7 @@
> #include <linux/slab.h>
> #include <linux/soundwire/sdw_registers.h>
> #include <linux/soundwire/sdw.h>
> +#include <linux/soundwire/sdw_type.h>
> #include <sound/soc.h>
> #include "bus.h"
>
> @@ -401,20 +402,26 @@ static int sdw_do_port_prep(struct sdw_slave_runtime *s_rt,
> struct sdw_prepare_ch prep_ch,
> enum sdw_port_prep_ops cmd)
> {
> - const struct sdw_slave_ops *ops = s_rt->slave->ops;
> - int ret;
> + struct device *dev = &s_rt->slave->dev;
> + struct sdw_driver *drv;
> + int ret = 0;
>
> - if (ops->port_prep) {
> - ret = ops->port_prep(s_rt->slave, &prep_ch, cmd);
> - if (ret < 0) {
> - dev_err(&s_rt->slave->dev,
> - "Slave Port Prep cmd %d failed: %d\n",
> - cmd, ret);
> - return ret;
> + device_lock(dev);
> +
> + if (dev->driver) {
> + drv = drv_to_sdw_driver(dev->driver);
> + if (drv && drv->ops && drv->ops->port_prep) {
> + ret = drv->ops->port_prep(s_rt->slave, &prep_ch, cmd);
> + if (ret < 0)
> + dev_err(&s_rt->slave->dev,
> + "Slave Port Prep cmd %d failed: %d\n",
> + cmd, ret);
> }
> }
>
> - return 0;
> + device_unlock(dev);
> +
> + return ret;
> }
>
> static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
> @@ -578,7 +585,7 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt)
> struct sdw_slave_runtime *s_rt;
> struct sdw_bus *bus = m_rt->bus;
> struct sdw_slave *slave;
> - int ret = 0;
> + int ret;
>
> if (bus->ops->set_bus_conf) {
> ret = bus->ops->set_bus_conf(bus, &bus->params);
> @@ -587,19 +594,31 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt)
> }
>
> list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
> - slave = s_rt->slave;
> + struct sdw_driver *drv;
> + struct device *dev;
>
> - if (slave->ops->bus_config) {
> - ret = slave->ops->bus_config(slave, &bus->params);
> - if (ret < 0) {
> - dev_err(bus->dev, "Notify Slave: %d failed\n",
> - slave->dev_num);
> - return ret;
> + slave = s_rt->slave;
> + dev = &slave->dev;
> +
> + device_lock(dev);
> +
> + if (dev->driver) {
> + drv = drv_to_sdw_driver(dev->driver);
> + if (drv && drv->ops && drv->ops->bus_config) {
> + ret = drv->ops->bus_config(slave, &bus->params);
> + if (ret < 0) {
> + device_unlock(dev);
> + dev_err(bus->dev, "Notify Slave: %d failed\n",
> + slave->dev_num);
> + return ret;
> + }
> }
> }
> +
> + device_unlock(dev);
> }
>
> - return ret;
> + return 0;
> }
>
> /**
> --
> 2.30.2
--
~Vinod
More information about the Alsa-devel
mailing list