From: Doug Chapman <dchapman@redhat.com> Date: Thu, 6 Dec 2007 16:14:01 -0500 Subject: [cpufreq] rewrite lock to eliminate hotplug issues Message-id: 1196975641.1083.3.camel@athlon O-Subject: [RHEL 5.2 PATCH 1/5] hang at 'Disabling ondemand cpu frequency scaling' Bugzilla: 253416 Patch 1/5 for BZ 253416 Backport of: commit 5a01f2e8f3ac134e24144d74bb48a60236f7024d Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Mon Feb 5 16:12:44 2007 -0800 [CPUFREQ] Rewrite lock in cpufreq to eliminate cpufreq/hotplug related issues Yet another attempt to resolve cpufreq and hotplug locking issues. Patchset has 3 patches: * Rewrite the lock infrastructure of cpufreq using a per cpu rwsem. * Minor restructuring of work callback in ondemand driver. * Use the new cpufreq rwsem infrastructure in ondemand work. This patch: Convert policy->lock to rwsem and move it to per_cpu area. This rwsem will protect against both changing/accessing policy related parameters and CPU hot plug/unplug. Acked-by: Brian Maly <bmaly@redhat.com> Acked-by: Prarit Bhargava <prarit@redhat.com> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b3df613..66b274d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -40,8 +40,67 @@ static struct cpufreq_driver *cpufreq_driver; static struct cpufreq_policy *cpufreq_cpu_data[NR_CPUS]; static DEFINE_SPINLOCK(cpufreq_driver_lock); +/* + * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure + * all cpufreq/hotplug/workqueue/etc related lock issues. + * + * The rules for this semaphore: + * - Any routine that wants to read from the policy structure will + * do a down_read on this semaphore. + * - Any routine that will write to the policy structure and/or may take away + * the policy altogether (eg. CPU hotplug), will hold this lock in write + * mode before doing so. + * + * Additional rules: + * - All holders of the lock should check to make sure that the CPU they + * are concerned with are online after they get the lock. + * - Governor routines that can be called in cpufreq hotplug path should not + * take this sem as top level hotplug notifier handler takes this. + */ +static DEFINE_PER_CPU(int, policy_cpu); +static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem); + +#define lock_policy_rwsem(mode, cpu) \ +int lock_policy_rwsem_##mode \ +(int cpu) \ +{ \ + int policy_cpu = per_cpu(policy_cpu, cpu); \ + BUG_ON(policy_cpu == -1); \ + down_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \ + if (unlikely(!cpu_online(cpu))) { \ + up_##mode(&per_cpu(cpu_policy_rwsem, policy_cpu)); \ + return -1; \ + } \ + \ + return 0; \ +} + +lock_policy_rwsem(read, cpu); +EXPORT_SYMBOL_GPL(lock_policy_rwsem_read); + +lock_policy_rwsem(write, cpu); +EXPORT_SYMBOL_GPL(lock_policy_rwsem_write); + +void unlock_policy_rwsem_read(int cpu) +{ + int policy_cpu = per_cpu(policy_cpu, cpu); + BUG_ON(policy_cpu == -1); + up_read(&per_cpu(cpu_policy_rwsem, policy_cpu)); +} +EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read); + +void unlock_policy_rwsem_write(int cpu) +{ + int policy_cpu = per_cpu(policy_cpu, cpu); + BUG_ON(policy_cpu == -1); + up_write(&per_cpu(cpu_policy_rwsem, policy_cpu)); +} +EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write); + + /* internal prototypes */ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event); +static unsigned int __cpufreq_get(unsigned int cpu); static void handle_update(void *data); /** @@ -395,10 +454,8 @@ static ssize_t store_##file_name \ return -EINVAL; \ \ lock_cpu_hotplug(); \ - mutex_lock(&policy->lock); \ ret = __cpufreq_set_policy(policy, &new_policy); \ policy->user_policy.object = policy->object; \ - mutex_unlock(&policy->lock); \ unlock_cpu_hotplug(); \ \ return ret ? ret : count; \ @@ -412,7 +469,7 @@ store_one(scaling_max_freq,max); */ static ssize_t show_cpuinfo_cur_freq (struct cpufreq_policy * policy, char *buf) { - unsigned int cur_freq = cpufreq_get(policy->cpu); + unsigned int cur_freq = __cpufreq_get(policy->cpu); if (!cur_freq) return sprintf(buf, "<unknown>"); return sprintf(buf, "%u\n", cur_freq); @@ -459,12 +516,10 @@ static ssize_t store_scaling_governor (struct cpufreq_policy * policy, /* Do not use cpufreq_set_policy here or the user_policy.max will be wrongly overridden */ - mutex_lock(&policy->lock); ret = __cpufreq_set_policy(policy, &new_policy); policy->user_policy.policy = policy->policy; policy->user_policy.governor = policy->governor; - mutex_unlock(&policy->lock); unlock_cpu_hotplug(); @@ -568,7 +623,14 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf) policy = cpufreq_cpu_get(policy->cpu); if (!policy) return -EINVAL; + + if (lock_policy_rwsem_read(policy->cpu) < 0) + return -EINVAL; + ret = fattr->show ? fattr->show(policy,buf) : -EIO; + + unlock_policy_rwsem_read(policy->cpu); + cpufreq_cpu_put(policy); return ret; } @@ -582,7 +644,14 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr, policy = cpufreq_cpu_get(policy->cpu); if (!policy) return -EINVAL; + + if (lock_policy_rwsem_write(policy->cpu) < 0) + return -EINVAL; + ret = fattr->store ? fattr->store(policy,buf,count) : -EIO; + + unlock_policy_rwsem_write(policy->cpu); + cpufreq_cpu_put(policy); return ret; } @@ -656,8 +725,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) policy->cpu = cpu; policy->cpus = cpumask_of_cpu(cpu); - mutex_init(&policy->lock); - mutex_lock(&policy->lock); + /* Initially set CPU itself as the policy_cpu */ + per_cpu(policy_cpu, cpu) = cpu; + lock_policy_rwsem_write(cpu); + init_completion(&policy->kobj_unregister); INIT_WORK(&policy->update, handle_update, (void *)(long)cpu); @@ -667,7 +738,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) ret = cpufreq_driver->init(policy); if (ret) { dprintk("initialization failed\n"); - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out; } @@ -681,6 +752,14 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) */ managed_policy = cpufreq_cpu_get(j); if (unlikely(managed_policy)) { + + /* Set proper policy_cpu */ + unlock_policy_rwsem_write(cpu); + per_cpu(policy_cpu, cpu) = managed_policy->cpu; + + if (lock_policy_rwsem_write(cpu) < 0) + goto err_out_driver_exit; + spin_lock_irqsave(&cpufreq_driver_lock, flags); managed_policy->cpus = policy->cpus; cpufreq_cpu_data[cpu] = managed_policy; @@ -691,8 +770,8 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) &managed_policy->kobj, "cpufreq"); cpufreq_debug_enable_ratelimit(); - mutex_unlock(&policy->lock); ret = 0; + unlock_policy_rwsem_write(cpu); goto err_out_driver_exit; /* call driver->exit() */ } } @@ -706,7 +785,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) ret = kobject_register(&policy->kobj); if (ret) { - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); goto err_out_driver_exit; } /* set up files for this cpu device */ @@ -721,8 +800,10 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); spin_lock_irqsave(&cpufreq_driver_lock, flags); - for_each_cpu_mask(j, policy->cpus) + for_each_cpu_mask(j, policy->cpus) { cpufreq_cpu_data[j] = policy; + per_cpu(policy_cpu, j) = policy->cpu; + } spin_unlock_irqrestore(&cpufreq_driver_lock, flags); /* symlink affected CPUs */ @@ -741,7 +822,7 @@ static int cpufreq_add_dev (struct sys_device * sys_dev) policy->governor = NULL; /* to assure that the starting sequence is * run in cpufreq_set_policy */ - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(cpu); /* set default policy */ ret = cpufreq_set_policy(&new_policy); @@ -782,11 +863,13 @@ module_out: /** - * cpufreq_remove_dev - remove a CPU device + * __cpufreq_remove_dev - remove a CPU device * * Removes the cpufreq interface for a CPU device. + * Caller should already have policy_rwsem in write mode for this CPU. + * This routine frees the rwsem before returning. */ -static int cpufreq_remove_dev (struct sys_device * sys_dev) +static int __cpufreq_remove_dev (struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long flags; @@ -805,6 +888,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) if (!data) { spin_unlock_irqrestore(&cpufreq_driver_lock, flags); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return -EINVAL; } cpufreq_cpu_data[cpu] = NULL; @@ -821,6 +905,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) sysfs_remove_link(&sys_dev->kobj, "cpufreq"); cpufreq_cpu_put(data); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return 0; } #endif @@ -829,6 +914,7 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) if (!kobject_get(&data->kobj)) { spin_unlock_irqrestore(&cpufreq_driver_lock, flags); cpufreq_debug_enable_ratelimit(); + unlock_policy_rwsem_write(cpu); return -EFAULT; } @@ -862,10 +948,10 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) spin_unlock_irqrestore(&cpufreq_driver_lock, flags); #endif - mutex_lock(&data->lock); if (cpufreq_driver->target) __cpufreq_governor(data, CPUFREQ_GOV_STOP); - mutex_unlock(&data->lock); + + unlock_policy_rwsem_write(cpu); kobject_unregister(&data->kobj); @@ -888,6 +974,17 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev) return 0; } +static int cpufreq_remove_dev (struct sys_device * sys_dev) +{ + unsigned int cpu = sys_dev->id; + int retval; + if (unlikely(lock_policy_rwsem_write(cpu))) + BUG(); + + retval = __cpufreq_remove_dev(sys_dev); + return retval; +} + static void handle_update(void *data) { @@ -933,9 +1030,12 @@ unsigned int cpufreq_quick_get(unsigned int cpu) unsigned int ret = 0; if (policy) { - mutex_lock(&policy->lock); + if (unlikely(lock_policy_rwsem_read(cpu))) + return ret; + ret = policy->cur; - mutex_unlock(&policy->lock); + + unlock_policy_rwsem_read(cpu); cpufreq_cpu_put(policy); } @@ -944,24 +1044,13 @@ unsigned int cpufreq_quick_get(unsigned int cpu) EXPORT_SYMBOL(cpufreq_quick_get); -/** - * cpufreq_get - get the current CPU frequency (in kHz) - * @cpu: CPU number - * - * Get the CPU current (static) CPU frequency - */ -unsigned int cpufreq_get(unsigned int cpu) +static unsigned int __cpufreq_get(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy = cpufreq_cpu_data[cpu]; unsigned int ret = 0; - if (!policy) - return 0; - if (!cpufreq_driver->get) - goto out; - - mutex_lock(&policy->lock); + return (ret); ret = cpufreq_driver->get(cpu); @@ -973,16 +1062,40 @@ unsigned int cpufreq_get(unsigned int cpu) } } - mutex_unlock(&policy->lock); + return (ret); +} -out: - cpufreq_cpu_put(policy); - return (ret); +/** + * cpufreq_get - get the current CPU frequency (in kHz) + * @cpu: CPU number + * + * Get the CPU current (static) CPU frequency + */ +unsigned int cpufreq_get(unsigned int cpu) +{ + unsigned int ret_freq = 0; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + + if (!policy) + goto out; + + if (unlikely(lock_policy_rwsem_read(cpu))) + goto out_policy; + + ret_freq = __cpufreq_get(cpu); + + unlock_policy_rwsem_read(cpu); + +out_policy: + cpufreq_cpu_put(policy); +out: + return (ret_freq); } EXPORT_SYMBOL(cpufreq_get); + /** * cpufreq_suspend - let the low level driver prepare for suspend */ @@ -1256,11 +1369,12 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, return -EINVAL; lock_cpu_hotplug(); - mutex_lock(&policy->lock); + if (unlikely(lock_policy_rwsem_write(policy->cpu))) + return -EINVAL; ret = __cpufreq_driver_target(policy, target_freq, relation); - mutex_unlock(&policy->lock); + unlock_policy_rwsem_write(policy->cpu); unlock_cpu_hotplug(); cpufreq_cpu_put(policy); @@ -1348,9 +1462,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu) if (!cpu_policy) return -EINVAL; - mutex_lock(&cpu_policy->lock); memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy)); - mutex_unlock(&cpu_policy->lock); cpufreq_cpu_put(cpu_policy); return 0; @@ -1462,8 +1574,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) lock_cpu_hotplug(); - /* lock this CPU */ - mutex_lock(&data->lock); + if (unlikely(lock_policy_rwsem_write(policy->cpu))) + return -EINVAL; + ret = __cpufreq_set_policy(data, policy); data->user_policy.min = data->min; @@ -1471,7 +1584,7 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) data->user_policy.policy = data->policy; data->user_policy.governor = data->governor; - mutex_unlock(&data->lock); + unlock_policy_rwsem_write(policy->cpu); unlock_cpu_hotplug(); cpufreq_cpu_put(data); @@ -1498,7 +1611,8 @@ int cpufreq_update_policy(unsigned int cpu) return -ENODEV; lock_cpu_hotplug(); - mutex_lock(&data->lock); + if (unlikely(lock_policy_rwsem_write(cpu))) + return -EINVAL; dprintk("updating policy for CPU %u\n", cpu); memcpy(&policy, data, sizeof(struct cpufreq_policy)); @@ -1522,7 +1636,8 @@ int cpufreq_update_policy(unsigned int cpu) ret = __cpufreq_set_policy(data, &policy); - mutex_unlock(&data->lock); + unlock_policy_rwsem_write(cpu); + unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; @@ -1551,14 +1666,18 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, * hardware-managed P-State to switch other related * threads to min or higher speeds if possible. */ + if (unlikely(lock_policy_rwsem_write(cpu))) + BUG(); + policy = cpufreq_cpu_data[cpu]; if (policy) { - cpufreq_driver_target(policy, policy->min, + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_H); } + __cpufreq_remove_dev(sys_dev); break; - case CPU_DEAD: - cpufreq_remove_dev(sys_dev); + case CPU_DOWN_FAILED: + cpufreq_add_dev(sys_dev); break; } } @@ -1671,3 +1790,16 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) return 0; } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); + +static int __init cpufreq_core_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(policy_cpu, cpu) = -1; + init_rwsem(&per_cpu(cpu_policy_rwsem, cpu)); + } + return 0; +} + +core_initcall(cpufreq_core_init); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 4ea39fe..84fc92e 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -84,9 +84,6 @@ struct cpufreq_policy { unsigned int policy; /* see above */ struct cpufreq_governor *governor; /* see below */ - struct mutex lock; /* CPU ->setpolicy or ->target may - only be called once a time */ - struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ @@ -175,6 +172,11 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); +int lock_policy_rwsem_read(int cpu); +int lock_policy_rwsem_write(int cpu); +void unlock_policy_rwsem_read(int cpu); +void unlock_policy_rwsem_write(int cpu); + /********************************************************************* * CPUFREQ DRIVER INTERFACE *