@ -13,22 +13,17 @@
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/smp.h>
# include <linux/init.h>
# include <linux/interrupt.h>
# include <linux/ctype.h>
# include <linux/cpufreq.h>
# include <linux/sysctl.h>
# include <linux/types.h>
# include <linux/fs.h>
# include <linux/sysfs.h>
# include <linux/cpu.h>
# include <linux/kmod.h>
# include <linux/workqueue.h>
# include <linux/jiffies.h>
# include <linux/kernel_stat.h>
# include <linux/percpu.h>
# include <linux/mutex.h>
# include <linux/hrtimer.h>
# include <linux/tick.h>
# include <linux/ktime.h>
# include <linux/sched.h>
/*
* dbs is used in this file as a shortform for demandbased switching
* It helps to keep variable names smaller , simpler
@ -43,14 +38,14 @@
* latency of the processor . The governor will work on any processor with
* transition latency < = 10 mS , using appropriate sampling
* rate .
* For CPUs with transition latency > 10 mS ( mostly drivers
* with CPUFREQ_ETERNAL ) , this governor will not work .
* For CPUs with transition latency > 10 mS ( mostly drivers with CPUFREQ_ETERNAL )
* this governor will not work .
* All times here are in uS .
*/
static unsigned int def_sampling_rate ;
# define MIN_SAMPLING_RATE_RATIO (2)
/* for correct statistics, we need at least 10 ticks between each measure */
# define MIN_STAT_SAMPLING_RATE \
# define MIN_STAT_SAMPLING_RATE \
( MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs ( 10 ) )
# define MIN_SAMPLING_RATE \
( def_sampling_rate / MIN_SAMPLING_RATE_RATIO )
@ -75,12 +70,15 @@ static unsigned int minimum_sampling_rate(void)
static void do_dbs_timer ( struct work_struct * work ) ;
struct cpu_dbs_info_s {
cputime64_t prev_cpu_idle ;
cputime64_t prev_cpu_wall ;
cputime64_t prev_cpu_nice ;
struct cpufreq_policy * cur_policy ;
unsigned int prev_cpu_idle_up ;
unsigned int prev_cpu_idle_down ;
unsigned int enable ;
struct delayed_work work ;
unsigned int down_skip ;
unsigned int requested_freq ;
int cpu ;
unsigned int enable : 1 ;
} ;
static DEFINE_PER_CPU ( struct cpu_dbs_info_s , cpu_dbs_info ) ;
@ -95,18 +93,17 @@ static unsigned int dbs_enable; /* number of CPUs using this policy */
* is recursive for the same process . - Venki
*/
static DEFINE_MUTEX ( dbs_mutex ) ;
static DECLARE_DELAYED_WORK ( dbs_work , do_dbs_timer ) ;
struct dbs_tuners {
static struct workqueue_struct * kconservative_wq ;
static struct dbs_tuners {
unsigned int sampling_rate ;
unsigned int sampling_down_factor ;
unsigned int up_threshold ;
unsigned int down_threshold ;
unsigned int ignore_nice ;
unsigned int freq_step ;
} ;
static struct dbs_tuners dbs_tuners_ins = {
} dbs_tuners_ins = {
. up_threshold = DEF_FREQUENCY_UP_THRESHOLD ,
. down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD ,
. sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR ,
@ -114,18 +111,37 @@ static struct dbs_tuners dbs_tuners_ins = {
. freq_step = 5 ,
} ;
static inline unsigned int get_cpu_idle_time ( unsigned int cpu )
static inline cputime64_t get_cpu_idle_time_jiffy ( unsigned int cpu ,
cputime64_t * wall )
{
unsigned int add_nice = 0 , ret ;
cputime64_t idle_time ;
cputime64_t cur_wall_time ;
cputime64_t busy_time ;
cur_wall_time = jiffies64_to_cputime64 ( get_jiffies_64 ( ) ) ;
busy_time = cputime64_add ( kstat_cpu ( cpu ) . cpustat . user ,
kstat_cpu ( cpu ) . cpustat . system ) ;
if ( dbs_tuners_ins . ignore_nice )
add_nice = kstat_cpu ( cpu ) . cpustat . nice ;
busy_time = cputime64_add ( busy_time , kstat_cpu ( cpu ) . cpustat . irq ) ;
busy_time = cputime64_add ( busy_time , kstat_cpu ( cpu ) . cpustat . softirq ) ;
busy_time = cputime64_add ( busy_time , kstat_cpu ( cpu ) . cpustat . steal ) ;
busy_time = cputime64_add ( busy_time , kstat_cpu ( cpu ) . cpustat . nice ) ;
ret = kstat_cpu ( cpu ) . cpustat . idle +
kstat_cpu ( cpu ) . cpustat . iowait +
add_nic e;
idle_time = cputime64_sub ( cur_wall_time , busy_time ) ;
if ( wall )
* wall = cur_wall_tim e;
return ret ;
return idle_time ;
}
static inline cputime64_t get_cpu_idle_time ( unsigned int cpu , cputime64_t * wall )
{
u64 idle_time = get_cpu_idle_time_us ( cpu , wall ) ;
if ( idle_time = = - 1ULL )
return get_cpu_idle_time_jiffy ( cpu , wall ) ;
return idle_time ;
}
/* keep track of frequency transitions */
@ -186,8 +202,8 @@ static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf)
return sprintf ( buf , " %u \n " , MIN_SAMPLING_RATE ) ;
}
# define define_one_ro(_name) \
static struct freq_attr _name = \
# define define_one_ro(_name) \
static struct freq_attr _name = \
__ATTR ( _name , 0444 , show_ # # _name , NULL )
define_one_ro ( sampling_rate_max ) ;
@ -213,6 +229,7 @@ static ssize_t store_sampling_down_factor(struct cpufreq_policy *unused,
unsigned int input ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 | | input > MAX_SAMPLING_DOWN_FACTOR | | input < 1 )
return - EINVAL ;
@ -230,11 +247,10 @@ static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
mutex_lock ( & dbs_mutex ) ;
if ( ret ! = 1 ) {
mutex_unlock ( & dbs_mutex ) ;
if ( ret ! = 1 )
return - EINVAL ;
}
mutex_lock ( & dbs_mutex ) ;
dbs_tuners_ins . sampling_rate = max ( input , minimum_sampling_rate ( ) ) ;
mutex_unlock ( & dbs_mutex ) ;
@ -250,7 +266,7 @@ static ssize_t store_up_threshold(struct cpufreq_policy *unused,
mutex_lock ( & dbs_mutex ) ;
if ( ret ! = 1 | | input > 100 | |
input < = dbs_tuners_ins . down_threshold ) {
input < = dbs_tuners_ins . down_threshold ) {
mutex_unlock ( & dbs_mutex ) ;
return - EINVAL ;
}
@ -269,7 +285,9 @@ static ssize_t store_down_threshold(struct cpufreq_policy *unused,
ret = sscanf ( buf , " %u " , & input ) ;
mutex_lock ( & dbs_mutex ) ;
if ( ret ! = 1 | | input > 100 | | input > = dbs_tuners_ins . up_threshold ) {
/* cannot be lower than 11 otherwise freq will not fall */
if ( ret ! = 1 | | input < 11 | | input > 100 | |
input > = dbs_tuners_ins . up_threshold ) {
mutex_unlock ( & dbs_mutex ) ;
return - EINVAL ;
}
@ -302,12 +320,14 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
}
dbs_tuners_ins . ignore_nice = input ;
/* we need to re-evaluate prev_cpu_idle_up and prev_cpu_idle_down */
/* we need to re-evaluate prev_cpu_idle */
for_each_online_cpu ( j ) {
struct cpu_dbs_info_s * j_dbs_info ;
j_dbs_info = & per_cpu ( cpu_dbs_info , j ) ;
j_dbs_info - > prev_cpu_idle_up = get_cpu_idle_time ( j ) ;
j_dbs_info - > prev_cpu_idle_down = j_dbs_info - > prev_cpu_idle_up ;
struct cpu_dbs_info_s * dbs_info ;
dbs_info = & per_cpu ( cpu_dbs_info , j ) ;
dbs_info - > prev_cpu_idle = get_cpu_idle_time ( j ,
& dbs_info - > prev_cpu_wall ) ;
if ( dbs_tuners_ins . ignore_nice )
dbs_info - > prev_cpu_nice = kstat_cpu ( j ) . cpustat . nice ;
}
mutex_unlock ( & dbs_mutex ) ;
@ -319,7 +339,6 @@ static ssize_t store_freq_step(struct cpufreq_policy *policy,
{
unsigned int input ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
@ -367,55 +386,78 @@ static struct attribute_group dbs_attr_group = {
/************************** sysfs end ************************/
static void dbs_check_cpu ( int cpu )
static void dbs_check_cpu ( struct cpu_dbs_info_s * this_dbs_info )
{
unsigned int idle_ticks , up_idle_ticks , down_idle_ticks ;
unsigned int tmp_idle_ticks , total_idle_ticks ;
unsigned int load = 0 ;
unsigned int freq_target ;
unsigned int freq_down_sampling_rate ;
struct cpu_dbs_info_s * this_dbs_info = & per_cpu ( cpu_dbs_info , cpu ) ;
struct cpufreq_policy * policy ;
if ( ! this_dbs_info - > enable )
return ;
struct cpufreq_policy * policy ;
unsigned int j ;
policy = this_dbs_info - > cur_policy ;
/*
* The default safe range is 20 % to 80 %
* Every sampling_rate , we check
* - If current idle time is less than 20 % , then we try to
* increase frequency
* Every sampling_rate * sampling_down_factor , we check
* - If current idle time is more than 80 % , then we try to
* decrease frequency
* Every sampling_rate , we check , if current idle time is less
* than 20 % ( default ) , then we try to increase frequency
* Every sampling_rate * sampling_down_factor , we check , if current
* idle time is more than 80 % , then we try to decrease frequency
*
* Any frequency increase takes it to the maximum frequency .
* Frequency reduction happens at minimum steps of
* 5 % ( default ) of max_ frequency
* 5 % ( default ) of maximum frequency
*/
/* Check for frequency increase */
idle_ticks = UINT_MAX ;
/* Get Absolute Load */
for_each_cpu ( j , policy - > cpus ) {
struct cpu_dbs_info_s * j_dbs_info ;
cputime64_t cur_wall_time , cur_idle_time ;
unsigned int idle_time , wall_time ;
/* Check for frequency increase */
total_idle_ticks = get_cpu_idle_time ( cpu ) ;
tmp_idle_ticks = total_idle_ticks -
this_dbs_info - > prev_cpu_idle_up ;
this_dbs_info - > prev_cpu_idle_up = total_idle_ticks ;
j_dbs_info = & per_cpu ( cpu_dbs_info , j ) ;
cur_idle_time = get_cpu_idle_time ( j , & cur_wall_time ) ;
wall_time = ( unsigned int ) cputime64_sub ( cur_wall_time ,
j_dbs_info - > prev_cpu_wall ) ;
j_dbs_info - > prev_cpu_wall = cur_wall_time ;
if ( tmp_idle_ticks < idle_ticks )
idle_ticks = tmp_idle_ticks ;
idle_time = ( unsigned int ) cputime64_sub ( cur_idle_time ,
j_dbs_info - > prev_cpu_idle ) ;
j_dbs_info - > prev_cpu_idle = cur_idle_time ;
/* Scale idle ticks by 100 and compare with up and down ticks */
idle_ticks * = 100 ;
up_idle_ticks = ( 100 - dbs_tuners_ins . up_threshold ) *
usecs_to_jiffies ( dbs_tuners_ins . sampling_rate ) ;
if ( dbs_tuners_ins . ignore_nice ) {
cputime64_t cur_nice ;
unsigned long cur_nice_jiffies ;
cur_nice = cputime64_sub ( kstat_cpu ( j ) . cpustat . nice ,
j_dbs_info - > prev_cpu_nice ) ;
/*
* Assumption : nice time between sampling periods will
* be less than 2 ^ 32 jiffies for 32 bit sys
*/
cur_nice_jiffies = ( unsigned long )
cputime64_to_jiffies64 ( cur_nice ) ;
j_dbs_info - > prev_cpu_nice = kstat_cpu ( j ) . cpustat . nice ;
idle_time + = jiffies_to_usecs ( cur_nice_jiffies ) ;
}
if ( unlikely ( ! wall_time | | wall_time < idle_time ) )
continue ;
load = 100 * ( wall_time - idle_time ) / wall_time ;
}
/*
* break out if we ' cannot ' reduce the speed as the user might
* want freq_step to be zero
*/
if ( dbs_tuners_ins . freq_step = = 0 )
return ;
if ( idle_ticks < up_idle_ticks ) {
/* Check for frequency increase */
if ( load > dbs_tuners_ins . up_threshold ) {
this_dbs_info - > down_skip = 0 ;
this_dbs_info - > prev_cpu_idle_down =
this_dbs_info - > prev_cpu_idle_up ;
/* if we are already at full speed then break out early */
if ( this_dbs_info - > requested_freq = = policy - > max )
@ -436,49 +478,24 @@ static void dbs_check_cpu(int cpu)
return ;
}
/* Check for frequency decrease */
this_dbs_info - > down_skip + + ;
if ( this_dbs_info - > down_skip < dbs_tuners_ins . sampling_down_factor )
return ;
/* Check for frequency decrease */
total_idle_ticks = this_dbs_info - > prev_cpu_idle_up ;
tmp_idle_ticks = total_idle_ticks -
this_dbs_info - > prev_cpu_idle_down ;
this_dbs_info - > prev_cpu_idle_down = total_idle_ticks ;
if ( tmp_idle_ticks < idle_ticks )
idle_ticks = tmp_idle_ticks ;
/* Scale idle ticks by 100 and compare with up and down ticks */
idle_ticks * = 100 ;
this_dbs_info - > down_skip = 0 ;
freq_down_sampling_rate = dbs_tuners_ins . sampling_rate *
dbs_tuners_ins . sampling_down_factor ;
down_idle_ticks = ( 100 - dbs_tuners_ins . down_threshold ) *
usecs_to_jiffies ( freq_down_sampling_rate ) ;
if ( idle_ticks > down_idle_ticks ) {
/*
* if we are already at the lowest speed then break out early
* or if we ' cannot ' reduce the speed as the user might want
* freq_target to be zero
*/
if ( this_dbs_info - > requested_freq = = policy - > min
| | dbs_tuners_ins . freq_step = = 0 )
return ;
/*
* The optimal frequency is the frequency that is the lowest that
* can support the current CPU usage without triggering the up
* policy . To be safe , we focus 10 points under the threshold .
*/
if ( load < ( dbs_tuners_ins . down_threshold - 10 ) ) {
freq_target = ( dbs_tuners_ins . freq_step * policy - > max ) / 100 ;
/* max freq cannot be less than 100. But who knows.... */
if ( unlikely ( freq_target = = 0 ) )
freq_target = 5 ;
this_dbs_info - > requested_freq - = freq_target ;
if ( this_dbs_info - > requested_freq < policy - > min )
this_dbs_info - > requested_freq = policy - > min ;
/*
* if we cannot reduce the frequency anymore , break out early
*/
if ( policy - > cur = = policy - > min )
return ;
__cpufreq_driver_target ( policy , this_dbs_info - > requested_freq ,
CPUFREQ_RELATION_H ) ;
return ;
@ -487,27 +504,45 @@ static void dbs_check_cpu(int cpu)
static void do_dbs_timer ( struct work_struct * work )
{
int i ;
mutex_lock ( & dbs_mutex ) ;
for_each_online_cpu ( i )
dbs_check_cpu ( i ) ;
schedule_delayed_work ( & dbs_work ,
usecs_to_jiffies ( dbs_tuners_ins . sampling_rate ) ) ;
mutex_unlock ( & dbs_mutex ) ;
struct cpu_dbs_info_s * dbs_info =
container_of ( work , struct cpu_dbs_info_s , work . work ) ;
unsigned int cpu = dbs_info - > cpu ;
/* We want all CPUs to do sampling nearly on same jiffy */
int delay = usecs_to_jiffies ( dbs_tuners_ins . sampling_rate ) ;
delay - = jiffies % delay ;
if ( lock_policy_rwsem_write ( cpu ) < 0 )
return ;
if ( ! dbs_info - > enable ) {
unlock_policy_rwsem_write ( cpu ) ;
return ;
}
dbs_check_cpu ( dbs_info ) ;
queue_delayed_work_on ( cpu , kconservative_wq , & dbs_info - > work , delay ) ;
unlock_policy_rwsem_write ( cpu ) ;
}
static inline void dbs_timer_init ( void )
static inline void dbs_timer_init ( struct cpu_dbs_info_s * dbs_info )
{
init_timer_deferrable ( & dbs_work . timer ) ;
schedule_delayed_work ( & dbs_work ,
usecs_to_jiffies ( dbs_tuners_ins . sampling_rate ) ) ;
return ;
/* We want all CPUs to do sampling nearly on same jiffy */
int delay = usecs_to_jiffies ( dbs_tuners_ins . sampling_rate ) ;
delay - = jiffies % delay ;
dbs_info - > enable = 1 ;
INIT_DELAYED_WORK_DEFERRABLE ( & dbs_info - > work , do_dbs_timer ) ;
queue_delayed_work_on ( dbs_info - > cpu , kconservative_wq , & dbs_info - > work ,
delay ) ;
}
static inline void dbs_timer_exit ( void )
static inline void dbs_timer_exit ( struct cpu_dbs_info_s * dbs_info )
{
cancel_delayed_work ( & dbs_work ) ;
return ;
dbs_info - > enable = 0 ;
cancel_delayed_work ( & dbs_info - > work ) ;
}
static int cpufreq_governor_dbs ( struct cpufreq_policy * policy ,
@ -541,11 +576,13 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
j_dbs_info = & per_cpu ( cpu_dbs_info , j ) ;
j_dbs_info - > cur_policy = policy ;
j_dbs_info - > prev_cpu_idle_up = get_cpu_idle_time ( cpu ) ;
j_dbs_info - > prev_cpu_idle_down
= j_dbs_info - > prev_cpu_idle_up ;
j_dbs_info - > prev_cpu_idle = get_cpu_idle_time ( j ,
& j_dbs_info - > prev_cpu_wall ) ;
if ( dbs_tuners_ins . ignore_nice ) {
j_dbs_info - > prev_cpu_nice =
kstat_cpu ( j ) . cpustat . nice ;
}
}
this_dbs_info - > enable = 1 ;
this_dbs_info - > down_skip = 0 ;
this_dbs_info - > requested_freq = policy - > cur ;
@ -567,30 +604,30 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
dbs_tuners_ins . sampling_rate = def_sampling_rate ;
dbs_timer_init ( ) ;
cpufreq_register_notifier (
& dbs_cpufreq_notifier_block ,
CPUFREQ_TRANSITION_NOTIFIER ) ;
}
dbs_timer_init ( this_dbs_info ) ;
mutex_unlock ( & dbs_mutex ) ;
break ;
case CPUFREQ_GOV_STOP :
mutex_lock ( & dbs_mutex ) ;
this_dbs_info - > enable = 0 ;
dbs_timer_exit ( this_dbs_info ) ;
sysfs_remove_group ( & policy - > kobj , & dbs_attr_group ) ;
dbs_enable - - ;
/*
* Stop the timerschedule work , when this governor
* is used for first time
*/
if ( dbs_enable = = 0 ) {
dbs_timer_exit ( ) ;
if ( dbs_enable = = 0 )
cpufreq_unregister_notifier (
& dbs_cpufreq_notifier_block ,
CPUFREQ_TRANSITION_NOTIFIER ) ;
}
mutex_unlock ( & dbs_mutex ) ;
@ -607,6 +644,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
this_dbs_info - > cur_policy ,
policy - > min , CPUFREQ_RELATION_L ) ;
mutex_unlock ( & dbs_mutex ) ;
break ;
}
return 0 ;
@ -624,15 +662,25 @@ struct cpufreq_governor cpufreq_gov_conservative = {
static int __init cpufreq_gov_dbs_init ( void )
{
return cpufreq_register_governor ( & cpufreq_gov_conservative ) ;
int err ;
kconservative_wq = create_workqueue ( " kconservative " ) ;
if ( ! kconservative_wq ) {
printk ( KERN_ERR " Creation of kconservative failed \n " ) ;
return - EFAULT ;
}
err = cpufreq_register_governor ( & cpufreq_gov_conservative ) ;
if ( err )
destroy_workqueue ( kconservative_wq ) ;
return err ;
}
static void __exit cpufreq_gov_dbs_exit ( void )
{
/* Make sure that the scheduled work is indeed not running */
flush_scheduled_work ( ) ;
cpufreq_unregister_governor ( & cpufreq_gov_conservative ) ;
destroy_workqueue ( kconservative_wq ) ;
}