@ -39,6 +39,7 @@
# include <linux/limits.h>
# include <linux/export.h>
# include <linux/mutex.h>
# include <linux/rbtree.h>
# include <linux/slab.h>
# include <linux/swap.h>
# include <linux/swapops.h>
@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = {
*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH ,
MEM_CGROUP_TARGET_SOFTLIMIT ,
MEM_CGROUP_TARGET_NUMAINFO ,
MEM_CGROUP_NTARGETS ,
} ;
@ -159,6 +161,10 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter [ DEF_PRIORITY + 1 ] ;
struct rb_node tree_node ; /* RB tree node */
unsigned long long usage_in_excess ; /* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree ;
struct mem_cgroup * memcg ; /* Back pointer, we cannot */
/* use container_of */
} ;
@ -167,6 +173,26 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo [ MAX_NR_ZONES ] ;
} ;
/*
* Cgroups above their limits are maintained in a RB - Tree , independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_zone {
struct rb_root rb_root ;
spinlock_t lock ;
} ;
struct mem_cgroup_tree_per_node {
struct mem_cgroup_tree_per_zone rb_tree_per_zone [ MAX_NR_ZONES ] ;
} ;
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node * rb_tree_per_node [ MAX_NUMNODES ] ;
} ;
static struct mem_cgroup_tree soft_limit_tree __read_mostly ;
struct mem_cgroup_threshold {
struct eventfd_ctx * eventfd ;
u64 threshold ;
@ -405,6 +431,7 @@ static bool move_file(void)
* limit reclaim to prevent infinite loops , if they ever occur .
*/
# define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
# define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0 ,
@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
return mem_cgroup_zoneinfo ( memcg , nid , zid ) ;
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone ( int nid , int zid )
{
return & soft_limit_tree . rb_tree_per_node [ nid ] - > rb_tree_per_zone [ zid ] ;
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page ( struct page * page )
{
int nid = page_to_nid ( page ) ;
int zid = page_zonenum ( page ) ;
return & soft_limit_tree . rb_tree_per_node [ nid ] - > rb_tree_per_zone [ zid ] ;
}
static void
__mem_cgroup_insert_exceeded ( struct mem_cgroup * memcg ,
struct mem_cgroup_per_zone * mz ,
struct mem_cgroup_tree_per_zone * mctz ,
unsigned long long new_usage_in_excess )
{
struct rb_node * * p = & mctz - > rb_root . rb_node ;
struct rb_node * parent = NULL ;
struct mem_cgroup_per_zone * mz_node ;
if ( mz - > on_tree )
return ;
mz - > usage_in_excess = new_usage_in_excess ;
if ( ! mz - > usage_in_excess )
return ;
while ( * p ) {
parent = * p ;
mz_node = rb_entry ( parent , struct mem_cgroup_per_zone ,
tree_node ) ;
if ( mz - > usage_in_excess < mz_node - > usage_in_excess )
p = & ( * p ) - > rb_left ;
/*
* We can ' t avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if ( mz - > usage_in_excess > = mz_node - > usage_in_excess )
p = & ( * p ) - > rb_right ;
}
rb_link_node ( & mz - > tree_node , parent , p ) ;
rb_insert_color ( & mz - > tree_node , & mctz - > rb_root ) ;
mz - > on_tree = true ;
}
static void
__mem_cgroup_remove_exceeded ( struct mem_cgroup * memcg ,
struct mem_cgroup_per_zone * mz ,
struct mem_cgroup_tree_per_zone * mctz )
{
if ( ! mz - > on_tree )
return ;
rb_erase ( & mz - > tree_node , & mctz - > rb_root ) ;
mz - > on_tree = false ;
}
static void
mem_cgroup_remove_exceeded ( struct mem_cgroup * memcg ,
struct mem_cgroup_per_zone * mz ,
struct mem_cgroup_tree_per_zone * mctz )
{
spin_lock ( & mctz - > lock ) ;
__mem_cgroup_remove_exceeded ( memcg , mz , mctz ) ;
spin_unlock ( & mctz - > lock ) ;
}
static void mem_cgroup_update_tree ( struct mem_cgroup * memcg , struct page * page )
{
unsigned long long excess ;
struct mem_cgroup_per_zone * mz ;
struct mem_cgroup_tree_per_zone * mctz ;
int nid = page_to_nid ( page ) ;
int zid = page_zonenum ( page ) ;
mctz = soft_limit_tree_from_page ( page ) ;
/*
* Necessary to update all ancestors when hierarchy is used .
* because their event counter is not touched .
*/
for ( ; memcg ; memcg = parent_mem_cgroup ( memcg ) ) {
mz = mem_cgroup_zoneinfo ( memcg , nid , zid ) ;
excess = res_counter_soft_limit_excess ( & memcg - > res ) ;
/*
* We have to update the tree if mz is on RB - tree or
* mem is over its softlimit .
*/
if ( excess | | mz - > on_tree ) {
spin_lock ( & mctz - > lock ) ;
/* if on-tree, remove it */
if ( mz - > on_tree )
__mem_cgroup_remove_exceeded ( memcg , mz , mctz ) ;
/*
* Insert again . mz - > usage_in_excess will be updated .
* If excess is 0 , no tree ops .
*/
__mem_cgroup_insert_exceeded ( memcg , mz , mctz , excess ) ;
spin_unlock ( & mctz - > lock ) ;
}
}
}
static void mem_cgroup_remove_from_trees ( struct mem_cgroup * memcg )
{
int node , zone ;
struct mem_cgroup_per_zone * mz ;
struct mem_cgroup_tree_per_zone * mctz ;
for_each_node ( node ) {
for ( zone = 0 ; zone < MAX_NR_ZONES ; zone + + ) {
mz = mem_cgroup_zoneinfo ( memcg , node , zone ) ;
mctz = soft_limit_tree_node_zone ( node , zone ) ;
mem_cgroup_remove_exceeded ( memcg , mz , mctz ) ;
}
}
}
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node ( struct mem_cgroup_tree_per_zone * mctz )
{
struct rb_node * rightmost = NULL ;
struct mem_cgroup_per_zone * mz ;
retry :
mz = NULL ;
rightmost = rb_last ( & mctz - > rb_root ) ;
if ( ! rightmost )
goto done ; /* Nothing to reclaim from */
mz = rb_entry ( rightmost , struct mem_cgroup_per_zone , tree_node ) ;
/*
* Remove the node now but someone else can add it back ,
* we will to add it back at the end of reclaim to its correct
* position in the tree .
*/
__mem_cgroup_remove_exceeded ( mz - > memcg , mz , mctz ) ;
if ( ! res_counter_soft_limit_excess ( & mz - > memcg - > res ) | |
! css_tryget ( & mz - > memcg - > css ) )
goto retry ;
done :
return mz ;
}
static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node ( struct mem_cgroup_tree_per_zone * mctz )
{
struct mem_cgroup_per_zone * mz ;
spin_lock ( & mctz - > lock ) ;
mz = __mem_cgroup_largest_soft_limit_node ( mctz ) ;
spin_unlock ( & mctz - > lock ) ;
return mz ;
}
/*
* Implementation Note : reading percpu statistics for memcg .
*
@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
case MEM_CGROUP_TARGET_THRESH :
next = val + THRESHOLDS_EVENTS_TARGET ;
break ;
case MEM_CGROUP_TARGET_SOFTLIMIT :
next = val + SOFTLIMIT_EVENTS_TARGET ;
break ;
case MEM_CGROUP_TARGET_NUMAINFO :
next = val + NUMAINFO_EVENTS_TARGET ;
break ;
@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
/* threshold event is triggered in finer grain than soft limit */
if ( unlikely ( mem_cgroup_event_ratelimit ( memcg ,
MEM_CGROUP_TARGET_THRESH ) ) ) {
bool do_softlimit ;
bool do_numainfo __maybe_unused ;
do_softlimit = mem_cgroup_event_ratelimit ( memcg ,
MEM_CGROUP_TARGET_SOFTLIMIT ) ;
# if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit ( memcg ,
MEM_CGROUP_TARGET_NUMAINFO ) ;
@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_enable ( ) ;
mem_cgroup_threshold ( memcg ) ;
if ( unlikely ( do_softlimit ) )
mem_cgroup_update_tree ( memcg , page ) ;
# if MAX_NUMNODES > 1
if ( unlikely ( do_numainfo ) )
atomic_inc ( & memcg - > numainfo_events ) ;
@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
return total ;
}
# if MAX_NUMNODES > 1
/**
* test_mem_cgroup_node_reclaimable
* @ memcg : the target memcg
@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false ;
}
# if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return node ;
}
/*
* Check all nodes whether it contains reclaimable pages or not .
* For quick scan , we make use of scan_nodes . This will allow us to skip
* unused nodes . But scan_nodes is lazily updated and may not cotain
* enough new information . We need to do double check .
*/
static bool mem_cgroup_reclaimable ( struct mem_cgroup * memcg , bool noswap )
{
int nid ;
/*
* quick check . . . making use of scan_node .
* We can skip unused nodes .
*/
if ( ! nodes_empty ( memcg - > scan_nodes ) ) {
for ( nid = first_node ( memcg - > scan_nodes ) ;
nid < MAX_NUMNODES ;
nid = next_node ( nid , memcg - > scan_nodes ) ) {
if ( test_mem_cgroup_node_reclaimable ( memcg , nid , noswap ) )
return true ;
}
}
/*
* Check rest of nodes .
*/
for_each_node_state ( nid , N_MEMORY ) {
if ( node_isset ( nid , memcg - > scan_nodes ) )
continue ;
if ( test_mem_cgroup_node_reclaimable ( memcg , nid , noswap ) )
return true ;
}
return false ;
}
# else
int mem_cgroup_select_victim_node ( struct mem_cgroup * memcg )
{
return 0 ;
}
static bool mem_cgroup_reclaimable ( struct mem_cgroup * memcg , bool noswap )
{
return test_mem_cgroup_node_reclaimable ( memcg , 0 , noswap ) ;
}
# endif
/*
@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unlock_page_cgroup ( pc ) ;
/*
* " charge_statistics " updated event counter .
* " charge_statistics " updated event counter . Then , check it .
* Insert ancestor ( and ancestor ' s ancestors ) , to softlimit RB - tree .
* if they exceeds softlimit .
*/
memcg_check_events ( memcg , page ) ;
}
@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for ( zone = 0 ; zone < MAX_NR_ZONES ; zone + + ) {
mz = & pn - > zoneinfo [ zone ] ;
lruvec_init ( & mz - > lruvec ) ;
mz - > usage_in_excess = 0 ;
mz - > on_tree = false ;
mz - > memcg = memcg ;
}
memcg - > nodeinfo [ node ] = pn ;
@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
int node ;
size_t size = memcg_size ( ) ;
mem_cgroup_remove_from_trees ( memcg ) ;
free_css_id ( & mem_cgroup_subsys , & memcg - > css ) ;
for_each_node ( node )
@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL ( parent_mem_cgroup ) ;
static void __init mem_cgroup_soft_limit_tree_init ( void )
{
struct mem_cgroup_tree_per_node * rtpn ;
struct mem_cgroup_tree_per_zone * rtpz ;
int tmp , node , zone ;
for_each_node ( node ) {
tmp = node ;
if ( ! node_state ( node , N_NORMAL_MEMORY ) )
tmp = - 1 ;
rtpn = kzalloc_node ( sizeof ( * rtpn ) , GFP_KERNEL , tmp ) ;
BUG_ON ( ! rtpn ) ;
soft_limit_tree . rb_tree_per_node [ node ] = rtpn ;
for ( zone = 0 ; zone < MAX_NR_ZONES ; zone + + ) {
rtpz = & rtpn - > rb_tree_per_zone [ zone ] ;
rtpz - > rb_root = RB_ROOT ;
spin_lock_init ( & rtpz - > lock ) ;
}
}
}
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc ( struct cgroup_subsys_state * parent_css )
{
@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void)
{
hotcpu_notifier ( memcg_cpu_hotplug_callback , 0 ) ;
enable_swap_cgroup ( ) ;
mem_cgroup_soft_limit_tree_init ( ) ;
memcg_stock_init ( ) ;
return 0 ;
}