|
|
|
/*
|
|
|
|
* linux/mm/mmzone.c
|
|
|
|
*
|
|
|
|
* management codes for pgdats, zones and page flags
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
|
|
|
|
struct pglist_data *first_online_pgdat(void)
|
|
|
|
{
|
|
|
|
return NODE_DATA(first_online_node);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
int nid = next_online_node(pgdat->node_id);
|
|
|
|
|
|
|
|
if (nid == MAX_NUMNODES)
|
|
|
|
return NULL;
|
|
|
|
return NODE_DATA(nid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* next_zone - helper magic for for_each_zone()
|
|
|
|
*/
|
|
|
|
struct zone *next_zone(struct zone *zone)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat = zone->zone_pgdat;
|
|
|
|
|
|
|
|
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
|
|
|
|
zone++;
|
|
|
|
else {
|
|
|
|
pgdat = next_online_pgdat(pgdat);
|
|
|
|
if (pgdat)
|
|
|
|
zone = pgdat->node_zones;
|
|
|
|
else
|
|
|
|
zone = NULL;
|
|
|
|
}
|
|
|
|
return zone;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
return node_isset(zonelist_node_idx(zref), *nodes);
|
|
|
|
#else
|
|
|
|
return 1;
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns the next zone at or below highest_zoneidx in a zonelist */
|
|
|
|
struct zoneref *__next_zones_zonelist(struct zoneref *z,
|
|
|
|
enum zone_type highest_zoneidx,
|
|
|
|
nodemask_t *nodes)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Find the next suitable zone to use for the allocation.
|
|
|
|
* Only filter based on nodemask if it's set
|
|
|
|
*/
|
|
|
|
if (unlikely(nodes == NULL))
|
|
|
|
while (zonelist_zone_idx(z) > highest_zoneidx)
|
|
|
|
z++;
|
|
|
|
else
|
|
|
|
while (zonelist_zone_idx(z) > highest_zoneidx ||
|
|
|
|
(z->zone && !zref_in_nodemask(z, nodes)))
|
|
|
|
z++;
|
|
|
|
|
|
|
|
return z;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
|
|
|
|
bool memmap_valid_within(unsigned long pfn,
|
|
|
|
struct page *page, struct zone *zone)
|
|
|
|
{
|
|
|
|
if (page_to_pfn(page) != pfn)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (page_zone(page) != zone)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
|
|
|
|
|
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option),
when adding or removing a page to/from its lru list, the zone pointer
used for stats updates is nowadays taken from the struct lruvec. (On
many configurations, calculating zone from page is slower.)
But we have no code to update all the lruvecs (per zone, per memcg) when
a memory node is hotadded. Here's an extract from the oops which
results when running numactl to bind a program to a newly onlined node:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
IP: __mod_zone_page_state+0x9/0x60
Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
Call Trace:
__pagevec_lru_add_fn+0xdf/0x140
pagevec_lru_move_fn+0xb1/0x100
__pagevec_lru_add+0x1c/0x30
lru_add_drain_cpu+0xa3/0x130
lru_add_drain+0x2f/0x40
...
The natural solution might be to use a memcg callback whenever memory is
hotadded; but that solution has not been scoped out, and it happens that
we do have an easy location at which to update lruvec->zone. The lruvec
pointer is discovered either by mem_cgroup_zone_lruvec() or by
mem_cgroup_page_lruvec(), and both of those do know the right zone.
So check and set lruvec->zone in those; and remove the inadequate
attempt to set lruvec->zone from lruvec_init(), which is called before
NODE_DATA(node) has been allocated in such cases.
Ah, there was one exceptionr. For no particularly good reason,
mem_cgroup_force_empty_list() has its own code for deciding lruvec.
Change it to use the standard mem_cgroup_zone_lruvec() and
mem_cgroup_get_lru_size() too. In fact it was already safe against such
an oops (the lru lists in danger could only be empty), but we're better
proofed against future changes this way.
I've marked this for stable (3.6) since we introduced the problem in 3.5
(now closed to stable); but I have no idea if this is the only fix
needed to get memory hotadd working with memcg in 3.6, and received no
answer when I enquired twice before.
Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
12 years ago
|
|
|
void lruvec_init(struct lruvec *lruvec)
|
|
|
|
{
|
|
|
|
enum lru_list lru;
|
|
|
|
|
|
|
|
memset(lruvec, 0, sizeof(struct lruvec));
|
|
|
|
|
|
|
|
for_each_lru(lru)
|
|
|
|
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
|
|
|
int page_cpupid_xchg_last(struct page *page, int cpupid)
|
|
|
|
{
|
|
|
|
unsigned long old_flags, flags;
|
|
|
|
int last_cpupid;
|
|
|
|
|
|
|
|
do {
|
|
|
|
old_flags = flags = page->flags;
|
|
|
|
last_cpupid = page_cpupid_last(page);
|
|
|
|
|
|
|
|
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
|
|
|
|
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
|
|
|
|
} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
|
|
|
|
|
|
|
|
return last_cpupid;
|
|
|
|
}
|
|
|
|
#endif
|