|
|
|
#ifndef __LINUX_NODEMASK_H
|
|
|
|
#define __LINUX_NODEMASK_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Nodemasks provide a bitmap suitable for representing the
|
|
|
|
* set of Node's in a system, one bit position per Node number.
|
|
|
|
*
|
|
|
|
* See detailed comments in the file linux/bitmap.h describing the
|
|
|
|
* data type on which these nodemasks are based.
|
|
|
|
*
|
|
|
|
* For details of nodemask_scnprintf() and nodemask_parse_user(),
|
|
|
|
* see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c.
|
|
|
|
* For details of nodelist_scnprintf() and nodelist_parse(), see
|
|
|
|
* bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c.
|
|
|
|
* For details of node_remap(), see bitmap_bitremap in lib/bitmap.c.
|
|
|
|
* For details of nodes_remap(), see bitmap_remap in lib/bitmap.c.
|
|
|
|
*
|
|
|
|
* The available nodemask operations are:
|
|
|
|
*
|
|
|
|
* void node_set(node, mask) turn on bit 'node' in mask
|
|
|
|
* void node_clear(node, mask) turn off bit 'node' in mask
|
|
|
|
* void nodes_setall(mask) set all bits
|
|
|
|
* void nodes_clear(mask) clear all bits
|
|
|
|
* int node_isset(node, mask) true iff bit 'node' set in mask
|
|
|
|
* int node_test_and_set(node, mask) test and set bit 'node' in mask
|
|
|
|
*
|
|
|
|
* void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection]
|
|
|
|
* void nodes_or(dst, src1, src2) dst = src1 | src2 [union]
|
|
|
|
* void nodes_xor(dst, src1, src2) dst = src1 ^ src2
|
|
|
|
* void nodes_andnot(dst, src1, src2) dst = src1 & ~src2
|
|
|
|
* void nodes_complement(dst, src) dst = ~src
|
|
|
|
*
|
|
|
|
* int nodes_equal(mask1, mask2) Does mask1 == mask2?
|
|
|
|
* int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect?
|
|
|
|
* int nodes_subset(mask1, mask2) Is mask1 a subset of mask2?
|
|
|
|
* int nodes_empty(mask) Is mask empty (no bits sets)?
|
|
|
|
* int nodes_full(mask) Is mask full (all bits sets)?
|
|
|
|
* int nodes_weight(mask) Hamming weight - number of set bits
|
|
|
|
*
|
|
|
|
* void nodes_shift_right(dst, src, n) Shift right
|
|
|
|
* void nodes_shift_left(dst, src, n) Shift left
|
|
|
|
*
|
|
|
|
* int first_node(mask) Number lowest set bit, or MAX_NUMNODES
|
|
|
|
* int next_node(node, mask) Next node past 'node', or MAX_NUMNODES
|
|
|
|
* int first_unset_node(mask) First node not set in mask, or
|
|
|
|
* MAX_NUMNODES.
|
|
|
|
*
|
|
|
|
* nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set
|
|
|
|
* NODE_MASK_ALL Initializer - all bits set
|
|
|
|
* NODE_MASK_NONE Initializer - no bits set
|
|
|
|
* unsigned long *nodes_addr(mask) Array of unsigned long's in mask
|
|
|
|
*
|
|
|
|
* int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
|
|
|
|
* int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask
|
|
|
|
* int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
|
|
|
|
* int nodelist_parse(buf, map) Parse ascii string as nodelist
|
|
|
|
* int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit)
|
|
|
|
* int nodes_remap(dst, src, old, new) *dst = map(old, new)(dst)
|
|
|
|
*
|
|
|
|
* for_each_node_mask(node, mask) for-loop node over mask
|
|
|
|
*
|
|
|
|
* int num_online_nodes() Number of online Nodes
|
|
|
|
* int num_possible_nodes() Number of all possible Nodes
|
|
|
|
*
|
|
|
|
* int node_online(node) Is some node online?
|
|
|
|
* int node_possible(node) Is some node possible?
|
|
|
|
*
|
|
|
|
* int any_online_node(mask) First online node in mask
|
|
|
|
*
|
|
|
|
* node_set_online(node) set bit 'node' in node_online_map
|
|
|
|
* node_set_offline(node) clear bit 'node' in node_online_map
|
|
|
|
*
|
|
|
|
* for_each_node(node) for-loop node over node_possible_map
|
|
|
|
* for_each_online_node(node) for-loop node over node_online_map
|
|
|
|
*
|
|
|
|
* Subtlety:
|
|
|
|
* 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
|
|
|
|
* to generate slightly worse code. So use a simple one-line #define
|
|
|
|
* for node_isset(), instead of wrapping an inline inside a macro, the
|
|
|
|
* way we do the other calls.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/bitmap.h>
|
|
|
|
#include <linux/numa.h>
|
|
|
|
|
|
|
|
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
|
|
|
|
extern nodemask_t _unused_nodemask_arg_;
|
|
|
|
|
|
|
|
#define node_set(node, dst) __node_set((node), &(dst))
|
|
|
|
static inline void __node_set(int node, volatile nodemask_t *dstp)
|
|
|
|
{
|
|
|
|
set_bit(node, dstp->bits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define node_clear(node, dst) __node_clear((node), &(dst))
|
|
|
|
static inline void __node_clear(int node, volatile nodemask_t *dstp)
|
|
|
|
{
|
|
|
|
clear_bit(node, dstp->bits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_setall(nodemask_t *dstp, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_fill(dstp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_clear(nodemask_t *dstp, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_zero(dstp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No static inline type checking - see Subtlety (1) above. */
|
|
|
|
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)
|
|
|
|
|
|
|
|
#define node_test_and_set(node, nodemask) \
|
|
|
|
__node_test_and_set((node), &(nodemask))
|
|
|
|
static inline int __node_test_and_set(int node, nodemask_t *addr)
|
|
|
|
{
|
|
|
|
return test_and_set_bit(node, addr->bits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_and(dst, src1, src2) \
|
|
|
|
__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_or(dst, src1, src2) \
|
|
|
|
__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_xor(dst, src1, src2) \
|
|
|
|
__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_andnot(dst, src1, src2) \
|
|
|
|
__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_complement(dst, src) \
|
|
|
|
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_complement(nodemask_t *dstp,
|
|
|
|
const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_complement(dstp->bits, srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_equal(src1, src2) \
|
|
|
|
__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_equal(const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_equal(src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_intersects(src1, src2) \
|
|
|
|
__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_intersects(const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_intersects(src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_subset(src1, src2) \
|
|
|
|
__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_subset(const nodemask_t *src1p,
|
|
|
|
const nodemask_t *src2p, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_subset(src1p->bits, src2p->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_empty(const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_empty(srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_full(const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_full(srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
|
|
|
|
static inline int __nodes_weight(const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_weight(srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_shift_right(dst, src, n) \
|
|
|
|
__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_shift_right(nodemask_t *dstp,
|
|
|
|
const nodemask_t *srcp, int n, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_shift_left(dst, src, n) \
|
|
|
|
__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_shift_left(nodemask_t *dstp,
|
|
|
|
const nodemask_t *srcp, int n, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FIXME: better would be to fix all architectures to never return
|
|
|
|
> MAX_NUMNODES, then the silly min_ts could be dropped. */
|
|
|
|
|
|
|
|
#define first_node(src) __first_node(&(src))
|
|
|
|
static inline int __first_node(const nodemask_t *srcp)
|
|
|
|
{
|
|
|
|
return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
|
|
|
|
}
|
|
|
|
|
|
|
|
#define next_node(n, src) __next_node((n), &(src))
|
|
|
|
static inline int __next_node(int n, const nodemask_t *srcp)
|
|
|
|
{
|
|
|
|
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodemask_of_node(node) \
|
|
|
|
({ \
|
|
|
|
typeof(_unused_nodemask_arg_) m; \
|
|
|
|
if (sizeof(m) == sizeof(unsigned long)) { \
|
|
|
|
m.bits[0] = 1UL<<(node); \
|
|
|
|
} else { \
|
|
|
|
nodes_clear(m); \
|
|
|
|
node_set((node), m); \
|
|
|
|
} \
|
|
|
|
m; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define first_unset_node(mask) __first_unset_node(&(mask))
|
|
|
|
static inline int __first_unset_node(const nodemask_t *maskp)
|
|
|
|
{
|
|
|
|
return min_t(int,MAX_NUMNODES,
|
|
|
|
find_first_zero_bit(maskp->bits, MAX_NUMNODES));
|
|
|
|
}
|
|
|
|
|
|
|
|
#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)
|
|
|
|
|
|
|
|
#if MAX_NUMNODES <= BITS_PER_LONG
|
|
|
|
|
|
|
|
#define NODE_MASK_ALL \
|
|
|
|
((nodemask_t) { { \
|
|
|
|
[BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \
|
|
|
|
} })
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#define NODE_MASK_ALL \
|
|
|
|
((nodemask_t) { { \
|
|
|
|
[0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \
|
|
|
|
[BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \
|
|
|
|
} })
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define NODE_MASK_NONE \
|
|
|
|
((nodemask_t) { { \
|
|
|
|
[0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \
|
|
|
|
} })
|
|
|
|
|
|
|
|
#define nodes_addr(src) ((src).bits)
|
|
|
|
|
|
|
|
#define nodemask_scnprintf(buf, len, src) \
|
|
|
|
__nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
|
|
|
|
static inline int __nodemask_scnprintf(char *buf, int len,
|
|
|
|
const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_scnprintf(buf, len, srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodemask_parse_user(ubuf, ulen, dst) \
|
|
|
|
__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
|
|
|
|
static inline int __nodemask_parse_user(const char __user *buf, int len,
|
|
|
|
nodemask_t *dstp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_parse_user(buf, len, dstp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodelist_scnprintf(buf, len, src) \
|
|
|
|
__nodelist_scnprintf((buf), (len), &(src), MAX_NUMNODES)
|
|
|
|
static inline int __nodelist_scnprintf(char *buf, int len,
|
|
|
|
const nodemask_t *srcp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
|
|
|
|
static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_parselist(buf, dstp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define node_remap(oldbit, old, new) \
|
|
|
|
__node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
|
|
|
|
static inline int __node_remap(int oldbit,
|
|
|
|
const nodemask_t *oldp, const nodemask_t *newp, int nbits)
|
|
|
|
{
|
|
|
|
return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define nodes_remap(dst, src, old, new) \
|
|
|
|
__nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
|
|
|
|
static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
|
|
|
|
const nodemask_t *oldp, const nodemask_t *newp, int nbits)
|
|
|
|
{
|
|
|
|
bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if MAX_NUMNODES > 1
|
|
|
|
#define for_each_node_mask(node, mask) \
|
|
|
|
for ((node) = first_node(mask); \
|
|
|
|
(node) < MAX_NUMNODES; \
|
|
|
|
(node) = next_node((node), (mask)))
|
|
|
|
#else /* MAX_NUMNODES == 1 */
|
|
|
|
#define for_each_node_mask(node, mask) \
|
|
|
|
if (!nodes_empty(mask)) \
|
|
|
|
for ((node) = 0; (node) < 1; (node)++)
|
|
|
|
#endif /* MAX_NUMNODES */
|
|
|
|
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
/*
|
|
|
|
* Bitmasks that are kept for all the nodes.
|
|
|
|
*/
|
|
|
|
enum node_states {
|
|
|
|
N_POSSIBLE, /* The node could become online at some point */
|
|
|
|
N_ONLINE, /* The node is online */
|
|
|
|
N_NORMAL_MEMORY, /* The node has regular memory */
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
N_HIGH_MEMORY, /* The node has regular or high memory */
|
|
|
|
#else
|
|
|
|
N_HIGH_MEMORY = N_NORMAL_MEMORY,
|
|
|
|
#endif
|
|
|
|
N_CPU, /* The node has one or more cpus */
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
NR_NODE_STATES
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following particular system nodemasks and operations
|
|
|
|
* on them manage all possible and online nodes.
|
|
|
|
*/
|
|
|
|
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
extern nodemask_t node_states[NR_NODE_STATES];
|
|
|
|
|
|
|
|
#if MAX_NUMNODES > 1
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
static inline int node_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
return node_isset(node, node_states[state]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void node_set_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
__node_set(node, &node_states[state]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void node_clear_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
__node_clear(node, &node_states[state]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int num_node_state(enum node_states state)
|
|
|
|
{
|
|
|
|
return nodes_weight(node_states[state]);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define for_each_node_state(__node, __state) \
|
|
|
|
for_each_node_mask((__node), node_states[__state])
|
|
|
|
|
|
|
|
#define first_online_node first_node(node_states[N_ONLINE])
|
|
|
|
#define next_online_node(nid) next_node((nid), node_states[N_ONLINE])
|
|
|
|
|
|
|
|
extern int nr_node_ids;
|
|
|
|
#else
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
|
|
|
|
static inline int node_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
return node == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void node_set_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void node_clear_state(int node, enum node_states state)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int num_node_state(enum node_states state)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define for_each_node_state(node, __state) \
|
|
|
|
for ( (node) = 0; (node) == 0; (node) = 1)
|
|
|
|
|
|
|
|
#define first_online_node 0
|
|
|
|
#define next_online_node(nid) (MAX_NUMNODES)
|
|
|
|
#define nr_node_ids 1
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
|
|
|
|
#endif
|
|
|
|
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
#define node_online_map node_states[N_ONLINE]
|
|
|
|
#define node_possible_map node_states[N_POSSIBLE]
|
|
|
|
|
|
|
|
#define any_online_node(mask) \
|
|
|
|
({ \
|
|
|
|
int node; \
|
|
|
|
for_each_node_mask(node, (mask)) \
|
|
|
|
if (node_online(node)) \
|
|
|
|
break; \
|
|
|
|
node; \
|
|
|
|
})
|
|
|
|
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
#define num_online_nodes() num_node_state(N_ONLINE)
|
|
|
|
#define num_possible_nodes() num_node_state(N_POSSIBLE)
|
|
|
|
#define node_online(node) node_state((node), N_ONLINE)
|
|
|
|
#define node_possible(node) node_state((node), N_POSSIBLE)
|
|
|
|
|
|
|
|
#define node_set_online(node) node_set_state((node), N_ONLINE)
|
|
|
|
#define node_set_offline(node) node_clear_state((node), N_ONLINE)
|
|
|
|
|
Memoryless nodes: Generic management of nodemasks for various purposes
Why do we need to support memoryless nodes?
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> For fujitsu, problem is called "empty" node.
>
> When ACPI's SRAT table includes "possible nodes", ia64 bootstrap(acpi_numa_init)
> creates nodes, which includes no memory, no cpu.
>
> I tried to remove empty-node in past, but that was denied.
> It was because we can hot-add cpu to the empty node.
> (node-hotplug triggered by cpu is not implemented now. and it will be ugly.)
>
>
> For HP, (Lee can comment on this later), they have memory-less-node.
> As far as I hear, HP's machine can have following configration.
>
> (example)
> Node0: CPU0 memory AAA MB
> Node1: CPU1 memory AAA MB
> Node2: CPU2 memory AAA MB
> Node3: CPU3 memory AAA MB
> Node4: Memory XXX GB
>
> AAA is very small value (below 16MB) and will be omitted by ia64 bootstrap.
> After boot, only Node 4 has valid memory (but have no cpu.)
>
> Maybe this is memory-interleave by firmware config.
Christoph Lameter <clameter@sgi.com> wrote:
> Future SGI platforms (actually also current one can have but nothing like
> that is deployed to my knowledge) have nodes with only cpus. Current SGI
> platforms have nodes with just I/O that we so far cannot manage in the
> core. So the arch code maps them to the nearest memory node.
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> For the HP platforms, we can configure each cell with from 0% to 100%
> "cell local memory". When we configure with <100% CLM, the "missing
> percentages" are interleaved by hardware on a cache-line granularity to
> improve bandwidth at the expense of latency for numa-challenged
> applications [and OSes, but not our problem ;-)]. When we boot Linux on
> such a config, all of the real nodes have no memory--it all resides in a
> single interleaved pseudo-node.
>
> When we boot Linux on a 100% CLM configuration [== NUMA], we still have
> the interleaved pseudo-node. It contains a few hundred MB stolen from
> the real nodes to contain the DMA zone. [Interleaved memory resides at
> phys addr 0]. The memoryless-nodes patches, along with the zoneorder
> patches, support this config as well.
>
> Also, when we boot a NUMA config with the "mem=" command line,
> specifying less memory than actually exists, Linux takes the excluded
> memory "off the top" rather than distributing it across the nodes. This
> can result in memoryless nodes, as well.
>
This patch:
Preparation for memoryless node patches.
Provide a generic way to keep nodemasks describing various characteristics of
NUMA nodes.
Remove the node_online_map and the node_possible map and realize the same
functionality using two nodes stats: N_POSSIBLE and N_ONLINE.
[Lee.Schermerhorn@hp.com: Initialize N_*_MEMORY and N_CPU masks for non-NUMA config]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Tested-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Bob Picco <bob.picco@hp.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
18 years ago
|
|
|
#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
|
|
|
|
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
|
|
|
|
|
|
|
|
#endif /* __LINUX_NODEMASK_H */
|