From 7289e6ddb633aaee6ccea2bd2e410654c47b29a6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 14:58:19 -0800 Subject: [PATCH 1/8] fib_trie: Only resize tnodes once instead of on each leaf removal in fib_table_flush This change makes it so that we only call resize on the tnodes, instead of from each of the leaves. By doing this we can significantly reduce the amount of time spent resizing as we can update all of the leaves in the tnode first before we make any determinations about resizing. As a result we can simply free the tnode in the case that all of the leaves from a given tnode are flushed instead of resizing with each leaf removed. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 141 ++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f48534577f8d..d8b68b4de532 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1399,25 +1399,6 @@ found: } EXPORT_SYMBOL_GPL(fib_table_lookup); -/* - * Remove the leaf and return parent. - */ -static void trie_leaf_remove(struct trie *t, struct tnode *l) -{ - struct tnode *tp = node_parent(l); - - pr_debug("entering trie_leaf_remove(%p)\n", l); - - if (tp) { - put_child(tp, get_index(l->key, tp), NULL); - trie_rebalance(t, tp); - } else { - RCU_INIT_POINTER(t->trie, NULL); - } - - node_free(l); -} - /* * Caller must hold RTNL. */ @@ -1483,8 +1464,18 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!plen) tb->tb_num_default--; - if (hlist_empty(&l->leaf)) - trie_leaf_remove(t, l); + if (hlist_empty(&l->leaf)) { + struct tnode *tp = node_parent(l); + + if (tp) { + put_child(tp, get_index(l->key, tp), NULL); + trie_rebalance(t, tp); + } else { + RCU_INIT_POINTER(t->trie, NULL); + } + + node_free(l); + } if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); @@ -1494,33 +1485,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) return 0; } -static int trie_flush_leaf(struct tnode *l) -{ - struct hlist_node *tmp; - unsigned char slen = 0; - struct fib_alias *fa; - int found = 0; - - hlist_for_each_entry_safe(fa, tmp, &l->leaf, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags & RTNH_F_DEAD)) { - hlist_del_rcu(&fa->fa_list); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); - found++; - - continue; - } - - slen = fa->fa_slen; - } - - l->slen = slen; - - return found; -} - /* Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ @@ -1588,30 +1552,81 @@ static struct tnode *trie_leafindex(struct trie *t, int index) */ int fib_table_flush(struct fib_table *tb) { - struct trie *t = (struct trie *) tb->tb_data; - struct tnode *l, *ll = NULL; + struct trie *t = (struct trie *)tb->tb_data; + struct hlist_node *tmp; + struct fib_alias *fa; + struct tnode *n, *pn; + unsigned long cindex; + unsigned char slen; int found = 0; - for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { - found += trie_flush_leaf(l); + n = rcu_dereference(t->trie); + if (!n) + goto flush_complete; + + pn = NULL; + cindex = 0; + + while (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; +backtrace: + /* walk trie in reverse order */ + do { + while (!(cindex--)) { + t_key pkey = pn->key; + + n = pn; + pn = node_parent(n); + + /* resize completed node */ + resize(t, n); + + /* if we got the root we are done */ + if (!pn) + goto flush_complete; - if (ll) { - if (hlist_empty(&ll->leaf)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + cindex = get_index(pkey, pn); + } + + /* grab the next available node */ + n = tnode_get_child(pn, cindex); + } while (!n); + } + + /* track slen in case any prefixes survive */ + slen = 0; + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + hlist_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + found++; + + continue; } - ll = l; + slen = fa->fa_slen; } - if (ll) { - if (hlist_empty(&ll->leaf)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, t, n->key, NULL); + node_free(n); + } else { + leaf_pull_suffix(n); } + /* if trie is leaf only loop is completed */ + if (pn) + goto backtrace; +flush_complete: pr_debug("trie_flush found=%d\n", found); return found; } From 8be33e955cb959dabc1a6eef0b7356fe8cf73fa6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 14:59:19 -0800 Subject: [PATCH 2/8] fib_trie: Fib walk rcu should take a tnode and key instead of a trie and a leaf This change makes it so that leaf_walk_rcu takes a tnode and a key instead of the trie and a leaf. The main idea behind this is to avoid using the leaf parent pointer as that can have additional overhead in the future as I am trying to reduce the size of a leaf down to 16 bytes on 64b systems and 12b on 32b systems. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 216 ++++++++++++++++++++++++-------------------- 1 file changed, 120 insertions(+), 96 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d8b68b4de532..bf488cee524a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1485,71 +1485,71 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) return 0; } -/* Scan for the next right leaf starting at node p->child[idx] - * Since we have back pointer, no recursion necessary. - */ -static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) +/* Scan for the next leaf starting at the provided key value */ +static struct tnode *leaf_walk_rcu(struct tnode **tn, t_key key) { - do { - unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; - - while (idx < tnode_child_length(p)) { - c = tnode_get_child_rcu(p, idx++); - if (!c) - continue; - - if (IS_LEAF(c)) - return c; - - /* Rescan start scanning in new node */ - p = c; - idx = 0; - } + struct tnode *pn, *n = *tn; + unsigned long cindex; - /* Node empty, walk back up to parent */ - c = p; - } while ((p = node_parent_rcu(c)) != NULL); + /* record parent node for backtracing */ + pn = n; + cindex = n ? get_index(key, n) : 0; - return NULL; /* Root of trie */ -} + /* this loop is meant to try and find the key in the trie */ + while (n) { + unsigned long idx = get_index(key, n); -static struct tnode *trie_firstleaf(struct trie *t) -{ - struct tnode *n = rcu_dereference_rtnl(t->trie); + /* guarantee forward progress on the keys */ + if (IS_LEAF(n) && (n->key >= key)) + goto found; + if (idx >= (1ul << n->bits)) + break; - if (!n) - return NULL; + /* record parent and next child index */ + pn = n; + cindex = idx; - if (IS_LEAF(n)) /* trie is just a leaf */ - return n; + /* descend into the next child */ + n = tnode_get_child_rcu(pn, cindex++); + } - return leaf_walk_rcu(n, NULL); -} + /* this loop will search for the next leaf with a greater key */ + while (pn) { + /* if we exhausted the parent node we will need to climb */ + if (cindex >= (1ul << pn->bits)) { + t_key pkey = pn->key; -static struct tnode *trie_nextleaf(struct tnode *l) -{ - struct tnode *p = node_parent_rcu(l); + pn = node_parent_rcu(pn); + if (!pn) + break; - if (!p) - return NULL; /* trie with just one leaf */ + cindex = get_index(pkey, pn) + 1; + continue; + } - return leaf_walk_rcu(p, l); -} + /* grab the next available node */ + n = tnode_get_child_rcu(pn, cindex++); + if (!n) + continue; -static struct tnode *trie_leafindex(struct trie *t, int index) -{ - struct tnode *l = trie_firstleaf(t); + /* no need to compare keys since we bumped the index */ + if (IS_LEAF(n)) + goto found; - while (l && index-- > 0) - l = trie_nextleaf(l); + /* Rescan start scanning in new node */ + pn = n; + cindex = 0; + } - return l; + *tn = pn; + return NULL; /* Root of trie */ +found: + /* if we are at the limit for keys just return NULL for the tnode */ + *tn = (n->key == KEY_MAX) ? NULL : pn; + return n; } - -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; @@ -1680,42 +1680,42 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - struct tnode *l; - struct trie *t = (struct trie *) tb->tb_data; - t_key key = cb->args[2]; - int count = cb->args[3]; - - rcu_read_lock(); + struct trie *t = (struct trie *)tb->tb_data; + struct tnode *l, *tp; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ - if (count == 0) - l = trie_firstleaf(t); - else { - /* Normally, continue from last key, but if that is missing - * fallback to using slow rescan - */ - l = fib_find_node(t, key); - if (!l) - l = trie_leafindex(t, count); - } + int count = cb->args[2]; + t_key key = cb->args[3]; - while (l) { - cb->args[2] = l->key; + rcu_read_lock(); + + tp = rcu_dereference_rtnl(t->trie); + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { - cb->args[3] = count; + cb->args[3] = key; + cb->args[2] = count; rcu_read_unlock(); return -1; } ++count; - l = trie_nextleaf(l); + key = l->key + 1; + memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); + + /* stop loop if key wrapped back to 0 */ + if (key < l->key) + break; } - cb->args[3] = count; + rcu_read_unlock(); + cb->args[3] = key; + cb->args[2] = count; + return skb->len; } @@ -2186,31 +2186,46 @@ static const struct file_operations fib_trie_fops = { struct fib_route_iter { struct seq_net_private p; - struct trie *main_trie; + struct fib_table *main_tb; + struct tnode *tnode; loff_t pos; t_key key; }; static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { - struct tnode *l = NULL; - struct trie *t = iter->main_trie; + struct fib_table *tb = iter->main_tb; + struct tnode *l, **tp = &iter->tnode; + struct trie *t; + t_key key; - /* use cache location of last found key */ - if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) + /* use cache location of next-to-find key */ + if (iter->pos > 0 && pos >= iter->pos) { pos -= iter->pos; - else { + key = iter->key; + } else { + t = (struct trie *)tb->tb_data; + iter->tnode = rcu_dereference_rtnl(t->trie); iter->pos = 0; - l = trie_firstleaf(t); + key = 0; } - while (l && pos-- > 0) { + while ((l = leaf_walk_rcu(tp, key)) != NULL) { + key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + + if (pos-- <= 0) + break; + + l = NULL; + + /* handle unlikely case of a key wrap */ + if (!key) + break; } if (l) - iter->key = pos; /* remember it */ + iter->key = key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2222,37 +2237,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; + struct trie *t; rcu_read_lock(); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; - iter->main_trie = (struct trie *) tb->tb_data; - if (*pos == 0) - return SEQ_START_TOKEN; - else - return fib_route_get_idx(iter, *pos - 1); + iter->main_tb = tb; + + if (*pos != 0) + return fib_route_get_idx(iter, *pos); + + t = (struct trie *)tb->tb_data; + iter->tnode = rcu_dereference_rtnl(t->trie); + iter->pos = 0; + iter->key = 0; + + return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; - struct tnode *l = v; + struct tnode *l = NULL; + t_key key = iter->key; ++*pos; - if (v == SEQ_START_TOKEN) { - iter->pos = 0; - l = trie_firstleaf(iter->main_trie); - } else { + + /* only allow key of 0 for start of sequence */ + if ((v == SEQ_START_TOKEN) || key) + l = leaf_walk_rcu(&iter->tnode, key); + + if (l) { + iter->key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + } else { + iter->pos = 0; } - if (l) - iter->key = l->key; - else - iter->pos = 0; return l; } From d4a975e83f4de2e454d7f937b36ce13b010c65ce Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:01:59 -0800 Subject: [PATCH 3/8] fib_trie: Fib find node should return parent This change makes it so that the parent pointer is returned by reference in fib_find_node. By doing this I can use it to find the parent node when I am performing an insertion and I don't have to look for it again in fib_insert_node. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index bf488cee524a..5d0f145dbafe 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -912,9 +912,9 @@ static void fib_insert_alias(struct tnode *l, struct fib_alias *fa, } /* rcu_read_lock needs to be hold by caller from readside */ -static struct tnode *fib_find_node(struct trie *t, u32 key) +static struct tnode *fib_find_node(struct trie *t, struct tnode **tn, u32 key) { - struct tnode *n = rcu_dereference_rtnl(t->trie); + struct tnode *pn = NULL, *n = rcu_dereference_rtnl(t->trie); while (n) { unsigned long index = get_index(key, n); @@ -924,21 +924,30 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) - return NULL; + if (index >= (1ul << n->bits)) { + n = NULL; + break; + } /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) break; + pn = n; n = tnode_get_child_rcu(n, index); } + *tn = pn; + return n; } @@ -1071,15 +1080,15 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { - struct trie *t = (struct trie *) tb->tb_data; + struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; + struct tnode *l, *tp; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; u8 tos = cfg->fc_tos; - u32 key, mask; + u32 key; int err; - struct tnode *l; if (plen > KEYLENGTH) return -EINVAL; @@ -1088,9 +1097,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl(inet_make_mask(plen)); - - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; fi = fib_create_info(cfg); @@ -1099,7 +1106,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) goto err; } - l = fib_find_node(t, key); + l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority) : NULL; /* Now fa, if non-NULL, points to the first fib alias @@ -1406,22 +1413,21 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; + struct tnode *l, *tp; u8 plen = cfg->fc_dst_len; - u8 tos = cfg->fc_tos; u8 slen = KEYLENGTH - plen; - struct tnode *l; - u32 key, mask; + u8 tos = cfg->fc_tos; + u32 key; if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); - mask = ntohl(inet_make_mask(plen)); - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - l = fib_find_node(t, key); + l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; From d5d6487cb8f019ab663df4c03519cd69e4362795 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:02:18 -0800 Subject: [PATCH 4/8] fib_trie: Update insert and delete to make use of tp from find_node This change makes it so that the insert and delete functions make use of the tnode pointer returned in the fib_find_node call. By doing this we will not have to rely on the parent pointer in the leaf which will be going away soon. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 237 ++++++++++++++++++-------------------------- 1 file changed, 95 insertions(+), 142 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5d0f145dbafe..5be88df02b27 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -300,7 +300,7 @@ static inline void empty_child_dec(struct tnode *n) n->empty_children-- ? : n->full_children--; } -static struct tnode *leaf_new(t_key key) +static struct tnode *leaf_new(t_key key, struct fib_alias *fa) { struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { @@ -310,12 +310,14 @@ static struct tnode *leaf_new(t_key key) * as the nodes are searched */ l->key = key; - l->slen = 0; + l->slen = fa->fa_slen; l->pos = 0; /* set bits to 0 indicating we are not a tnode */ l->bits = 0; + /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); + hlist_add_head(&fa->fa_list, &l->leaf); } return l; } @@ -842,10 +844,8 @@ static void resize(struct trie *t, struct tnode *tn) } } -static void leaf_pull_suffix(struct tnode *l) +static void leaf_pull_suffix(struct tnode *tp, struct tnode *l) { - struct tnode *tp = node_parent(l); - while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { if (update_suffix(tp) > l->slen) break; @@ -853,10 +853,8 @@ static void leaf_pull_suffix(struct tnode *l) } } -static void leaf_push_suffix(struct tnode *l) +static void leaf_push_suffix(struct tnode *tn, struct tnode *l) { - struct tnode *tn = node_parent(l); - /* if this is a new leaf then tn will be NULL and we can sort * out parent suffix lengths as a part of trie_rebalance */ @@ -866,51 +864,6 @@ static void leaf_push_suffix(struct tnode *l) } } -static void fib_remove_alias(struct tnode *l, struct fib_alias *old) -{ - /* record the location of the previous list_info entry */ - struct hlist_node **pprev = old->fa_list.pprev; - struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); - - /* remove the fib_alias from the list */ - hlist_del_rcu(&old->fa_list); - - /* only access fa if it is pointing at the last valid hlist_node */ - if (hlist_empty(&l->leaf) || (*pprev)) - return; - - /* update the trie with the latest suffix length */ - l->slen = fa->fa_slen; - leaf_pull_suffix(l); -} - -static void fib_insert_alias(struct tnode *l, struct fib_alias *fa, - struct fib_alias *new) -{ - if (fa) { - hlist_add_before_rcu(&new->fa_list, &fa->fa_list); - } else { - struct fib_alias *last; - - hlist_for_each_entry(last, &l->leaf, fa_list) { - if (new->fa_slen < last->fa_slen) - break; - fa = last; - } - - if (fa) - hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); - else - hlist_add_head_rcu(&new->fa_list, &l->leaf); - } - - /* if we added to the tail node then we need to update slen */ - if (l->slen < new->fa_slen) { - l->slen = new->fa_slen; - leaf_push_suffix(l); - } -} - /* rcu_read_lock needs to be hold by caller from readside */ static struct tnode *fib_find_node(struct trie *t, struct tnode **tn, u32 key) { @@ -980,61 +933,28 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) { struct tnode *tp; - while ((tp = node_parent(tn)) != NULL) { + while (tn) { + tp = node_parent(tn); resize(t, tn); tn = tp; } - - /* Handle last (top) tnode */ - if (IS_TNODE(tn)) - resize(t, tn); } /* only used from updater-side */ - -static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) +static int fib_insert_node(struct trie *t, struct tnode *tp, + struct fib_alias *new, t_key key) { - struct tnode *l, *n, *tp = NULL; - - n = rtnl_dereference(t->trie); - - /* If we point to NULL, stop. Either the tree is empty and we should - * just put a new leaf in if, or we have reached an empty child slot, - * and we should just put our new leaf in that. - * - * If we hit a node with a key that does't match then we should stop - * and create a new tnode to replace that node and insert ourselves - * and the other node into the new tnode. - */ - while (n) { - unsigned long index = get_index(key, n); - - /* This bit of code is a bit tricky but it combines multiple - * checks into a single check. The prefix consists of the - * prefix plus zeros for the "bits" in the prefix. The index - * is the difference between the key and this value. From - * this we can actually derive several pieces of data. - * if !(index >> bits) - * we know the value is child index - * else - * we have a mismatch in skip bits and failed - */ - if (index >> n->bits) - break; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) { - /* Case 1: n is a leaf, and prefixes match*/ - return n; - } - - tp = n; - n = tnode_get_child_rcu(n, index); - } + struct tnode *n, *l; - l = leaf_new(key); + l = leaf_new(key, new); if (!l) - return NULL; + return -ENOMEM; + + /* retrieve child from parent node */ + if (tp) + n = tnode_get_child(tp, get_index(key, tp)); + else + n = rcu_dereference_rtnl(t->trie); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * @@ -1048,7 +968,7 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) { node_free(l); - return NULL; + return -ENOMEM; } /* initialize routes out of node */ @@ -1064,20 +984,47 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) } /* Case 3: n is NULL, and will just insert a new leaf */ - if (tp) { - NODE_INIT_PARENT(l, tp); - put_child(tp, get_index(key, tp), l); - trie_rebalance(t, tp); + NODE_INIT_PARENT(l, tp); + put_child_root(tp, t, key, l); + trie_rebalance(t, tp); + + return 0; +} + +static int fib_insert_alias(struct trie *t, struct tnode *tp, + struct tnode *l, struct fib_alias *new, + struct fib_alias *fa, t_key key) +{ + if (!l) + return fib_insert_node(t, tp, new, key); + + if (fa) { + hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { - rcu_assign_pointer(t->trie, l); + struct fib_alias *last; + + hlist_for_each_entry(last, &l->leaf, fa_list) { + if (new->fa_slen < last->fa_slen) + break; + fa = last; + } + + if (fa) + hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); + else + hlist_add_head_rcu(&new->fa_list, &l->leaf); } - return l; + /* if we added to the tail node then we need to update slen */ + if (l->slen < new->fa_slen) { + l->slen = new->fa_slen; + leaf_push_suffix(tp, l); + } + + return 0; } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *)tb->tb_data; @@ -1205,19 +1152,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_slen = slen; /* Insert new entry to the list. */ - if (!l) { - l = fib_insert_node(t, key, plen); - if (unlikely(!l)) { - err = -ENOMEM; - goto out_free_new_fa; - } - } + err = fib_insert_alias(t, tp, l, new_fa, fa, key); + if (err) + goto out_free_new_fa; if (!plen) tb->tb_num_default++; - fib_insert_alias(l, fa, new_fa); - rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1406,9 +1347,36 @@ found: } EXPORT_SYMBOL_GPL(fib_table_lookup); -/* - * Caller must hold RTNL. - */ +static void fib_remove_alias(struct trie *t, struct tnode *tp, + struct tnode *l, struct fib_alias *old) +{ + /* record the location of the previous list_info entry */ + struct hlist_node **pprev = old->fa_list.pprev; + struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); + + /* remove the fib_alias from the list */ + hlist_del_rcu(&old->fa_list); + + /* if we emptied the list this leaf will be freed and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + if (hlist_empty(&l->leaf)) { + put_child_root(tp, t, l->key, NULL); + node_free(l); + trie_rebalance(t, tp); + return; + } + + /* only access fa if it is pointing at the last valid hlist_node */ + if (*pprev) + return; + + /* update the trie with the latest suffix length */ + l->slen = fa->fa_slen; + leaf_pull_suffix(tp, l); +} + +/* Caller must hold RTNL. */ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; @@ -1432,7 +1400,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) return -ESRCH; fa = fib_find_alias(&l->leaf, slen, tos, 0); - if (!fa) return -ESRCH; @@ -1461,33 +1428,19 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, + rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); - fib_remove_alias(l, fa); - if (!plen) tb->tb_num_default--; - if (hlist_empty(&l->leaf)) { - struct tnode *tp = node_parent(l); - - if (tp) { - put_child(tp, get_index(l->key, tp), NULL); - trie_rebalance(t, tp); - } else { - RCU_INIT_POINTER(t->trie, NULL); - } - - node_free(l); - } + fib_remove_alias(t, tp, l, fa_to_delete); - if (fa->fa_state & FA_S_ACCESSED) + if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); + fib_release_info(fa_to_delete->fa_info); + alias_free_mem_rcu(fa_to_delete); return 0; } @@ -1626,7 +1579,7 @@ backtrace: put_child_root(pn, t, n->key, NULL); node_free(n); } else { - leaf_pull_suffix(n); + leaf_pull_suffix(pn, n); } /* if trie is leaf only loop is completed */ From 41b489fd6ce03e96e90fcffdb69b168065ae2e40 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:02:33 -0800 Subject: [PATCH 5/8] fib_trie: move leaf and tnode to occupy the same spot in the key vector If we are going to compact the leaf and tnode we first need to make sure the fields are all in the same place. In that regard I am moving the leaf pointer which represents the fib_alias hash list to occupy what is currently the first key_vector pointer. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 51 ++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5be88df02b27..2233ebf2aae8 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -94,24 +94,27 @@ typedef unsigned int t_key; #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) struct tnode { + struct rcu_head rcu; + + t_key empty_children; /* KEYLENGTH bits needed */ + t_key full_children; /* KEYLENGTH bits needed */ + struct tnode __rcu *parent; + t_key key; - unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; - struct tnode __rcu *parent; - struct rcu_head rcu; union { - /* The fields in this struct are valid if bits > 0 (TNODE) */ - struct { - t_key empty_children; /* KEYLENGTH bits needed */ - t_key full_children; /* KEYLENGTH bits needed */ - struct tnode __rcu *child[0]; - }; - /* This list pointer if valid if bits == 0 (LEAF) */ + /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; + /* This array is valid if (pos | bits) > 0 (TNODE) */ + struct tnode __rcu *tnode[0]; }; }; +#define TNODE_SIZE(n) offsetof(struct tnode, tnode[n]) +#define LEAF_SIZE TNODE_SIZE(1) + #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; @@ -180,14 +183,14 @@ static inline unsigned long tnode_child_length(const struct tnode *tn) static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned long i) { - return rtnl_dereference(tn->child[i]); + return rtnl_dereference(tn->tnode[i]); } /* caller must hold RCU read lock or RTNL */ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned long i) { - return rcu_dereference_rtnl(tn->child[i]); + return rcu_dereference_rtnl(tn->tnode[i]); } /* To understand this stuff, an understanding of keys and all their bits is @@ -266,7 +269,7 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) } #define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) + ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct tnode *)) static void __node_free_rcu(struct rcu_head *head) { @@ -324,7 +327,7 @@ static struct tnode *leaf_new(t_key key, struct fib_alias *fa) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = offsetof(struct tnode, child[1ul << bits]); + size_t sz = TNODE_SIZE(1ul << bits); struct tnode *tn = tnode_alloc(sz); unsigned int shift = pos + bits; @@ -343,7 +346,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) tn->empty_children = 1ul << bits; } - pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), + pr_debug("AT %p s=%zu %zu\n", tn, TNODE_SIZE(0), sizeof(struct tnode *) << bits); return tn; } @@ -384,7 +387,7 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) if (n && (tn->slen < n->slen)) tn->slen = n->slen; - rcu_assign_pointer(tn->child[i], n); + rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct tnode *tn) @@ -435,7 +438,7 @@ static void tnode_free(struct tnode *tn) while (head) { head = head->next; - tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu); @@ -788,7 +791,7 @@ static void resize(struct trie *t, struct tnode *tn) * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ - cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; + cptr = tp ? &tp->tnode[get_index(tn->key, tp)] : &t->trie; BUG_ON(tn != rtnl_dereference(*cptr)); /* Double as long as the resulting node has a number of @@ -1241,7 +1244,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ - struct tnode __rcu **cptr = n->child; + struct tnode __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of @@ -1287,7 +1290,7 @@ backtrace: cindex &= cindex - 1; /* grab pointer for next child node */ - cptr = &pn->child[cindex]; + cptr = &pn->tnode[cindex]; } } @@ -1685,7 +1688,7 @@ void __init fib_trie_init(void) 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", - sizeof(struct tnode), + LEAF_SIZE, 0, SLAB_PANIC, NULL); } @@ -1843,13 +1846,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct tnode) * stat->leaves; + bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); - bytes += sizeof(struct tnode) * stat->tnodes; + bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) @@ -1918,7 +1921,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct tnode), sizeof(struct tnode)); + LEAF_SIZE, TNODE_SIZE(0)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; From a7e53531234dc206bb75abb5305a72665dd4d75d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:02:44 -0800 Subject: [PATCH 6/8] fib_trie: Make fib_table rcu safe The fib_table was wrapped in several places with an rcu_read_lock/rcu_read_unlock however after looking over the code I found several spots where the tables were being accessed as just standard pointers without any protections. This change fixes that so that all of the proper protections are in place when accessing the table to take RCU replacement or removal of the table into account. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip_fib.h | 70 ++++++++++++++++++++++++---------------- include/net/netns/ipv4.h | 7 ++-- net/ipv4/fib_frontend.c | 52 ++++++++++++++++++++--------- net/ipv4/fib_trie.c | 21 ++++++++---- 4 files changed, 98 insertions(+), 52 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index cba4b7c32935..825cb2800908 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -185,6 +185,7 @@ struct fib_table { u32 tb_id; int tb_default; int tb_num_default; + struct rcu_head rcu; unsigned long tb_data[0]; }; @@ -206,12 +207,16 @@ void fib_free_table(struct fib_table *tb); static inline struct fib_table *fib_get_table(struct net *net, u32 id) { + struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; - return hlist_entry(ptr->first, struct fib_table, tb_hlist); + + tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); + + return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) @@ -222,15 +227,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id) static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res) { - int err = -ENETUNREACH; + struct fib_table *tb; + int err; rcu_read_lock(); - if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res, - FIB_LOOKUP_NOREF) || - !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res, - FIB_LOOKUP_NOREF)) - err = 0; + for (err = 0; !err; err = -ENETUNREACH) { + tb = fib_get_table(net, RT_TABLE_LOCAL); + if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + break; + tb = fib_get_table(net, RT_TABLE_MAIN); + if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + break; + } rcu_read_unlock(); @@ -249,28 +258,33 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { - if (!net->ipv4.fib_has_custom_rules) { - int err = -ENETUNREACH; - - rcu_read_lock(); - - res->tclassid = 0; - if ((net->ipv4.fib_local && - !fib_table_lookup(net->ipv4.fib_local, flp, res, - FIB_LOOKUP_NOREF)) || - (net->ipv4.fib_main && - !fib_table_lookup(net->ipv4.fib_main, flp, res, - FIB_LOOKUP_NOREF)) || - (net->ipv4.fib_default && - !fib_table_lookup(net->ipv4.fib_default, flp, res, - FIB_LOOKUP_NOREF))) - err = 0; - - rcu_read_unlock(); - - return err; + struct fib_table *tb; + int err; + + if (net->ipv4.fib_has_custom_rules) + return __fib_lookup(net, flp, res); + + rcu_read_lock(); + + res->tclassid = 0; + + for (err = 0; !err; err = -ENETUNREACH) { + tb = rcu_dereference_rtnl(net->ipv4.fib_local); + if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + break; + + tb = rcu_dereference_rtnl(net->ipv4.fib_main); + if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + break; + + tb = rcu_dereference_rtnl(net->ipv4.fib_default); + if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF)) + break; } - return __fib_lookup(net, flp, res); + + rcu_read_unlock(); + + return err; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1b26c6c3fd7c..db1db158a00e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -7,6 +7,7 @@ #include #include +#include struct tcpm_hash_bucket; struct ctl_table_header; @@ -38,9 +39,9 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; - struct fib_table *fib_local; - struct fib_table *fib_main; - struct fib_table *fib_default; + struct fib_table __rcu *fib_local; + struct fib_table __rcu *fib_main; + struct fib_table __rcu *fib_default; #endif #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57be71dd6a9e..220c4b4af4cf 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -89,17 +89,14 @@ struct fib_table *fib_new_table(struct net *net, u32 id) switch (id) { case RT_TABLE_LOCAL: - net->ipv4.fib_local = tb; + rcu_assign_pointer(net->ipv4.fib_local, tb); break; - case RT_TABLE_MAIN: - net->ipv4.fib_main = tb; + rcu_assign_pointer(net->ipv4.fib_main, tb); break; - case RT_TABLE_DEFAULT: - net->ipv4.fib_default = tb; + rcu_assign_pointer(net->ipv4.fib_default, tb); break; - default: break; } @@ -132,13 +129,14 @@ struct fib_table *fib_get_table(struct net *net, u32 id) static void fib_flush(struct net *net) { int flushed = 0; - struct fib_table *tb; - struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *tmp; + struct fib_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(tb); } @@ -665,10 +663,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_e = cb->args[1]; + rcu_read_lock(); + for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -682,6 +682,8 @@ next: } } out: + rcu_read_unlock(); + cb->args[1] = e; cb->args[0] = h; @@ -1117,14 +1119,34 @@ static void ip_fib_net_exit(struct net *net) rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { - struct fib_table *tb; - struct hlist_head *head; + struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; + struct fib_table *tb; + + /* this is done in two passes as flushing the table could + * cause it to be reallocated in order to accommodate new + * tnodes at the root as the table shrinks. + */ + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) + fib_table_flush(tb); - head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { +#ifdef CONFIG_IP_MULTIPLE_TABLES + switch (tb->tb_id) { + case RT_TABLE_LOCAL: + RCU_INIT_POINTER(net->ipv4.fib_local, NULL); + break; + case RT_TABLE_MAIN: + RCU_INIT_POINTER(net->ipv4.fib_main, NULL); + break; + case RT_TABLE_DEFAULT: + RCU_INIT_POINTER(net->ipv4.fib_default, NULL); + break; + default: + break; + } +#endif hlist_del(&tb->tb_hlist); - fib_table_flush(tb); fib_free_table(tb); } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2233ebf2aae8..3642b17c8726 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -193,6 +193,13 @@ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, return rcu_dereference_rtnl(tn->tnode[i]); } +static inline struct fib_table *trie_get_table(struct trie *t) +{ + unsigned long *tb_data = (unsigned long *)t; + + return container_of(tb_data, struct fib_table, tb_data[0]); +} + /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. @@ -1593,8 +1600,9 @@ flush_complete: return found; } -void fib_free_table(struct fib_table *tb) +static void __trie_free_rcu(struct rcu_head *head) { + struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; @@ -1603,6 +1611,11 @@ void fib_free_table(struct fib_table *tb) kfree(tb); } +void fib_free_table(struct fib_table *tb) +{ + call_rcu(&tb->rcu, __trie_free_rcu); +} + static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { @@ -1639,6 +1652,7 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, return skb->len; } +/* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { @@ -1650,15 +1664,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, int count = cb->args[2]; t_key key = cb->args[3]; - rcu_read_lock(); - tp = rcu_dereference_rtnl(t->trie); while ((l = leaf_walk_rcu(&tp, key)) != NULL) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { cb->args[3] = key; cb->args[2] = count; - rcu_read_unlock(); return -1; } @@ -1673,8 +1684,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, break; } - rcu_read_unlock(); - cb->args[3] = key; cb->args[2] = count; From 71e8b67d0fdd2fe22a657bb98716c5cf0e31e828 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:04:03 -0800 Subject: [PATCH 7/8] fib_trie: Update last spot w/ idx >> n->bits code and explanation This change updates the fib_table_lookup function so that it is in sync with the fib_find_node function in terms of the explanation for the index check based on the bits value. I have also updated it from doing a mask to just doing a compare as I have found that seems to provide more options to the compiler as I have seen it turn this into a shift of the value and test under some circumstances. In addition I addressed one minor issue in which we kept computing the key ^ n->key when checking the fib aliases. I pulled the xor out of the loop in order to reduce the number of memory reads in the lookup. As a result we should save a couple cycles since the xor is only done once much earlier in the lookup. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3642b17c8726..08676c56efc3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1201,6 +1201,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, const t_key key = ntohl(flp->daddr); struct tnode *n, *pn; struct fib_alias *fa; + unsigned long index; t_key cindex; n = rcu_dereference(t->trie); @@ -1216,19 +1217,23 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { - unsigned long index = get_index(key, n); + index = get_index(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) + if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ @@ -1302,14 +1307,17 @@ backtrace: } found: + /* this line carries forward the xor from earlier in the function */ + index = key ^ n->key; + /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; int nhsel, err; - if (((key ^ n->key) >= (1ul << fa->fa_slen)) && + if ((index >= (1ul << fa->fa_slen)) && ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) - continue; + continue; if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; if (fi->fib_dead) From 1de3d87bcd2c2e631500b9e4f0c40b38ce0d0d57 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 4 Mar 2015 15:04:46 -0800 Subject: [PATCH 8/8] fib_trie: Prevent allocating tnode if bits is too big for size_t This patch adds code to prevent us from attempting to allocate a tnode with a size larger than what can be represented by size_t. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 08676c56efc3..fae34ad4bb1a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -277,6 +277,8 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) #define TNODE_KMALLOC_MAX \ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct tnode *)) +#define TNODE_VMALLOC_MAX \ + ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct tnode *)) static void __node_free_rcu(struct rcu_head *head) { @@ -292,8 +294,17 @@ static void __node_free_rcu(struct rcu_head *head) #define node_free(n) call_rcu(&n->rcu, __node_free_rcu) -static struct tnode *tnode_alloc(size_t size) +static struct tnode *tnode_alloc(int bits) { + size_t size; + + /* verify bits is within bounds */ + if (bits > TNODE_VMALLOC_MAX) + return NULL; + + /* determine size and verify it is non-zero and didn't overflow */ + size = TNODE_SIZE(1ul << bits); + if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else @@ -334,8 +345,7 @@ static struct tnode *leaf_new(t_key key, struct fib_alias *fa) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = TNODE_SIZE(1ul << bits); - struct tnode *tn = tnode_alloc(sz); + struct tnode *tn = tnode_alloc(bits); unsigned int shift = pos + bits; /* verify bits and pos their msb bits clear and values are valid */