net: fix a potential rcu_read_lock() imbalance in rt6_fill_node()
[~shefty/rdma-dev.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 255,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288         if (peer) {
289                 rt->rt6i_peer = NULL;
290                 inet_putpeer(peer);
291         }
292 }
293
294 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
295
296 static u32 rt6_peer_genid(void)
297 {
298         return atomic_read(&__rt6_peer_genid);
299 }
300
301 void rt6_bind_peer(struct rt6_info *rt, int create)
302 {
303         struct inet_peer *peer;
304
305         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
306         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
307                 inet_putpeer(peer);
308         else
309                 rt->rt6i_peer_genid = rt6_peer_genid();
310 }
311
312 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
313                            int how)
314 {
315         struct rt6_info *rt = (struct rt6_info *)dst;
316         struct inet6_dev *idev = rt->rt6i_idev;
317         struct net_device *loopback_dev =
318                 dev_net(dev)->loopback_dev;
319
320         if (dev != loopback_dev && idev && idev->dev == dev) {
321                 struct inet6_dev *loopback_idev =
322                         in6_dev_get(loopback_dev);
323                 if (loopback_idev) {
324                         rt->rt6i_idev = loopback_idev;
325                         in6_dev_put(idev);
326                 }
327         }
328 }
329
330 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
331 {
332         return (rt->rt6i_flags & RTF_EXPIRES) &&
333                 time_after(jiffies, rt->dst.expires);
334 }
335
336 static inline int rt6_need_strict(const struct in6_addr *daddr)
337 {
338         return ipv6_addr_type(daddr) &
339                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
340 }
341
342 /*
343  *      Route lookup. Any table->tb6_lock is implied.
344  */
345
346 static inline struct rt6_info *rt6_device_match(struct net *net,
347                                                     struct rt6_info *rt,
348                                                     const struct in6_addr *saddr,
349                                                     int oif,
350                                                     int flags)
351 {
352         struct rt6_info *local = NULL;
353         struct rt6_info *sprt;
354
355         if (!oif && ipv6_addr_any(saddr))
356                 goto out;
357
358         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
359                 struct net_device *dev = sprt->dst.dev;
360
361                 if (oif) {
362                         if (dev->ifindex == oif)
363                                 return sprt;
364                         if (dev->flags & IFF_LOOPBACK) {
365                                 if (!sprt->rt6i_idev ||
366                                     sprt->rt6i_idev->dev->ifindex != oif) {
367                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
368                                                 continue;
369                                         if (local && (!oif ||
370                                                       local->rt6i_idev->dev->ifindex == oif))
371                                                 continue;
372                                 }
373                                 local = sprt;
374                         }
375                 } else {
376                         if (ipv6_chk_addr(net, saddr, dev,
377                                           flags & RT6_LOOKUP_F_IFACE))
378                                 return sprt;
379                 }
380         }
381
382         if (oif) {
383                 if (local)
384                         return local;
385
386                 if (flags & RT6_LOOKUP_F_IFACE)
387                         return net->ipv6.ip6_null_entry;
388         }
389 out:
390         return rt;
391 }
392
393 #ifdef CONFIG_IPV6_ROUTER_PREF
394 static void rt6_probe(struct rt6_info *rt)
395 {
396         struct neighbour *neigh;
397         /*
398          * Okay, this does not seem to be appropriate
399          * for now, however, we need to check if it
400          * is really so; aka Router Reachability Probing.
401          *
402          * Router Reachability Probe MUST be rate-limited
403          * to no more than one per minute.
404          */
405         rcu_read_lock();
406         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
407         if (!neigh || (neigh->nud_state & NUD_VALID))
408                 goto out;
409         read_lock_bh(&neigh->lock);
410         if (!(neigh->nud_state & NUD_VALID) &&
411             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
412                 struct in6_addr mcaddr;
413                 struct in6_addr *target;
414
415                 neigh->updated = jiffies;
416                 read_unlock_bh(&neigh->lock);
417
418                 target = (struct in6_addr *)&neigh->primary_key;
419                 addrconf_addr_solict_mult(target, &mcaddr);
420                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
421         } else {
422                 read_unlock_bh(&neigh->lock);
423         }
424 out:
425         rcu_read_unlock();
426 }
427 #else
428 static inline void rt6_probe(struct rt6_info *rt)
429 {
430 }
431 #endif
432
433 /*
434  * Default Router Selection (RFC 2461 6.3.6)
435  */
436 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
437 {
438         struct net_device *dev = rt->dst.dev;
439         if (!oif || dev->ifindex == oif)
440                 return 2;
441         if ((dev->flags & IFF_LOOPBACK) &&
442             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
443                 return 1;
444         return 0;
445 }
446
447 static inline int rt6_check_neigh(struct rt6_info *rt)
448 {
449         struct neighbour *neigh;
450         int m;
451
452         rcu_read_lock();
453         neigh = dst_get_neighbour_noref(&rt->dst);
454         if (rt->rt6i_flags & RTF_NONEXTHOP ||
455             !(rt->rt6i_flags & RTF_GATEWAY))
456                 m = 1;
457         else if (neigh) {
458                 read_lock_bh(&neigh->lock);
459                 if (neigh->nud_state & NUD_VALID)
460                         m = 2;
461 #ifdef CONFIG_IPV6_ROUTER_PREF
462                 else if (neigh->nud_state & NUD_FAILED)
463                         m = 0;
464 #endif
465                 else
466                         m = 1;
467                 read_unlock_bh(&neigh->lock);
468         } else
469                 m = 0;
470         rcu_read_unlock();
471         return m;
472 }
473
474 static int rt6_score_route(struct rt6_info *rt, int oif,
475                            int strict)
476 {
477         int m, n;
478
479         m = rt6_check_dev(rt, oif);
480         if (!m && (strict & RT6_LOOKUP_F_IFACE))
481                 return -1;
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
484 #endif
485         n = rt6_check_neigh(rt);
486         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
487                 return -1;
488         return m;
489 }
490
491 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
492                                    int *mpri, struct rt6_info *match)
493 {
494         int m;
495
496         if (rt6_check_expired(rt))
497                 goto out;
498
499         m = rt6_score_route(rt, oif, strict);
500         if (m < 0)
501                 goto out;
502
503         if (m > *mpri) {
504                 if (strict & RT6_LOOKUP_F_REACHABLE)
505                         rt6_probe(match);
506                 *mpri = m;
507                 match = rt;
508         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
509                 rt6_probe(rt);
510         }
511
512 out:
513         return match;
514 }
515
516 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
517                                      struct rt6_info *rr_head,
518                                      u32 metric, int oif, int strict)
519 {
520         struct rt6_info *rt, *match;
521         int mpri = -1;
522
523         match = NULL;
524         for (rt = rr_head; rt && rt->rt6i_metric == metric;
525              rt = rt->dst.rt6_next)
526                 match = find_match(rt, oif, strict, &mpri, match);
527         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
528              rt = rt->dst.rt6_next)
529                 match = find_match(rt, oif, strict, &mpri, match);
530
531         return match;
532 }
533
534 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
535 {
536         struct rt6_info *match, *rt0;
537         struct net *net;
538
539         rt0 = fn->rr_ptr;
540         if (!rt0)
541                 fn->rr_ptr = rt0 = fn->leaf;
542
543         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
544
545         if (!match &&
546             (strict & RT6_LOOKUP_F_REACHABLE)) {
547                 struct rt6_info *next = rt0->dst.rt6_next;
548
549                 /* no entries matched; do round-robin */
550                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
551                         next = fn->leaf;
552
553                 if (next != rt0)
554                         fn->rr_ptr = next;
555         }
556
557         net = dev_net(rt0->dst.dev);
558         return match ? match : net->ipv6.ip6_null_entry;
559 }
560
561 #ifdef CONFIG_IPV6_ROUTE_INFO
562 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
563                   const struct in6_addr *gwaddr)
564 {
565         struct net *net = dev_net(dev);
566         struct route_info *rinfo = (struct route_info *) opt;
567         struct in6_addr prefix_buf, *prefix;
568         unsigned int pref;
569         unsigned long lifetime;
570         struct rt6_info *rt;
571
572         if (len < sizeof(struct route_info)) {
573                 return -EINVAL;
574         }
575
576         /* Sanity check for prefix_len and length */
577         if (rinfo->length > 3) {
578                 return -EINVAL;
579         } else if (rinfo->prefix_len > 128) {
580                 return -EINVAL;
581         } else if (rinfo->prefix_len > 64) {
582                 if (rinfo->length < 2) {
583                         return -EINVAL;
584                 }
585         } else if (rinfo->prefix_len > 0) {
586                 if (rinfo->length < 1) {
587                         return -EINVAL;
588                 }
589         }
590
591         pref = rinfo->route_pref;
592         if (pref == ICMPV6_ROUTER_PREF_INVALID)
593                 return -EINVAL;
594
595         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
596
597         if (rinfo->length == 3)
598                 prefix = (struct in6_addr *)rinfo->prefix;
599         else {
600                 /* this function is safe */
601                 ipv6_addr_prefix(&prefix_buf,
602                                  (struct in6_addr *)rinfo->prefix,
603                                  rinfo->prefix_len);
604                 prefix = &prefix_buf;
605         }
606
607         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
608                                 dev->ifindex);
609
610         if (rt && !lifetime) {
611                 ip6_del_rt(rt);
612                 rt = NULL;
613         }
614
615         if (!rt && lifetime)
616                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
617                                         pref);
618         else if (rt)
619                 rt->rt6i_flags = RTF_ROUTEINFO |
620                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
621
622         if (rt) {
623                 if (!addrconf_finite_timeout(lifetime)) {
624                         rt->rt6i_flags &= ~RTF_EXPIRES;
625                 } else {
626                         rt->dst.expires = jiffies + HZ * lifetime;
627                         rt->rt6i_flags |= RTF_EXPIRES;
628                 }
629                 dst_release(&rt->dst);
630         }
631         return 0;
632 }
633 #endif
634
635 #define BACKTRACK(__net, saddr)                 \
636 do { \
637         if (rt == __net->ipv6.ip6_null_entry) { \
638                 struct fib6_node *pn; \
639                 while (1) { \
640                         if (fn->fn_flags & RTN_TL_ROOT) \
641                                 goto out; \
642                         pn = fn->parent; \
643                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
644                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
645                         else \
646                                 fn = pn; \
647                         if (fn->fn_flags & RTN_RTINFO) \
648                                 goto restart; \
649                 } \
650         } \
651 } while (0)
652
653 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
654                                              struct fib6_table *table,
655                                              struct flowi6 *fl6, int flags)
656 {
657         struct fib6_node *fn;
658         struct rt6_info *rt;
659
660         read_lock_bh(&table->tb6_lock);
661         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
662 restart:
663         rt = fn->leaf;
664         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
665         BACKTRACK(net, &fl6->saddr);
666 out:
667         dst_use(&rt->dst, jiffies);
668         read_unlock_bh(&table->tb6_lock);
669         return rt;
670
671 }
672
673 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
674                                     int flags)
675 {
676         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
677 }
678 EXPORT_SYMBOL_GPL(ip6_route_lookup);
679
680 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
681                             const struct in6_addr *saddr, int oif, int strict)
682 {
683         struct flowi6 fl6 = {
684                 .flowi6_oif = oif,
685                 .daddr = *daddr,
686         };
687         struct dst_entry *dst;
688         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
689
690         if (saddr) {
691                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
692                 flags |= RT6_LOOKUP_F_HAS_SADDR;
693         }
694
695         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
696         if (dst->error == 0)
697                 return (struct rt6_info *) dst;
698
699         dst_release(dst);
700
701         return NULL;
702 }
703
704 EXPORT_SYMBOL(rt6_lookup);
705
706 /* ip6_ins_rt is called with FREE table->tb6_lock.
707    It takes new route entry, the addition fails by any reason the
708    route is freed. In any case, if caller does not hold it, it may
709    be destroyed.
710  */
711
712 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
713 {
714         int err;
715         struct fib6_table *table;
716
717         table = rt->rt6i_table;
718         write_lock_bh(&table->tb6_lock);
719         err = fib6_add(&table->tb6_root, rt, info);
720         write_unlock_bh(&table->tb6_lock);
721
722         return err;
723 }
724
725 int ip6_ins_rt(struct rt6_info *rt)
726 {
727         struct nl_info info = {
728                 .nl_net = dev_net(rt->dst.dev),
729         };
730         return __ip6_ins_rt(rt, &info);
731 }
732
733 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
734                                       const struct in6_addr *daddr,
735                                       const struct in6_addr *saddr)
736 {
737         struct rt6_info *rt;
738
739         /*
740          *      Clone the route.
741          */
742
743         rt = ip6_rt_copy(ort, daddr);
744
745         if (rt) {
746                 int attempts = !in_softirq();
747
748                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
749                         if (ort->rt6i_dst.plen != 128 &&
750                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
751                                 rt->rt6i_flags |= RTF_ANYCAST;
752                         rt->rt6i_gateway = *daddr;
753                 }
754
755                 rt->rt6i_flags |= RTF_CACHE;
756
757 #ifdef CONFIG_IPV6_SUBTREES
758                 if (rt->rt6i_src.plen && saddr) {
759                         rt->rt6i_src.addr = *saddr;
760                         rt->rt6i_src.plen = 128;
761                 }
762 #endif
763
764         retry:
765                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
766                         struct net *net = dev_net(rt->dst.dev);
767                         int saved_rt_min_interval =
768                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
769                         int saved_rt_elasticity =
770                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
771
772                         if (attempts-- > 0) {
773                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
774                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
775
776                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
777
778                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
779                                         saved_rt_elasticity;
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
781                                         saved_rt_min_interval;
782                                 goto retry;
783                         }
784
785                         if (net_ratelimit())
786                                 printk(KERN_WARNING
787                                        "ipv6: Neighbour table overflow.\n");
788                         dst_free(&rt->dst);
789                         return NULL;
790                 }
791         }
792
793         return rt;
794 }
795
796 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
797                                         const struct in6_addr *daddr)
798 {
799         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
800
801         if (rt) {
802                 rt->rt6i_flags |= RTF_CACHE;
803                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
804         }
805         return rt;
806 }
807
808 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
809                                       struct flowi6 *fl6, int flags)
810 {
811         struct fib6_node *fn;
812         struct rt6_info *rt, *nrt;
813         int strict = 0;
814         int attempts = 3;
815         int err;
816         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
817
818         strict |= flags & RT6_LOOKUP_F_IFACE;
819
820 relookup:
821         read_lock_bh(&table->tb6_lock);
822
823 restart_2:
824         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
825
826 restart:
827         rt = rt6_select(fn, oif, strict | reachable);
828
829         BACKTRACK(net, &fl6->saddr);
830         if (rt == net->ipv6.ip6_null_entry ||
831             rt->rt6i_flags & RTF_CACHE)
832                 goto out;
833
834         dst_hold(&rt->dst);
835         read_unlock_bh(&table->tb6_lock);
836
837         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
838                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
839         else if (!(rt->dst.flags & DST_HOST))
840                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
841         else
842                 goto out2;
843
844         dst_release(&rt->dst);
845         rt = nrt ? : net->ipv6.ip6_null_entry;
846
847         dst_hold(&rt->dst);
848         if (nrt) {
849                 err = ip6_ins_rt(nrt);
850                 if (!err)
851                         goto out2;
852         }
853
854         if (--attempts <= 0)
855                 goto out2;
856
857         /*
858          * Race condition! In the gap, when table->tb6_lock was
859          * released someone could insert this route.  Relookup.
860          */
861         dst_release(&rt->dst);
862         goto relookup;
863
864 out:
865         if (reachable) {
866                 reachable = 0;
867                 goto restart_2;
868         }
869         dst_hold(&rt->dst);
870         read_unlock_bh(&table->tb6_lock);
871 out2:
872         rt->dst.lastuse = jiffies;
873         rt->dst.__use++;
874
875         return rt;
876 }
877
878 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
879                                             struct flowi6 *fl6, int flags)
880 {
881         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
882 }
883
884 void ip6_route_input(struct sk_buff *skb)
885 {
886         const struct ipv6hdr *iph = ipv6_hdr(skb);
887         struct net *net = dev_net(skb->dev);
888         int flags = RT6_LOOKUP_F_HAS_SADDR;
889         struct flowi6 fl6 = {
890                 .flowi6_iif = skb->dev->ifindex,
891                 .daddr = iph->daddr,
892                 .saddr = iph->saddr,
893                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
894                 .flowi6_mark = skb->mark,
895                 .flowi6_proto = iph->nexthdr,
896         };
897
898         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
899                 flags |= RT6_LOOKUP_F_IFACE;
900
901         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
902 }
903
904 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
905                                              struct flowi6 *fl6, int flags)
906 {
907         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
908 }
909
910 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
911                                     struct flowi6 *fl6)
912 {
913         int flags = 0;
914
915         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
916                 flags |= RT6_LOOKUP_F_IFACE;
917
918         if (!ipv6_addr_any(&fl6->saddr))
919                 flags |= RT6_LOOKUP_F_HAS_SADDR;
920         else if (sk)
921                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
922
923         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
924 }
925
926 EXPORT_SYMBOL(ip6_route_output);
927
928 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
929 {
930         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
931         struct dst_entry *new = NULL;
932
933         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
934         if (rt) {
935                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
936
937                 new = &rt->dst;
938
939                 new->__use = 1;
940                 new->input = dst_discard;
941                 new->output = dst_discard;
942
943                 if (dst_metrics_read_only(&ort->dst))
944                         new->_metrics = ort->dst._metrics;
945                 else
946                         dst_copy_metrics(new, &ort->dst);
947                 rt->rt6i_idev = ort->rt6i_idev;
948                 if (rt->rt6i_idev)
949                         in6_dev_hold(rt->rt6i_idev);
950                 rt->dst.expires = 0;
951
952                 rt->rt6i_gateway = ort->rt6i_gateway;
953                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
954                 rt->rt6i_metric = 0;
955
956                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
957 #ifdef CONFIG_IPV6_SUBTREES
958                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
959 #endif
960
961                 dst_free(new);
962         }
963
964         dst_release(dst_orig);
965         return new ? new : ERR_PTR(-ENOMEM);
966 }
967
968 /*
969  *      Destination cache support functions
970  */
971
972 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
973 {
974         struct rt6_info *rt;
975
976         rt = (struct rt6_info *) dst;
977
978         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
979                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
980                         if (!rt->rt6i_peer)
981                                 rt6_bind_peer(rt, 0);
982                         rt->rt6i_peer_genid = rt6_peer_genid();
983                 }
984                 return dst;
985         }
986         return NULL;
987 }
988
989 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
990 {
991         struct rt6_info *rt = (struct rt6_info *) dst;
992
993         if (rt) {
994                 if (rt->rt6i_flags & RTF_CACHE) {
995                         if (rt6_check_expired(rt)) {
996                                 ip6_del_rt(rt);
997                                 dst = NULL;
998                         }
999                 } else {
1000                         dst_release(dst);
1001                         dst = NULL;
1002                 }
1003         }
1004         return dst;
1005 }
1006
1007 static void ip6_link_failure(struct sk_buff *skb)
1008 {
1009         struct rt6_info *rt;
1010
1011         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1012
1013         rt = (struct rt6_info *) skb_dst(skb);
1014         if (rt) {
1015                 if (rt->rt6i_flags & RTF_CACHE) {
1016                         dst_set_expires(&rt->dst, 0);
1017                         rt->rt6i_flags |= RTF_EXPIRES;
1018                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1019                         rt->rt6i_node->fn_sernum = -1;
1020         }
1021 }
1022
1023 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1024 {
1025         struct rt6_info *rt6 = (struct rt6_info*)dst;
1026
1027         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1028                 rt6->rt6i_flags |= RTF_MODIFIED;
1029                 if (mtu < IPV6_MIN_MTU) {
1030                         u32 features = dst_metric(dst, RTAX_FEATURES);
1031                         mtu = IPV6_MIN_MTU;
1032                         features |= RTAX_FEATURE_ALLFRAG;
1033                         dst_metric_set(dst, RTAX_FEATURES, features);
1034                 }
1035                 dst_metric_set(dst, RTAX_MTU, mtu);
1036         }
1037 }
1038
1039 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1040 {
1041         struct net_device *dev = dst->dev;
1042         unsigned int mtu = dst_mtu(dst);
1043         struct net *net = dev_net(dev);
1044
1045         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1046
1047         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1048                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1049
1050         /*
1051          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1052          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1053          * IPV6_MAXPLEN is also valid and means: "any MSS,
1054          * rely only on pmtu discovery"
1055          */
1056         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1057                 mtu = IPV6_MAXPLEN;
1058         return mtu;
1059 }
1060
1061 static unsigned int ip6_mtu(const struct dst_entry *dst)
1062 {
1063         struct inet6_dev *idev;
1064         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1065
1066         if (mtu)
1067                 return mtu;
1068
1069         mtu = IPV6_MIN_MTU;
1070
1071         rcu_read_lock();
1072         idev = __in6_dev_get(dst->dev);
1073         if (idev)
1074                 mtu = idev->cnf.mtu6;
1075         rcu_read_unlock();
1076
1077         return mtu;
1078 }
1079
1080 static struct dst_entry *icmp6_dst_gc_list;
1081 static DEFINE_SPINLOCK(icmp6_dst_lock);
1082
1083 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1084                                   struct neighbour *neigh,
1085                                   struct flowi6 *fl6)
1086 {
1087         struct dst_entry *dst;
1088         struct rt6_info *rt;
1089         struct inet6_dev *idev = in6_dev_get(dev);
1090         struct net *net = dev_net(dev);
1091
1092         if (unlikely(!idev))
1093                 return ERR_PTR(-ENODEV);
1094
1095         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1096         if (unlikely(!rt)) {
1097                 in6_dev_put(idev);
1098                 dst = ERR_PTR(-ENOMEM);
1099                 goto out;
1100         }
1101
1102         if (neigh)
1103                 neigh_hold(neigh);
1104         else {
1105                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1106                 if (IS_ERR(neigh)) {
1107                         in6_dev_put(idev);
1108                         dst_free(&rt->dst);
1109                         return ERR_CAST(neigh);
1110                 }
1111         }
1112
1113         rt->dst.flags |= DST_HOST;
1114         rt->dst.output  = ip6_output;
1115         dst_set_neighbour(&rt->dst, neigh);
1116         atomic_set(&rt->dst.__refcnt, 1);
1117         rt->rt6i_dst.addr = fl6->daddr;
1118         rt->rt6i_dst.plen = 128;
1119         rt->rt6i_idev     = idev;
1120         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1121
1122         spin_lock_bh(&icmp6_dst_lock);
1123         rt->dst.next = icmp6_dst_gc_list;
1124         icmp6_dst_gc_list = &rt->dst;
1125         spin_unlock_bh(&icmp6_dst_lock);
1126
1127         fib6_force_start_gc(net);
1128
1129         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1130
1131 out:
1132         return dst;
1133 }
1134
1135 int icmp6_dst_gc(void)
1136 {
1137         struct dst_entry *dst, **pprev;
1138         int more = 0;
1139
1140         spin_lock_bh(&icmp6_dst_lock);
1141         pprev = &icmp6_dst_gc_list;
1142
1143         while ((dst = *pprev) != NULL) {
1144                 if (!atomic_read(&dst->__refcnt)) {
1145                         *pprev = dst->next;
1146                         dst_free(dst);
1147                 } else {
1148                         pprev = &dst->next;
1149                         ++more;
1150                 }
1151         }
1152
1153         spin_unlock_bh(&icmp6_dst_lock);
1154
1155         return more;
1156 }
1157
1158 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1159                             void *arg)
1160 {
1161         struct dst_entry *dst, **pprev;
1162
1163         spin_lock_bh(&icmp6_dst_lock);
1164         pprev = &icmp6_dst_gc_list;
1165         while ((dst = *pprev) != NULL) {
1166                 struct rt6_info *rt = (struct rt6_info *) dst;
1167                 if (func(rt, arg)) {
1168                         *pprev = dst->next;
1169                         dst_free(dst);
1170                 } else {
1171                         pprev = &dst->next;
1172                 }
1173         }
1174         spin_unlock_bh(&icmp6_dst_lock);
1175 }
1176
1177 static int ip6_dst_gc(struct dst_ops *ops)
1178 {
1179         unsigned long now = jiffies;
1180         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1181         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1182         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1183         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1184         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1185         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1186         int entries;
1187
1188         entries = dst_entries_get_fast(ops);
1189         if (time_after(rt_last_gc + rt_min_interval, now) &&
1190             entries <= rt_max_size)
1191                 goto out;
1192
1193         net->ipv6.ip6_rt_gc_expire++;
1194         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1195         net->ipv6.ip6_rt_last_gc = now;
1196         entries = dst_entries_get_slow(ops);
1197         if (entries < ops->gc_thresh)
1198                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1199 out:
1200         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1201         return entries > rt_max_size;
1202 }
1203
1204 /* Clean host part of a prefix. Not necessary in radix tree,
1205    but results in cleaner routing tables.
1206
1207    Remove it only when all the things will work!
1208  */
1209
1210 int ip6_dst_hoplimit(struct dst_entry *dst)
1211 {
1212         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1213         if (hoplimit == 0) {
1214                 struct net_device *dev = dst->dev;
1215                 struct inet6_dev *idev;
1216
1217                 rcu_read_lock();
1218                 idev = __in6_dev_get(dev);
1219                 if (idev)
1220                         hoplimit = idev->cnf.hop_limit;
1221                 else
1222                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1223                 rcu_read_unlock();
1224         }
1225         return hoplimit;
1226 }
1227 EXPORT_SYMBOL(ip6_dst_hoplimit);
1228
1229 /*
1230  *
1231  */
1232
1233 int ip6_route_add(struct fib6_config *cfg)
1234 {
1235         int err;
1236         struct net *net = cfg->fc_nlinfo.nl_net;
1237         struct rt6_info *rt = NULL;
1238         struct net_device *dev = NULL;
1239         struct inet6_dev *idev = NULL;
1240         struct fib6_table *table;
1241         int addr_type;
1242
1243         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1244                 return -EINVAL;
1245 #ifndef CONFIG_IPV6_SUBTREES
1246         if (cfg->fc_src_len)
1247                 return -EINVAL;
1248 #endif
1249         if (cfg->fc_ifindex) {
1250                 err = -ENODEV;
1251                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1252                 if (!dev)
1253                         goto out;
1254                 idev = in6_dev_get(dev);
1255                 if (!idev)
1256                         goto out;
1257         }
1258
1259         if (cfg->fc_metric == 0)
1260                 cfg->fc_metric = IP6_RT_PRIO_USER;
1261
1262         err = -ENOBUFS;
1263         if (cfg->fc_nlinfo.nlh &&
1264             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1265                 table = fib6_get_table(net, cfg->fc_table);
1266                 if (!table) {
1267                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1268                         table = fib6_new_table(net, cfg->fc_table);
1269                 }
1270         } else {
1271                 table = fib6_new_table(net, cfg->fc_table);
1272         }
1273
1274         if (!table)
1275                 goto out;
1276
1277         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1278
1279         if (!rt) {
1280                 err = -ENOMEM;
1281                 goto out;
1282         }
1283
1284         rt->dst.obsolete = -1;
1285         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1286                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1287                                 0;
1288
1289         if (cfg->fc_protocol == RTPROT_UNSPEC)
1290                 cfg->fc_protocol = RTPROT_BOOT;
1291         rt->rt6i_protocol = cfg->fc_protocol;
1292
1293         addr_type = ipv6_addr_type(&cfg->fc_dst);
1294
1295         if (addr_type & IPV6_ADDR_MULTICAST)
1296                 rt->dst.input = ip6_mc_input;
1297         else if (cfg->fc_flags & RTF_LOCAL)
1298                 rt->dst.input = ip6_input;
1299         else
1300                 rt->dst.input = ip6_forward;
1301
1302         rt->dst.output = ip6_output;
1303
1304         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1305         rt->rt6i_dst.plen = cfg->fc_dst_len;
1306         if (rt->rt6i_dst.plen == 128)
1307                rt->dst.flags |= DST_HOST;
1308
1309         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1310                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1311                 if (!metrics) {
1312                         err = -ENOMEM;
1313                         goto out;
1314                 }
1315                 dst_init_metrics(&rt->dst, metrics, 0);
1316         }
1317 #ifdef CONFIG_IPV6_SUBTREES
1318         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1319         rt->rt6i_src.plen = cfg->fc_src_len;
1320 #endif
1321
1322         rt->rt6i_metric = cfg->fc_metric;
1323
1324         /* We cannot add true routes via loopback here,
1325            they would result in kernel looping; promote them to reject routes
1326          */
1327         if ((cfg->fc_flags & RTF_REJECT) ||
1328             (dev && (dev->flags & IFF_LOOPBACK) &&
1329              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1330              !(cfg->fc_flags & RTF_LOCAL))) {
1331                 /* hold loopback dev/idev if we haven't done so. */
1332                 if (dev != net->loopback_dev) {
1333                         if (dev) {
1334                                 dev_put(dev);
1335                                 in6_dev_put(idev);
1336                         }
1337                         dev = net->loopback_dev;
1338                         dev_hold(dev);
1339                         idev = in6_dev_get(dev);
1340                         if (!idev) {
1341                                 err = -ENODEV;
1342                                 goto out;
1343                         }
1344                 }
1345                 rt->dst.output = ip6_pkt_discard_out;
1346                 rt->dst.input = ip6_pkt_discard;
1347                 rt->dst.error = -ENETUNREACH;
1348                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1349                 goto install_route;
1350         }
1351
1352         if (cfg->fc_flags & RTF_GATEWAY) {
1353                 const struct in6_addr *gw_addr;
1354                 int gwa_type;
1355
1356                 gw_addr = &cfg->fc_gateway;
1357                 rt->rt6i_gateway = *gw_addr;
1358                 gwa_type = ipv6_addr_type(gw_addr);
1359
1360                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1361                         struct rt6_info *grt;
1362
1363                         /* IPv6 strictly inhibits using not link-local
1364                            addresses as nexthop address.
1365                            Otherwise, router will not able to send redirects.
1366                            It is very good, but in some (rare!) circumstances
1367                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1368                            some exceptions. --ANK
1369                          */
1370                         err = -EINVAL;
1371                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1372                                 goto out;
1373
1374                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1375
1376                         err = -EHOSTUNREACH;
1377                         if (!grt)
1378                                 goto out;
1379                         if (dev) {
1380                                 if (dev != grt->dst.dev) {
1381                                         dst_release(&grt->dst);
1382                                         goto out;
1383                                 }
1384                         } else {
1385                                 dev = grt->dst.dev;
1386                                 idev = grt->rt6i_idev;
1387                                 dev_hold(dev);
1388                                 in6_dev_hold(grt->rt6i_idev);
1389                         }
1390                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1391                                 err = 0;
1392                         dst_release(&grt->dst);
1393
1394                         if (err)
1395                                 goto out;
1396                 }
1397                 err = -EINVAL;
1398                 if (!dev || (dev->flags & IFF_LOOPBACK))
1399                         goto out;
1400         }
1401
1402         err = -ENODEV;
1403         if (!dev)
1404                 goto out;
1405
1406         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1407                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1408                         err = -EINVAL;
1409                         goto out;
1410                 }
1411                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1412                 rt->rt6i_prefsrc.plen = 128;
1413         } else
1414                 rt->rt6i_prefsrc.plen = 0;
1415
1416         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1417                 err = rt6_bind_neighbour(rt, dev);
1418                 if (err)
1419                         goto out;
1420         }
1421
1422         rt->rt6i_flags = cfg->fc_flags;
1423
1424 install_route:
1425         if (cfg->fc_mx) {
1426                 struct nlattr *nla;
1427                 int remaining;
1428
1429                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1430                         int type = nla_type(nla);
1431
1432                         if (type) {
1433                                 if (type > RTAX_MAX) {
1434                                         err = -EINVAL;
1435                                         goto out;
1436                                 }
1437
1438                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1439                         }
1440                 }
1441         }
1442
1443         rt->dst.dev = dev;
1444         rt->rt6i_idev = idev;
1445         rt->rt6i_table = table;
1446
1447         cfg->fc_nlinfo.nl_net = dev_net(dev);
1448
1449         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1450
1451 out:
1452         if (dev)
1453                 dev_put(dev);
1454         if (idev)
1455                 in6_dev_put(idev);
1456         if (rt)
1457                 dst_free(&rt->dst);
1458         return err;
1459 }
1460
1461 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1462 {
1463         int err;
1464         struct fib6_table *table;
1465         struct net *net = dev_net(rt->dst.dev);
1466
1467         if (rt == net->ipv6.ip6_null_entry)
1468                 return -ENOENT;
1469
1470         table = rt->rt6i_table;
1471         write_lock_bh(&table->tb6_lock);
1472
1473         err = fib6_del(rt, info);
1474         dst_release(&rt->dst);
1475
1476         write_unlock_bh(&table->tb6_lock);
1477
1478         return err;
1479 }
1480
1481 int ip6_del_rt(struct rt6_info *rt)
1482 {
1483         struct nl_info info = {
1484                 .nl_net = dev_net(rt->dst.dev),
1485         };
1486         return __ip6_del_rt(rt, &info);
1487 }
1488
1489 static int ip6_route_del(struct fib6_config *cfg)
1490 {
1491         struct fib6_table *table;
1492         struct fib6_node *fn;
1493         struct rt6_info *rt;
1494         int err = -ESRCH;
1495
1496         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1497         if (!table)
1498                 return err;
1499
1500         read_lock_bh(&table->tb6_lock);
1501
1502         fn = fib6_locate(&table->tb6_root,
1503                          &cfg->fc_dst, cfg->fc_dst_len,
1504                          &cfg->fc_src, cfg->fc_src_len);
1505
1506         if (fn) {
1507                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1508                         if (cfg->fc_ifindex &&
1509                             (!rt->dst.dev ||
1510                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1511                                 continue;
1512                         if (cfg->fc_flags & RTF_GATEWAY &&
1513                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1514                                 continue;
1515                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1516                                 continue;
1517                         dst_hold(&rt->dst);
1518                         read_unlock_bh(&table->tb6_lock);
1519
1520                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1521                 }
1522         }
1523         read_unlock_bh(&table->tb6_lock);
1524
1525         return err;
1526 }
1527
1528 /*
1529  *      Handle redirects
1530  */
1531 struct ip6rd_flowi {
1532         struct flowi6 fl6;
1533         struct in6_addr gateway;
1534 };
1535
1536 static struct rt6_info *__ip6_route_redirect(struct net *net,
1537                                              struct fib6_table *table,
1538                                              struct flowi6 *fl6,
1539                                              int flags)
1540 {
1541         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1542         struct rt6_info *rt;
1543         struct fib6_node *fn;
1544
1545         /*
1546          * Get the "current" route for this destination and
1547          * check if the redirect has come from approriate router.
1548          *
1549          * RFC 2461 specifies that redirects should only be
1550          * accepted if they come from the nexthop to the target.
1551          * Due to the way the routes are chosen, this notion
1552          * is a bit fuzzy and one might need to check all possible
1553          * routes.
1554          */
1555
1556         read_lock_bh(&table->tb6_lock);
1557         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1558 restart:
1559         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1560                 /*
1561                  * Current route is on-link; redirect is always invalid.
1562                  *
1563                  * Seems, previous statement is not true. It could
1564                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1565                  * But then router serving it might decide, that we should
1566                  * know truth 8)8) --ANK (980726).
1567                  */
1568                 if (rt6_check_expired(rt))
1569                         continue;
1570                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1571                         continue;
1572                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1573                         continue;
1574                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1575                         continue;
1576                 break;
1577         }
1578
1579         if (!rt)
1580                 rt = net->ipv6.ip6_null_entry;
1581         BACKTRACK(net, &fl6->saddr);
1582 out:
1583         dst_hold(&rt->dst);
1584
1585         read_unlock_bh(&table->tb6_lock);
1586
1587         return rt;
1588 };
1589
1590 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1591                                            const struct in6_addr *src,
1592                                            const struct in6_addr *gateway,
1593                                            struct net_device *dev)
1594 {
1595         int flags = RT6_LOOKUP_F_HAS_SADDR;
1596         struct net *net = dev_net(dev);
1597         struct ip6rd_flowi rdfl = {
1598                 .fl6 = {
1599                         .flowi6_oif = dev->ifindex,
1600                         .daddr = *dest,
1601                         .saddr = *src,
1602                 },
1603         };
1604
1605         rdfl.gateway = *gateway;
1606
1607         if (rt6_need_strict(dest))
1608                 flags |= RT6_LOOKUP_F_IFACE;
1609
1610         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1611                                                    flags, __ip6_route_redirect);
1612 }
1613
1614 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1615                   const struct in6_addr *saddr,
1616                   struct neighbour *neigh, u8 *lladdr, int on_link)
1617 {
1618         struct rt6_info *rt, *nrt = NULL;
1619         struct netevent_redirect netevent;
1620         struct net *net = dev_net(neigh->dev);
1621
1622         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1623
1624         if (rt == net->ipv6.ip6_null_entry) {
1625                 if (net_ratelimit())
1626                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1627                                "for redirect target\n");
1628                 goto out;
1629         }
1630
1631         /*
1632          *      We have finally decided to accept it.
1633          */
1634
1635         neigh_update(neigh, lladdr, NUD_STALE,
1636                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1637                      NEIGH_UPDATE_F_OVERRIDE|
1638                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1639                                      NEIGH_UPDATE_F_ISROUTER))
1640                      );
1641
1642         /*
1643          * Redirect received -> path was valid.
1644          * Look, redirects are sent only in response to data packets,
1645          * so that this nexthop apparently is reachable. --ANK
1646          */
1647         dst_confirm(&rt->dst);
1648
1649         /* Duplicate redirect: silently ignore. */
1650         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1651                 goto out;
1652
1653         nrt = ip6_rt_copy(rt, dest);
1654         if (!nrt)
1655                 goto out;
1656
1657         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1658         if (on_link)
1659                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1660
1661         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1662         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1663
1664         if (ip6_ins_rt(nrt))
1665                 goto out;
1666
1667         netevent.old = &rt->dst;
1668         netevent.new = &nrt->dst;
1669         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1670
1671         if (rt->rt6i_flags & RTF_CACHE) {
1672                 ip6_del_rt(rt);
1673                 return;
1674         }
1675
1676 out:
1677         dst_release(&rt->dst);
1678 }
1679
1680 /*
1681  *      Handle ICMP "packet too big" messages
1682  *      i.e. Path MTU discovery
1683  */
1684
1685 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1686                              struct net *net, u32 pmtu, int ifindex)
1687 {
1688         struct rt6_info *rt, *nrt;
1689         int allfrag = 0;
1690 again:
1691         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1692         if (!rt)
1693                 return;
1694
1695         if (rt6_check_expired(rt)) {
1696                 ip6_del_rt(rt);
1697                 goto again;
1698         }
1699
1700         if (pmtu >= dst_mtu(&rt->dst))
1701                 goto out;
1702
1703         if (pmtu < IPV6_MIN_MTU) {
1704                 /*
1705                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1706                  * MTU (1280) and a fragment header should always be included
1707                  * after a node receiving Too Big message reporting PMTU is
1708                  * less than the IPv6 Minimum Link MTU.
1709                  */
1710                 pmtu = IPV6_MIN_MTU;
1711                 allfrag = 1;
1712         }
1713
1714         /* New mtu received -> path was valid.
1715            They are sent only in response to data packets,
1716            so that this nexthop apparently is reachable. --ANK
1717          */
1718         dst_confirm(&rt->dst);
1719
1720         /* Host route. If it is static, it would be better
1721            not to override it, but add new one, so that
1722            when cache entry will expire old pmtu
1723            would return automatically.
1724          */
1725         if (rt->rt6i_flags & RTF_CACHE) {
1726                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1727                 if (allfrag) {
1728                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1729                         features |= RTAX_FEATURE_ALLFRAG;
1730                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1731                 }
1732                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1733                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1734                 goto out;
1735         }
1736
1737         /* Network route.
1738            Two cases are possible:
1739            1. It is connected route. Action: COW
1740            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1741          */
1742         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1743                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1744         else
1745                 nrt = rt6_alloc_clone(rt, daddr);
1746
1747         if (nrt) {
1748                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1749                 if (allfrag) {
1750                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1751                         features |= RTAX_FEATURE_ALLFRAG;
1752                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1753                 }
1754
1755                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1756                  * happened within 5 mins, the recommended timer is 10 mins.
1757                  * Here this route expiration time is set to ip6_rt_mtu_expires
1758                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1759                  * and detecting PMTU increase will be automatically happened.
1760                  */
1761                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1762                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1763
1764                 ip6_ins_rt(nrt);
1765         }
1766 out:
1767         dst_release(&rt->dst);
1768 }
1769
1770 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1771                         struct net_device *dev, u32 pmtu)
1772 {
1773         struct net *net = dev_net(dev);
1774
1775         /*
1776          * RFC 1981 states that a node "MUST reduce the size of the packets it
1777          * is sending along the path" that caused the Packet Too Big message.
1778          * Since it's not possible in the general case to determine which
1779          * interface was used to send the original packet, we update the MTU
1780          * on the interface that will be used to send future packets. We also
1781          * update the MTU on the interface that received the Packet Too Big in
1782          * case the original packet was forced out that interface with
1783          * SO_BINDTODEVICE or similar. This is the next best thing to the
1784          * correct behaviour, which would be to update the MTU on all
1785          * interfaces.
1786          */
1787         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1788         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1789 }
1790
1791 /*
1792  *      Misc support functions
1793  */
1794
1795 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1796                                     const struct in6_addr *dest)
1797 {
1798         struct net *net = dev_net(ort->dst.dev);
1799         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1800                                             ort->dst.dev, 0);
1801
1802         if (rt) {
1803                 rt->dst.input = ort->dst.input;
1804                 rt->dst.output = ort->dst.output;
1805                 rt->dst.flags |= DST_HOST;
1806
1807                 rt->rt6i_dst.addr = *dest;
1808                 rt->rt6i_dst.plen = 128;
1809                 dst_copy_metrics(&rt->dst, &ort->dst);
1810                 rt->dst.error = ort->dst.error;
1811                 rt->rt6i_idev = ort->rt6i_idev;
1812                 if (rt->rt6i_idev)
1813                         in6_dev_hold(rt->rt6i_idev);
1814                 rt->dst.lastuse = jiffies;
1815                 rt->dst.expires = 0;
1816
1817                 rt->rt6i_gateway = ort->rt6i_gateway;
1818                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1819                 rt->rt6i_metric = 0;
1820
1821 #ifdef CONFIG_IPV6_SUBTREES
1822                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1823 #endif
1824                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1825                 rt->rt6i_table = ort->rt6i_table;
1826         }
1827         return rt;
1828 }
1829
1830 #ifdef CONFIG_IPV6_ROUTE_INFO
1831 static struct rt6_info *rt6_get_route_info(struct net *net,
1832                                            const struct in6_addr *prefix, int prefixlen,
1833                                            const struct in6_addr *gwaddr, int ifindex)
1834 {
1835         struct fib6_node *fn;
1836         struct rt6_info *rt = NULL;
1837         struct fib6_table *table;
1838
1839         table = fib6_get_table(net, RT6_TABLE_INFO);
1840         if (!table)
1841                 return NULL;
1842
1843         write_lock_bh(&table->tb6_lock);
1844         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1845         if (!fn)
1846                 goto out;
1847
1848         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1849                 if (rt->dst.dev->ifindex != ifindex)
1850                         continue;
1851                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1852                         continue;
1853                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1854                         continue;
1855                 dst_hold(&rt->dst);
1856                 break;
1857         }
1858 out:
1859         write_unlock_bh(&table->tb6_lock);
1860         return rt;
1861 }
1862
1863 static struct rt6_info *rt6_add_route_info(struct net *net,
1864                                            const struct in6_addr *prefix, int prefixlen,
1865                                            const struct in6_addr *gwaddr, int ifindex,
1866                                            unsigned pref)
1867 {
1868         struct fib6_config cfg = {
1869                 .fc_table       = RT6_TABLE_INFO,
1870                 .fc_metric      = IP6_RT_PRIO_USER,
1871                 .fc_ifindex     = ifindex,
1872                 .fc_dst_len     = prefixlen,
1873                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1874                                   RTF_UP | RTF_PREF(pref),
1875                 .fc_nlinfo.pid = 0,
1876                 .fc_nlinfo.nlh = NULL,
1877                 .fc_nlinfo.nl_net = net,
1878         };
1879
1880         cfg.fc_dst = *prefix;
1881         cfg.fc_gateway = *gwaddr;
1882
1883         /* We should treat it as a default route if prefix length is 0. */
1884         if (!prefixlen)
1885                 cfg.fc_flags |= RTF_DEFAULT;
1886
1887         ip6_route_add(&cfg);
1888
1889         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1890 }
1891 #endif
1892
1893 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1894 {
1895         struct rt6_info *rt;
1896         struct fib6_table *table;
1897
1898         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1899         if (!table)
1900                 return NULL;
1901
1902         write_lock_bh(&table->tb6_lock);
1903         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1904                 if (dev == rt->dst.dev &&
1905                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1906                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1907                         break;
1908         }
1909         if (rt)
1910                 dst_hold(&rt->dst);
1911         write_unlock_bh(&table->tb6_lock);
1912         return rt;
1913 }
1914
1915 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1916                                      struct net_device *dev,
1917                                      unsigned int pref)
1918 {
1919         struct fib6_config cfg = {
1920                 .fc_table       = RT6_TABLE_DFLT,
1921                 .fc_metric      = IP6_RT_PRIO_USER,
1922                 .fc_ifindex     = dev->ifindex,
1923                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1924                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1925                 .fc_nlinfo.pid = 0,
1926                 .fc_nlinfo.nlh = NULL,
1927                 .fc_nlinfo.nl_net = dev_net(dev),
1928         };
1929
1930         cfg.fc_gateway = *gwaddr;
1931
1932         ip6_route_add(&cfg);
1933
1934         return rt6_get_dflt_router(gwaddr, dev);
1935 }
1936
1937 void rt6_purge_dflt_routers(struct net *net)
1938 {
1939         struct rt6_info *rt;
1940         struct fib6_table *table;
1941
1942         /* NOTE: Keep consistent with rt6_get_dflt_router */
1943         table = fib6_get_table(net, RT6_TABLE_DFLT);
1944         if (!table)
1945                 return;
1946
1947 restart:
1948         read_lock_bh(&table->tb6_lock);
1949         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1950                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1951                         dst_hold(&rt->dst);
1952                         read_unlock_bh(&table->tb6_lock);
1953                         ip6_del_rt(rt);
1954                         goto restart;
1955                 }
1956         }
1957         read_unlock_bh(&table->tb6_lock);
1958 }
1959
1960 static void rtmsg_to_fib6_config(struct net *net,
1961                                  struct in6_rtmsg *rtmsg,
1962                                  struct fib6_config *cfg)
1963 {
1964         memset(cfg, 0, sizeof(*cfg));
1965
1966         cfg->fc_table = RT6_TABLE_MAIN;
1967         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1968         cfg->fc_metric = rtmsg->rtmsg_metric;
1969         cfg->fc_expires = rtmsg->rtmsg_info;
1970         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1971         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1972         cfg->fc_flags = rtmsg->rtmsg_flags;
1973
1974         cfg->fc_nlinfo.nl_net = net;
1975
1976         cfg->fc_dst = rtmsg->rtmsg_dst;
1977         cfg->fc_src = rtmsg->rtmsg_src;
1978         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1979 }
1980
1981 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1982 {
1983         struct fib6_config cfg;
1984         struct in6_rtmsg rtmsg;
1985         int err;
1986
1987         switch(cmd) {
1988         case SIOCADDRT:         /* Add a route */
1989         case SIOCDELRT:         /* Delete a route */
1990                 if (!capable(CAP_NET_ADMIN))
1991                         return -EPERM;
1992                 err = copy_from_user(&rtmsg, arg,
1993                                      sizeof(struct in6_rtmsg));
1994                 if (err)
1995                         return -EFAULT;
1996
1997                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1998
1999                 rtnl_lock();
2000                 switch (cmd) {
2001                 case SIOCADDRT:
2002                         err = ip6_route_add(&cfg);
2003                         break;
2004                 case SIOCDELRT:
2005                         err = ip6_route_del(&cfg);
2006                         break;
2007                 default:
2008                         err = -EINVAL;
2009                 }
2010                 rtnl_unlock();
2011
2012                 return err;
2013         }
2014
2015         return -EINVAL;
2016 }
2017
2018 /*
2019  *      Drop the packet on the floor
2020  */
2021
2022 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2023 {
2024         int type;
2025         struct dst_entry *dst = skb_dst(skb);
2026         switch (ipstats_mib_noroutes) {
2027         case IPSTATS_MIB_INNOROUTES:
2028                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2029                 if (type == IPV6_ADDR_ANY) {
2030                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2031                                       IPSTATS_MIB_INADDRERRORS);
2032                         break;
2033                 }
2034                 /* FALLTHROUGH */
2035         case IPSTATS_MIB_OUTNOROUTES:
2036                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2037                               ipstats_mib_noroutes);
2038                 break;
2039         }
2040         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2041         kfree_skb(skb);
2042         return 0;
2043 }
2044
2045 static int ip6_pkt_discard(struct sk_buff *skb)
2046 {
2047         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2048 }
2049
2050 static int ip6_pkt_discard_out(struct sk_buff *skb)
2051 {
2052         skb->dev = skb_dst(skb)->dev;
2053         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2054 }
2055
2056 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2057
2058 static int ip6_pkt_prohibit(struct sk_buff *skb)
2059 {
2060         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2061 }
2062
2063 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2064 {
2065         skb->dev = skb_dst(skb)->dev;
2066         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2067 }
2068
2069 #endif
2070
2071 /*
2072  *      Allocate a dst for local (unicast / anycast) address.
2073  */
2074
2075 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2076                                     const struct in6_addr *addr,
2077                                     bool anycast)
2078 {
2079         struct net *net = dev_net(idev->dev);
2080         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2081                                             net->loopback_dev, 0);
2082         int err;
2083
2084         if (!rt) {
2085                 if (net_ratelimit())
2086                         pr_warning("IPv6:  Maximum number of routes reached,"
2087                                    " consider increasing route/max_size.\n");
2088                 return ERR_PTR(-ENOMEM);
2089         }
2090
2091         in6_dev_hold(idev);
2092
2093         rt->dst.flags |= DST_HOST;
2094         rt->dst.input = ip6_input;
2095         rt->dst.output = ip6_output;
2096         rt->rt6i_idev = idev;
2097         rt->dst.obsolete = -1;
2098
2099         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2100         if (anycast)
2101                 rt->rt6i_flags |= RTF_ANYCAST;
2102         else
2103                 rt->rt6i_flags |= RTF_LOCAL;
2104         err = rt6_bind_neighbour(rt, rt->dst.dev);
2105         if (err) {
2106                 dst_free(&rt->dst);
2107                 return ERR_PTR(err);
2108         }
2109
2110         rt->rt6i_dst.addr = *addr;
2111         rt->rt6i_dst.plen = 128;
2112         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2113
2114         atomic_set(&rt->dst.__refcnt, 1);
2115
2116         return rt;
2117 }
2118
2119 int ip6_route_get_saddr(struct net *net,
2120                         struct rt6_info *rt,
2121                         const struct in6_addr *daddr,
2122                         unsigned int prefs,
2123                         struct in6_addr *saddr)
2124 {
2125         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2126         int err = 0;
2127         if (rt->rt6i_prefsrc.plen)
2128                 *saddr = rt->rt6i_prefsrc.addr;
2129         else
2130                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2131                                          daddr, prefs, saddr);
2132         return err;
2133 }
2134
2135 /* remove deleted ip from prefsrc entries */
2136 struct arg_dev_net_ip {
2137         struct net_device *dev;
2138         struct net *net;
2139         struct in6_addr *addr;
2140 };
2141
2142 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2143 {
2144         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2145         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2146         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2147
2148         if (((void *)rt->dst.dev == dev || !dev) &&
2149             rt != net->ipv6.ip6_null_entry &&
2150             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2151                 /* remove prefsrc entry */
2152                 rt->rt6i_prefsrc.plen = 0;
2153         }
2154         return 0;
2155 }
2156
2157 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2158 {
2159         struct net *net = dev_net(ifp->idev->dev);
2160         struct arg_dev_net_ip adni = {
2161                 .dev = ifp->idev->dev,
2162                 .net = net,
2163                 .addr = &ifp->addr,
2164         };
2165         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2166 }
2167
2168 struct arg_dev_net {
2169         struct net_device *dev;
2170         struct net *net;
2171 };
2172
2173 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2174 {
2175         const struct arg_dev_net *adn = arg;
2176         const struct net_device *dev = adn->dev;
2177
2178         if ((rt->dst.dev == dev || !dev) &&
2179             rt != adn->net->ipv6.ip6_null_entry)
2180                 return -1;
2181
2182         return 0;
2183 }
2184
2185 void rt6_ifdown(struct net *net, struct net_device *dev)
2186 {
2187         struct arg_dev_net adn = {
2188                 .dev = dev,
2189                 .net = net,
2190         };
2191
2192         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2193         icmp6_clean_all(fib6_ifdown, &adn);
2194 }
2195
2196 struct rt6_mtu_change_arg
2197 {
2198         struct net_device *dev;
2199         unsigned mtu;
2200 };
2201
2202 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2203 {
2204         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2205         struct inet6_dev *idev;
2206
2207         /* In IPv6 pmtu discovery is not optional,
2208            so that RTAX_MTU lock cannot disable it.
2209            We still use this lock to block changes
2210            caused by addrconf/ndisc.
2211         */
2212
2213         idev = __in6_dev_get(arg->dev);
2214         if (!idev)
2215                 return 0;
2216
2217         /* For administrative MTU increase, there is no way to discover
2218            IPv6 PMTU increase, so PMTU increase should be updated here.
2219            Since RFC 1981 doesn't include administrative MTU increase
2220            update PMTU increase is a MUST. (i.e. jumbo frame)
2221          */
2222         /*
2223            If new MTU is less than route PMTU, this new MTU will be the
2224            lowest MTU in the path, update the route PMTU to reflect PMTU
2225            decreases; if new MTU is greater than route PMTU, and the
2226            old MTU is the lowest MTU in the path, update the route PMTU
2227            to reflect the increase. In this case if the other nodes' MTU
2228            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2229            PMTU discouvery.
2230          */
2231         if (rt->dst.dev == arg->dev &&
2232             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2233             (dst_mtu(&rt->dst) >= arg->mtu ||
2234              (dst_mtu(&rt->dst) < arg->mtu &&
2235               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2236                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2237         }
2238         return 0;
2239 }
2240
2241 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2242 {
2243         struct rt6_mtu_change_arg arg = {
2244                 .dev = dev,
2245                 .mtu = mtu,
2246         };
2247
2248         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2249 }
2250
2251 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2252         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2253         [RTA_OIF]               = { .type = NLA_U32 },
2254         [RTA_IIF]               = { .type = NLA_U32 },
2255         [RTA_PRIORITY]          = { .type = NLA_U32 },
2256         [RTA_METRICS]           = { .type = NLA_NESTED },
2257 };
2258
2259 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2260                               struct fib6_config *cfg)
2261 {
2262         struct rtmsg *rtm;
2263         struct nlattr *tb[RTA_MAX+1];
2264         int err;
2265
2266         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2267         if (err < 0)
2268                 goto errout;
2269
2270         err = -EINVAL;
2271         rtm = nlmsg_data(nlh);
2272         memset(cfg, 0, sizeof(*cfg));
2273
2274         cfg->fc_table = rtm->rtm_table;
2275         cfg->fc_dst_len = rtm->rtm_dst_len;
2276         cfg->fc_src_len = rtm->rtm_src_len;
2277         cfg->fc_flags = RTF_UP;
2278         cfg->fc_protocol = rtm->rtm_protocol;
2279
2280         if (rtm->rtm_type == RTN_UNREACHABLE)
2281                 cfg->fc_flags |= RTF_REJECT;
2282
2283         if (rtm->rtm_type == RTN_LOCAL)
2284                 cfg->fc_flags |= RTF_LOCAL;
2285
2286         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2287         cfg->fc_nlinfo.nlh = nlh;
2288         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2289
2290         if (tb[RTA_GATEWAY]) {
2291                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2292                 cfg->fc_flags |= RTF_GATEWAY;
2293         }
2294
2295         if (tb[RTA_DST]) {
2296                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2297
2298                 if (nla_len(tb[RTA_DST]) < plen)
2299                         goto errout;
2300
2301                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2302         }
2303
2304         if (tb[RTA_SRC]) {
2305                 int plen = (rtm->rtm_src_len + 7) >> 3;
2306
2307                 if (nla_len(tb[RTA_SRC]) < plen)
2308                         goto errout;
2309
2310                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2311         }
2312
2313         if (tb[RTA_PREFSRC])
2314                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2315
2316         if (tb[RTA_OIF])
2317                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2318
2319         if (tb[RTA_PRIORITY])
2320                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2321
2322         if (tb[RTA_METRICS]) {
2323                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2324                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2325         }
2326
2327         if (tb[RTA_TABLE])
2328                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2329
2330         err = 0;
2331 errout:
2332         return err;
2333 }
2334
2335 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2336 {
2337         struct fib6_config cfg;
2338         int err;
2339
2340         err = rtm_to_fib6_config(skb, nlh, &cfg);
2341         if (err < 0)
2342                 return err;
2343
2344         return ip6_route_del(&cfg);
2345 }
2346
2347 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2348 {
2349         struct fib6_config cfg;
2350         int err;
2351
2352         err = rtm_to_fib6_config(skb, nlh, &cfg);
2353         if (err < 0)
2354                 return err;
2355
2356         return ip6_route_add(&cfg);
2357 }
2358
2359 static inline size_t rt6_nlmsg_size(void)
2360 {
2361         return NLMSG_ALIGN(sizeof(struct rtmsg))
2362                + nla_total_size(16) /* RTA_SRC */
2363                + nla_total_size(16) /* RTA_DST */
2364                + nla_total_size(16) /* RTA_GATEWAY */
2365                + nla_total_size(16) /* RTA_PREFSRC */
2366                + nla_total_size(4) /* RTA_TABLE */
2367                + nla_total_size(4) /* RTA_IIF */
2368                + nla_total_size(4) /* RTA_OIF */
2369                + nla_total_size(4) /* RTA_PRIORITY */
2370                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2371                + nla_total_size(sizeof(struct rta_cacheinfo));
2372 }
2373
2374 static int rt6_fill_node(struct net *net,
2375                          struct sk_buff *skb, struct rt6_info *rt,
2376                          struct in6_addr *dst, struct in6_addr *src,
2377                          int iif, int type, u32 pid, u32 seq,
2378                          int prefix, int nowait, unsigned int flags)
2379 {
2380         const struct inet_peer *peer;
2381         struct rtmsg *rtm;
2382         struct nlmsghdr *nlh;
2383         long expires;
2384         u32 table;
2385         struct neighbour *n;
2386         u32 ts, tsage;
2387
2388         if (prefix) {   /* user wants prefix routes only */
2389                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2390                         /* success since this is not a prefix route */
2391                         return 1;
2392                 }
2393         }
2394
2395         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2396         if (!nlh)
2397                 return -EMSGSIZE;
2398
2399         rtm = nlmsg_data(nlh);
2400         rtm->rtm_family = AF_INET6;
2401         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2402         rtm->rtm_src_len = rt->rt6i_src.plen;
2403         rtm->rtm_tos = 0;
2404         if (rt->rt6i_table)
2405                 table = rt->rt6i_table->tb6_id;
2406         else
2407                 table = RT6_TABLE_UNSPEC;
2408         rtm->rtm_table = table;
2409         NLA_PUT_U32(skb, RTA_TABLE, table);
2410         if (rt->rt6i_flags & RTF_REJECT)
2411                 rtm->rtm_type = RTN_UNREACHABLE;
2412         else if (rt->rt6i_flags & RTF_LOCAL)
2413                 rtm->rtm_type = RTN_LOCAL;
2414         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2415                 rtm->rtm_type = RTN_LOCAL;
2416         else
2417                 rtm->rtm_type = RTN_UNICAST;
2418         rtm->rtm_flags = 0;
2419         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2420         rtm->rtm_protocol = rt->rt6i_protocol;
2421         if (rt->rt6i_flags & RTF_DYNAMIC)
2422                 rtm->rtm_protocol = RTPROT_REDIRECT;
2423         else if (rt->rt6i_flags & RTF_ADDRCONF)
2424                 rtm->rtm_protocol = RTPROT_KERNEL;
2425         else if (rt->rt6i_flags & RTF_DEFAULT)
2426                 rtm->rtm_protocol = RTPROT_RA;
2427
2428         if (rt->rt6i_flags & RTF_CACHE)
2429                 rtm->rtm_flags |= RTM_F_CLONED;
2430
2431         if (dst) {
2432                 NLA_PUT(skb, RTA_DST, 16, dst);
2433                 rtm->rtm_dst_len = 128;
2434         } else if (rtm->rtm_dst_len)
2435                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2436 #ifdef CONFIG_IPV6_SUBTREES
2437         if (src) {
2438                 NLA_PUT(skb, RTA_SRC, 16, src);
2439                 rtm->rtm_src_len = 128;
2440         } else if (rtm->rtm_src_len)
2441                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2442 #endif
2443         if (iif) {
2444 #ifdef CONFIG_IPV6_MROUTE
2445                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2446                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2447                         if (err <= 0) {
2448                                 if (!nowait) {
2449                                         if (err == 0)
2450                                                 return 0;
2451                                         goto nla_put_failure;
2452                                 } else {
2453                                         if (err == -EMSGSIZE)
2454                                                 goto nla_put_failure;
2455                                 }
2456                         }
2457                 } else
2458 #endif
2459                         NLA_PUT_U32(skb, RTA_IIF, iif);
2460         } else if (dst) {
2461                 struct in6_addr saddr_buf;
2462                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2463                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2464         }
2465
2466         if (rt->rt6i_prefsrc.plen) {
2467                 struct in6_addr saddr_buf;
2468                 saddr_buf = rt->rt6i_prefsrc.addr;
2469                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2470         }
2471
2472         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2473                 goto nla_put_failure;
2474
2475         rcu_read_lock();
2476         n = dst_get_neighbour_noref(&rt->dst);
2477         if (n) {
2478                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2479                         rcu_read_unlock();
2480                         goto nla_put_failure;
2481                 }
2482         }
2483         rcu_read_unlock();
2484
2485         if (rt->dst.dev)
2486                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2487
2488         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2489
2490         if (!(rt->rt6i_flags & RTF_EXPIRES))
2491                 expires = 0;
2492         else if (rt->dst.expires - jiffies < INT_MAX)
2493                 expires = rt->dst.expires - jiffies;
2494         else
2495                 expires = INT_MAX;
2496
2497         peer = rt->rt6i_peer;
2498         ts = tsage = 0;
2499         if (peer && peer->tcp_ts_stamp) {
2500                 ts = peer->tcp_ts;
2501                 tsage = get_seconds() - peer->tcp_ts_stamp;
2502         }
2503
2504         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2505                                expires, rt->dst.error) < 0)
2506                 goto nla_put_failure;
2507
2508         return nlmsg_end(skb, nlh);
2509
2510 nla_put_failure:
2511         nlmsg_cancel(skb, nlh);
2512         return -EMSGSIZE;
2513 }
2514
2515 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2516 {
2517         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2518         int prefix;
2519
2520         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2521                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2522                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2523         } else
2524                 prefix = 0;
2525
2526         return rt6_fill_node(arg->net,
2527                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2528                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2529                      prefix, 0, NLM_F_MULTI);
2530 }
2531
2532 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2533 {
2534         struct net *net = sock_net(in_skb->sk);
2535         struct nlattr *tb[RTA_MAX+1];
2536         struct rt6_info *rt;
2537         struct sk_buff *skb;
2538         struct rtmsg *rtm;
2539         struct flowi6 fl6;
2540         int err, iif = 0;
2541
2542         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2543         if (err < 0)
2544                 goto errout;
2545
2546         err = -EINVAL;
2547         memset(&fl6, 0, sizeof(fl6));
2548
2549         if (tb[RTA_SRC]) {
2550                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2551                         goto errout;
2552
2553                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2554         }
2555
2556         if (tb[RTA_DST]) {
2557                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2558                         goto errout;
2559
2560                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2561         }
2562
2563         if (tb[RTA_IIF])
2564                 iif = nla_get_u32(tb[RTA_IIF]);
2565
2566         if (tb[RTA_OIF])
2567                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2568
2569         if (iif) {
2570                 struct net_device *dev;
2571                 dev = __dev_get_by_index(net, iif);
2572                 if (!dev) {
2573                         err = -ENODEV;
2574                         goto errout;
2575                 }
2576         }
2577
2578         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2579         if (!skb) {
2580                 err = -ENOBUFS;
2581                 goto errout;
2582         }
2583
2584         /* Reserve room for dummy headers, this skb can pass
2585            through good chunk of routing engine.
2586          */
2587         skb_reset_mac_header(skb);
2588         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2589
2590         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2591         skb_dst_set(skb, &rt->dst);
2592
2593         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2594                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2595                             nlh->nlmsg_seq, 0, 0, 0);
2596         if (err < 0) {
2597                 kfree_skb(skb);
2598                 goto errout;
2599         }
2600
2601         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2602 errout:
2603         return err;
2604 }
2605
2606 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2607 {
2608         struct sk_buff *skb;
2609         struct net *net = info->nl_net;
2610         u32 seq;
2611         int err;
2612
2613         err = -ENOBUFS;
2614         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2615
2616         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2617         if (!skb)
2618                 goto errout;
2619
2620         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2621                                 event, info->pid, seq, 0, 0, 0);
2622         if (err < 0) {
2623                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2624                 WARN_ON(err == -EMSGSIZE);
2625                 kfree_skb(skb);
2626                 goto errout;
2627         }
2628         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2629                     info->nlh, gfp_any());
2630         return;
2631 errout:
2632         if (err < 0)
2633                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2634 }
2635
2636 static int ip6_route_dev_notify(struct notifier_block *this,
2637                                 unsigned long event, void *data)
2638 {
2639         struct net_device *dev = (struct net_device *)data;
2640         struct net *net = dev_net(dev);
2641
2642         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2643                 net->ipv6.ip6_null_entry->dst.dev = dev;
2644                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2645 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2646                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2647                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2648                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2649                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2650 #endif
2651         }
2652
2653         return NOTIFY_OK;
2654 }
2655
2656 /*
2657  *      /proc
2658  */
2659
2660 #ifdef CONFIG_PROC_FS
2661
2662 struct rt6_proc_arg
2663 {
2664         char *buffer;
2665         int offset;
2666         int length;
2667         int skip;
2668         int len;
2669 };
2670
2671 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2672 {
2673         struct seq_file *m = p_arg;
2674         struct neighbour *n;
2675
2676         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2677
2678 #ifdef CONFIG_IPV6_SUBTREES
2679         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2680 #else
2681         seq_puts(m, "00000000000000000000000000000000 00 ");
2682 #endif
2683         rcu_read_lock();
2684         n = dst_get_neighbour_noref(&rt->dst);
2685         if (n) {
2686                 seq_printf(m, "%pi6", n->primary_key);
2687         } else {
2688                 seq_puts(m, "00000000000000000000000000000000");
2689         }
2690         rcu_read_unlock();
2691         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2692                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2693                    rt->dst.__use, rt->rt6i_flags,
2694                    rt->dst.dev ? rt->dst.dev->name : "");
2695         return 0;
2696 }
2697
2698 static int ipv6_route_show(struct seq_file *m, void *v)
2699 {
2700         struct net *net = (struct net *)m->private;
2701         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2702         return 0;
2703 }
2704
2705 static int ipv6_route_open(struct inode *inode, struct file *file)
2706 {
2707         return single_open_net(inode, file, ipv6_route_show);
2708 }
2709
2710 static const struct file_operations ipv6_route_proc_fops = {
2711         .owner          = THIS_MODULE,
2712         .open           = ipv6_route_open,
2713         .read           = seq_read,
2714         .llseek         = seq_lseek,
2715         .release        = single_release_net,
2716 };
2717
2718 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2719 {
2720         struct net *net = (struct net *)seq->private;
2721         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2722                    net->ipv6.rt6_stats->fib_nodes,
2723                    net->ipv6.rt6_stats->fib_route_nodes,
2724                    net->ipv6.rt6_stats->fib_rt_alloc,
2725                    net->ipv6.rt6_stats->fib_rt_entries,
2726                    net->ipv6.rt6_stats->fib_rt_cache,
2727                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2728                    net->ipv6.rt6_stats->fib_discarded_routes);
2729
2730         return 0;
2731 }
2732
2733 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2734 {
2735         return single_open_net(inode, file, rt6_stats_seq_show);
2736 }
2737
2738 static const struct file_operations rt6_stats_seq_fops = {
2739         .owner   = THIS_MODULE,
2740         .open    = rt6_stats_seq_open,
2741         .read    = seq_read,
2742         .llseek  = seq_lseek,
2743         .release = single_release_net,
2744 };
2745 #endif  /* CONFIG_PROC_FS */
2746
2747 #ifdef CONFIG_SYSCTL
2748
2749 static
2750 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2751                               void __user *buffer, size_t *lenp, loff_t *ppos)
2752 {
2753         struct net *net;
2754         int delay;
2755         if (!write)
2756                 return -EINVAL;
2757
2758         net = (struct net *)ctl->extra1;
2759         delay = net->ipv6.sysctl.flush_delay;
2760         proc_dointvec(ctl, write, buffer, lenp, ppos);
2761         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2762         return 0;
2763 }
2764
2765 ctl_table ipv6_route_table_template[] = {
2766         {
2767                 .procname       =       "flush",
2768                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2769                 .maxlen         =       sizeof(int),
2770                 .mode           =       0200,
2771                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2772         },
2773         {
2774                 .procname       =       "gc_thresh",
2775                 .data           =       &ip6_dst_ops_template.gc_thresh,
2776                 .maxlen         =       sizeof(int),
2777                 .mode           =       0644,
2778                 .proc_handler   =       proc_dointvec,
2779         },
2780         {
2781                 .procname       =       "max_size",
2782                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2783                 .maxlen         =       sizeof(int),
2784                 .mode           =       0644,
2785                 .proc_handler   =       proc_dointvec,
2786         },
2787         {
2788                 .procname       =       "gc_min_interval",
2789                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2790                 .maxlen         =       sizeof(int),
2791                 .mode           =       0644,
2792                 .proc_handler   =       proc_dointvec_jiffies,
2793         },
2794         {
2795                 .procname       =       "gc_timeout",
2796                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2797                 .maxlen         =       sizeof(int),
2798                 .mode           =       0644,
2799                 .proc_handler   =       proc_dointvec_jiffies,
2800         },
2801         {
2802                 .procname       =       "gc_interval",
2803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2804                 .maxlen         =       sizeof(int),
2805                 .mode           =       0644,
2806                 .proc_handler   =       proc_dointvec_jiffies,
2807         },
2808         {
2809                 .procname       =       "gc_elasticity",
2810                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2811                 .maxlen         =       sizeof(int),
2812                 .mode           =       0644,
2813                 .proc_handler   =       proc_dointvec,
2814         },
2815         {
2816                 .procname       =       "mtu_expires",
2817                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2818                 .maxlen         =       sizeof(int),
2819                 .mode           =       0644,
2820                 .proc_handler   =       proc_dointvec_jiffies,
2821         },
2822         {
2823                 .procname       =       "min_adv_mss",
2824                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2825                 .maxlen         =       sizeof(int),
2826                 .mode           =       0644,
2827                 .proc_handler   =       proc_dointvec,
2828         },
2829         {
2830                 .procname       =       "gc_min_interval_ms",
2831                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2832                 .maxlen         =       sizeof(int),
2833                 .mode           =       0644,
2834                 .proc_handler   =       proc_dointvec_ms_jiffies,
2835         },
2836         { }
2837 };
2838
2839 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2840 {
2841         struct ctl_table *table;
2842
2843         table = kmemdup(ipv6_route_table_template,
2844                         sizeof(ipv6_route_table_template),
2845                         GFP_KERNEL);
2846
2847         if (table) {
2848                 table[0].data = &net->ipv6.sysctl.flush_delay;
2849                 table[0].extra1 = net;
2850                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2851                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2852                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2853                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2854                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2855                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2856                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2857                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2858                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2859         }
2860
2861         return table;
2862 }
2863 #endif
2864
2865 static int __net_init ip6_route_net_init(struct net *net)
2866 {
2867         int ret = -ENOMEM;
2868
2869         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2870                sizeof(net->ipv6.ip6_dst_ops));
2871
2872         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2873                 goto out_ip6_dst_ops;
2874
2875         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2876                                            sizeof(*net->ipv6.ip6_null_entry),
2877                                            GFP_KERNEL);
2878         if (!net->ipv6.ip6_null_entry)
2879                 goto out_ip6_dst_entries;
2880         net->ipv6.ip6_null_entry->dst.path =
2881                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2882         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2883         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2884                          ip6_template_metrics, true);
2885
2886 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2887         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2888                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2889                                                GFP_KERNEL);
2890         if (!net->ipv6.ip6_prohibit_entry)
2891                 goto out_ip6_null_entry;
2892         net->ipv6.ip6_prohibit_entry->dst.path =
2893                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2894         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2895         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2896                          ip6_template_metrics, true);
2897
2898         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2899                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2900                                                GFP_KERNEL);
2901         if (!net->ipv6.ip6_blk_hole_entry)
2902                 goto out_ip6_prohibit_entry;
2903         net->ipv6.ip6_blk_hole_entry->dst.path =
2904                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2905         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2906         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2907                          ip6_template_metrics, true);
2908 #endif
2909
2910         net->ipv6.sysctl.flush_delay = 0;
2911         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2912         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2913         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2914         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2915         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2916         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2917         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2918
2919 #ifdef CONFIG_PROC_FS
2920         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2921         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2922 #endif
2923         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2924
2925         ret = 0;
2926 out:
2927         return ret;
2928
2929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2930 out_ip6_prohibit_entry:
2931         kfree(net->ipv6.ip6_prohibit_entry);
2932 out_ip6_null_entry:
2933         kfree(net->ipv6.ip6_null_entry);
2934 #endif
2935 out_ip6_dst_entries:
2936         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2937 out_ip6_dst_ops:
2938         goto out;
2939 }
2940
2941 static void __net_exit ip6_route_net_exit(struct net *net)
2942 {
2943 #ifdef CONFIG_PROC_FS
2944         proc_net_remove(net, "ipv6_route");
2945         proc_net_remove(net, "rt6_stats");
2946 #endif
2947         kfree(net->ipv6.ip6_null_entry);
2948 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2949         kfree(net->ipv6.ip6_prohibit_entry);
2950         kfree(net->ipv6.ip6_blk_hole_entry);
2951 #endif
2952         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2953 }
2954
2955 static struct pernet_operations ip6_route_net_ops = {
2956         .init = ip6_route_net_init,
2957         .exit = ip6_route_net_exit,
2958 };
2959
2960 static struct notifier_block ip6_route_dev_notifier = {
2961         .notifier_call = ip6_route_dev_notify,
2962         .priority = 0,
2963 };
2964
2965 int __init ip6_route_init(void)
2966 {
2967         int ret;
2968
2969         ret = -ENOMEM;
2970         ip6_dst_ops_template.kmem_cachep =
2971                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2972                                   SLAB_HWCACHE_ALIGN, NULL);
2973         if (!ip6_dst_ops_template.kmem_cachep)
2974                 goto out;
2975
2976         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2977         if (ret)
2978                 goto out_kmem_cache;
2979
2980         ret = register_pernet_subsys(&ip6_route_net_ops);
2981         if (ret)
2982                 goto out_dst_entries;
2983
2984         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2985
2986         /* Registering of the loopback is done before this portion of code,
2987          * the loopback reference in rt6_info will not be taken, do it
2988          * manually for init_net */
2989         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2990         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2991   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2992         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2993         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2994         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2995         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2996   #endif
2997         ret = fib6_init();
2998         if (ret)
2999                 goto out_register_subsys;
3000
3001         ret = xfrm6_init();
3002         if (ret)
3003                 goto out_fib6_init;
3004
3005         ret = fib6_rules_init();
3006         if (ret)
3007                 goto xfrm6_init;
3008
3009         ret = -ENOBUFS;
3010         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3011             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3012             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3013                 goto fib6_rules_init;
3014
3015         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3016         if (ret)
3017                 goto fib6_rules_init;
3018
3019 out:
3020         return ret;
3021
3022 fib6_rules_init:
3023         fib6_rules_cleanup();
3024 xfrm6_init:
3025         xfrm6_fini();
3026 out_fib6_init:
3027         fib6_gc_cleanup();
3028 out_register_subsys:
3029         unregister_pernet_subsys(&ip6_route_net_ops);
3030 out_dst_entries:
3031         dst_entries_destroy(&ip6_dst_blackhole_ops);
3032 out_kmem_cache:
3033         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3034         goto out;
3035 }
3036
3037 void ip6_route_cleanup(void)
3038 {
3039         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3040         fib6_rules_cleanup();
3041         xfrm6_fini();
3042         fib6_gc_cleanup();
3043         unregister_pernet_subsys(&ip6_route_net_ops);
3044         dst_entries_destroy(&ip6_dst_blackhole_ops);
3045         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3046 }