]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - net/ipv6/route.c
ipv6: fix icmp6_dst_alloc()
[~shefty/rdma-dev.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
125 {
126         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
127         if (n)
128                 return n;
129         return neigh_create(&nd_tbl, daddr, dst->dev);
130 }
131
132 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
133 {
134         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
135         if (!n) {
136                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
137                 if (IS_ERR(n))
138                         return PTR_ERR(n);
139         }
140         dst_set_neighbour(&rt->dst, n);
141
142         return 0;
143 }
144
145 static struct dst_ops ip6_dst_ops_template = {
146         .family                 =       AF_INET6,
147         .protocol               =       cpu_to_be16(ETH_P_IPV6),
148         .gc                     =       ip6_dst_gc,
149         .gc_thresh              =       1024,
150         .check                  =       ip6_dst_check,
151         .default_advmss         =       ip6_default_advmss,
152         .mtu                    =       ip6_mtu,
153         .cow_metrics            =       ipv6_cow_metrics,
154         .destroy                =       ip6_dst_destroy,
155         .ifdown                 =       ip6_dst_ifdown,
156         .negative_advice        =       ip6_negative_advice,
157         .link_failure           =       ip6_link_failure,
158         .update_pmtu            =       ip6_rt_update_pmtu,
159         .local_out              =       __ip6_local_out,
160         .neigh_lookup           =       ip6_neigh_lookup,
161 };
162
163 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
164 {
165         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
166
167         return mtu ? : dst->dev->mtu;
168 }
169
170 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
171 {
172 }
173
174 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175                                          unsigned long old)
176 {
177         return NULL;
178 }
179
180 static struct dst_ops ip6_dst_blackhole_ops = {
181         .family                 =       AF_INET6,
182         .protocol               =       cpu_to_be16(ETH_P_IPV6),
183         .destroy                =       ip6_dst_destroy,
184         .check                  =       ip6_dst_check,
185         .mtu                    =       ip6_blackhole_mtu,
186         .default_advmss         =       ip6_default_advmss,
187         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
188         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
189         .neigh_lookup           =       ip6_neigh_lookup,
190 };
191
192 static const u32 ip6_template_metrics[RTAX_MAX] = {
193         [RTAX_HOPLIMIT - 1] = 255,
194 };
195
196 static struct rt6_info ip6_null_entry_template = {
197         .dst = {
198                 .__refcnt       = ATOMIC_INIT(1),
199                 .__use          = 1,
200                 .obsolete       = -1,
201                 .error          = -ENETUNREACH,
202                 .input          = ip6_pkt_discard,
203                 .output         = ip6_pkt_discard_out,
204         },
205         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
206         .rt6i_protocol  = RTPROT_KERNEL,
207         .rt6i_metric    = ~(u32) 0,
208         .rt6i_ref       = ATOMIC_INIT(1),
209 };
210
211 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
212
213 static int ip6_pkt_prohibit(struct sk_buff *skb);
214 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
215
216 static struct rt6_info ip6_prohibit_entry_template = {
217         .dst = {
218                 .__refcnt       = ATOMIC_INIT(1),
219                 .__use          = 1,
220                 .obsolete       = -1,
221                 .error          = -EACCES,
222                 .input          = ip6_pkt_prohibit,
223                 .output         = ip6_pkt_prohibit_out,
224         },
225         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
226         .rt6i_protocol  = RTPROT_KERNEL,
227         .rt6i_metric    = ~(u32) 0,
228         .rt6i_ref       = ATOMIC_INIT(1),
229 };
230
231 static struct rt6_info ip6_blk_hole_entry_template = {
232         .dst = {
233                 .__refcnt       = ATOMIC_INIT(1),
234                 .__use          = 1,
235                 .obsolete       = -1,
236                 .error          = -EINVAL,
237                 .input          = dst_discard,
238                 .output         = dst_discard,
239         },
240         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
241         .rt6i_protocol  = RTPROT_KERNEL,
242         .rt6i_metric    = ~(u32) 0,
243         .rt6i_ref       = ATOMIC_INIT(1),
244 };
245
246 #endif
247
248 /* allocate dst with ip6_dst_ops */
249 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
250                                              struct net_device *dev,
251                                              int flags)
252 {
253         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
254
255         if (rt)
256                 memset(&rt->rt6i_table, 0,
257                        sizeof(*rt) - sizeof(struct dst_entry));
258
259         return rt;
260 }
261
262 static void ip6_dst_destroy(struct dst_entry *dst)
263 {
264         struct rt6_info *rt = (struct rt6_info *)dst;
265         struct inet6_dev *idev = rt->rt6i_idev;
266         struct inet_peer *peer = rt->rt6i_peer;
267
268         if (!(rt->dst.flags & DST_HOST))
269                 dst_destroy_metrics_generic(dst);
270
271         if (idev) {
272                 rt->rt6i_idev = NULL;
273                 in6_dev_put(idev);
274         }
275         if (peer) {
276                 rt->rt6i_peer = NULL;
277                 inet_putpeer(peer);
278         }
279 }
280
281 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
282
283 static u32 rt6_peer_genid(void)
284 {
285         return atomic_read(&__rt6_peer_genid);
286 }
287
288 void rt6_bind_peer(struct rt6_info *rt, int create)
289 {
290         struct inet_peer *peer;
291
292         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
293         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
294                 inet_putpeer(peer);
295         else
296                 rt->rt6i_peer_genid = rt6_peer_genid();
297 }
298
299 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
300                            int how)
301 {
302         struct rt6_info *rt = (struct rt6_info *)dst;
303         struct inet6_dev *idev = rt->rt6i_idev;
304         struct net_device *loopback_dev =
305                 dev_net(dev)->loopback_dev;
306
307         if (dev != loopback_dev && idev && idev->dev == dev) {
308                 struct inet6_dev *loopback_idev =
309                         in6_dev_get(loopback_dev);
310                 if (loopback_idev) {
311                         rt->rt6i_idev = loopback_idev;
312                         in6_dev_put(idev);
313                 }
314         }
315 }
316
317 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
318 {
319         return (rt->rt6i_flags & RTF_EXPIRES) &&
320                 time_after(jiffies, rt->dst.expires);
321 }
322
323 static inline int rt6_need_strict(const struct in6_addr *daddr)
324 {
325         return ipv6_addr_type(daddr) &
326                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
327 }
328
329 /*
330  *      Route lookup. Any table->tb6_lock is implied.
331  */
332
333 static inline struct rt6_info *rt6_device_match(struct net *net,
334                                                     struct rt6_info *rt,
335                                                     const struct in6_addr *saddr,
336                                                     int oif,
337                                                     int flags)
338 {
339         struct rt6_info *local = NULL;
340         struct rt6_info *sprt;
341
342         if (!oif && ipv6_addr_any(saddr))
343                 goto out;
344
345         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
346                 struct net_device *dev = sprt->dst.dev;
347
348                 if (oif) {
349                         if (dev->ifindex == oif)
350                                 return sprt;
351                         if (dev->flags & IFF_LOOPBACK) {
352                                 if (!sprt->rt6i_idev ||
353                                     sprt->rt6i_idev->dev->ifindex != oif) {
354                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
355                                                 continue;
356                                         if (local && (!oif ||
357                                                       local->rt6i_idev->dev->ifindex == oif))
358                                                 continue;
359                                 }
360                                 local = sprt;
361                         }
362                 } else {
363                         if (ipv6_chk_addr(net, saddr, dev,
364                                           flags & RT6_LOOKUP_F_IFACE))
365                                 return sprt;
366                 }
367         }
368
369         if (oif) {
370                 if (local)
371                         return local;
372
373                 if (flags & RT6_LOOKUP_F_IFACE)
374                         return net->ipv6.ip6_null_entry;
375         }
376 out:
377         return rt;
378 }
379
380 #ifdef CONFIG_IPV6_ROUTER_PREF
381 static void rt6_probe(struct rt6_info *rt)
382 {
383         struct neighbour *neigh;
384         /*
385          * Okay, this does not seem to be appropriate
386          * for now, however, we need to check if it
387          * is really so; aka Router Reachability Probing.
388          *
389          * Router Reachability Probe MUST be rate-limited
390          * to no more than one per minute.
391          */
392         rcu_read_lock();
393         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
394         if (!neigh || (neigh->nud_state & NUD_VALID))
395                 goto out;
396         read_lock_bh(&neigh->lock);
397         if (!(neigh->nud_state & NUD_VALID) &&
398             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
399                 struct in6_addr mcaddr;
400                 struct in6_addr *target;
401
402                 neigh->updated = jiffies;
403                 read_unlock_bh(&neigh->lock);
404
405                 target = (struct in6_addr *)&neigh->primary_key;
406                 addrconf_addr_solict_mult(target, &mcaddr);
407                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
408         } else {
409                 read_unlock_bh(&neigh->lock);
410         }
411 out:
412         rcu_read_unlock();
413 }
414 #else
415 static inline void rt6_probe(struct rt6_info *rt)
416 {
417 }
418 #endif
419
420 /*
421  * Default Router Selection (RFC 2461 6.3.6)
422  */
423 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
424 {
425         struct net_device *dev = rt->dst.dev;
426         if (!oif || dev->ifindex == oif)
427                 return 2;
428         if ((dev->flags & IFF_LOOPBACK) &&
429             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
430                 return 1;
431         return 0;
432 }
433
434 static inline int rt6_check_neigh(struct rt6_info *rt)
435 {
436         struct neighbour *neigh;
437         int m;
438
439         rcu_read_lock();
440         neigh = dst_get_neighbour_noref(&rt->dst);
441         if (rt->rt6i_flags & RTF_NONEXTHOP ||
442             !(rt->rt6i_flags & RTF_GATEWAY))
443                 m = 1;
444         else if (neigh) {
445                 read_lock_bh(&neigh->lock);
446                 if (neigh->nud_state & NUD_VALID)
447                         m = 2;
448 #ifdef CONFIG_IPV6_ROUTER_PREF
449                 else if (neigh->nud_state & NUD_FAILED)
450                         m = 0;
451 #endif
452                 else
453                         m = 1;
454                 read_unlock_bh(&neigh->lock);
455         } else
456                 m = 0;
457         rcu_read_unlock();
458         return m;
459 }
460
461 static int rt6_score_route(struct rt6_info *rt, int oif,
462                            int strict)
463 {
464         int m, n;
465
466         m = rt6_check_dev(rt, oif);
467         if (!m && (strict & RT6_LOOKUP_F_IFACE))
468                 return -1;
469 #ifdef CONFIG_IPV6_ROUTER_PREF
470         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
471 #endif
472         n = rt6_check_neigh(rt);
473         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
474                 return -1;
475         return m;
476 }
477
478 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
479                                    int *mpri, struct rt6_info *match)
480 {
481         int m;
482
483         if (rt6_check_expired(rt))
484                 goto out;
485
486         m = rt6_score_route(rt, oif, strict);
487         if (m < 0)
488                 goto out;
489
490         if (m > *mpri) {
491                 if (strict & RT6_LOOKUP_F_REACHABLE)
492                         rt6_probe(match);
493                 *mpri = m;
494                 match = rt;
495         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
496                 rt6_probe(rt);
497         }
498
499 out:
500         return match;
501 }
502
503 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
504                                      struct rt6_info *rr_head,
505                                      u32 metric, int oif, int strict)
506 {
507         struct rt6_info *rt, *match;
508         int mpri = -1;
509
510         match = NULL;
511         for (rt = rr_head; rt && rt->rt6i_metric == metric;
512              rt = rt->dst.rt6_next)
513                 match = find_match(rt, oif, strict, &mpri, match);
514         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
515              rt = rt->dst.rt6_next)
516                 match = find_match(rt, oif, strict, &mpri, match);
517
518         return match;
519 }
520
521 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
522 {
523         struct rt6_info *match, *rt0;
524         struct net *net;
525
526         rt0 = fn->rr_ptr;
527         if (!rt0)
528                 fn->rr_ptr = rt0 = fn->leaf;
529
530         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531
532         if (!match &&
533             (strict & RT6_LOOKUP_F_REACHABLE)) {
534                 struct rt6_info *next = rt0->dst.rt6_next;
535
536                 /* no entries matched; do round-robin */
537                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
538                         next = fn->leaf;
539
540                 if (next != rt0)
541                         fn->rr_ptr = next;
542         }
543
544         net = dev_net(rt0->dst.dev);
545         return match ? match : net->ipv6.ip6_null_entry;
546 }
547
548 #ifdef CONFIG_IPV6_ROUTE_INFO
549 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
550                   const struct in6_addr *gwaddr)
551 {
552         struct net *net = dev_net(dev);
553         struct route_info *rinfo = (struct route_info *) opt;
554         struct in6_addr prefix_buf, *prefix;
555         unsigned int pref;
556         unsigned long lifetime;
557         struct rt6_info *rt;
558
559         if (len < sizeof(struct route_info)) {
560                 return -EINVAL;
561         }
562
563         /* Sanity check for prefix_len and length */
564         if (rinfo->length > 3) {
565                 return -EINVAL;
566         } else if (rinfo->prefix_len > 128) {
567                 return -EINVAL;
568         } else if (rinfo->prefix_len > 64) {
569                 if (rinfo->length < 2) {
570                         return -EINVAL;
571                 }
572         } else if (rinfo->prefix_len > 0) {
573                 if (rinfo->length < 1) {
574                         return -EINVAL;
575                 }
576         }
577
578         pref = rinfo->route_pref;
579         if (pref == ICMPV6_ROUTER_PREF_INVALID)
580                 return -EINVAL;
581
582         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
583
584         if (rinfo->length == 3)
585                 prefix = (struct in6_addr *)rinfo->prefix;
586         else {
587                 /* this function is safe */
588                 ipv6_addr_prefix(&prefix_buf,
589                                  (struct in6_addr *)rinfo->prefix,
590                                  rinfo->prefix_len);
591                 prefix = &prefix_buf;
592         }
593
594         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
595                                 dev->ifindex);
596
597         if (rt && !lifetime) {
598                 ip6_del_rt(rt);
599                 rt = NULL;
600         }
601
602         if (!rt && lifetime)
603                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
604                                         pref);
605         else if (rt)
606                 rt->rt6i_flags = RTF_ROUTEINFO |
607                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
608
609         if (rt) {
610                 if (!addrconf_finite_timeout(lifetime)) {
611                         rt->rt6i_flags &= ~RTF_EXPIRES;
612                 } else {
613                         rt->dst.expires = jiffies + HZ * lifetime;
614                         rt->rt6i_flags |= RTF_EXPIRES;
615                 }
616                 dst_release(&rt->dst);
617         }
618         return 0;
619 }
620 #endif
621
622 #define BACKTRACK(__net, saddr)                 \
623 do { \
624         if (rt == __net->ipv6.ip6_null_entry) { \
625                 struct fib6_node *pn; \
626                 while (1) { \
627                         if (fn->fn_flags & RTN_TL_ROOT) \
628                                 goto out; \
629                         pn = fn->parent; \
630                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
631                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
632                         else \
633                                 fn = pn; \
634                         if (fn->fn_flags & RTN_RTINFO) \
635                                 goto restart; \
636                 } \
637         } \
638 } while (0)
639
640 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
641                                              struct fib6_table *table,
642                                              struct flowi6 *fl6, int flags)
643 {
644         struct fib6_node *fn;
645         struct rt6_info *rt;
646
647         read_lock_bh(&table->tb6_lock);
648         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
649 restart:
650         rt = fn->leaf;
651         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
652         BACKTRACK(net, &fl6->saddr);
653 out:
654         dst_use(&rt->dst, jiffies);
655         read_unlock_bh(&table->tb6_lock);
656         return rt;
657
658 }
659
660 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
661                                     int flags)
662 {
663         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
664 }
665 EXPORT_SYMBOL_GPL(ip6_route_lookup);
666
667 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
668                             const struct in6_addr *saddr, int oif, int strict)
669 {
670         struct flowi6 fl6 = {
671                 .flowi6_oif = oif,
672                 .daddr = *daddr,
673         };
674         struct dst_entry *dst;
675         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
676
677         if (saddr) {
678                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
679                 flags |= RT6_LOOKUP_F_HAS_SADDR;
680         }
681
682         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
683         if (dst->error == 0)
684                 return (struct rt6_info *) dst;
685
686         dst_release(dst);
687
688         return NULL;
689 }
690
691 EXPORT_SYMBOL(rt6_lookup);
692
693 /* ip6_ins_rt is called with FREE table->tb6_lock.
694    It takes new route entry, the addition fails by any reason the
695    route is freed. In any case, if caller does not hold it, it may
696    be destroyed.
697  */
698
699 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
700 {
701         int err;
702         struct fib6_table *table;
703
704         table = rt->rt6i_table;
705         write_lock_bh(&table->tb6_lock);
706         err = fib6_add(&table->tb6_root, rt, info);
707         write_unlock_bh(&table->tb6_lock);
708
709         return err;
710 }
711
712 int ip6_ins_rt(struct rt6_info *rt)
713 {
714         struct nl_info info = {
715                 .nl_net = dev_net(rt->dst.dev),
716         };
717         return __ip6_ins_rt(rt, &info);
718 }
719
720 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
721                                       const struct in6_addr *daddr,
722                                       const struct in6_addr *saddr)
723 {
724         struct rt6_info *rt;
725
726         /*
727          *      Clone the route.
728          */
729
730         rt = ip6_rt_copy(ort, daddr);
731
732         if (rt) {
733                 int attempts = !in_softirq();
734
735                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
736                         if (ort->rt6i_dst.plen != 128 &&
737                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
738                                 rt->rt6i_flags |= RTF_ANYCAST;
739                         rt->rt6i_gateway = *daddr;
740                 }
741
742                 rt->rt6i_flags |= RTF_CACHE;
743
744 #ifdef CONFIG_IPV6_SUBTREES
745                 if (rt->rt6i_src.plen && saddr) {
746                         rt->rt6i_src.addr = *saddr;
747                         rt->rt6i_src.plen = 128;
748                 }
749 #endif
750
751         retry:
752                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
753                         struct net *net = dev_net(rt->dst.dev);
754                         int saved_rt_min_interval =
755                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
756                         int saved_rt_elasticity =
757                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758
759                         if (attempts-- > 0) {
760                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
761                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762
763                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764
765                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766                                         saved_rt_elasticity;
767                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
768                                         saved_rt_min_interval;
769                                 goto retry;
770                         }
771
772                         if (net_ratelimit())
773                                 printk(KERN_WARNING
774                                        "ipv6: Neighbour table overflow.\n");
775                         dst_free(&rt->dst);
776                         return NULL;
777                 }
778         }
779
780         return rt;
781 }
782
783 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
784                                         const struct in6_addr *daddr)
785 {
786         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
787
788         if (rt) {
789                 rt->rt6i_flags |= RTF_CACHE;
790                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
791         }
792         return rt;
793 }
794
795 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
796                                       struct flowi6 *fl6, int flags)
797 {
798         struct fib6_node *fn;
799         struct rt6_info *rt, *nrt;
800         int strict = 0;
801         int attempts = 3;
802         int err;
803         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
804
805         strict |= flags & RT6_LOOKUP_F_IFACE;
806
807 relookup:
808         read_lock_bh(&table->tb6_lock);
809
810 restart_2:
811         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
812
813 restart:
814         rt = rt6_select(fn, oif, strict | reachable);
815
816         BACKTRACK(net, &fl6->saddr);
817         if (rt == net->ipv6.ip6_null_entry ||
818             rt->rt6i_flags & RTF_CACHE)
819                 goto out;
820
821         dst_hold(&rt->dst);
822         read_unlock_bh(&table->tb6_lock);
823
824         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
825                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
826         else if (!(rt->dst.flags & DST_HOST))
827                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
828         else
829                 goto out2;
830
831         dst_release(&rt->dst);
832         rt = nrt ? : net->ipv6.ip6_null_entry;
833
834         dst_hold(&rt->dst);
835         if (nrt) {
836                 err = ip6_ins_rt(nrt);
837                 if (!err)
838                         goto out2;
839         }
840
841         if (--attempts <= 0)
842                 goto out2;
843
844         /*
845          * Race condition! In the gap, when table->tb6_lock was
846          * released someone could insert this route.  Relookup.
847          */
848         dst_release(&rt->dst);
849         goto relookup;
850
851 out:
852         if (reachable) {
853                 reachable = 0;
854                 goto restart_2;
855         }
856         dst_hold(&rt->dst);
857         read_unlock_bh(&table->tb6_lock);
858 out2:
859         rt->dst.lastuse = jiffies;
860         rt->dst.__use++;
861
862         return rt;
863 }
864
865 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
866                                             struct flowi6 *fl6, int flags)
867 {
868         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
869 }
870
871 void ip6_route_input(struct sk_buff *skb)
872 {
873         const struct ipv6hdr *iph = ipv6_hdr(skb);
874         struct net *net = dev_net(skb->dev);
875         int flags = RT6_LOOKUP_F_HAS_SADDR;
876         struct flowi6 fl6 = {
877                 .flowi6_iif = skb->dev->ifindex,
878                 .daddr = iph->daddr,
879                 .saddr = iph->saddr,
880                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
881                 .flowi6_mark = skb->mark,
882                 .flowi6_proto = iph->nexthdr,
883         };
884
885         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
886                 flags |= RT6_LOOKUP_F_IFACE;
887
888         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
889 }
890
891 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
892                                              struct flowi6 *fl6, int flags)
893 {
894         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
895 }
896
897 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
898                                     struct flowi6 *fl6)
899 {
900         int flags = 0;
901
902         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
903                 flags |= RT6_LOOKUP_F_IFACE;
904
905         if (!ipv6_addr_any(&fl6->saddr))
906                 flags |= RT6_LOOKUP_F_HAS_SADDR;
907         else if (sk)
908                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
909
910         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
911 }
912
913 EXPORT_SYMBOL(ip6_route_output);
914
915 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
916 {
917         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
918         struct dst_entry *new = NULL;
919
920         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
921         if (rt) {
922                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
923
924                 new = &rt->dst;
925
926                 new->__use = 1;
927                 new->input = dst_discard;
928                 new->output = dst_discard;
929
930                 if (dst_metrics_read_only(&ort->dst))
931                         new->_metrics = ort->dst._metrics;
932                 else
933                         dst_copy_metrics(new, &ort->dst);
934                 rt->rt6i_idev = ort->rt6i_idev;
935                 if (rt->rt6i_idev)
936                         in6_dev_hold(rt->rt6i_idev);
937                 rt->dst.expires = 0;
938
939                 rt->rt6i_gateway = ort->rt6i_gateway;
940                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
941                 rt->rt6i_metric = 0;
942
943                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
944 #ifdef CONFIG_IPV6_SUBTREES
945                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
946 #endif
947
948                 dst_free(new);
949         }
950
951         dst_release(dst_orig);
952         return new ? new : ERR_PTR(-ENOMEM);
953 }
954
955 /*
956  *      Destination cache support functions
957  */
958
959 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
960 {
961         struct rt6_info *rt;
962
963         rt = (struct rt6_info *) dst;
964
965         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
966                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
967                         if (!rt->rt6i_peer)
968                                 rt6_bind_peer(rt, 0);
969                         rt->rt6i_peer_genid = rt6_peer_genid();
970                 }
971                 return dst;
972         }
973         return NULL;
974 }
975
976 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
977 {
978         struct rt6_info *rt = (struct rt6_info *) dst;
979
980         if (rt) {
981                 if (rt->rt6i_flags & RTF_CACHE) {
982                         if (rt6_check_expired(rt)) {
983                                 ip6_del_rt(rt);
984                                 dst = NULL;
985                         }
986                 } else {
987                         dst_release(dst);
988                         dst = NULL;
989                 }
990         }
991         return dst;
992 }
993
994 static void ip6_link_failure(struct sk_buff *skb)
995 {
996         struct rt6_info *rt;
997
998         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
999
1000         rt = (struct rt6_info *) skb_dst(skb);
1001         if (rt) {
1002                 if (rt->rt6i_flags & RTF_CACHE) {
1003                         dst_set_expires(&rt->dst, 0);
1004                         rt->rt6i_flags |= RTF_EXPIRES;
1005                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1006                         rt->rt6i_node->fn_sernum = -1;
1007         }
1008 }
1009
1010 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1011 {
1012         struct rt6_info *rt6 = (struct rt6_info*)dst;
1013
1014         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1015                 rt6->rt6i_flags |= RTF_MODIFIED;
1016                 if (mtu < IPV6_MIN_MTU) {
1017                         u32 features = dst_metric(dst, RTAX_FEATURES);
1018                         mtu = IPV6_MIN_MTU;
1019                         features |= RTAX_FEATURE_ALLFRAG;
1020                         dst_metric_set(dst, RTAX_FEATURES, features);
1021                 }
1022                 dst_metric_set(dst, RTAX_MTU, mtu);
1023         }
1024 }
1025
1026 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1027 {
1028         struct net_device *dev = dst->dev;
1029         unsigned int mtu = dst_mtu(dst);
1030         struct net *net = dev_net(dev);
1031
1032         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1033
1034         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1035                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1036
1037         /*
1038          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1039          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1040          * IPV6_MAXPLEN is also valid and means: "any MSS,
1041          * rely only on pmtu discovery"
1042          */
1043         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1044                 mtu = IPV6_MAXPLEN;
1045         return mtu;
1046 }
1047
1048 static unsigned int ip6_mtu(const struct dst_entry *dst)
1049 {
1050         struct inet6_dev *idev;
1051         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1052
1053         if (mtu)
1054                 return mtu;
1055
1056         mtu = IPV6_MIN_MTU;
1057
1058         rcu_read_lock();
1059         idev = __in6_dev_get(dst->dev);
1060         if (idev)
1061                 mtu = idev->cnf.mtu6;
1062         rcu_read_unlock();
1063
1064         return mtu;
1065 }
1066
1067 static struct dst_entry *icmp6_dst_gc_list;
1068 static DEFINE_SPINLOCK(icmp6_dst_lock);
1069
1070 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1071                                   struct neighbour *neigh,
1072                                   struct flowi6 *fl6)
1073 {
1074         struct dst_entry *dst;
1075         struct rt6_info *rt;
1076         struct inet6_dev *idev = in6_dev_get(dev);
1077         struct net *net = dev_net(dev);
1078
1079         if (unlikely(!idev))
1080                 return ERR_PTR(-ENODEV);
1081
1082         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1083         if (unlikely(!rt)) {
1084                 in6_dev_put(idev);
1085                 dst = ERR_PTR(-ENOMEM);
1086                 goto out;
1087         }
1088
1089         if (neigh)
1090                 neigh_hold(neigh);
1091         else {
1092                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1093                 if (IS_ERR(neigh)) {
1094                         in6_dev_put(idev);
1095                         dst_free(&rt->dst);
1096                         return ERR_CAST(neigh);
1097                 }
1098         }
1099
1100         rt->dst.flags |= DST_HOST;
1101         rt->dst.output  = ip6_output;
1102         dst_set_neighbour(&rt->dst, neigh);
1103         atomic_set(&rt->dst.__refcnt, 1);
1104         rt->rt6i_dst.addr = fl6->daddr;
1105         rt->rt6i_dst.plen = 128;
1106         rt->rt6i_idev     = idev;
1107         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1108
1109         spin_lock_bh(&icmp6_dst_lock);
1110         rt->dst.next = icmp6_dst_gc_list;
1111         icmp6_dst_gc_list = &rt->dst;
1112         spin_unlock_bh(&icmp6_dst_lock);
1113
1114         fib6_force_start_gc(net);
1115
1116         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1117
1118 out:
1119         return dst;
1120 }
1121
1122 int icmp6_dst_gc(void)
1123 {
1124         struct dst_entry *dst, **pprev;
1125         int more = 0;
1126
1127         spin_lock_bh(&icmp6_dst_lock);
1128         pprev = &icmp6_dst_gc_list;
1129
1130         while ((dst = *pprev) != NULL) {
1131                 if (!atomic_read(&dst->__refcnt)) {
1132                         *pprev = dst->next;
1133                         dst_free(dst);
1134                 } else {
1135                         pprev = &dst->next;
1136                         ++more;
1137                 }
1138         }
1139
1140         spin_unlock_bh(&icmp6_dst_lock);
1141
1142         return more;
1143 }
1144
1145 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1146                             void *arg)
1147 {
1148         struct dst_entry *dst, **pprev;
1149
1150         spin_lock_bh(&icmp6_dst_lock);
1151         pprev = &icmp6_dst_gc_list;
1152         while ((dst = *pprev) != NULL) {
1153                 struct rt6_info *rt = (struct rt6_info *) dst;
1154                 if (func(rt, arg)) {
1155                         *pprev = dst->next;
1156                         dst_free(dst);
1157                 } else {
1158                         pprev = &dst->next;
1159                 }
1160         }
1161         spin_unlock_bh(&icmp6_dst_lock);
1162 }
1163
1164 static int ip6_dst_gc(struct dst_ops *ops)
1165 {
1166         unsigned long now = jiffies;
1167         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1168         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1169         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1170         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1171         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1172         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1173         int entries;
1174
1175         entries = dst_entries_get_fast(ops);
1176         if (time_after(rt_last_gc + rt_min_interval, now) &&
1177             entries <= rt_max_size)
1178                 goto out;
1179
1180         net->ipv6.ip6_rt_gc_expire++;
1181         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1182         net->ipv6.ip6_rt_last_gc = now;
1183         entries = dst_entries_get_slow(ops);
1184         if (entries < ops->gc_thresh)
1185                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1186 out:
1187         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1188         return entries > rt_max_size;
1189 }
1190
1191 /* Clean host part of a prefix. Not necessary in radix tree,
1192    but results in cleaner routing tables.
1193
1194    Remove it only when all the things will work!
1195  */
1196
1197 int ip6_dst_hoplimit(struct dst_entry *dst)
1198 {
1199         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1200         if (hoplimit == 0) {
1201                 struct net_device *dev = dst->dev;
1202                 struct inet6_dev *idev;
1203
1204                 rcu_read_lock();
1205                 idev = __in6_dev_get(dev);
1206                 if (idev)
1207                         hoplimit = idev->cnf.hop_limit;
1208                 else
1209                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1210                 rcu_read_unlock();
1211         }
1212         return hoplimit;
1213 }
1214 EXPORT_SYMBOL(ip6_dst_hoplimit);
1215
1216 /*
1217  *
1218  */
1219
1220 int ip6_route_add(struct fib6_config *cfg)
1221 {
1222         int err;
1223         struct net *net = cfg->fc_nlinfo.nl_net;
1224         struct rt6_info *rt = NULL;
1225         struct net_device *dev = NULL;
1226         struct inet6_dev *idev = NULL;
1227         struct fib6_table *table;
1228         int addr_type;
1229
1230         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1231                 return -EINVAL;
1232 #ifndef CONFIG_IPV6_SUBTREES
1233         if (cfg->fc_src_len)
1234                 return -EINVAL;
1235 #endif
1236         if (cfg->fc_ifindex) {
1237                 err = -ENODEV;
1238                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1239                 if (!dev)
1240                         goto out;
1241                 idev = in6_dev_get(dev);
1242                 if (!idev)
1243                         goto out;
1244         }
1245
1246         if (cfg->fc_metric == 0)
1247                 cfg->fc_metric = IP6_RT_PRIO_USER;
1248
1249         err = -ENOBUFS;
1250         if (cfg->fc_nlinfo.nlh &&
1251             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1252                 table = fib6_get_table(net, cfg->fc_table);
1253                 if (!table) {
1254                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1255                         table = fib6_new_table(net, cfg->fc_table);
1256                 }
1257         } else {
1258                 table = fib6_new_table(net, cfg->fc_table);
1259         }
1260
1261         if (!table)
1262                 goto out;
1263
1264         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1265
1266         if (!rt) {
1267                 err = -ENOMEM;
1268                 goto out;
1269         }
1270
1271         rt->dst.obsolete = -1;
1272         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1273                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1274                                 0;
1275
1276         if (cfg->fc_protocol == RTPROT_UNSPEC)
1277                 cfg->fc_protocol = RTPROT_BOOT;
1278         rt->rt6i_protocol = cfg->fc_protocol;
1279
1280         addr_type = ipv6_addr_type(&cfg->fc_dst);
1281
1282         if (addr_type & IPV6_ADDR_MULTICAST)
1283                 rt->dst.input = ip6_mc_input;
1284         else if (cfg->fc_flags & RTF_LOCAL)
1285                 rt->dst.input = ip6_input;
1286         else
1287                 rt->dst.input = ip6_forward;
1288
1289         rt->dst.output = ip6_output;
1290
1291         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1292         rt->rt6i_dst.plen = cfg->fc_dst_len;
1293         if (rt->rt6i_dst.plen == 128)
1294                rt->dst.flags |= DST_HOST;
1295
1296         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1297                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1298                 if (!metrics) {
1299                         err = -ENOMEM;
1300                         goto out;
1301                 }
1302                 dst_init_metrics(&rt->dst, metrics, 0);
1303         }
1304 #ifdef CONFIG_IPV6_SUBTREES
1305         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1306         rt->rt6i_src.plen = cfg->fc_src_len;
1307 #endif
1308
1309         rt->rt6i_metric = cfg->fc_metric;
1310
1311         /* We cannot add true routes via loopback here,
1312            they would result in kernel looping; promote them to reject routes
1313          */
1314         if ((cfg->fc_flags & RTF_REJECT) ||
1315             (dev && (dev->flags & IFF_LOOPBACK) &&
1316              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1317              !(cfg->fc_flags & RTF_LOCAL))) {
1318                 /* hold loopback dev/idev if we haven't done so. */
1319                 if (dev != net->loopback_dev) {
1320                         if (dev) {
1321                                 dev_put(dev);
1322                                 in6_dev_put(idev);
1323                         }
1324                         dev = net->loopback_dev;
1325                         dev_hold(dev);
1326                         idev = in6_dev_get(dev);
1327                         if (!idev) {
1328                                 err = -ENODEV;
1329                                 goto out;
1330                         }
1331                 }
1332                 rt->dst.output = ip6_pkt_discard_out;
1333                 rt->dst.input = ip6_pkt_discard;
1334                 rt->dst.error = -ENETUNREACH;
1335                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1336                 goto install_route;
1337         }
1338
1339         if (cfg->fc_flags & RTF_GATEWAY) {
1340                 const struct in6_addr *gw_addr;
1341                 int gwa_type;
1342
1343                 gw_addr = &cfg->fc_gateway;
1344                 rt->rt6i_gateway = *gw_addr;
1345                 gwa_type = ipv6_addr_type(gw_addr);
1346
1347                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1348                         struct rt6_info *grt;
1349
1350                         /* IPv6 strictly inhibits using not link-local
1351                            addresses as nexthop address.
1352                            Otherwise, router will not able to send redirects.
1353                            It is very good, but in some (rare!) circumstances
1354                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1355                            some exceptions. --ANK
1356                          */
1357                         err = -EINVAL;
1358                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1359                                 goto out;
1360
1361                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1362
1363                         err = -EHOSTUNREACH;
1364                         if (!grt)
1365                                 goto out;
1366                         if (dev) {
1367                                 if (dev != grt->dst.dev) {
1368                                         dst_release(&grt->dst);
1369                                         goto out;
1370                                 }
1371                         } else {
1372                                 dev = grt->dst.dev;
1373                                 idev = grt->rt6i_idev;
1374                                 dev_hold(dev);
1375                                 in6_dev_hold(grt->rt6i_idev);
1376                         }
1377                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1378                                 err = 0;
1379                         dst_release(&grt->dst);
1380
1381                         if (err)
1382                                 goto out;
1383                 }
1384                 err = -EINVAL;
1385                 if (!dev || (dev->flags & IFF_LOOPBACK))
1386                         goto out;
1387         }
1388
1389         err = -ENODEV;
1390         if (!dev)
1391                 goto out;
1392
1393         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1394                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1395                         err = -EINVAL;
1396                         goto out;
1397                 }
1398                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1399                 rt->rt6i_prefsrc.plen = 128;
1400         } else
1401                 rt->rt6i_prefsrc.plen = 0;
1402
1403         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1404                 err = rt6_bind_neighbour(rt, dev);
1405                 if (err)
1406                         goto out;
1407         }
1408
1409         rt->rt6i_flags = cfg->fc_flags;
1410
1411 install_route:
1412         if (cfg->fc_mx) {
1413                 struct nlattr *nla;
1414                 int remaining;
1415
1416                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1417                         int type = nla_type(nla);
1418
1419                         if (type) {
1420                                 if (type > RTAX_MAX) {
1421                                         err = -EINVAL;
1422                                         goto out;
1423                                 }
1424
1425                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1426                         }
1427                 }
1428         }
1429
1430         rt->dst.dev = dev;
1431         rt->rt6i_idev = idev;
1432         rt->rt6i_table = table;
1433
1434         cfg->fc_nlinfo.nl_net = dev_net(dev);
1435
1436         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1437
1438 out:
1439         if (dev)
1440                 dev_put(dev);
1441         if (idev)
1442                 in6_dev_put(idev);
1443         if (rt)
1444                 dst_free(&rt->dst);
1445         return err;
1446 }
1447
1448 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1449 {
1450         int err;
1451         struct fib6_table *table;
1452         struct net *net = dev_net(rt->dst.dev);
1453
1454         if (rt == net->ipv6.ip6_null_entry)
1455                 return -ENOENT;
1456
1457         table = rt->rt6i_table;
1458         write_lock_bh(&table->tb6_lock);
1459
1460         err = fib6_del(rt, info);
1461         dst_release(&rt->dst);
1462
1463         write_unlock_bh(&table->tb6_lock);
1464
1465         return err;
1466 }
1467
1468 int ip6_del_rt(struct rt6_info *rt)
1469 {
1470         struct nl_info info = {
1471                 .nl_net = dev_net(rt->dst.dev),
1472         };
1473         return __ip6_del_rt(rt, &info);
1474 }
1475
1476 static int ip6_route_del(struct fib6_config *cfg)
1477 {
1478         struct fib6_table *table;
1479         struct fib6_node *fn;
1480         struct rt6_info *rt;
1481         int err = -ESRCH;
1482
1483         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1484         if (!table)
1485                 return err;
1486
1487         read_lock_bh(&table->tb6_lock);
1488
1489         fn = fib6_locate(&table->tb6_root,
1490                          &cfg->fc_dst, cfg->fc_dst_len,
1491                          &cfg->fc_src, cfg->fc_src_len);
1492
1493         if (fn) {
1494                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1495                         if (cfg->fc_ifindex &&
1496                             (!rt->dst.dev ||
1497                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1498                                 continue;
1499                         if (cfg->fc_flags & RTF_GATEWAY &&
1500                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1501                                 continue;
1502                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1503                                 continue;
1504                         dst_hold(&rt->dst);
1505                         read_unlock_bh(&table->tb6_lock);
1506
1507                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1508                 }
1509         }
1510         read_unlock_bh(&table->tb6_lock);
1511
1512         return err;
1513 }
1514
1515 /*
1516  *      Handle redirects
1517  */
1518 struct ip6rd_flowi {
1519         struct flowi6 fl6;
1520         struct in6_addr gateway;
1521 };
1522
1523 static struct rt6_info *__ip6_route_redirect(struct net *net,
1524                                              struct fib6_table *table,
1525                                              struct flowi6 *fl6,
1526                                              int flags)
1527 {
1528         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1529         struct rt6_info *rt;
1530         struct fib6_node *fn;
1531
1532         /*
1533          * Get the "current" route for this destination and
1534          * check if the redirect has come from approriate router.
1535          *
1536          * RFC 2461 specifies that redirects should only be
1537          * accepted if they come from the nexthop to the target.
1538          * Due to the way the routes are chosen, this notion
1539          * is a bit fuzzy and one might need to check all possible
1540          * routes.
1541          */
1542
1543         read_lock_bh(&table->tb6_lock);
1544         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1545 restart:
1546         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1547                 /*
1548                  * Current route is on-link; redirect is always invalid.
1549                  *
1550                  * Seems, previous statement is not true. It could
1551                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1552                  * But then router serving it might decide, that we should
1553                  * know truth 8)8) --ANK (980726).
1554                  */
1555                 if (rt6_check_expired(rt))
1556                         continue;
1557                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1558                         continue;
1559                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1560                         continue;
1561                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1562                         continue;
1563                 break;
1564         }
1565
1566         if (!rt)
1567                 rt = net->ipv6.ip6_null_entry;
1568         BACKTRACK(net, &fl6->saddr);
1569 out:
1570         dst_hold(&rt->dst);
1571
1572         read_unlock_bh(&table->tb6_lock);
1573
1574         return rt;
1575 };
1576
1577 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1578                                            const struct in6_addr *src,
1579                                            const struct in6_addr *gateway,
1580                                            struct net_device *dev)
1581 {
1582         int flags = RT6_LOOKUP_F_HAS_SADDR;
1583         struct net *net = dev_net(dev);
1584         struct ip6rd_flowi rdfl = {
1585                 .fl6 = {
1586                         .flowi6_oif = dev->ifindex,
1587                         .daddr = *dest,
1588                         .saddr = *src,
1589                 },
1590         };
1591
1592         rdfl.gateway = *gateway;
1593
1594         if (rt6_need_strict(dest))
1595                 flags |= RT6_LOOKUP_F_IFACE;
1596
1597         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1598                                                    flags, __ip6_route_redirect);
1599 }
1600
1601 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1602                   const struct in6_addr *saddr,
1603                   struct neighbour *neigh, u8 *lladdr, int on_link)
1604 {
1605         struct rt6_info *rt, *nrt = NULL;
1606         struct netevent_redirect netevent;
1607         struct net *net = dev_net(neigh->dev);
1608
1609         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1610
1611         if (rt == net->ipv6.ip6_null_entry) {
1612                 if (net_ratelimit())
1613                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1614                                "for redirect target\n");
1615                 goto out;
1616         }
1617
1618         /*
1619          *      We have finally decided to accept it.
1620          */
1621
1622         neigh_update(neigh, lladdr, NUD_STALE,
1623                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1624                      NEIGH_UPDATE_F_OVERRIDE|
1625                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1626                                      NEIGH_UPDATE_F_ISROUTER))
1627                      );
1628
1629         /*
1630          * Redirect received -> path was valid.
1631          * Look, redirects are sent only in response to data packets,
1632          * so that this nexthop apparently is reachable. --ANK
1633          */
1634         dst_confirm(&rt->dst);
1635
1636         /* Duplicate redirect: silently ignore. */
1637         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1638                 goto out;
1639
1640         nrt = ip6_rt_copy(rt, dest);
1641         if (!nrt)
1642                 goto out;
1643
1644         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1645         if (on_link)
1646                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1647
1648         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1649         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1650
1651         if (ip6_ins_rt(nrt))
1652                 goto out;
1653
1654         netevent.old = &rt->dst;
1655         netevent.new = &nrt->dst;
1656         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1657
1658         if (rt->rt6i_flags & RTF_CACHE) {
1659                 ip6_del_rt(rt);
1660                 return;
1661         }
1662
1663 out:
1664         dst_release(&rt->dst);
1665 }
1666
1667 /*
1668  *      Handle ICMP "packet too big" messages
1669  *      i.e. Path MTU discovery
1670  */
1671
1672 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1673                              struct net *net, u32 pmtu, int ifindex)
1674 {
1675         struct rt6_info *rt, *nrt;
1676         int allfrag = 0;
1677 again:
1678         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1679         if (!rt)
1680                 return;
1681
1682         if (rt6_check_expired(rt)) {
1683                 ip6_del_rt(rt);
1684                 goto again;
1685         }
1686
1687         if (pmtu >= dst_mtu(&rt->dst))
1688                 goto out;
1689
1690         if (pmtu < IPV6_MIN_MTU) {
1691                 /*
1692                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1693                  * MTU (1280) and a fragment header should always be included
1694                  * after a node receiving Too Big message reporting PMTU is
1695                  * less than the IPv6 Minimum Link MTU.
1696                  */
1697                 pmtu = IPV6_MIN_MTU;
1698                 allfrag = 1;
1699         }
1700
1701         /* New mtu received -> path was valid.
1702            They are sent only in response to data packets,
1703            so that this nexthop apparently is reachable. --ANK
1704          */
1705         dst_confirm(&rt->dst);
1706
1707         /* Host route. If it is static, it would be better
1708            not to override it, but add new one, so that
1709            when cache entry will expire old pmtu
1710            would return automatically.
1711          */
1712         if (rt->rt6i_flags & RTF_CACHE) {
1713                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1714                 if (allfrag) {
1715                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1716                         features |= RTAX_FEATURE_ALLFRAG;
1717                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1718                 }
1719                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1720                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1721                 goto out;
1722         }
1723
1724         /* Network route.
1725            Two cases are possible:
1726            1. It is connected route. Action: COW
1727            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1728          */
1729         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1730                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1731         else
1732                 nrt = rt6_alloc_clone(rt, daddr);
1733
1734         if (nrt) {
1735                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1736                 if (allfrag) {
1737                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1738                         features |= RTAX_FEATURE_ALLFRAG;
1739                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1740                 }
1741
1742                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1743                  * happened within 5 mins, the recommended timer is 10 mins.
1744                  * Here this route expiration time is set to ip6_rt_mtu_expires
1745                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1746                  * and detecting PMTU increase will be automatically happened.
1747                  */
1748                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1749                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1750
1751                 ip6_ins_rt(nrt);
1752         }
1753 out:
1754         dst_release(&rt->dst);
1755 }
1756
1757 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1758                         struct net_device *dev, u32 pmtu)
1759 {
1760         struct net *net = dev_net(dev);
1761
1762         /*
1763          * RFC 1981 states that a node "MUST reduce the size of the packets it
1764          * is sending along the path" that caused the Packet Too Big message.
1765          * Since it's not possible in the general case to determine which
1766          * interface was used to send the original packet, we update the MTU
1767          * on the interface that will be used to send future packets. We also
1768          * update the MTU on the interface that received the Packet Too Big in
1769          * case the original packet was forced out that interface with
1770          * SO_BINDTODEVICE or similar. This is the next best thing to the
1771          * correct behaviour, which would be to update the MTU on all
1772          * interfaces.
1773          */
1774         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1775         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1776 }
1777
1778 /*
1779  *      Misc support functions
1780  */
1781
1782 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1783                                     const struct in6_addr *dest)
1784 {
1785         struct net *net = dev_net(ort->dst.dev);
1786         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1787                                             ort->dst.dev, 0);
1788
1789         if (rt) {
1790                 rt->dst.input = ort->dst.input;
1791                 rt->dst.output = ort->dst.output;
1792                 rt->dst.flags |= DST_HOST;
1793
1794                 rt->rt6i_dst.addr = *dest;
1795                 rt->rt6i_dst.plen = 128;
1796                 dst_copy_metrics(&rt->dst, &ort->dst);
1797                 rt->dst.error = ort->dst.error;
1798                 rt->rt6i_idev = ort->rt6i_idev;
1799                 if (rt->rt6i_idev)
1800                         in6_dev_hold(rt->rt6i_idev);
1801                 rt->dst.lastuse = jiffies;
1802                 rt->dst.expires = 0;
1803
1804                 rt->rt6i_gateway = ort->rt6i_gateway;
1805                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1806                 rt->rt6i_metric = 0;
1807
1808 #ifdef CONFIG_IPV6_SUBTREES
1809                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1810 #endif
1811                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1812                 rt->rt6i_table = ort->rt6i_table;
1813         }
1814         return rt;
1815 }
1816
1817 #ifdef CONFIG_IPV6_ROUTE_INFO
1818 static struct rt6_info *rt6_get_route_info(struct net *net,
1819                                            const struct in6_addr *prefix, int prefixlen,
1820                                            const struct in6_addr *gwaddr, int ifindex)
1821 {
1822         struct fib6_node *fn;
1823         struct rt6_info *rt = NULL;
1824         struct fib6_table *table;
1825
1826         table = fib6_get_table(net, RT6_TABLE_INFO);
1827         if (!table)
1828                 return NULL;
1829
1830         write_lock_bh(&table->tb6_lock);
1831         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1832         if (!fn)
1833                 goto out;
1834
1835         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1836                 if (rt->dst.dev->ifindex != ifindex)
1837                         continue;
1838                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1839                         continue;
1840                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1841                         continue;
1842                 dst_hold(&rt->dst);
1843                 break;
1844         }
1845 out:
1846         write_unlock_bh(&table->tb6_lock);
1847         return rt;
1848 }
1849
1850 static struct rt6_info *rt6_add_route_info(struct net *net,
1851                                            const struct in6_addr *prefix, int prefixlen,
1852                                            const struct in6_addr *gwaddr, int ifindex,
1853                                            unsigned pref)
1854 {
1855         struct fib6_config cfg = {
1856                 .fc_table       = RT6_TABLE_INFO,
1857                 .fc_metric      = IP6_RT_PRIO_USER,
1858                 .fc_ifindex     = ifindex,
1859                 .fc_dst_len     = prefixlen,
1860                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1861                                   RTF_UP | RTF_PREF(pref),
1862                 .fc_nlinfo.pid = 0,
1863                 .fc_nlinfo.nlh = NULL,
1864                 .fc_nlinfo.nl_net = net,
1865         };
1866
1867         cfg.fc_dst = *prefix;
1868         cfg.fc_gateway = *gwaddr;
1869
1870         /* We should treat it as a default route if prefix length is 0. */
1871         if (!prefixlen)
1872                 cfg.fc_flags |= RTF_DEFAULT;
1873
1874         ip6_route_add(&cfg);
1875
1876         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1877 }
1878 #endif
1879
1880 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1881 {
1882         struct rt6_info *rt;
1883         struct fib6_table *table;
1884
1885         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1886         if (!table)
1887                 return NULL;
1888
1889         write_lock_bh(&table->tb6_lock);
1890         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1891                 if (dev == rt->dst.dev &&
1892                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1893                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1894                         break;
1895         }
1896         if (rt)
1897                 dst_hold(&rt->dst);
1898         write_unlock_bh(&table->tb6_lock);
1899         return rt;
1900 }
1901
1902 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1903                                      struct net_device *dev,
1904                                      unsigned int pref)
1905 {
1906         struct fib6_config cfg = {
1907                 .fc_table       = RT6_TABLE_DFLT,
1908                 .fc_metric      = IP6_RT_PRIO_USER,
1909                 .fc_ifindex     = dev->ifindex,
1910                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1911                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1912                 .fc_nlinfo.pid = 0,
1913                 .fc_nlinfo.nlh = NULL,
1914                 .fc_nlinfo.nl_net = dev_net(dev),
1915         };
1916
1917         cfg.fc_gateway = *gwaddr;
1918
1919         ip6_route_add(&cfg);
1920
1921         return rt6_get_dflt_router(gwaddr, dev);
1922 }
1923
1924 void rt6_purge_dflt_routers(struct net *net)
1925 {
1926         struct rt6_info *rt;
1927         struct fib6_table *table;
1928
1929         /* NOTE: Keep consistent with rt6_get_dflt_router */
1930         table = fib6_get_table(net, RT6_TABLE_DFLT);
1931         if (!table)
1932                 return;
1933
1934 restart:
1935         read_lock_bh(&table->tb6_lock);
1936         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1937                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1938                         dst_hold(&rt->dst);
1939                         read_unlock_bh(&table->tb6_lock);
1940                         ip6_del_rt(rt);
1941                         goto restart;
1942                 }
1943         }
1944         read_unlock_bh(&table->tb6_lock);
1945 }
1946
1947 static void rtmsg_to_fib6_config(struct net *net,
1948                                  struct in6_rtmsg *rtmsg,
1949                                  struct fib6_config *cfg)
1950 {
1951         memset(cfg, 0, sizeof(*cfg));
1952
1953         cfg->fc_table = RT6_TABLE_MAIN;
1954         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1955         cfg->fc_metric = rtmsg->rtmsg_metric;
1956         cfg->fc_expires = rtmsg->rtmsg_info;
1957         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1958         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1959         cfg->fc_flags = rtmsg->rtmsg_flags;
1960
1961         cfg->fc_nlinfo.nl_net = net;
1962
1963         cfg->fc_dst = rtmsg->rtmsg_dst;
1964         cfg->fc_src = rtmsg->rtmsg_src;
1965         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1966 }
1967
1968 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1969 {
1970         struct fib6_config cfg;
1971         struct in6_rtmsg rtmsg;
1972         int err;
1973
1974         switch(cmd) {
1975         case SIOCADDRT:         /* Add a route */
1976         case SIOCDELRT:         /* Delete a route */
1977                 if (!capable(CAP_NET_ADMIN))
1978                         return -EPERM;
1979                 err = copy_from_user(&rtmsg, arg,
1980                                      sizeof(struct in6_rtmsg));
1981                 if (err)
1982                         return -EFAULT;
1983
1984                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1985
1986                 rtnl_lock();
1987                 switch (cmd) {
1988                 case SIOCADDRT:
1989                         err = ip6_route_add(&cfg);
1990                         break;
1991                 case SIOCDELRT:
1992                         err = ip6_route_del(&cfg);
1993                         break;
1994                 default:
1995                         err = -EINVAL;
1996                 }
1997                 rtnl_unlock();
1998
1999                 return err;
2000         }
2001
2002         return -EINVAL;
2003 }
2004
2005 /*
2006  *      Drop the packet on the floor
2007  */
2008
2009 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2010 {
2011         int type;
2012         struct dst_entry *dst = skb_dst(skb);
2013         switch (ipstats_mib_noroutes) {
2014         case IPSTATS_MIB_INNOROUTES:
2015                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2016                 if (type == IPV6_ADDR_ANY) {
2017                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2018                                       IPSTATS_MIB_INADDRERRORS);
2019                         break;
2020                 }
2021                 /* FALLTHROUGH */
2022         case IPSTATS_MIB_OUTNOROUTES:
2023                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2024                               ipstats_mib_noroutes);
2025                 break;
2026         }
2027         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2028         kfree_skb(skb);
2029         return 0;
2030 }
2031
2032 static int ip6_pkt_discard(struct sk_buff *skb)
2033 {
2034         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2035 }
2036
2037 static int ip6_pkt_discard_out(struct sk_buff *skb)
2038 {
2039         skb->dev = skb_dst(skb)->dev;
2040         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2041 }
2042
2043 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2044
2045 static int ip6_pkt_prohibit(struct sk_buff *skb)
2046 {
2047         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2048 }
2049
2050 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2051 {
2052         skb->dev = skb_dst(skb)->dev;
2053         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2054 }
2055
2056 #endif
2057
2058 /*
2059  *      Allocate a dst for local (unicast / anycast) address.
2060  */
2061
2062 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2063                                     const struct in6_addr *addr,
2064                                     bool anycast)
2065 {
2066         struct net *net = dev_net(idev->dev);
2067         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2068                                             net->loopback_dev, 0);
2069         int err;
2070
2071         if (!rt) {
2072                 if (net_ratelimit())
2073                         pr_warning("IPv6:  Maximum number of routes reached,"
2074                                    " consider increasing route/max_size.\n");
2075                 return ERR_PTR(-ENOMEM);
2076         }
2077
2078         in6_dev_hold(idev);
2079
2080         rt->dst.flags |= DST_HOST;
2081         rt->dst.input = ip6_input;
2082         rt->dst.output = ip6_output;
2083         rt->rt6i_idev = idev;
2084         rt->dst.obsolete = -1;
2085
2086         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2087         if (anycast)
2088                 rt->rt6i_flags |= RTF_ANYCAST;
2089         else
2090                 rt->rt6i_flags |= RTF_LOCAL;
2091         err = rt6_bind_neighbour(rt, rt->dst.dev);
2092         if (err) {
2093                 dst_free(&rt->dst);
2094                 return ERR_PTR(err);
2095         }
2096
2097         rt->rt6i_dst.addr = *addr;
2098         rt->rt6i_dst.plen = 128;
2099         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2100
2101         atomic_set(&rt->dst.__refcnt, 1);
2102
2103         return rt;
2104 }
2105
2106 int ip6_route_get_saddr(struct net *net,
2107                         struct rt6_info *rt,
2108                         const struct in6_addr *daddr,
2109                         unsigned int prefs,
2110                         struct in6_addr *saddr)
2111 {
2112         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2113         int err = 0;
2114         if (rt->rt6i_prefsrc.plen)
2115                 *saddr = rt->rt6i_prefsrc.addr;
2116         else
2117                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2118                                          daddr, prefs, saddr);
2119         return err;
2120 }
2121
2122 /* remove deleted ip from prefsrc entries */
2123 struct arg_dev_net_ip {
2124         struct net_device *dev;
2125         struct net *net;
2126         struct in6_addr *addr;
2127 };
2128
2129 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2130 {
2131         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2132         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2133         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2134
2135         if (((void *)rt->dst.dev == dev || !dev) &&
2136             rt != net->ipv6.ip6_null_entry &&
2137             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2138                 /* remove prefsrc entry */
2139                 rt->rt6i_prefsrc.plen = 0;
2140         }
2141         return 0;
2142 }
2143
2144 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2145 {
2146         struct net *net = dev_net(ifp->idev->dev);
2147         struct arg_dev_net_ip adni = {
2148                 .dev = ifp->idev->dev,
2149                 .net = net,
2150                 .addr = &ifp->addr,
2151         };
2152         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2153 }
2154
2155 struct arg_dev_net {
2156         struct net_device *dev;
2157         struct net *net;
2158 };
2159
2160 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2161 {
2162         const struct arg_dev_net *adn = arg;
2163         const struct net_device *dev = adn->dev;
2164
2165         if ((rt->dst.dev == dev || !dev) &&
2166             rt != adn->net->ipv6.ip6_null_entry)
2167                 return -1;
2168
2169         return 0;
2170 }
2171
2172 void rt6_ifdown(struct net *net, struct net_device *dev)
2173 {
2174         struct arg_dev_net adn = {
2175                 .dev = dev,
2176                 .net = net,
2177         };
2178
2179         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2180         icmp6_clean_all(fib6_ifdown, &adn);
2181 }
2182
2183 struct rt6_mtu_change_arg
2184 {
2185         struct net_device *dev;
2186         unsigned mtu;
2187 };
2188
2189 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2190 {
2191         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2192         struct inet6_dev *idev;
2193
2194         /* In IPv6 pmtu discovery is not optional,
2195            so that RTAX_MTU lock cannot disable it.
2196            We still use this lock to block changes
2197            caused by addrconf/ndisc.
2198         */
2199
2200         idev = __in6_dev_get(arg->dev);
2201         if (!idev)
2202                 return 0;
2203
2204         /* For administrative MTU increase, there is no way to discover
2205            IPv6 PMTU increase, so PMTU increase should be updated here.
2206            Since RFC 1981 doesn't include administrative MTU increase
2207            update PMTU increase is a MUST. (i.e. jumbo frame)
2208          */
2209         /*
2210            If new MTU is less than route PMTU, this new MTU will be the
2211            lowest MTU in the path, update the route PMTU to reflect PMTU
2212            decreases; if new MTU is greater than route PMTU, and the
2213            old MTU is the lowest MTU in the path, update the route PMTU
2214            to reflect the increase. In this case if the other nodes' MTU
2215            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2216            PMTU discouvery.
2217          */
2218         if (rt->dst.dev == arg->dev &&
2219             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2220             (dst_mtu(&rt->dst) >= arg->mtu ||
2221              (dst_mtu(&rt->dst) < arg->mtu &&
2222               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2223                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2224         }
2225         return 0;
2226 }
2227
2228 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2229 {
2230         struct rt6_mtu_change_arg arg = {
2231                 .dev = dev,
2232                 .mtu = mtu,
2233         };
2234
2235         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2236 }
2237
2238 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2239         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2240         [RTA_OIF]               = { .type = NLA_U32 },
2241         [RTA_IIF]               = { .type = NLA_U32 },
2242         [RTA_PRIORITY]          = { .type = NLA_U32 },
2243         [RTA_METRICS]           = { .type = NLA_NESTED },
2244 };
2245
2246 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2247                               struct fib6_config *cfg)
2248 {
2249         struct rtmsg *rtm;
2250         struct nlattr *tb[RTA_MAX+1];
2251         int err;
2252
2253         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2254         if (err < 0)
2255                 goto errout;
2256
2257         err = -EINVAL;
2258         rtm = nlmsg_data(nlh);
2259         memset(cfg, 0, sizeof(*cfg));
2260
2261         cfg->fc_table = rtm->rtm_table;
2262         cfg->fc_dst_len = rtm->rtm_dst_len;
2263         cfg->fc_src_len = rtm->rtm_src_len;
2264         cfg->fc_flags = RTF_UP;
2265         cfg->fc_protocol = rtm->rtm_protocol;
2266
2267         if (rtm->rtm_type == RTN_UNREACHABLE)
2268                 cfg->fc_flags |= RTF_REJECT;
2269
2270         if (rtm->rtm_type == RTN_LOCAL)
2271                 cfg->fc_flags |= RTF_LOCAL;
2272
2273         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2274         cfg->fc_nlinfo.nlh = nlh;
2275         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2276
2277         if (tb[RTA_GATEWAY]) {
2278                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2279                 cfg->fc_flags |= RTF_GATEWAY;
2280         }
2281
2282         if (tb[RTA_DST]) {
2283                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2284
2285                 if (nla_len(tb[RTA_DST]) < plen)
2286                         goto errout;
2287
2288                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2289         }
2290
2291         if (tb[RTA_SRC]) {
2292                 int plen = (rtm->rtm_src_len + 7) >> 3;
2293
2294                 if (nla_len(tb[RTA_SRC]) < plen)
2295                         goto errout;
2296
2297                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2298         }
2299
2300         if (tb[RTA_PREFSRC])
2301                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2302
2303         if (tb[RTA_OIF])
2304                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2305
2306         if (tb[RTA_PRIORITY])
2307                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2308
2309         if (tb[RTA_METRICS]) {
2310                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2311                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2312         }
2313
2314         if (tb[RTA_TABLE])
2315                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2316
2317         err = 0;
2318 errout:
2319         return err;
2320 }
2321
2322 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2323 {
2324         struct fib6_config cfg;
2325         int err;
2326
2327         err = rtm_to_fib6_config(skb, nlh, &cfg);
2328         if (err < 0)
2329                 return err;
2330
2331         return ip6_route_del(&cfg);
2332 }
2333
2334 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2335 {
2336         struct fib6_config cfg;
2337         int err;
2338
2339         err = rtm_to_fib6_config(skb, nlh, &cfg);
2340         if (err < 0)
2341                 return err;
2342
2343         return ip6_route_add(&cfg);
2344 }
2345
2346 static inline size_t rt6_nlmsg_size(void)
2347 {
2348         return NLMSG_ALIGN(sizeof(struct rtmsg))
2349                + nla_total_size(16) /* RTA_SRC */
2350                + nla_total_size(16) /* RTA_DST */
2351                + nla_total_size(16) /* RTA_GATEWAY */
2352                + nla_total_size(16) /* RTA_PREFSRC */
2353                + nla_total_size(4) /* RTA_TABLE */
2354                + nla_total_size(4) /* RTA_IIF */
2355                + nla_total_size(4) /* RTA_OIF */
2356                + nla_total_size(4) /* RTA_PRIORITY */
2357                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2358                + nla_total_size(sizeof(struct rta_cacheinfo));
2359 }
2360
2361 static int rt6_fill_node(struct net *net,
2362                          struct sk_buff *skb, struct rt6_info *rt,
2363                          struct in6_addr *dst, struct in6_addr *src,
2364                          int iif, int type, u32 pid, u32 seq,
2365                          int prefix, int nowait, unsigned int flags)
2366 {
2367         const struct inet_peer *peer;
2368         struct rtmsg *rtm;
2369         struct nlmsghdr *nlh;
2370         long expires;
2371         u32 table;
2372         struct neighbour *n;
2373         u32 ts, tsage;
2374
2375         if (prefix) {   /* user wants prefix routes only */
2376                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2377                         /* success since this is not a prefix route */
2378                         return 1;
2379                 }
2380         }
2381
2382         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2383         if (!nlh)
2384                 return -EMSGSIZE;
2385
2386         rtm = nlmsg_data(nlh);
2387         rtm->rtm_family = AF_INET6;
2388         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2389         rtm->rtm_src_len = rt->rt6i_src.plen;
2390         rtm->rtm_tos = 0;
2391         if (rt->rt6i_table)
2392                 table = rt->rt6i_table->tb6_id;
2393         else
2394                 table = RT6_TABLE_UNSPEC;
2395         rtm->rtm_table = table;
2396         NLA_PUT_U32(skb, RTA_TABLE, table);
2397         if (rt->rt6i_flags & RTF_REJECT)
2398                 rtm->rtm_type = RTN_UNREACHABLE;
2399         else if (rt->rt6i_flags & RTF_LOCAL)
2400                 rtm->rtm_type = RTN_LOCAL;
2401         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2402                 rtm->rtm_type = RTN_LOCAL;
2403         else
2404                 rtm->rtm_type = RTN_UNICAST;
2405         rtm->rtm_flags = 0;
2406         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2407         rtm->rtm_protocol = rt->rt6i_protocol;
2408         if (rt->rt6i_flags & RTF_DYNAMIC)
2409                 rtm->rtm_protocol = RTPROT_REDIRECT;
2410         else if (rt->rt6i_flags & RTF_ADDRCONF)
2411                 rtm->rtm_protocol = RTPROT_KERNEL;
2412         else if (rt->rt6i_flags & RTF_DEFAULT)
2413                 rtm->rtm_protocol = RTPROT_RA;
2414
2415         if (rt->rt6i_flags & RTF_CACHE)
2416                 rtm->rtm_flags |= RTM_F_CLONED;
2417
2418         if (dst) {
2419                 NLA_PUT(skb, RTA_DST, 16, dst);
2420                 rtm->rtm_dst_len = 128;
2421         } else if (rtm->rtm_dst_len)
2422                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2423 #ifdef CONFIG_IPV6_SUBTREES
2424         if (src) {
2425                 NLA_PUT(skb, RTA_SRC, 16, src);
2426                 rtm->rtm_src_len = 128;
2427         } else if (rtm->rtm_src_len)
2428                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2429 #endif
2430         if (iif) {
2431 #ifdef CONFIG_IPV6_MROUTE
2432                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2433                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2434                         if (err <= 0) {
2435                                 if (!nowait) {
2436                                         if (err == 0)
2437                                                 return 0;
2438                                         goto nla_put_failure;
2439                                 } else {
2440                                         if (err == -EMSGSIZE)
2441                                                 goto nla_put_failure;
2442                                 }
2443                         }
2444                 } else
2445 #endif
2446                         NLA_PUT_U32(skb, RTA_IIF, iif);
2447         } else if (dst) {
2448                 struct in6_addr saddr_buf;
2449                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2450                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2451         }
2452
2453         if (rt->rt6i_prefsrc.plen) {
2454                 struct in6_addr saddr_buf;
2455                 saddr_buf = rt->rt6i_prefsrc.addr;
2456                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2457         }
2458
2459         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2460                 goto nla_put_failure;
2461
2462         rcu_read_lock();
2463         n = dst_get_neighbour_noref(&rt->dst);
2464         if (n)
2465                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2466         rcu_read_unlock();
2467
2468         if (rt->dst.dev)
2469                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2470
2471         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2472
2473         if (!(rt->rt6i_flags & RTF_EXPIRES))
2474                 expires = 0;
2475         else if (rt->dst.expires - jiffies < INT_MAX)
2476                 expires = rt->dst.expires - jiffies;
2477         else
2478                 expires = INT_MAX;
2479
2480         peer = rt->rt6i_peer;
2481         ts = tsage = 0;
2482         if (peer && peer->tcp_ts_stamp) {
2483                 ts = peer->tcp_ts;
2484                 tsage = get_seconds() - peer->tcp_ts_stamp;
2485         }
2486
2487         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2488                                expires, rt->dst.error) < 0)
2489                 goto nla_put_failure;
2490
2491         return nlmsg_end(skb, nlh);
2492
2493 nla_put_failure:
2494         nlmsg_cancel(skb, nlh);
2495         return -EMSGSIZE;
2496 }
2497
2498 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2499 {
2500         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2501         int prefix;
2502
2503         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2504                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2505                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2506         } else
2507                 prefix = 0;
2508
2509         return rt6_fill_node(arg->net,
2510                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2511                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2512                      prefix, 0, NLM_F_MULTI);
2513 }
2514
2515 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2516 {
2517         struct net *net = sock_net(in_skb->sk);
2518         struct nlattr *tb[RTA_MAX+1];
2519         struct rt6_info *rt;
2520         struct sk_buff *skb;
2521         struct rtmsg *rtm;
2522         struct flowi6 fl6;
2523         int err, iif = 0;
2524
2525         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2526         if (err < 0)
2527                 goto errout;
2528
2529         err = -EINVAL;
2530         memset(&fl6, 0, sizeof(fl6));
2531
2532         if (tb[RTA_SRC]) {
2533                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2534                         goto errout;
2535
2536                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2537         }
2538
2539         if (tb[RTA_DST]) {
2540                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2541                         goto errout;
2542
2543                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2544         }
2545
2546         if (tb[RTA_IIF])
2547                 iif = nla_get_u32(tb[RTA_IIF]);
2548
2549         if (tb[RTA_OIF])
2550                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2551
2552         if (iif) {
2553                 struct net_device *dev;
2554                 dev = __dev_get_by_index(net, iif);
2555                 if (!dev) {
2556                         err = -ENODEV;
2557                         goto errout;
2558                 }
2559         }
2560
2561         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2562         if (!skb) {
2563                 err = -ENOBUFS;
2564                 goto errout;
2565         }
2566
2567         /* Reserve room for dummy headers, this skb can pass
2568            through good chunk of routing engine.
2569          */
2570         skb_reset_mac_header(skb);
2571         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2572
2573         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2574         skb_dst_set(skb, &rt->dst);
2575
2576         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2577                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2578                             nlh->nlmsg_seq, 0, 0, 0);
2579         if (err < 0) {
2580                 kfree_skb(skb);
2581                 goto errout;
2582         }
2583
2584         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2585 errout:
2586         return err;
2587 }
2588
2589 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2590 {
2591         struct sk_buff *skb;
2592         struct net *net = info->nl_net;
2593         u32 seq;
2594         int err;
2595
2596         err = -ENOBUFS;
2597         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2598
2599         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2600         if (!skb)
2601                 goto errout;
2602
2603         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2604                                 event, info->pid, seq, 0, 0, 0);
2605         if (err < 0) {
2606                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2607                 WARN_ON(err == -EMSGSIZE);
2608                 kfree_skb(skb);
2609                 goto errout;
2610         }
2611         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2612                     info->nlh, gfp_any());
2613         return;
2614 errout:
2615         if (err < 0)
2616                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2617 }
2618
2619 static int ip6_route_dev_notify(struct notifier_block *this,
2620                                 unsigned long event, void *data)
2621 {
2622         struct net_device *dev = (struct net_device *)data;
2623         struct net *net = dev_net(dev);
2624
2625         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2626                 net->ipv6.ip6_null_entry->dst.dev = dev;
2627                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2628 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2629                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2630                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2631                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2632                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2633 #endif
2634         }
2635
2636         return NOTIFY_OK;
2637 }
2638
2639 /*
2640  *      /proc
2641  */
2642
2643 #ifdef CONFIG_PROC_FS
2644
2645 struct rt6_proc_arg
2646 {
2647         char *buffer;
2648         int offset;
2649         int length;
2650         int skip;
2651         int len;
2652 };
2653
2654 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2655 {
2656         struct seq_file *m = p_arg;
2657         struct neighbour *n;
2658
2659         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2660
2661 #ifdef CONFIG_IPV6_SUBTREES
2662         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2663 #else
2664         seq_puts(m, "00000000000000000000000000000000 00 ");
2665 #endif
2666         rcu_read_lock();
2667         n = dst_get_neighbour_noref(&rt->dst);
2668         if (n) {
2669                 seq_printf(m, "%pi6", n->primary_key);
2670         } else {
2671                 seq_puts(m, "00000000000000000000000000000000");
2672         }
2673         rcu_read_unlock();
2674         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2675                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2676                    rt->dst.__use, rt->rt6i_flags,
2677                    rt->dst.dev ? rt->dst.dev->name : "");
2678         return 0;
2679 }
2680
2681 static int ipv6_route_show(struct seq_file *m, void *v)
2682 {
2683         struct net *net = (struct net *)m->private;
2684         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2685         return 0;
2686 }
2687
2688 static int ipv6_route_open(struct inode *inode, struct file *file)
2689 {
2690         return single_open_net(inode, file, ipv6_route_show);
2691 }
2692
2693 static const struct file_operations ipv6_route_proc_fops = {
2694         .owner          = THIS_MODULE,
2695         .open           = ipv6_route_open,
2696         .read           = seq_read,
2697         .llseek         = seq_lseek,
2698         .release        = single_release_net,
2699 };
2700
2701 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2702 {
2703         struct net *net = (struct net *)seq->private;
2704         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2705                    net->ipv6.rt6_stats->fib_nodes,
2706                    net->ipv6.rt6_stats->fib_route_nodes,
2707                    net->ipv6.rt6_stats->fib_rt_alloc,
2708                    net->ipv6.rt6_stats->fib_rt_entries,
2709                    net->ipv6.rt6_stats->fib_rt_cache,
2710                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2711                    net->ipv6.rt6_stats->fib_discarded_routes);
2712
2713         return 0;
2714 }
2715
2716 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2717 {
2718         return single_open_net(inode, file, rt6_stats_seq_show);
2719 }
2720
2721 static const struct file_operations rt6_stats_seq_fops = {
2722         .owner   = THIS_MODULE,
2723         .open    = rt6_stats_seq_open,
2724         .read    = seq_read,
2725         .llseek  = seq_lseek,
2726         .release = single_release_net,
2727 };
2728 #endif  /* CONFIG_PROC_FS */
2729
2730 #ifdef CONFIG_SYSCTL
2731
2732 static
2733 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2734                               void __user *buffer, size_t *lenp, loff_t *ppos)
2735 {
2736         struct net *net;
2737         int delay;
2738         if (!write)
2739                 return -EINVAL;
2740
2741         net = (struct net *)ctl->extra1;
2742         delay = net->ipv6.sysctl.flush_delay;
2743         proc_dointvec(ctl, write, buffer, lenp, ppos);
2744         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2745         return 0;
2746 }
2747
2748 ctl_table ipv6_route_table_template[] = {
2749         {
2750                 .procname       =       "flush",
2751                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2752                 .maxlen         =       sizeof(int),
2753                 .mode           =       0200,
2754                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2755         },
2756         {
2757                 .procname       =       "gc_thresh",
2758                 .data           =       &ip6_dst_ops_template.gc_thresh,
2759                 .maxlen         =       sizeof(int),
2760                 .mode           =       0644,
2761                 .proc_handler   =       proc_dointvec,
2762         },
2763         {
2764                 .procname       =       "max_size",
2765                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2766                 .maxlen         =       sizeof(int),
2767                 .mode           =       0644,
2768                 .proc_handler   =       proc_dointvec,
2769         },
2770         {
2771                 .procname       =       "gc_min_interval",
2772                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2773                 .maxlen         =       sizeof(int),
2774                 .mode           =       0644,
2775                 .proc_handler   =       proc_dointvec_jiffies,
2776         },
2777         {
2778                 .procname       =       "gc_timeout",
2779                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2780                 .maxlen         =       sizeof(int),
2781                 .mode           =       0644,
2782                 .proc_handler   =       proc_dointvec_jiffies,
2783         },
2784         {
2785                 .procname       =       "gc_interval",
2786                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2787                 .maxlen         =       sizeof(int),
2788                 .mode           =       0644,
2789                 .proc_handler   =       proc_dointvec_jiffies,
2790         },
2791         {
2792                 .procname       =       "gc_elasticity",
2793                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2794                 .maxlen         =       sizeof(int),
2795                 .mode           =       0644,
2796                 .proc_handler   =       proc_dointvec,
2797         },
2798         {
2799                 .procname       =       "mtu_expires",
2800                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2801                 .maxlen         =       sizeof(int),
2802                 .mode           =       0644,
2803                 .proc_handler   =       proc_dointvec_jiffies,
2804         },
2805         {
2806                 .procname       =       "min_adv_mss",
2807                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2808                 .maxlen         =       sizeof(int),
2809                 .mode           =       0644,
2810                 .proc_handler   =       proc_dointvec,
2811         },
2812         {
2813                 .procname       =       "gc_min_interval_ms",
2814                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2815                 .maxlen         =       sizeof(int),
2816                 .mode           =       0644,
2817                 .proc_handler   =       proc_dointvec_ms_jiffies,
2818         },
2819         { }
2820 };
2821
2822 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2823 {
2824         struct ctl_table *table;
2825
2826         table = kmemdup(ipv6_route_table_template,
2827                         sizeof(ipv6_route_table_template),
2828                         GFP_KERNEL);
2829
2830         if (table) {
2831                 table[0].data = &net->ipv6.sysctl.flush_delay;
2832                 table[0].extra1 = net;
2833                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2834                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2835                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2836                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2837                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2838                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2839                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2840                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2841                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2842         }
2843
2844         return table;
2845 }
2846 #endif
2847
2848 static int __net_init ip6_route_net_init(struct net *net)
2849 {
2850         int ret = -ENOMEM;
2851
2852         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2853                sizeof(net->ipv6.ip6_dst_ops));
2854
2855         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2856                 goto out_ip6_dst_ops;
2857
2858         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2859                                            sizeof(*net->ipv6.ip6_null_entry),
2860                                            GFP_KERNEL);
2861         if (!net->ipv6.ip6_null_entry)
2862                 goto out_ip6_dst_entries;
2863         net->ipv6.ip6_null_entry->dst.path =
2864                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2865         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2866         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2867                          ip6_template_metrics, true);
2868
2869 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2870         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2871                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2872                                                GFP_KERNEL);
2873         if (!net->ipv6.ip6_prohibit_entry)
2874                 goto out_ip6_null_entry;
2875         net->ipv6.ip6_prohibit_entry->dst.path =
2876                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2877         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2879                          ip6_template_metrics, true);
2880
2881         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2882                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2883                                                GFP_KERNEL);
2884         if (!net->ipv6.ip6_blk_hole_entry)
2885                 goto out_ip6_prohibit_entry;
2886         net->ipv6.ip6_blk_hole_entry->dst.path =
2887                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2888         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2889         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2890                          ip6_template_metrics, true);
2891 #endif
2892
2893         net->ipv6.sysctl.flush_delay = 0;
2894         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2895         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2896         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2897         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2898         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2899         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2900         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2901
2902 #ifdef CONFIG_PROC_FS
2903         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2904         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2905 #endif
2906         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2907
2908         ret = 0;
2909 out:
2910         return ret;
2911
2912 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2913 out_ip6_prohibit_entry:
2914         kfree(net->ipv6.ip6_prohibit_entry);
2915 out_ip6_null_entry:
2916         kfree(net->ipv6.ip6_null_entry);
2917 #endif
2918 out_ip6_dst_entries:
2919         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2920 out_ip6_dst_ops:
2921         goto out;
2922 }
2923
2924 static void __net_exit ip6_route_net_exit(struct net *net)
2925 {
2926 #ifdef CONFIG_PROC_FS
2927         proc_net_remove(net, "ipv6_route");
2928         proc_net_remove(net, "rt6_stats");
2929 #endif
2930         kfree(net->ipv6.ip6_null_entry);
2931 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2932         kfree(net->ipv6.ip6_prohibit_entry);
2933         kfree(net->ipv6.ip6_blk_hole_entry);
2934 #endif
2935         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2936 }
2937
2938 static struct pernet_operations ip6_route_net_ops = {
2939         .init = ip6_route_net_init,
2940         .exit = ip6_route_net_exit,
2941 };
2942
2943 static struct notifier_block ip6_route_dev_notifier = {
2944         .notifier_call = ip6_route_dev_notify,
2945         .priority = 0,
2946 };
2947
2948 int __init ip6_route_init(void)
2949 {
2950         int ret;
2951
2952         ret = -ENOMEM;
2953         ip6_dst_ops_template.kmem_cachep =
2954                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2955                                   SLAB_HWCACHE_ALIGN, NULL);
2956         if (!ip6_dst_ops_template.kmem_cachep)
2957                 goto out;
2958
2959         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2960         if (ret)
2961                 goto out_kmem_cache;
2962
2963         ret = register_pernet_subsys(&ip6_route_net_ops);
2964         if (ret)
2965                 goto out_dst_entries;
2966
2967         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2968
2969         /* Registering of the loopback is done before this portion of code,
2970          * the loopback reference in rt6_info will not be taken, do it
2971          * manually for init_net */
2972         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2973         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2974   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2975         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2976         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2977         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2978         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979   #endif
2980         ret = fib6_init();
2981         if (ret)
2982                 goto out_register_subsys;
2983
2984         ret = xfrm6_init();
2985         if (ret)
2986                 goto out_fib6_init;
2987
2988         ret = fib6_rules_init();
2989         if (ret)
2990                 goto xfrm6_init;
2991
2992         ret = -ENOBUFS;
2993         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2994             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2995             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2996                 goto fib6_rules_init;
2997
2998         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2999         if (ret)
3000                 goto fib6_rules_init;
3001
3002 out:
3003         return ret;
3004
3005 fib6_rules_init:
3006         fib6_rules_cleanup();
3007 xfrm6_init:
3008         xfrm6_fini();
3009 out_fib6_init:
3010         fib6_gc_cleanup();
3011 out_register_subsys:
3012         unregister_pernet_subsys(&ip6_route_net_ops);
3013 out_dst_entries:
3014         dst_entries_destroy(&ip6_dst_blackhole_ops);
3015 out_kmem_cache:
3016         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3017         goto out;
3018 }
3019
3020 void ip6_route_cleanup(void)
3021 {
3022         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3023         fib6_rules_cleanup();
3024         xfrm6_fini();
3025         fib6_gc_cleanup();
3026         unregister_pernet_subsys(&ip6_route_net_ops);
3027         dst_entries_destroy(&ip6_dst_blackhole_ops);
3028         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3029 }