1138b0a5566d47c16070488a6e29421254119f81
[~shefty/rdma-dev.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt)
251                 memset(&rt->rt6i_table, 0,
252                        sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (!sprt->rt6i_idev ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while (0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
731                         if (rt->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         rt->rt6i_gateway = *daddr;
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         rt->rt6i_src.addr = *saddr;
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
748                                              rt->rt6i_dev);
749                 if (IS_ERR(neigh)) {
750                         struct net *net = dev_net(rt->rt6i_dev);
751                         int saved_rt_min_interval =
752                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
753                         int saved_rt_elasticity =
754                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755
756                         if (attempts-- > 0) {
757                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
758                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759
760                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761
762                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
763                                         saved_rt_elasticity;
764                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
765                                         saved_rt_min_interval;
766                                 goto retry;
767                         }
768
769                         if (net_ratelimit())
770                                 printk(KERN_WARNING
771                                        "ipv6: Neighbour table overflow.\n");
772                         dst_free(&rt->dst);
773                         return NULL;
774                 }
775                 dst_set_neighbour(&rt->dst, neigh);
776
777         }
778
779         return rt;
780 }
781
782 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
783                                         const struct in6_addr *daddr)
784 {
785         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
786
787         if (rt) {
788                 rt->rt6i_flags |= RTF_CACHE;
789                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
790         }
791         return rt;
792 }
793
794 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
795                                       struct flowi6 *fl6, int flags)
796 {
797         struct fib6_node *fn;
798         struct rt6_info *rt, *nrt;
799         int strict = 0;
800         int attempts = 3;
801         int err;
802         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
803
804         strict |= flags & RT6_LOOKUP_F_IFACE;
805
806 relookup:
807         read_lock_bh(&table->tb6_lock);
808
809 restart_2:
810         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
811
812 restart:
813         rt = rt6_select(fn, oif, strict | reachable);
814
815         BACKTRACK(net, &fl6->saddr);
816         if (rt == net->ipv6.ip6_null_entry ||
817             rt->rt6i_flags & RTF_CACHE)
818                 goto out;
819
820         dst_hold(&rt->dst);
821         read_unlock_bh(&table->tb6_lock);
822
823         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
824                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
825         else if (!(rt->dst.flags & DST_HOST))
826                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
827         else
828                 goto out2;
829
830         dst_release(&rt->dst);
831         rt = nrt ? : net->ipv6.ip6_null_entry;
832
833         dst_hold(&rt->dst);
834         if (nrt) {
835                 err = ip6_ins_rt(nrt);
836                 if (!err)
837                         goto out2;
838         }
839
840         if (--attempts <= 0)
841                 goto out2;
842
843         /*
844          * Race condition! In the gap, when table->tb6_lock was
845          * released someone could insert this route.  Relookup.
846          */
847         dst_release(&rt->dst);
848         goto relookup;
849
850 out:
851         if (reachable) {
852                 reachable = 0;
853                 goto restart_2;
854         }
855         dst_hold(&rt->dst);
856         read_unlock_bh(&table->tb6_lock);
857 out2:
858         rt->dst.lastuse = jiffies;
859         rt->dst.__use++;
860
861         return rt;
862 }
863
864 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
865                                             struct flowi6 *fl6, int flags)
866 {
867         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
868 }
869
870 void ip6_route_input(struct sk_buff *skb)
871 {
872         const struct ipv6hdr *iph = ipv6_hdr(skb);
873         struct net *net = dev_net(skb->dev);
874         int flags = RT6_LOOKUP_F_HAS_SADDR;
875         struct flowi6 fl6 = {
876                 .flowi6_iif = skb->dev->ifindex,
877                 .daddr = iph->daddr,
878                 .saddr = iph->saddr,
879                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
880                 .flowi6_mark = skb->mark,
881                 .flowi6_proto = iph->nexthdr,
882         };
883
884         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
885                 flags |= RT6_LOOKUP_F_IFACE;
886
887         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
888 }
889
890 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
891                                              struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
894 }
895
896 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
897                                     struct flowi6 *fl6)
898 {
899         int flags = 0;
900
901         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
902                 flags |= RT6_LOOKUP_F_IFACE;
903
904         if (!ipv6_addr_any(&fl6->saddr))
905                 flags |= RT6_LOOKUP_F_HAS_SADDR;
906         else if (sk)
907                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908
909         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
910 }
911
912 EXPORT_SYMBOL(ip6_route_output);
913
914 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915 {
916         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
917         struct dst_entry *new = NULL;
918
919         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
920         if (rt) {
921                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
922
923                 new = &rt->dst;
924
925                 new->__use = 1;
926                 new->input = dst_discard;
927                 new->output = dst_discard;
928
929                 if (dst_metrics_read_only(&ort->dst))
930                         new->_metrics = ort->dst._metrics;
931                 else
932                         dst_copy_metrics(new, &ort->dst);
933                 rt->rt6i_idev = ort->rt6i_idev;
934                 if (rt->rt6i_idev)
935                         in6_dev_hold(rt->rt6i_idev);
936                 rt->rt6i_expires = 0;
937
938                 rt->rt6i_gateway = ort->rt6i_gateway;
939                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
940                 rt->rt6i_metric = 0;
941
942                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
943 #ifdef CONFIG_IPV6_SUBTREES
944                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
945 #endif
946
947                 dst_free(new);
948         }
949
950         dst_release(dst_orig);
951         return new ? new : ERR_PTR(-ENOMEM);
952 }
953
954 /*
955  *      Destination cache support functions
956  */
957
958 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
959 {
960         struct rt6_info *rt;
961
962         rt = (struct rt6_info *) dst;
963
964         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
965                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
966                         if (!rt->rt6i_peer)
967                                 rt6_bind_peer(rt, 0);
968                         rt->rt6i_peer_genid = rt6_peer_genid();
969                 }
970                 return dst;
971         }
972         return NULL;
973 }
974
975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
976 {
977         struct rt6_info *rt = (struct rt6_info *) dst;
978
979         if (rt) {
980                 if (rt->rt6i_flags & RTF_CACHE) {
981                         if (rt6_check_expired(rt)) {
982                                 ip6_del_rt(rt);
983                                 dst = NULL;
984                         }
985                 } else {
986                         dst_release(dst);
987                         dst = NULL;
988                 }
989         }
990         return dst;
991 }
992
993 static void ip6_link_failure(struct sk_buff *skb)
994 {
995         struct rt6_info *rt;
996
997         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
998
999         rt = (struct rt6_info *) skb_dst(skb);
1000         if (rt) {
1001                 if (rt->rt6i_flags & RTF_CACHE) {
1002                         dst_set_expires(&rt->dst, 0);
1003                         rt->rt6i_flags |= RTF_EXPIRES;
1004                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1005                         rt->rt6i_node->fn_sernum = -1;
1006         }
1007 }
1008
1009 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1010 {
1011         struct rt6_info *rt6 = (struct rt6_info*)dst;
1012
1013         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1014                 rt6->rt6i_flags |= RTF_MODIFIED;
1015                 if (mtu < IPV6_MIN_MTU) {
1016                         u32 features = dst_metric(dst, RTAX_FEATURES);
1017                         mtu = IPV6_MIN_MTU;
1018                         features |= RTAX_FEATURE_ALLFRAG;
1019                         dst_metric_set(dst, RTAX_FEATURES, features);
1020                 }
1021                 dst_metric_set(dst, RTAX_MTU, mtu);
1022         }
1023 }
1024
1025 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1026 {
1027         struct net_device *dev = dst->dev;
1028         unsigned int mtu = dst_mtu(dst);
1029         struct net *net = dev_net(dev);
1030
1031         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1032
1033         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1034                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1035
1036         /*
1037          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1038          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1039          * IPV6_MAXPLEN is also valid and means: "any MSS,
1040          * rely only on pmtu discovery"
1041          */
1042         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1043                 mtu = IPV6_MAXPLEN;
1044         return mtu;
1045 }
1046
1047 static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 {
1049         struct inet6_dev *idev;
1050         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1051
1052         if (mtu)
1053                 return mtu;
1054
1055         mtu = IPV6_MIN_MTU;
1056
1057         rcu_read_lock();
1058         idev = __in6_dev_get(dst->dev);
1059         if (idev)
1060                 mtu = idev->cnf.mtu6;
1061         rcu_read_unlock();
1062
1063         return mtu;
1064 }
1065
1066 static struct dst_entry *icmp6_dst_gc_list;
1067 static DEFINE_SPINLOCK(icmp6_dst_lock);
1068
1069 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1070                                   struct neighbour *neigh,
1071                                   const struct in6_addr *addr)
1072 {
1073         struct rt6_info *rt;
1074         struct inet6_dev *idev = in6_dev_get(dev);
1075         struct net *net = dev_net(dev);
1076
1077         if (unlikely(!idev))
1078                 return NULL;
1079
1080         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1081         if (unlikely(!rt)) {
1082                 in6_dev_put(idev);
1083                 goto out;
1084         }
1085
1086         if (neigh)
1087                 neigh_hold(neigh);
1088         else {
1089                 neigh = __neigh_lookup_errno(&nd_tbl, addr, dev);
1090                 if (IS_ERR(neigh))
1091                         neigh = NULL;
1092         }
1093
1094         rt->dst.flags |= DST_HOST;
1095         rt->dst.output  = ip6_output;
1096         dst_set_neighbour(&rt->dst, neigh);
1097         atomic_set(&rt->dst.__refcnt, 1);
1098         rt->rt6i_dst.addr = *addr;
1099         rt->rt6i_dst.plen = 128;
1100         rt->rt6i_idev     = idev;
1101         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1102
1103         spin_lock_bh(&icmp6_dst_lock);
1104         rt->dst.next = icmp6_dst_gc_list;
1105         icmp6_dst_gc_list = &rt->dst;
1106         spin_unlock_bh(&icmp6_dst_lock);
1107
1108         fib6_force_start_gc(net);
1109
1110 out:
1111         return &rt->dst;
1112 }
1113
1114 int icmp6_dst_gc(void)
1115 {
1116         struct dst_entry *dst, **pprev;
1117         int more = 0;
1118
1119         spin_lock_bh(&icmp6_dst_lock);
1120         pprev = &icmp6_dst_gc_list;
1121
1122         while ((dst = *pprev) != NULL) {
1123                 if (!atomic_read(&dst->__refcnt)) {
1124                         *pprev = dst->next;
1125                         dst_free(dst);
1126                 } else {
1127                         pprev = &dst->next;
1128                         ++more;
1129                 }
1130         }
1131
1132         spin_unlock_bh(&icmp6_dst_lock);
1133
1134         return more;
1135 }
1136
1137 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1138                             void *arg)
1139 {
1140         struct dst_entry *dst, **pprev;
1141
1142         spin_lock_bh(&icmp6_dst_lock);
1143         pprev = &icmp6_dst_gc_list;
1144         while ((dst = *pprev) != NULL) {
1145                 struct rt6_info *rt = (struct rt6_info *) dst;
1146                 if (func(rt, arg)) {
1147                         *pprev = dst->next;
1148                         dst_free(dst);
1149                 } else {
1150                         pprev = &dst->next;
1151                 }
1152         }
1153         spin_unlock_bh(&icmp6_dst_lock);
1154 }
1155
1156 static int ip6_dst_gc(struct dst_ops *ops)
1157 {
1158         unsigned long now = jiffies;
1159         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1160         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1161         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1162         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1163         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1164         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1165         int entries;
1166
1167         entries = dst_entries_get_fast(ops);
1168         if (time_after(rt_last_gc + rt_min_interval, now) &&
1169             entries <= rt_max_size)
1170                 goto out;
1171
1172         net->ipv6.ip6_rt_gc_expire++;
1173         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1174         net->ipv6.ip6_rt_last_gc = now;
1175         entries = dst_entries_get_slow(ops);
1176         if (entries < ops->gc_thresh)
1177                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1178 out:
1179         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1180         return entries > rt_max_size;
1181 }
1182
1183 /* Clean host part of a prefix. Not necessary in radix tree,
1184    but results in cleaner routing tables.
1185
1186    Remove it only when all the things will work!
1187  */
1188
1189 int ip6_dst_hoplimit(struct dst_entry *dst)
1190 {
1191         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1192         if (hoplimit == 0) {
1193                 struct net_device *dev = dst->dev;
1194                 struct inet6_dev *idev;
1195
1196                 rcu_read_lock();
1197                 idev = __in6_dev_get(dev);
1198                 if (idev)
1199                         hoplimit = idev->cnf.hop_limit;
1200                 else
1201                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1202                 rcu_read_unlock();
1203         }
1204         return hoplimit;
1205 }
1206 EXPORT_SYMBOL(ip6_dst_hoplimit);
1207
1208 /*
1209  *
1210  */
1211
1212 int ip6_route_add(struct fib6_config *cfg)
1213 {
1214         int err;
1215         struct net *net = cfg->fc_nlinfo.nl_net;
1216         struct rt6_info *rt = NULL;
1217         struct net_device *dev = NULL;
1218         struct inet6_dev *idev = NULL;
1219         struct fib6_table *table;
1220         int addr_type;
1221
1222         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1223                 return -EINVAL;
1224 #ifndef CONFIG_IPV6_SUBTREES
1225         if (cfg->fc_src_len)
1226                 return -EINVAL;
1227 #endif
1228         if (cfg->fc_ifindex) {
1229                 err = -ENODEV;
1230                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1231                 if (!dev)
1232                         goto out;
1233                 idev = in6_dev_get(dev);
1234                 if (!idev)
1235                         goto out;
1236         }
1237
1238         if (cfg->fc_metric == 0)
1239                 cfg->fc_metric = IP6_RT_PRIO_USER;
1240
1241         err = -ENOBUFS;
1242         if (cfg->fc_nlinfo.nlh &&
1243             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1244                 table = fib6_get_table(net, cfg->fc_table);
1245                 if (!table) {
1246                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1247                         table = fib6_new_table(net, cfg->fc_table);
1248                 }
1249         } else {
1250                 table = fib6_new_table(net, cfg->fc_table);
1251         }
1252
1253         if (!table)
1254                 goto out;
1255
1256         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1257
1258         if (!rt) {
1259                 err = -ENOMEM;
1260                 goto out;
1261         }
1262
1263         rt->dst.obsolete = -1;
1264         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1265                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1266                                 0;
1267
1268         if (cfg->fc_protocol == RTPROT_UNSPEC)
1269                 cfg->fc_protocol = RTPROT_BOOT;
1270         rt->rt6i_protocol = cfg->fc_protocol;
1271
1272         addr_type = ipv6_addr_type(&cfg->fc_dst);
1273
1274         if (addr_type & IPV6_ADDR_MULTICAST)
1275                 rt->dst.input = ip6_mc_input;
1276         else if (cfg->fc_flags & RTF_LOCAL)
1277                 rt->dst.input = ip6_input;
1278         else
1279                 rt->dst.input = ip6_forward;
1280
1281         rt->dst.output = ip6_output;
1282
1283         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1284         rt->rt6i_dst.plen = cfg->fc_dst_len;
1285         if (rt->rt6i_dst.plen == 128)
1286                rt->dst.flags |= DST_HOST;
1287
1288         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1289                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1290                 if (!metrics) {
1291                         err = -ENOMEM;
1292                         goto out;
1293                 }
1294                 dst_init_metrics(&rt->dst, metrics, 0);
1295         }
1296 #ifdef CONFIG_IPV6_SUBTREES
1297         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1298         rt->rt6i_src.plen = cfg->fc_src_len;
1299 #endif
1300
1301         rt->rt6i_metric = cfg->fc_metric;
1302
1303         /* We cannot add true routes via loopback here,
1304            they would result in kernel looping; promote them to reject routes
1305          */
1306         if ((cfg->fc_flags & RTF_REJECT) ||
1307             (dev && (dev->flags & IFF_LOOPBACK) &&
1308              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1309              !(cfg->fc_flags & RTF_LOCAL))) {
1310                 /* hold loopback dev/idev if we haven't done so. */
1311                 if (dev != net->loopback_dev) {
1312                         if (dev) {
1313                                 dev_put(dev);
1314                                 in6_dev_put(idev);
1315                         }
1316                         dev = net->loopback_dev;
1317                         dev_hold(dev);
1318                         idev = in6_dev_get(dev);
1319                         if (!idev) {
1320                                 err = -ENODEV;
1321                                 goto out;
1322                         }
1323                 }
1324                 rt->dst.output = ip6_pkt_discard_out;
1325                 rt->dst.input = ip6_pkt_discard;
1326                 rt->dst.error = -ENETUNREACH;
1327                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1328                 goto install_route;
1329         }
1330
1331         if (cfg->fc_flags & RTF_GATEWAY) {
1332                 const struct in6_addr *gw_addr;
1333                 int gwa_type;
1334
1335                 gw_addr = &cfg->fc_gateway;
1336                 rt->rt6i_gateway = *gw_addr;
1337                 gwa_type = ipv6_addr_type(gw_addr);
1338
1339                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1340                         struct rt6_info *grt;
1341
1342                         /* IPv6 strictly inhibits using not link-local
1343                            addresses as nexthop address.
1344                            Otherwise, router will not able to send redirects.
1345                            It is very good, but in some (rare!) circumstances
1346                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1347                            some exceptions. --ANK
1348                          */
1349                         err = -EINVAL;
1350                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1351                                 goto out;
1352
1353                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1354
1355                         err = -EHOSTUNREACH;
1356                         if (!grt)
1357                                 goto out;
1358                         if (dev) {
1359                                 if (dev != grt->rt6i_dev) {
1360                                         dst_release(&grt->dst);
1361                                         goto out;
1362                                 }
1363                         } else {
1364                                 dev = grt->rt6i_dev;
1365                                 idev = grt->rt6i_idev;
1366                                 dev_hold(dev);
1367                                 in6_dev_hold(grt->rt6i_idev);
1368                         }
1369                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1370                                 err = 0;
1371                         dst_release(&grt->dst);
1372
1373                         if (err)
1374                                 goto out;
1375                 }
1376                 err = -EINVAL;
1377                 if (!dev || (dev->flags & IFF_LOOPBACK))
1378                         goto out;
1379         }
1380
1381         err = -ENODEV;
1382         if (!dev)
1383                 goto out;
1384
1385         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1386                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1387                         err = -EINVAL;
1388                         goto out;
1389                 }
1390                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1391                 rt->rt6i_prefsrc.plen = 128;
1392         } else
1393                 rt->rt6i_prefsrc.plen = 0;
1394
1395         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1396                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1397                 if (IS_ERR(n)) {
1398                         err = PTR_ERR(n);
1399                         goto out;
1400                 }
1401                 dst_set_neighbour(&rt->dst, n);
1402         }
1403
1404         rt->rt6i_flags = cfg->fc_flags;
1405
1406 install_route:
1407         if (cfg->fc_mx) {
1408                 struct nlattr *nla;
1409                 int remaining;
1410
1411                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1412                         int type = nla_type(nla);
1413
1414                         if (type) {
1415                                 if (type > RTAX_MAX) {
1416                                         err = -EINVAL;
1417                                         goto out;
1418                                 }
1419
1420                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1421                         }
1422                 }
1423         }
1424
1425         rt->dst.dev = dev;
1426         rt->rt6i_idev = idev;
1427         rt->rt6i_table = table;
1428
1429         cfg->fc_nlinfo.nl_net = dev_net(dev);
1430
1431         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1432
1433 out:
1434         if (dev)
1435                 dev_put(dev);
1436         if (idev)
1437                 in6_dev_put(idev);
1438         if (rt)
1439                 dst_free(&rt->dst);
1440         return err;
1441 }
1442
1443 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1444 {
1445         int err;
1446         struct fib6_table *table;
1447         struct net *net = dev_net(rt->rt6i_dev);
1448
1449         if (rt == net->ipv6.ip6_null_entry)
1450                 return -ENOENT;
1451
1452         table = rt->rt6i_table;
1453         write_lock_bh(&table->tb6_lock);
1454
1455         err = fib6_del(rt, info);
1456         dst_release(&rt->dst);
1457
1458         write_unlock_bh(&table->tb6_lock);
1459
1460         return err;
1461 }
1462
1463 int ip6_del_rt(struct rt6_info *rt)
1464 {
1465         struct nl_info info = {
1466                 .nl_net = dev_net(rt->rt6i_dev),
1467         };
1468         return __ip6_del_rt(rt, &info);
1469 }
1470
1471 static int ip6_route_del(struct fib6_config *cfg)
1472 {
1473         struct fib6_table *table;
1474         struct fib6_node *fn;
1475         struct rt6_info *rt;
1476         int err = -ESRCH;
1477
1478         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1479         if (!table)
1480                 return err;
1481
1482         read_lock_bh(&table->tb6_lock);
1483
1484         fn = fib6_locate(&table->tb6_root,
1485                          &cfg->fc_dst, cfg->fc_dst_len,
1486                          &cfg->fc_src, cfg->fc_src_len);
1487
1488         if (fn) {
1489                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1490                         if (cfg->fc_ifindex &&
1491                             (!rt->rt6i_dev ||
1492                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1493                                 continue;
1494                         if (cfg->fc_flags & RTF_GATEWAY &&
1495                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1496                                 continue;
1497                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1498                                 continue;
1499                         dst_hold(&rt->dst);
1500                         read_unlock_bh(&table->tb6_lock);
1501
1502                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1503                 }
1504         }
1505         read_unlock_bh(&table->tb6_lock);
1506
1507         return err;
1508 }
1509
1510 /*
1511  *      Handle redirects
1512  */
1513 struct ip6rd_flowi {
1514         struct flowi6 fl6;
1515         struct in6_addr gateway;
1516 };
1517
1518 static struct rt6_info *__ip6_route_redirect(struct net *net,
1519                                              struct fib6_table *table,
1520                                              struct flowi6 *fl6,
1521                                              int flags)
1522 {
1523         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1524         struct rt6_info *rt;
1525         struct fib6_node *fn;
1526
1527         /*
1528          * Get the "current" route for this destination and
1529          * check if the redirect has come from approriate router.
1530          *
1531          * RFC 2461 specifies that redirects should only be
1532          * accepted if they come from the nexthop to the target.
1533          * Due to the way the routes are chosen, this notion
1534          * is a bit fuzzy and one might need to check all possible
1535          * routes.
1536          */
1537
1538         read_lock_bh(&table->tb6_lock);
1539         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1540 restart:
1541         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1542                 /*
1543                  * Current route is on-link; redirect is always invalid.
1544                  *
1545                  * Seems, previous statement is not true. It could
1546                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1547                  * But then router serving it might decide, that we should
1548                  * know truth 8)8) --ANK (980726).
1549                  */
1550                 if (rt6_check_expired(rt))
1551                         continue;
1552                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1553                         continue;
1554                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1555                         continue;
1556                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1557                         continue;
1558                 break;
1559         }
1560
1561         if (!rt)
1562                 rt = net->ipv6.ip6_null_entry;
1563         BACKTRACK(net, &fl6->saddr);
1564 out:
1565         dst_hold(&rt->dst);
1566
1567         read_unlock_bh(&table->tb6_lock);
1568
1569         return rt;
1570 };
1571
1572 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1573                                            const struct in6_addr *src,
1574                                            const struct in6_addr *gateway,
1575                                            struct net_device *dev)
1576 {
1577         int flags = RT6_LOOKUP_F_HAS_SADDR;
1578         struct net *net = dev_net(dev);
1579         struct ip6rd_flowi rdfl = {
1580                 .fl6 = {
1581                         .flowi6_oif = dev->ifindex,
1582                         .daddr = *dest,
1583                         .saddr = *src,
1584                 },
1585         };
1586
1587         rdfl.gateway = *gateway;
1588
1589         if (rt6_need_strict(dest))
1590                 flags |= RT6_LOOKUP_F_IFACE;
1591
1592         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1593                                                    flags, __ip6_route_redirect);
1594 }
1595
1596 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1597                   const struct in6_addr *saddr,
1598                   struct neighbour *neigh, u8 *lladdr, int on_link)
1599 {
1600         struct rt6_info *rt, *nrt = NULL;
1601         struct netevent_redirect netevent;
1602         struct net *net = dev_net(neigh->dev);
1603
1604         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1605
1606         if (rt == net->ipv6.ip6_null_entry) {
1607                 if (net_ratelimit())
1608                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1609                                "for redirect target\n");
1610                 goto out;
1611         }
1612
1613         /*
1614          *      We have finally decided to accept it.
1615          */
1616
1617         neigh_update(neigh, lladdr, NUD_STALE,
1618                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1619                      NEIGH_UPDATE_F_OVERRIDE|
1620                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1621                                      NEIGH_UPDATE_F_ISROUTER))
1622                      );
1623
1624         /*
1625          * Redirect received -> path was valid.
1626          * Look, redirects are sent only in response to data packets,
1627          * so that this nexthop apparently is reachable. --ANK
1628          */
1629         dst_confirm(&rt->dst);
1630
1631         /* Duplicate redirect: silently ignore. */
1632         if (neigh == dst_get_neighbour_raw(&rt->dst))
1633                 goto out;
1634
1635         nrt = ip6_rt_copy(rt, dest);
1636         if (!nrt)
1637                 goto out;
1638
1639         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1640         if (on_link)
1641                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1642
1643         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1644         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1645
1646         if (ip6_ins_rt(nrt))
1647                 goto out;
1648
1649         netevent.old = &rt->dst;
1650         netevent.new = &nrt->dst;
1651         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1652
1653         if (rt->rt6i_flags & RTF_CACHE) {
1654                 ip6_del_rt(rt);
1655                 return;
1656         }
1657
1658 out:
1659         dst_release(&rt->dst);
1660 }
1661
1662 /*
1663  *      Handle ICMP "packet too big" messages
1664  *      i.e. Path MTU discovery
1665  */
1666
1667 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1668                              struct net *net, u32 pmtu, int ifindex)
1669 {
1670         struct rt6_info *rt, *nrt;
1671         int allfrag = 0;
1672 again:
1673         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1674         if (!rt)
1675                 return;
1676
1677         if (rt6_check_expired(rt)) {
1678                 ip6_del_rt(rt);
1679                 goto again;
1680         }
1681
1682         if (pmtu >= dst_mtu(&rt->dst))
1683                 goto out;
1684
1685         if (pmtu < IPV6_MIN_MTU) {
1686                 /*
1687                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1688                  * MTU (1280) and a fragment header should always be included
1689                  * after a node receiving Too Big message reporting PMTU is
1690                  * less than the IPv6 Minimum Link MTU.
1691                  */
1692                 pmtu = IPV6_MIN_MTU;
1693                 allfrag = 1;
1694         }
1695
1696         /* New mtu received -> path was valid.
1697            They are sent only in response to data packets,
1698            so that this nexthop apparently is reachable. --ANK
1699          */
1700         dst_confirm(&rt->dst);
1701
1702         /* Host route. If it is static, it would be better
1703            not to override it, but add new one, so that
1704            when cache entry will expire old pmtu
1705            would return automatically.
1706          */
1707         if (rt->rt6i_flags & RTF_CACHE) {
1708                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1709                 if (allfrag) {
1710                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1711                         features |= RTAX_FEATURE_ALLFRAG;
1712                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1713                 }
1714                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1715                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1716                 goto out;
1717         }
1718
1719         /* Network route.
1720            Two cases are possible:
1721            1. It is connected route. Action: COW
1722            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1723          */
1724         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1725                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1726         else
1727                 nrt = rt6_alloc_clone(rt, daddr);
1728
1729         if (nrt) {
1730                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1731                 if (allfrag) {
1732                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1733                         features |= RTAX_FEATURE_ALLFRAG;
1734                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1735                 }
1736
1737                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1738                  * happened within 5 mins, the recommended timer is 10 mins.
1739                  * Here this route expiration time is set to ip6_rt_mtu_expires
1740                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1741                  * and detecting PMTU increase will be automatically happened.
1742                  */
1743                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1744                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1745
1746                 ip6_ins_rt(nrt);
1747         }
1748 out:
1749         dst_release(&rt->dst);
1750 }
1751
1752 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1753                         struct net_device *dev, u32 pmtu)
1754 {
1755         struct net *net = dev_net(dev);
1756
1757         /*
1758          * RFC 1981 states that a node "MUST reduce the size of the packets it
1759          * is sending along the path" that caused the Packet Too Big message.
1760          * Since it's not possible in the general case to determine which
1761          * interface was used to send the original packet, we update the MTU
1762          * on the interface that will be used to send future packets. We also
1763          * update the MTU on the interface that received the Packet Too Big in
1764          * case the original packet was forced out that interface with
1765          * SO_BINDTODEVICE or similar. This is the next best thing to the
1766          * correct behaviour, which would be to update the MTU on all
1767          * interfaces.
1768          */
1769         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1770         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1771 }
1772
1773 /*
1774  *      Misc support functions
1775  */
1776
1777 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1778                                     const struct in6_addr *dest)
1779 {
1780         struct net *net = dev_net(ort->rt6i_dev);
1781         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1782                                             ort->dst.dev, 0);
1783
1784         if (rt) {
1785                 rt->dst.input = ort->dst.input;
1786                 rt->dst.output = ort->dst.output;
1787                 rt->dst.flags |= DST_HOST;
1788
1789                 rt->rt6i_dst.addr = *dest;
1790                 rt->rt6i_dst.plen = 128;
1791                 dst_copy_metrics(&rt->dst, &ort->dst);
1792                 rt->dst.error = ort->dst.error;
1793                 rt->rt6i_idev = ort->rt6i_idev;
1794                 if (rt->rt6i_idev)
1795                         in6_dev_hold(rt->rt6i_idev);
1796                 rt->dst.lastuse = jiffies;
1797                 rt->rt6i_expires = 0;
1798
1799                 rt->rt6i_gateway = ort->rt6i_gateway;
1800                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1801                 rt->rt6i_metric = 0;
1802
1803 #ifdef CONFIG_IPV6_SUBTREES
1804                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1805 #endif
1806                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1807                 rt->rt6i_table = ort->rt6i_table;
1808         }
1809         return rt;
1810 }
1811
1812 #ifdef CONFIG_IPV6_ROUTE_INFO
1813 static struct rt6_info *rt6_get_route_info(struct net *net,
1814                                            const struct in6_addr *prefix, int prefixlen,
1815                                            const struct in6_addr *gwaddr, int ifindex)
1816 {
1817         struct fib6_node *fn;
1818         struct rt6_info *rt = NULL;
1819         struct fib6_table *table;
1820
1821         table = fib6_get_table(net, RT6_TABLE_INFO);
1822         if (!table)
1823                 return NULL;
1824
1825         write_lock_bh(&table->tb6_lock);
1826         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1827         if (!fn)
1828                 goto out;
1829
1830         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1831                 if (rt->rt6i_dev->ifindex != ifindex)
1832                         continue;
1833                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1834                         continue;
1835                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1836                         continue;
1837                 dst_hold(&rt->dst);
1838                 break;
1839         }
1840 out:
1841         write_unlock_bh(&table->tb6_lock);
1842         return rt;
1843 }
1844
1845 static struct rt6_info *rt6_add_route_info(struct net *net,
1846                                            const struct in6_addr *prefix, int prefixlen,
1847                                            const struct in6_addr *gwaddr, int ifindex,
1848                                            unsigned pref)
1849 {
1850         struct fib6_config cfg = {
1851                 .fc_table       = RT6_TABLE_INFO,
1852                 .fc_metric      = IP6_RT_PRIO_USER,
1853                 .fc_ifindex     = ifindex,
1854                 .fc_dst_len     = prefixlen,
1855                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1856                                   RTF_UP | RTF_PREF(pref),
1857                 .fc_nlinfo.pid = 0,
1858                 .fc_nlinfo.nlh = NULL,
1859                 .fc_nlinfo.nl_net = net,
1860         };
1861
1862         cfg.fc_dst = *prefix;
1863         cfg.fc_gateway = *gwaddr;
1864
1865         /* We should treat it as a default route if prefix length is 0. */
1866         if (!prefixlen)
1867                 cfg.fc_flags |= RTF_DEFAULT;
1868
1869         ip6_route_add(&cfg);
1870
1871         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1872 }
1873 #endif
1874
1875 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1876 {
1877         struct rt6_info *rt;
1878         struct fib6_table *table;
1879
1880         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1881         if (!table)
1882                 return NULL;
1883
1884         write_lock_bh(&table->tb6_lock);
1885         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1886                 if (dev == rt->rt6i_dev &&
1887                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1888                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1889                         break;
1890         }
1891         if (rt)
1892                 dst_hold(&rt->dst);
1893         write_unlock_bh(&table->tb6_lock);
1894         return rt;
1895 }
1896
1897 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1898                                      struct net_device *dev,
1899                                      unsigned int pref)
1900 {
1901         struct fib6_config cfg = {
1902                 .fc_table       = RT6_TABLE_DFLT,
1903                 .fc_metric      = IP6_RT_PRIO_USER,
1904                 .fc_ifindex     = dev->ifindex,
1905                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1906                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1907                 .fc_nlinfo.pid = 0,
1908                 .fc_nlinfo.nlh = NULL,
1909                 .fc_nlinfo.nl_net = dev_net(dev),
1910         };
1911
1912         cfg.fc_gateway = *gwaddr;
1913
1914         ip6_route_add(&cfg);
1915
1916         return rt6_get_dflt_router(gwaddr, dev);
1917 }
1918
1919 void rt6_purge_dflt_routers(struct net *net)
1920 {
1921         struct rt6_info *rt;
1922         struct fib6_table *table;
1923
1924         /* NOTE: Keep consistent with rt6_get_dflt_router */
1925         table = fib6_get_table(net, RT6_TABLE_DFLT);
1926         if (!table)
1927                 return;
1928
1929 restart:
1930         read_lock_bh(&table->tb6_lock);
1931         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1932                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1933                         dst_hold(&rt->dst);
1934                         read_unlock_bh(&table->tb6_lock);
1935                         ip6_del_rt(rt);
1936                         goto restart;
1937                 }
1938         }
1939         read_unlock_bh(&table->tb6_lock);
1940 }
1941
1942 static void rtmsg_to_fib6_config(struct net *net,
1943                                  struct in6_rtmsg *rtmsg,
1944                                  struct fib6_config *cfg)
1945 {
1946         memset(cfg, 0, sizeof(*cfg));
1947
1948         cfg->fc_table = RT6_TABLE_MAIN;
1949         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1950         cfg->fc_metric = rtmsg->rtmsg_metric;
1951         cfg->fc_expires = rtmsg->rtmsg_info;
1952         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1953         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1954         cfg->fc_flags = rtmsg->rtmsg_flags;
1955
1956         cfg->fc_nlinfo.nl_net = net;
1957
1958         cfg->fc_dst = rtmsg->rtmsg_dst;
1959         cfg->fc_src = rtmsg->rtmsg_src;
1960         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1961 }
1962
1963 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1964 {
1965         struct fib6_config cfg;
1966         struct in6_rtmsg rtmsg;
1967         int err;
1968
1969         switch(cmd) {
1970         case SIOCADDRT:         /* Add a route */
1971         case SIOCDELRT:         /* Delete a route */
1972                 if (!capable(CAP_NET_ADMIN))
1973                         return -EPERM;
1974                 err = copy_from_user(&rtmsg, arg,
1975                                      sizeof(struct in6_rtmsg));
1976                 if (err)
1977                         return -EFAULT;
1978
1979                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1980
1981                 rtnl_lock();
1982                 switch (cmd) {
1983                 case SIOCADDRT:
1984                         err = ip6_route_add(&cfg);
1985                         break;
1986                 case SIOCDELRT:
1987                         err = ip6_route_del(&cfg);
1988                         break;
1989                 default:
1990                         err = -EINVAL;
1991                 }
1992                 rtnl_unlock();
1993
1994                 return err;
1995         }
1996
1997         return -EINVAL;
1998 }
1999
2000 /*
2001  *      Drop the packet on the floor
2002  */
2003
2004 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2005 {
2006         int type;
2007         struct dst_entry *dst = skb_dst(skb);
2008         switch (ipstats_mib_noroutes) {
2009         case IPSTATS_MIB_INNOROUTES:
2010                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2011                 if (type == IPV6_ADDR_ANY) {
2012                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2013                                       IPSTATS_MIB_INADDRERRORS);
2014                         break;
2015                 }
2016                 /* FALLTHROUGH */
2017         case IPSTATS_MIB_OUTNOROUTES:
2018                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2019                               ipstats_mib_noroutes);
2020                 break;
2021         }
2022         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2023         kfree_skb(skb);
2024         return 0;
2025 }
2026
2027 static int ip6_pkt_discard(struct sk_buff *skb)
2028 {
2029         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2030 }
2031
2032 static int ip6_pkt_discard_out(struct sk_buff *skb)
2033 {
2034         skb->dev = skb_dst(skb)->dev;
2035         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2036 }
2037
2038 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2039
2040 static int ip6_pkt_prohibit(struct sk_buff *skb)
2041 {
2042         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2043 }
2044
2045 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2046 {
2047         skb->dev = skb_dst(skb)->dev;
2048         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2049 }
2050
2051 #endif
2052
2053 /*
2054  *      Allocate a dst for local (unicast / anycast) address.
2055  */
2056
2057 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2058                                     const struct in6_addr *addr,
2059                                     int anycast)
2060 {
2061         struct net *net = dev_net(idev->dev);
2062         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2063                                             net->loopback_dev, 0);
2064         struct neighbour *neigh;
2065
2066         if (!rt) {
2067                 if (net_ratelimit())
2068                         pr_warning("IPv6:  Maximum number of routes reached,"
2069                                    " consider increasing route/max_size.\n");
2070                 return ERR_PTR(-ENOMEM);
2071         }
2072
2073         in6_dev_hold(idev);
2074
2075         rt->dst.flags |= DST_HOST;
2076         rt->dst.input = ip6_input;
2077         rt->dst.output = ip6_output;
2078         rt->rt6i_idev = idev;
2079         rt->dst.obsolete = -1;
2080
2081         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2082         if (anycast)
2083                 rt->rt6i_flags |= RTF_ANYCAST;
2084         else
2085                 rt->rt6i_flags |= RTF_LOCAL;
2086         neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
2087         if (IS_ERR(neigh)) {
2088                 dst_free(&rt->dst);
2089
2090                 return ERR_CAST(neigh);
2091         }
2092         dst_set_neighbour(&rt->dst, neigh);
2093
2094         rt->rt6i_dst.addr = *addr;
2095         rt->rt6i_dst.plen = 128;
2096         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2097
2098         atomic_set(&rt->dst.__refcnt, 1);
2099
2100         return rt;
2101 }
2102
2103 int ip6_route_get_saddr(struct net *net,
2104                         struct rt6_info *rt,
2105                         const struct in6_addr *daddr,
2106                         unsigned int prefs,
2107                         struct in6_addr *saddr)
2108 {
2109         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2110         int err = 0;
2111         if (rt->rt6i_prefsrc.plen)
2112                 *saddr = rt->rt6i_prefsrc.addr;
2113         else
2114                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2115                                          daddr, prefs, saddr);
2116         return err;
2117 }
2118
2119 /* remove deleted ip from prefsrc entries */
2120 struct arg_dev_net_ip {
2121         struct net_device *dev;
2122         struct net *net;
2123         struct in6_addr *addr;
2124 };
2125
2126 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2127 {
2128         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2129         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2130         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2131
2132         if (((void *)rt->rt6i_dev == dev || !dev) &&
2133             rt != net->ipv6.ip6_null_entry &&
2134             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2135                 /* remove prefsrc entry */
2136                 rt->rt6i_prefsrc.plen = 0;
2137         }
2138         return 0;
2139 }
2140
2141 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2142 {
2143         struct net *net = dev_net(ifp->idev->dev);
2144         struct arg_dev_net_ip adni = {
2145                 .dev = ifp->idev->dev,
2146                 .net = net,
2147                 .addr = &ifp->addr,
2148         };
2149         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2150 }
2151
2152 struct arg_dev_net {
2153         struct net_device *dev;
2154         struct net *net;
2155 };
2156
2157 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2158 {
2159         const struct arg_dev_net *adn = arg;
2160         const struct net_device *dev = adn->dev;
2161
2162         if ((rt->rt6i_dev == dev || !dev) &&
2163             rt != adn->net->ipv6.ip6_null_entry) {
2164                 RT6_TRACE("deleted by ifdown %p\n", rt);
2165                 return -1;
2166         }
2167         return 0;
2168 }
2169
2170 void rt6_ifdown(struct net *net, struct net_device *dev)
2171 {
2172         struct arg_dev_net adn = {
2173                 .dev = dev,
2174                 .net = net,
2175         };
2176
2177         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2178         icmp6_clean_all(fib6_ifdown, &adn);
2179 }
2180
2181 struct rt6_mtu_change_arg
2182 {
2183         struct net_device *dev;
2184         unsigned mtu;
2185 };
2186
2187 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2188 {
2189         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2190         struct inet6_dev *idev;
2191
2192         /* In IPv6 pmtu discovery is not optional,
2193            so that RTAX_MTU lock cannot disable it.
2194            We still use this lock to block changes
2195            caused by addrconf/ndisc.
2196         */
2197
2198         idev = __in6_dev_get(arg->dev);
2199         if (!idev)
2200                 return 0;
2201
2202         /* For administrative MTU increase, there is no way to discover
2203            IPv6 PMTU increase, so PMTU increase should be updated here.
2204            Since RFC 1981 doesn't include administrative MTU increase
2205            update PMTU increase is a MUST. (i.e. jumbo frame)
2206          */
2207         /*
2208            If new MTU is less than route PMTU, this new MTU will be the
2209            lowest MTU in the path, update the route PMTU to reflect PMTU
2210            decreases; if new MTU is greater than route PMTU, and the
2211            old MTU is the lowest MTU in the path, update the route PMTU
2212            to reflect the increase. In this case if the other nodes' MTU
2213            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2214            PMTU discouvery.
2215          */
2216         if (rt->rt6i_dev == arg->dev &&
2217             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2218             (dst_mtu(&rt->dst) >= arg->mtu ||
2219              (dst_mtu(&rt->dst) < arg->mtu &&
2220               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2221                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2222         }
2223         return 0;
2224 }
2225
2226 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2227 {
2228         struct rt6_mtu_change_arg arg = {
2229                 .dev = dev,
2230                 .mtu = mtu,
2231         };
2232
2233         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2234 }
2235
2236 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2237         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2238         [RTA_OIF]               = { .type = NLA_U32 },
2239         [RTA_IIF]               = { .type = NLA_U32 },
2240         [RTA_PRIORITY]          = { .type = NLA_U32 },
2241         [RTA_METRICS]           = { .type = NLA_NESTED },
2242 };
2243
2244 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2245                               struct fib6_config *cfg)
2246 {
2247         struct rtmsg *rtm;
2248         struct nlattr *tb[RTA_MAX+1];
2249         int err;
2250
2251         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2252         if (err < 0)
2253                 goto errout;
2254
2255         err = -EINVAL;
2256         rtm = nlmsg_data(nlh);
2257         memset(cfg, 0, sizeof(*cfg));
2258
2259         cfg->fc_table = rtm->rtm_table;
2260         cfg->fc_dst_len = rtm->rtm_dst_len;
2261         cfg->fc_src_len = rtm->rtm_src_len;
2262         cfg->fc_flags = RTF_UP;
2263         cfg->fc_protocol = rtm->rtm_protocol;
2264
2265         if (rtm->rtm_type == RTN_UNREACHABLE)
2266                 cfg->fc_flags |= RTF_REJECT;
2267
2268         if (rtm->rtm_type == RTN_LOCAL)
2269                 cfg->fc_flags |= RTF_LOCAL;
2270
2271         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2272         cfg->fc_nlinfo.nlh = nlh;
2273         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2274
2275         if (tb[RTA_GATEWAY]) {
2276                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2277                 cfg->fc_flags |= RTF_GATEWAY;
2278         }
2279
2280         if (tb[RTA_DST]) {
2281                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2282
2283                 if (nla_len(tb[RTA_DST]) < plen)
2284                         goto errout;
2285
2286                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2287         }
2288
2289         if (tb[RTA_SRC]) {
2290                 int plen = (rtm->rtm_src_len + 7) >> 3;
2291
2292                 if (nla_len(tb[RTA_SRC]) < plen)
2293                         goto errout;
2294
2295                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2296         }
2297
2298         if (tb[RTA_PREFSRC])
2299                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2300
2301         if (tb[RTA_OIF])
2302                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2303
2304         if (tb[RTA_PRIORITY])
2305                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2306
2307         if (tb[RTA_METRICS]) {
2308                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2309                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2310         }
2311
2312         if (tb[RTA_TABLE])
2313                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2314
2315         err = 0;
2316 errout:
2317         return err;
2318 }
2319
2320 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2321 {
2322         struct fib6_config cfg;
2323         int err;
2324
2325         err = rtm_to_fib6_config(skb, nlh, &cfg);
2326         if (err < 0)
2327                 return err;
2328
2329         return ip6_route_del(&cfg);
2330 }
2331
2332 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2333 {
2334         struct fib6_config cfg;
2335         int err;
2336
2337         err = rtm_to_fib6_config(skb, nlh, &cfg);
2338         if (err < 0)
2339                 return err;
2340
2341         return ip6_route_add(&cfg);
2342 }
2343
2344 static inline size_t rt6_nlmsg_size(void)
2345 {
2346         return NLMSG_ALIGN(sizeof(struct rtmsg))
2347                + nla_total_size(16) /* RTA_SRC */
2348                + nla_total_size(16) /* RTA_DST */
2349                + nla_total_size(16) /* RTA_GATEWAY */
2350                + nla_total_size(16) /* RTA_PREFSRC */
2351                + nla_total_size(4) /* RTA_TABLE */
2352                + nla_total_size(4) /* RTA_IIF */
2353                + nla_total_size(4) /* RTA_OIF */
2354                + nla_total_size(4) /* RTA_PRIORITY */
2355                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2356                + nla_total_size(sizeof(struct rta_cacheinfo));
2357 }
2358
2359 static int rt6_fill_node(struct net *net,
2360                          struct sk_buff *skb, struct rt6_info *rt,
2361                          struct in6_addr *dst, struct in6_addr *src,
2362                          int iif, int type, u32 pid, u32 seq,
2363                          int prefix, int nowait, unsigned int flags)
2364 {
2365         struct rtmsg *rtm;
2366         struct nlmsghdr *nlh;
2367         long expires;
2368         u32 table;
2369         struct neighbour *n;
2370
2371         if (prefix) {   /* user wants prefix routes only */
2372                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2373                         /* success since this is not a prefix route */
2374                         return 1;
2375                 }
2376         }
2377
2378         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2379         if (!nlh)
2380                 return -EMSGSIZE;
2381
2382         rtm = nlmsg_data(nlh);
2383         rtm->rtm_family = AF_INET6;
2384         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2385         rtm->rtm_src_len = rt->rt6i_src.plen;
2386         rtm->rtm_tos = 0;
2387         if (rt->rt6i_table)
2388                 table = rt->rt6i_table->tb6_id;
2389         else
2390                 table = RT6_TABLE_UNSPEC;
2391         rtm->rtm_table = table;
2392         NLA_PUT_U32(skb, RTA_TABLE, table);
2393         if (rt->rt6i_flags & RTF_REJECT)
2394                 rtm->rtm_type = RTN_UNREACHABLE;
2395         else if (rt->rt6i_flags & RTF_LOCAL)
2396                 rtm->rtm_type = RTN_LOCAL;
2397         else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
2398                 rtm->rtm_type = RTN_LOCAL;
2399         else
2400                 rtm->rtm_type = RTN_UNICAST;
2401         rtm->rtm_flags = 0;
2402         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2403         rtm->rtm_protocol = rt->rt6i_protocol;
2404         if (rt->rt6i_flags & RTF_DYNAMIC)
2405                 rtm->rtm_protocol = RTPROT_REDIRECT;
2406         else if (rt->rt6i_flags & RTF_ADDRCONF)
2407                 rtm->rtm_protocol = RTPROT_KERNEL;
2408         else if (rt->rt6i_flags & RTF_DEFAULT)
2409                 rtm->rtm_protocol = RTPROT_RA;
2410
2411         if (rt->rt6i_flags & RTF_CACHE)
2412                 rtm->rtm_flags |= RTM_F_CLONED;
2413
2414         if (dst) {
2415                 NLA_PUT(skb, RTA_DST, 16, dst);
2416                 rtm->rtm_dst_len = 128;
2417         } else if (rtm->rtm_dst_len)
2418                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2419 #ifdef CONFIG_IPV6_SUBTREES
2420         if (src) {
2421                 NLA_PUT(skb, RTA_SRC, 16, src);
2422                 rtm->rtm_src_len = 128;
2423         } else if (rtm->rtm_src_len)
2424                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2425 #endif
2426         if (iif) {
2427 #ifdef CONFIG_IPV6_MROUTE
2428                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2429                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2430                         if (err <= 0) {
2431                                 if (!nowait) {
2432                                         if (err == 0)
2433                                                 return 0;
2434                                         goto nla_put_failure;
2435                                 } else {
2436                                         if (err == -EMSGSIZE)
2437                                                 goto nla_put_failure;
2438                                 }
2439                         }
2440                 } else
2441 #endif
2442                         NLA_PUT_U32(skb, RTA_IIF, iif);
2443         } else if (dst) {
2444                 struct in6_addr saddr_buf;
2445                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2446                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2447         }
2448
2449         if (rt->rt6i_prefsrc.plen) {
2450                 struct in6_addr saddr_buf;
2451                 saddr_buf = rt->rt6i_prefsrc.addr;
2452                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2453         }
2454
2455         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2456                 goto nla_put_failure;
2457
2458         rcu_read_lock();
2459         n = dst_get_neighbour(&rt->dst);
2460         if (n)
2461                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2462         rcu_read_unlock();
2463
2464         if (rt->dst.dev)
2465                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2466
2467         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2468
2469         if (!(rt->rt6i_flags & RTF_EXPIRES))
2470                 expires = 0;
2471         else if (rt->rt6i_expires - jiffies < INT_MAX)
2472                 expires = rt->rt6i_expires - jiffies;
2473         else
2474                 expires = INT_MAX;
2475
2476         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2477                                expires, rt->dst.error) < 0)
2478                 goto nla_put_failure;
2479
2480         return nlmsg_end(skb, nlh);
2481
2482 nla_put_failure:
2483         nlmsg_cancel(skb, nlh);
2484         return -EMSGSIZE;
2485 }
2486
2487 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2488 {
2489         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2490         int prefix;
2491
2492         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2493                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2494                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2495         } else
2496                 prefix = 0;
2497
2498         return rt6_fill_node(arg->net,
2499                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2500                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2501                      prefix, 0, NLM_F_MULTI);
2502 }
2503
2504 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2505 {
2506         struct net *net = sock_net(in_skb->sk);
2507         struct nlattr *tb[RTA_MAX+1];
2508         struct rt6_info *rt;
2509         struct sk_buff *skb;
2510         struct rtmsg *rtm;
2511         struct flowi6 fl6;
2512         int err, iif = 0;
2513
2514         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2515         if (err < 0)
2516                 goto errout;
2517
2518         err = -EINVAL;
2519         memset(&fl6, 0, sizeof(fl6));
2520
2521         if (tb[RTA_SRC]) {
2522                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2523                         goto errout;
2524
2525                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2526         }
2527
2528         if (tb[RTA_DST]) {
2529                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2530                         goto errout;
2531
2532                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2533         }
2534
2535         if (tb[RTA_IIF])
2536                 iif = nla_get_u32(tb[RTA_IIF]);
2537
2538         if (tb[RTA_OIF])
2539                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2540
2541         if (iif) {
2542                 struct net_device *dev;
2543                 dev = __dev_get_by_index(net, iif);
2544                 if (!dev) {
2545                         err = -ENODEV;
2546                         goto errout;
2547                 }
2548         }
2549
2550         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2551         if (!skb) {
2552                 err = -ENOBUFS;
2553                 goto errout;
2554         }
2555
2556         /* Reserve room for dummy headers, this skb can pass
2557            through good chunk of routing engine.
2558          */
2559         skb_reset_mac_header(skb);
2560         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2561
2562         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2563         skb_dst_set(skb, &rt->dst);
2564
2565         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2566                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2567                             nlh->nlmsg_seq, 0, 0, 0);
2568         if (err < 0) {
2569                 kfree_skb(skb);
2570                 goto errout;
2571         }
2572
2573         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574 errout:
2575         return err;
2576 }
2577
2578 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2579 {
2580         struct sk_buff *skb;
2581         struct net *net = info->nl_net;
2582         u32 seq;
2583         int err;
2584
2585         err = -ENOBUFS;
2586         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2587
2588         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2589         if (!skb)
2590                 goto errout;
2591
2592         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2593                                 event, info->pid, seq, 0, 0, 0);
2594         if (err < 0) {
2595                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2596                 WARN_ON(err == -EMSGSIZE);
2597                 kfree_skb(skb);
2598                 goto errout;
2599         }
2600         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2601                     info->nlh, gfp_any());
2602         return;
2603 errout:
2604         if (err < 0)
2605                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2606 }
2607
2608 static int ip6_route_dev_notify(struct notifier_block *this,
2609                                 unsigned long event, void *data)
2610 {
2611         struct net_device *dev = (struct net_device *)data;
2612         struct net *net = dev_net(dev);
2613
2614         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2615                 net->ipv6.ip6_null_entry->dst.dev = dev;
2616                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2617 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2618                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2619                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2620                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2621                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2622 #endif
2623         }
2624
2625         return NOTIFY_OK;
2626 }
2627
2628 /*
2629  *      /proc
2630  */
2631
2632 #ifdef CONFIG_PROC_FS
2633
2634 struct rt6_proc_arg
2635 {
2636         char *buffer;
2637         int offset;
2638         int length;
2639         int skip;
2640         int len;
2641 };
2642
2643 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2644 {
2645         struct seq_file *m = p_arg;
2646         struct neighbour *n;
2647
2648         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2649
2650 #ifdef CONFIG_IPV6_SUBTREES
2651         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2652 #else
2653         seq_puts(m, "00000000000000000000000000000000 00 ");
2654 #endif
2655         rcu_read_lock();
2656         n = dst_get_neighbour(&rt->dst);
2657         if (n) {
2658                 seq_printf(m, "%pi6", n->primary_key);
2659         } else {
2660                 seq_puts(m, "00000000000000000000000000000000");
2661         }
2662         rcu_read_unlock();
2663         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2664                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2665                    rt->dst.__use, rt->rt6i_flags,
2666                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2667         return 0;
2668 }
2669
2670 static int ipv6_route_show(struct seq_file *m, void *v)
2671 {
2672         struct net *net = (struct net *)m->private;
2673         fib6_clean_all(net, rt6_info_route, 0, m);
2674         return 0;
2675 }
2676
2677 static int ipv6_route_open(struct inode *inode, struct file *file)
2678 {
2679         return single_open_net(inode, file, ipv6_route_show);
2680 }
2681
2682 static const struct file_operations ipv6_route_proc_fops = {
2683         .owner          = THIS_MODULE,
2684         .open           = ipv6_route_open,
2685         .read           = seq_read,
2686         .llseek         = seq_lseek,
2687         .release        = single_release_net,
2688 };
2689
2690 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2691 {
2692         struct net *net = (struct net *)seq->private;
2693         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2694                    net->ipv6.rt6_stats->fib_nodes,
2695                    net->ipv6.rt6_stats->fib_route_nodes,
2696                    net->ipv6.rt6_stats->fib_rt_alloc,
2697                    net->ipv6.rt6_stats->fib_rt_entries,
2698                    net->ipv6.rt6_stats->fib_rt_cache,
2699                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2700                    net->ipv6.rt6_stats->fib_discarded_routes);
2701
2702         return 0;
2703 }
2704
2705 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2706 {
2707         return single_open_net(inode, file, rt6_stats_seq_show);
2708 }
2709
2710 static const struct file_operations rt6_stats_seq_fops = {
2711         .owner   = THIS_MODULE,
2712         .open    = rt6_stats_seq_open,
2713         .read    = seq_read,
2714         .llseek  = seq_lseek,
2715         .release = single_release_net,
2716 };
2717 #endif  /* CONFIG_PROC_FS */
2718
2719 #ifdef CONFIG_SYSCTL
2720
2721 static
2722 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2723                               void __user *buffer, size_t *lenp, loff_t *ppos)
2724 {
2725         struct net *net;
2726         int delay;
2727         if (!write)
2728                 return -EINVAL;
2729
2730         net = (struct net *)ctl->extra1;
2731         delay = net->ipv6.sysctl.flush_delay;
2732         proc_dointvec(ctl, write, buffer, lenp, ppos);
2733         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2734         return 0;
2735 }
2736
2737 ctl_table ipv6_route_table_template[] = {
2738         {
2739                 .procname       =       "flush",
2740                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2741                 .maxlen         =       sizeof(int),
2742                 .mode           =       0200,
2743                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2744         },
2745         {
2746                 .procname       =       "gc_thresh",
2747                 .data           =       &ip6_dst_ops_template.gc_thresh,
2748                 .maxlen         =       sizeof(int),
2749                 .mode           =       0644,
2750                 .proc_handler   =       proc_dointvec,
2751         },
2752         {
2753                 .procname       =       "max_size",
2754                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2755                 .maxlen         =       sizeof(int),
2756                 .mode           =       0644,
2757                 .proc_handler   =       proc_dointvec,
2758         },
2759         {
2760                 .procname       =       "gc_min_interval",
2761                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2762                 .maxlen         =       sizeof(int),
2763                 .mode           =       0644,
2764                 .proc_handler   =       proc_dointvec_jiffies,
2765         },
2766         {
2767                 .procname       =       "gc_timeout",
2768                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2769                 .maxlen         =       sizeof(int),
2770                 .mode           =       0644,
2771                 .proc_handler   =       proc_dointvec_jiffies,
2772         },
2773         {
2774                 .procname       =       "gc_interval",
2775                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2776                 .maxlen         =       sizeof(int),
2777                 .mode           =       0644,
2778                 .proc_handler   =       proc_dointvec_jiffies,
2779         },
2780         {
2781                 .procname       =       "gc_elasticity",
2782                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2783                 .maxlen         =       sizeof(int),
2784                 .mode           =       0644,
2785                 .proc_handler   =       proc_dointvec,
2786         },
2787         {
2788                 .procname       =       "mtu_expires",
2789                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2790                 .maxlen         =       sizeof(int),
2791                 .mode           =       0644,
2792                 .proc_handler   =       proc_dointvec_jiffies,
2793         },
2794         {
2795                 .procname       =       "min_adv_mss",
2796                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2797                 .maxlen         =       sizeof(int),
2798                 .mode           =       0644,
2799                 .proc_handler   =       proc_dointvec,
2800         },
2801         {
2802                 .procname       =       "gc_min_interval_ms",
2803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2804                 .maxlen         =       sizeof(int),
2805                 .mode           =       0644,
2806                 .proc_handler   =       proc_dointvec_ms_jiffies,
2807         },
2808         { }
2809 };
2810
2811 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2812 {
2813         struct ctl_table *table;
2814
2815         table = kmemdup(ipv6_route_table_template,
2816                         sizeof(ipv6_route_table_template),
2817                         GFP_KERNEL);
2818
2819         if (table) {
2820                 table[0].data = &net->ipv6.sysctl.flush_delay;
2821                 table[0].extra1 = net;
2822                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2823                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2824                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2825                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2826                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2827                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2828                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2829                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2830                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831         }
2832
2833         return table;
2834 }
2835 #endif
2836
2837 static int __net_init ip6_route_net_init(struct net *net)
2838 {
2839         int ret = -ENOMEM;
2840
2841         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2842                sizeof(net->ipv6.ip6_dst_ops));
2843
2844         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2845                 goto out_ip6_dst_ops;
2846
2847         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2848                                            sizeof(*net->ipv6.ip6_null_entry),
2849                                            GFP_KERNEL);
2850         if (!net->ipv6.ip6_null_entry)
2851                 goto out_ip6_dst_entries;
2852         net->ipv6.ip6_null_entry->dst.path =
2853                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2854         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2855         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2856                          ip6_template_metrics, true);
2857
2858 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2859         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2860                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2861                                                GFP_KERNEL);
2862         if (!net->ipv6.ip6_prohibit_entry)
2863                 goto out_ip6_null_entry;
2864         net->ipv6.ip6_prohibit_entry->dst.path =
2865                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2866         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2867         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2868                          ip6_template_metrics, true);
2869
2870         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2871                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2872                                                GFP_KERNEL);
2873         if (!net->ipv6.ip6_blk_hole_entry)
2874                 goto out_ip6_prohibit_entry;
2875         net->ipv6.ip6_blk_hole_entry->dst.path =
2876                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2877         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2879                          ip6_template_metrics, true);
2880 #endif
2881
2882         net->ipv6.sysctl.flush_delay = 0;
2883         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2884         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2885         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2886         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2887         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2888         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2889         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2890
2891 #ifdef CONFIG_PROC_FS
2892         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2893         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2894 #endif
2895         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2896
2897         ret = 0;
2898 out:
2899         return ret;
2900
2901 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2902 out_ip6_prohibit_entry:
2903         kfree(net->ipv6.ip6_prohibit_entry);
2904 out_ip6_null_entry:
2905         kfree(net->ipv6.ip6_null_entry);
2906 #endif
2907 out_ip6_dst_entries:
2908         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2909 out_ip6_dst_ops:
2910         goto out;
2911 }
2912
2913 static void __net_exit ip6_route_net_exit(struct net *net)
2914 {
2915 #ifdef CONFIG_PROC_FS
2916         proc_net_remove(net, "ipv6_route");
2917         proc_net_remove(net, "rt6_stats");
2918 #endif
2919         kfree(net->ipv6.ip6_null_entry);
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921         kfree(net->ipv6.ip6_prohibit_entry);
2922         kfree(net->ipv6.ip6_blk_hole_entry);
2923 #endif
2924         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 }
2926
2927 static struct pernet_operations ip6_route_net_ops = {
2928         .init = ip6_route_net_init,
2929         .exit = ip6_route_net_exit,
2930 };
2931
2932 static struct notifier_block ip6_route_dev_notifier = {
2933         .notifier_call = ip6_route_dev_notify,
2934         .priority = 0,
2935 };
2936
2937 int __init ip6_route_init(void)
2938 {
2939         int ret;
2940
2941         ret = -ENOMEM;
2942         ip6_dst_ops_template.kmem_cachep =
2943                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2944                                   SLAB_HWCACHE_ALIGN, NULL);
2945         if (!ip6_dst_ops_template.kmem_cachep)
2946                 goto out;
2947
2948         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2949         if (ret)
2950                 goto out_kmem_cache;
2951
2952         ret = register_pernet_subsys(&ip6_route_net_ops);
2953         if (ret)
2954                 goto out_dst_entries;
2955
2956         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2957
2958         /* Registering of the loopback is done before this portion of code,
2959          * the loopback reference in rt6_info will not be taken, do it
2960          * manually for init_net */
2961         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2962         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2963   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2964         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2965         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2966         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2967         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2968   #endif
2969         ret = fib6_init();
2970         if (ret)
2971                 goto out_register_subsys;
2972
2973         ret = xfrm6_init();
2974         if (ret)
2975                 goto out_fib6_init;
2976
2977         ret = fib6_rules_init();
2978         if (ret)
2979                 goto xfrm6_init;
2980
2981         ret = -ENOBUFS;
2982         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2983             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2984             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2985                 goto fib6_rules_init;
2986
2987         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2988         if (ret)
2989                 goto fib6_rules_init;
2990
2991 out:
2992         return ret;
2993
2994 fib6_rules_init:
2995         fib6_rules_cleanup();
2996 xfrm6_init:
2997         xfrm6_fini();
2998 out_fib6_init:
2999         fib6_gc_cleanup();
3000 out_register_subsys:
3001         unregister_pernet_subsys(&ip6_route_net_ops);
3002 out_dst_entries:
3003         dst_entries_destroy(&ip6_dst_blackhole_ops);
3004 out_kmem_cache:
3005         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3006         goto out;
3007 }
3008
3009 void ip6_route_cleanup(void)
3010 {
3011         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3012         fib6_rules_cleanup();
3013         xfrm6_fini();
3014         fib6_gc_cleanup();
3015         unregister_pernet_subsys(&ip6_route_net_ops);
3016         dst_entries_destroy(&ip6_dst_blackhole_ops);
3017         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3018 }