6d6146d31f229471aa518c6c050b622c0f1bcf68
[~shefty/rdma-dev.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148                                            struct sk_buff *skb, u32 mtu);
149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150                                         struct sk_buff *skb);
151
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153                             int how)
154 {
155 }
156
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159         WARN_ON(1);
160         return NULL;
161 }
162
163 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
164                                            struct sk_buff *skb,
165                                            const void *daddr);
166
167 static struct dst_ops ipv4_dst_ops = {
168         .family =               AF_INET,
169         .protocol =             cpu_to_be16(ETH_P_IP),
170         .check =                ipv4_dst_check,
171         .default_advmss =       ipv4_default_advmss,
172         .mtu =                  ipv4_mtu,
173         .cow_metrics =          ipv4_cow_metrics,
174         .destroy =              ipv4_dst_destroy,
175         .ifdown =               ipv4_dst_ifdown,
176         .negative_advice =      ipv4_negative_advice,
177         .link_failure =         ipv4_link_failure,
178         .update_pmtu =          ip_rt_update_pmtu,
179         .redirect =             ip_do_redirect,
180         .local_out =            __ip_local_out,
181         .neigh_lookup =         ipv4_neigh_lookup,
182 };
183
184 #define ECN_OR_COST(class)      TC_PRIO_##class
185
186 const __u8 ip_tos2prio[16] = {
187         TC_PRIO_BESTEFFORT,
188         ECN_OR_COST(BESTEFFORT),
189         TC_PRIO_BESTEFFORT,
190         ECN_OR_COST(BESTEFFORT),
191         TC_PRIO_BULK,
192         ECN_OR_COST(BULK),
193         TC_PRIO_BULK,
194         ECN_OR_COST(BULK),
195         TC_PRIO_INTERACTIVE,
196         ECN_OR_COST(INTERACTIVE),
197         TC_PRIO_INTERACTIVE,
198         ECN_OR_COST(INTERACTIVE),
199         TC_PRIO_INTERACTIVE_BULK,
200         ECN_OR_COST(INTERACTIVE_BULK),
201         TC_PRIO_INTERACTIVE_BULK,
202         ECN_OR_COST(INTERACTIVE_BULK)
203 };
204 EXPORT_SYMBOL(ip_tos2prio);
205
206 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
207 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
208
209 static inline int rt_genid(struct net *net)
210 {
211         return atomic_read(&net->ipv4.rt_genid);
212 }
213
214 #ifdef CONFIG_PROC_FS
215 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
216 {
217         if (*pos)
218                 return NULL;
219         return SEQ_START_TOKEN;
220 }
221
222 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223 {
224         ++*pos;
225         return NULL;
226 }
227
228 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
229 {
230 }
231
232 static int rt_cache_seq_show(struct seq_file *seq, void *v)
233 {
234         if (v == SEQ_START_TOKEN)
235                 seq_printf(seq, "%-127s\n",
236                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
237                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
238                            "HHUptod\tSpecDst");
239         return 0;
240 }
241
242 static const struct seq_operations rt_cache_seq_ops = {
243         .start  = rt_cache_seq_start,
244         .next   = rt_cache_seq_next,
245         .stop   = rt_cache_seq_stop,
246         .show   = rt_cache_seq_show,
247 };
248
249 static int rt_cache_seq_open(struct inode *inode, struct file *file)
250 {
251         return seq_open(file, &rt_cache_seq_ops);
252 }
253
254 static const struct file_operations rt_cache_seq_fops = {
255         .owner   = THIS_MODULE,
256         .open    = rt_cache_seq_open,
257         .read    = seq_read,
258         .llseek  = seq_lseek,
259         .release = seq_release,
260 };
261
262
263 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
264 {
265         int cpu;
266
267         if (*pos == 0)
268                 return SEQ_START_TOKEN;
269
270         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277 }
278
279 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
280 {
281         int cpu;
282
283         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
284                 if (!cpu_possible(cpu))
285                         continue;
286                 *pos = cpu+1;
287                 return &per_cpu(rt_cache_stat, cpu);
288         }
289         return NULL;
290
291 }
292
293 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
294 {
295
296 }
297
298 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
299 {
300         struct rt_cache_stat *st = v;
301
302         if (v == SEQ_START_TOKEN) {
303                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
304                 return 0;
305         }
306
307         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
308                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
309                    dst_entries_get_slow(&ipv4_dst_ops),
310                    st->in_hit,
311                    st->in_slow_tot,
312                    st->in_slow_mc,
313                    st->in_no_route,
314                    st->in_brd,
315                    st->in_martian_dst,
316                    st->in_martian_src,
317
318                    st->out_hit,
319                    st->out_slow_tot,
320                    st->out_slow_mc,
321
322                    st->gc_total,
323                    st->gc_ignored,
324                    st->gc_goal_miss,
325                    st->gc_dst_overflow,
326                    st->in_hlist_search,
327                    st->out_hlist_search
328                 );
329         return 0;
330 }
331
332 static const struct seq_operations rt_cpu_seq_ops = {
333         .start  = rt_cpu_seq_start,
334         .next   = rt_cpu_seq_next,
335         .stop   = rt_cpu_seq_stop,
336         .show   = rt_cpu_seq_show,
337 };
338
339
340 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
341 {
342         return seq_open(file, &rt_cpu_seq_ops);
343 }
344
345 static const struct file_operations rt_cpu_seq_fops = {
346         .owner   = THIS_MODULE,
347         .open    = rt_cpu_seq_open,
348         .read    = seq_read,
349         .llseek  = seq_lseek,
350         .release = seq_release,
351 };
352
353 #ifdef CONFIG_IP_ROUTE_CLASSID
354 static int rt_acct_proc_show(struct seq_file *m, void *v)
355 {
356         struct ip_rt_acct *dst, *src;
357         unsigned int i, j;
358
359         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
360         if (!dst)
361                 return -ENOMEM;
362
363         for_each_possible_cpu(i) {
364                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
365                 for (j = 0; j < 256; j++) {
366                         dst[j].o_bytes   += src[j].o_bytes;
367                         dst[j].o_packets += src[j].o_packets;
368                         dst[j].i_bytes   += src[j].i_bytes;
369                         dst[j].i_packets += src[j].i_packets;
370                 }
371         }
372
373         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
374         kfree(dst);
375         return 0;
376 }
377
378 static int rt_acct_proc_open(struct inode *inode, struct file *file)
379 {
380         return single_open(file, rt_acct_proc_show, NULL);
381 }
382
383 static const struct file_operations rt_acct_proc_fops = {
384         .owner          = THIS_MODULE,
385         .open           = rt_acct_proc_open,
386         .read           = seq_read,
387         .llseek         = seq_lseek,
388         .release        = single_release,
389 };
390 #endif
391
392 static int __net_init ip_rt_do_proc_init(struct net *net)
393 {
394         struct proc_dir_entry *pde;
395
396         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
397                         &rt_cache_seq_fops);
398         if (!pde)
399                 goto err1;
400
401         pde = proc_create("rt_cache", S_IRUGO,
402                           net->proc_net_stat, &rt_cpu_seq_fops);
403         if (!pde)
404                 goto err2;
405
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
408         if (!pde)
409                 goto err3;
410 #endif
411         return 0;
412
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 err3:
415         remove_proc_entry("rt_cache", net->proc_net_stat);
416 #endif
417 err2:
418         remove_proc_entry("rt_cache", net->proc_net);
419 err1:
420         return -ENOMEM;
421 }
422
423 static void __net_exit ip_rt_do_proc_exit(struct net *net)
424 {
425         remove_proc_entry("rt_cache", net->proc_net_stat);
426         remove_proc_entry("rt_cache", net->proc_net);
427 #ifdef CONFIG_IP_ROUTE_CLASSID
428         remove_proc_entry("rt_acct", net->proc_net);
429 #endif
430 }
431
432 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
433         .init = ip_rt_do_proc_init,
434         .exit = ip_rt_do_proc_exit,
435 };
436
437 static int __init ip_rt_proc_init(void)
438 {
439         return register_pernet_subsys(&ip_rt_proc_ops);
440 }
441
442 #else
443 static inline int ip_rt_proc_init(void)
444 {
445         return 0;
446 }
447 #endif /* CONFIG_PROC_FS */
448
449 static inline int rt_is_expired(struct rtable *rth)
450 {
451         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
452 }
453
454 /*
455  * Perturbation of rt_genid by a small quantity [1..256]
456  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
457  * many times (2^24) without giving recent rt_genid.
458  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
459  */
460 static void rt_cache_invalidate(struct net *net)
461 {
462         unsigned char shuffle;
463
464         get_random_bytes(&shuffle, sizeof(shuffle));
465         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
466 }
467
468 /*
469  * delay < 0  : invalidate cache (fast : entries will be deleted later)
470  * delay >= 0 : invalidate & flush cache (can be long)
471  */
472 void rt_cache_flush(struct net *net, int delay)
473 {
474         rt_cache_invalidate(net);
475 }
476
477 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
478                                            struct sk_buff *skb,
479                                            const void *daddr)
480 {
481         struct net_device *dev = dst->dev;
482         const __be32 *pkey = daddr;
483         const struct rtable *rt;
484         struct neighbour *n;
485
486         rt = (const struct rtable *) dst;
487         if (rt->rt_gateway)
488                 pkey = (const __be32 *) &rt->rt_gateway;
489         else if (skb)
490                 pkey = &ip_hdr(skb)->daddr;
491
492         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
493         if (n)
494                 return n;
495         return neigh_create(&arp_tbl, pkey, dev);
496 }
497
498 /*
499  * Peer allocation may fail only in serious out-of-memory conditions.  However
500  * we still can generate some output.
501  * Random ID selection looks a bit dangerous because we have no chances to
502  * select ID being unique in a reasonable period of time.
503  * But broken packet identifier may be better than no packet at all.
504  */
505 static void ip_select_fb_ident(struct iphdr *iph)
506 {
507         static DEFINE_SPINLOCK(ip_fb_id_lock);
508         static u32 ip_fallback_id;
509         u32 salt;
510
511         spin_lock_bh(&ip_fb_id_lock);
512         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
513         iph->id = htons(salt & 0xFFFF);
514         ip_fallback_id = salt;
515         spin_unlock_bh(&ip_fb_id_lock);
516 }
517
518 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
519 {
520         struct net *net = dev_net(dst->dev);
521         struct inet_peer *peer;
522
523         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
524         if (peer) {
525                 iph->id = htons(inet_getid(peer, more));
526                 inet_putpeer(peer);
527                 return;
528         }
529
530         ip_select_fb_ident(iph);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533
534 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
535                              const struct iphdr *iph,
536                              int oif, u8 tos,
537                              u8 prot, u32 mark, int flow_flags)
538 {
539         if (sk) {
540                 const struct inet_sock *inet = inet_sk(sk);
541
542                 oif = sk->sk_bound_dev_if;
543                 mark = sk->sk_mark;
544                 tos = RT_CONN_FLAGS(sk);
545                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546         }
547         flowi4_init_output(fl4, oif, mark, tos,
548                            RT_SCOPE_UNIVERSE, prot,
549                            flow_flags,
550                            iph->daddr, iph->saddr, 0, 0);
551 }
552
553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554                                const struct sock *sk)
555 {
556         const struct iphdr *iph = ip_hdr(skb);
557         int oif = skb->dev->ifindex;
558         u8 tos = RT_TOS(iph->tos);
559         u8 prot = iph->protocol;
560         u32 mark = skb->mark;
561
562         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567         const struct inet_sock *inet = inet_sk(sk);
568         const struct ip_options_rcu *inet_opt;
569         __be32 daddr = inet->inet_daddr;
570
571         rcu_read_lock();
572         inet_opt = rcu_dereference(inet->inet_opt);
573         if (inet_opt && inet_opt->opt.srr)
574                 daddr = inet_opt->opt.faddr;
575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578                            inet_sk_flowi_flags(sk),
579                            daddr, inet->inet_saddr, 0, 0);
580         rcu_read_unlock();
581 }
582
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584                                  const struct sk_buff *skb)
585 {
586         if (skb)
587                 build_skb_flow_key(fl4, skb, sk);
588         else
589                 build_sk_flow_key(fl4, sk);
590 }
591
592 static DEFINE_SEQLOCK(fnhe_seqlock);
593
594 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
595 {
596         struct fib_nh_exception *fnhe, *oldest;
597
598         oldest = rcu_dereference(hash->chain);
599         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
600              fnhe = rcu_dereference(fnhe->fnhe_next)) {
601                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
602                         oldest = fnhe;
603         }
604         return oldest;
605 }
606
607 static inline u32 fnhe_hashfun(__be32 daddr)
608 {
609         u32 hval;
610
611         hval = (__force u32) daddr;
612         hval ^= (hval >> 11) ^ (hval >> 22);
613
614         return hval & (FNHE_HASH_SIZE - 1);
615 }
616
617 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
618                                   u32 pmtu, unsigned long expires)
619 {
620         struct fnhe_hash_bucket *hash;
621         struct fib_nh_exception *fnhe;
622         int depth;
623         u32 hval = fnhe_hashfun(daddr);
624
625         write_seqlock_bh(&fnhe_seqlock);
626
627         hash = nh->nh_exceptions;
628         if (!hash) {
629                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
630                 if (!hash)
631                         goto out_unlock;
632                 nh->nh_exceptions = hash;
633         }
634
635         hash += hval;
636
637         depth = 0;
638         for (fnhe = rcu_dereference(hash->chain); fnhe;
639              fnhe = rcu_dereference(fnhe->fnhe_next)) {
640                 if (fnhe->fnhe_daddr == daddr)
641                         break;
642                 depth++;
643         }
644
645         if (fnhe) {
646                 if (gw)
647                         fnhe->fnhe_gw = gw;
648                 if (pmtu) {
649                         fnhe->fnhe_pmtu = pmtu;
650                         fnhe->fnhe_expires = expires;
651                 }
652         } else {
653                 if (depth > FNHE_RECLAIM_DEPTH)
654                         fnhe = fnhe_oldest(hash);
655                 else {
656                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
657                         if (!fnhe)
658                                 goto out_unlock;
659
660                         fnhe->fnhe_next = hash->chain;
661                         rcu_assign_pointer(hash->chain, fnhe);
662                 }
663                 fnhe->fnhe_daddr = daddr;
664                 fnhe->fnhe_gw = gw;
665                 fnhe->fnhe_pmtu = pmtu;
666                 fnhe->fnhe_expires = expires;
667         }
668
669         fnhe->fnhe_stamp = jiffies;
670
671 out_unlock:
672         write_sequnlock_bh(&fnhe_seqlock);
673         return;
674 }
675
676 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
677 {
678         __be32 new_gw = icmp_hdr(skb)->un.gateway;
679         __be32 old_gw = ip_hdr(skb)->saddr;
680         struct net_device *dev = skb->dev;
681         struct in_device *in_dev;
682         struct fib_result res;
683         struct neighbour *n;
684         struct net *net;
685
686         switch (icmp_hdr(skb)->code & 7) {
687         case ICMP_REDIR_NET:
688         case ICMP_REDIR_NETTOS:
689         case ICMP_REDIR_HOST:
690         case ICMP_REDIR_HOSTTOS:
691                 break;
692
693         default:
694                 return;
695         }
696
697         if (rt->rt_gateway != old_gw)
698                 return;
699
700         in_dev = __in_dev_get_rcu(dev);
701         if (!in_dev)
702                 return;
703
704         net = dev_net(dev);
705         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
706             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
707             ipv4_is_zeronet(new_gw))
708                 goto reject_redirect;
709
710         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
711                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
712                         goto reject_redirect;
713                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
714                         goto reject_redirect;
715         } else {
716                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
717                         goto reject_redirect;
718         }
719
720         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
721         if (n) {
722                 if (!(n->nud_state & NUD_VALID)) {
723                         neigh_event_send(n, NULL);
724                 } else {
725                         if (fib_lookup(net, fl4, &res) == 0) {
726                                 struct fib_nh *nh = &FIB_RES_NH(res);
727
728                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
729                                                       0, 0);
730                         }
731                         rt->rt_gateway = new_gw;
732                         rt->rt_flags |= RTCF_REDIRECTED;
733                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
734                 }
735                 neigh_release(n);
736         }
737         return;
738
739 reject_redirect:
740 #ifdef CONFIG_IP_ROUTE_VERBOSE
741         if (IN_DEV_LOG_MARTIANS(in_dev)) {
742                 const struct iphdr *iph = (const struct iphdr *) skb->data;
743                 __be32 daddr = iph->daddr;
744                 __be32 saddr = iph->saddr;
745
746                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
747                                      "  Advised path = %pI4 -> %pI4\n",
748                                      &old_gw, dev->name, &new_gw,
749                                      &saddr, &daddr);
750         }
751 #endif
752         ;
753 }
754
755 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
756 {
757         struct rtable *rt;
758         struct flowi4 fl4;
759
760         rt = (struct rtable *) dst;
761
762         ip_rt_build_flow_key(&fl4, sk, skb);
763         __ip_do_redirect(rt, skb, &fl4);
764 }
765
766 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
767 {
768         struct rtable *rt = (struct rtable *)dst;
769         struct dst_entry *ret = dst;
770
771         if (rt) {
772                 if (dst->obsolete > 0) {
773                         ip_rt_put(rt);
774                         ret = NULL;
775                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
776                            rt->dst.expires) {
777                         ip_rt_put(rt);
778                         ret = NULL;
779                 }
780         }
781         return ret;
782 }
783
784 /*
785  * Algorithm:
786  *      1. The first ip_rt_redirect_number redirects are sent
787  *         with exponential backoff, then we stop sending them at all,
788  *         assuming that the host ignores our redirects.
789  *      2. If we did not see packets requiring redirects
790  *         during ip_rt_redirect_silence, we assume that the host
791  *         forgot redirected route and start to send redirects again.
792  *
793  * This algorithm is much cheaper and more intelligent than dumb load limiting
794  * in icmp.c.
795  *
796  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
797  * and "frag. need" (breaks PMTU discovery) in icmp.c.
798  */
799
800 void ip_rt_send_redirect(struct sk_buff *skb)
801 {
802         struct rtable *rt = skb_rtable(skb);
803         struct in_device *in_dev;
804         struct inet_peer *peer;
805         struct net *net;
806         int log_martians;
807
808         rcu_read_lock();
809         in_dev = __in_dev_get_rcu(rt->dst.dev);
810         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
811                 rcu_read_unlock();
812                 return;
813         }
814         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
815         rcu_read_unlock();
816
817         net = dev_net(rt->dst.dev);
818         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
819         if (!peer) {
820                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
821                 return;
822         }
823
824         /* No redirected packets during ip_rt_redirect_silence;
825          * reset the algorithm.
826          */
827         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
828                 peer->rate_tokens = 0;
829
830         /* Too many ignored redirects; do not send anything
831          * set dst.rate_last to the last seen redirected packet.
832          */
833         if (peer->rate_tokens >= ip_rt_redirect_number) {
834                 peer->rate_last = jiffies;
835                 goto out_put_peer;
836         }
837
838         /* Check for load limit; set rate_last to the latest sent
839          * redirect.
840          */
841         if (peer->rate_tokens == 0 ||
842             time_after(jiffies,
843                        (peer->rate_last +
844                         (ip_rt_redirect_load << peer->rate_tokens)))) {
845                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
846                 peer->rate_last = jiffies;
847                 ++peer->rate_tokens;
848 #ifdef CONFIG_IP_ROUTE_VERBOSE
849                 if (log_martians &&
850                     peer->rate_tokens == ip_rt_redirect_number)
851                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
852                                              &ip_hdr(skb)->saddr, rt->rt_iif,
853                                              &rt->rt_dst, &rt->rt_gateway);
854 #endif
855         }
856 out_put_peer:
857         inet_putpeer(peer);
858 }
859
860 static int ip_error(struct sk_buff *skb)
861 {
862         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
863         struct rtable *rt = skb_rtable(skb);
864         struct inet_peer *peer;
865         unsigned long now;
866         struct net *net;
867         bool send;
868         int code;
869
870         net = dev_net(rt->dst.dev);
871         if (!IN_DEV_FORWARD(in_dev)) {
872                 switch (rt->dst.error) {
873                 case EHOSTUNREACH:
874                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
875                         break;
876
877                 case ENETUNREACH:
878                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
879                         break;
880                 }
881                 goto out;
882         }
883
884         switch (rt->dst.error) {
885         case EINVAL:
886         default:
887                 goto out;
888         case EHOSTUNREACH:
889                 code = ICMP_HOST_UNREACH;
890                 break;
891         case ENETUNREACH:
892                 code = ICMP_NET_UNREACH;
893                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
894                 break;
895         case EACCES:
896                 code = ICMP_PKT_FILTERED;
897                 break;
898         }
899
900         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
901
902         send = true;
903         if (peer) {
904                 now = jiffies;
905                 peer->rate_tokens += now - peer->rate_last;
906                 if (peer->rate_tokens > ip_rt_error_burst)
907                         peer->rate_tokens = ip_rt_error_burst;
908                 peer->rate_last = now;
909                 if (peer->rate_tokens >= ip_rt_error_cost)
910                         peer->rate_tokens -= ip_rt_error_cost;
911                 else
912                         send = false;
913                 inet_putpeer(peer);
914         }
915         if (send)
916                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
917
918 out:    kfree_skb(skb);
919         return 0;
920 }
921
922 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
923 {
924         struct fib_result res;
925
926         if (mtu < ip_rt_min_pmtu)
927                 mtu = ip_rt_min_pmtu;
928
929         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
930                 struct fib_nh *nh = &FIB_RES_NH(res);
931
932                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933                                       jiffies + ip_rt_mtu_expires);
934         }
935         rt->rt_pmtu = mtu;
936         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
937 }
938
939 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
940                               struct sk_buff *skb, u32 mtu)
941 {
942         struct rtable *rt = (struct rtable *) dst;
943         struct flowi4 fl4;
944
945         ip_rt_build_flow_key(&fl4, sk, skb);
946         __ip_rt_update_pmtu(rt, &fl4, mtu);
947 }
948
949 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
950                       int oif, u32 mark, u8 protocol, int flow_flags)
951 {
952         const struct iphdr *iph = (const struct iphdr *) skb->data;
953         struct flowi4 fl4;
954         struct rtable *rt;
955
956         __build_flow_key(&fl4, NULL, iph, oif,
957                          RT_TOS(iph->tos), protocol, mark, flow_flags);
958         rt = __ip_route_output_key(net, &fl4);
959         if (!IS_ERR(rt)) {
960                 __ip_rt_update_pmtu(rt, &fl4, mtu);
961                 ip_rt_put(rt);
962         }
963 }
964 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
965
966 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
967 {
968         const struct iphdr *iph = (const struct iphdr *) skb->data;
969         struct flowi4 fl4;
970         struct rtable *rt;
971
972         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
973         rt = __ip_route_output_key(sock_net(sk), &fl4);
974         if (!IS_ERR(rt)) {
975                 __ip_rt_update_pmtu(rt, &fl4, mtu);
976                 ip_rt_put(rt);
977         }
978 }
979 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
980
981 void ipv4_redirect(struct sk_buff *skb, struct net *net,
982                    int oif, u32 mark, u8 protocol, int flow_flags)
983 {
984         const struct iphdr *iph = (const struct iphdr *) skb->data;
985         struct flowi4 fl4;
986         struct rtable *rt;
987
988         __build_flow_key(&fl4, NULL, iph, oif,
989                          RT_TOS(iph->tos), protocol, mark, flow_flags);
990         rt = __ip_route_output_key(net, &fl4);
991         if (!IS_ERR(rt)) {
992                 __ip_do_redirect(rt, skb, &fl4);
993                 ip_rt_put(rt);
994         }
995 }
996 EXPORT_SYMBOL_GPL(ipv4_redirect);
997
998 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
999 {
1000         const struct iphdr *iph = (const struct iphdr *) skb->data;
1001         struct flowi4 fl4;
1002         struct rtable *rt;
1003
1004         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1005         rt = __ip_route_output_key(sock_net(sk), &fl4);
1006         if (!IS_ERR(rt)) {
1007                 __ip_do_redirect(rt, skb, &fl4);
1008                 ip_rt_put(rt);
1009         }
1010 }
1011 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1012
1013 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1014 {
1015         struct rtable *rt = (struct rtable *) dst;
1016
1017         if (rt_is_expired(rt))
1018                 return NULL;
1019         return dst;
1020 }
1021
1022 static void ipv4_dst_destroy(struct dst_entry *dst)
1023 {
1024         struct rtable *rt = (struct rtable *) dst;
1025
1026         if (rt->fi) {
1027                 fib_info_put(rt->fi);
1028                 rt->fi = NULL;
1029         }
1030 }
1031
1032
1033 static void ipv4_link_failure(struct sk_buff *skb)
1034 {
1035         struct rtable *rt;
1036
1037         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1038
1039         rt = skb_rtable(skb);
1040         if (rt)
1041                 dst_set_expires(&rt->dst, 0);
1042 }
1043
1044 static int ip_rt_bug(struct sk_buff *skb)
1045 {
1046         pr_debug("%s: %pI4 -> %pI4, %s\n",
1047                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1048                  skb->dev ? skb->dev->name : "?");
1049         kfree_skb(skb);
1050         WARN_ON(1);
1051         return 0;
1052 }
1053
1054 /*
1055    We do not cache source address of outgoing interface,
1056    because it is used only by IP RR, TS and SRR options,
1057    so that it out of fast path.
1058
1059    BTW remember: "addr" is allowed to be not aligned
1060    in IP options!
1061  */
1062
1063 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1064 {
1065         __be32 src;
1066
1067         if (rt_is_output_route(rt))
1068                 src = ip_hdr(skb)->saddr;
1069         else {
1070                 struct fib_result res;
1071                 struct flowi4 fl4;
1072                 struct iphdr *iph;
1073
1074                 iph = ip_hdr(skb);
1075
1076                 memset(&fl4, 0, sizeof(fl4));
1077                 fl4.daddr = iph->daddr;
1078                 fl4.saddr = iph->saddr;
1079                 fl4.flowi4_tos = RT_TOS(iph->tos);
1080                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1081                 fl4.flowi4_iif = skb->dev->ifindex;
1082                 fl4.flowi4_mark = skb->mark;
1083
1084                 rcu_read_lock();
1085                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1086                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1087                 else
1088                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1089                                         RT_SCOPE_UNIVERSE);
1090                 rcu_read_unlock();
1091         }
1092         memcpy(addr, &src, 4);
1093 }
1094
1095 #ifdef CONFIG_IP_ROUTE_CLASSID
1096 static void set_class_tag(struct rtable *rt, u32 tag)
1097 {
1098         if (!(rt->dst.tclassid & 0xFFFF))
1099                 rt->dst.tclassid |= tag & 0xFFFF;
1100         if (!(rt->dst.tclassid & 0xFFFF0000))
1101                 rt->dst.tclassid |= tag & 0xFFFF0000;
1102 }
1103 #endif
1104
1105 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1106 {
1107         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1108
1109         if (advmss == 0) {
1110                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1111                                ip_rt_min_advmss);
1112                 if (advmss > 65535 - 40)
1113                         advmss = 65535 - 40;
1114         }
1115         return advmss;
1116 }
1117
1118 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1119 {
1120         const struct rtable *rt = (const struct rtable *) dst;
1121         unsigned int mtu = rt->rt_pmtu;
1122
1123         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1124                 mtu = 0;
1125
1126         if (!mtu)
1127                 mtu = dst_metric_raw(dst, RTAX_MTU);
1128
1129         if (mtu && rt_is_output_route(rt))
1130                 return mtu;
1131
1132         mtu = dst->dev->mtu;
1133
1134         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1135
1136                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1137                         mtu = 576;
1138         }
1139
1140         if (mtu > IP_MAX_MTU)
1141                 mtu = IP_MAX_MTU;
1142
1143         return mtu;
1144 }
1145
1146 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1147                             struct fib_info *fi)
1148 {
1149         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1150                 rt->fi = fi;
1151                 atomic_inc(&fi->fib_clntref);
1152         }
1153         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1154 }
1155
1156 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1157 {
1158         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1159         struct fib_nh_exception *fnhe;
1160         u32 hval;
1161
1162         hval = fnhe_hashfun(daddr);
1163
1164 restart:
1165         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1166              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1167                 __be32 fnhe_daddr, gw;
1168                 unsigned long expires;
1169                 unsigned int seq;
1170                 u32 pmtu;
1171
1172                 seq = read_seqbegin(&fnhe_seqlock);
1173                 fnhe_daddr = fnhe->fnhe_daddr;
1174                 gw = fnhe->fnhe_gw;
1175                 pmtu = fnhe->fnhe_pmtu;
1176                 expires = fnhe->fnhe_expires;
1177                 if (read_seqretry(&fnhe_seqlock, seq))
1178                         goto restart;
1179                 if (daddr != fnhe_daddr)
1180                         continue;
1181                 if (pmtu) {
1182                         unsigned long diff = expires - jiffies;
1183
1184                         if (time_before(jiffies, expires)) {
1185                                 rt->rt_pmtu = pmtu;
1186                                 dst_set_expires(&rt->dst, diff);
1187                         }
1188                 }
1189                 if (gw)
1190                         rt->rt_gateway = gw;
1191                 fnhe->fnhe_stamp = jiffies;
1192                 break;
1193         }
1194 }
1195
1196 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1197                            const struct fib_result *res,
1198                            struct fib_info *fi, u16 type, u32 itag)
1199 {
1200         if (fi) {
1201                 struct fib_nh *nh = &FIB_RES_NH(*res);
1202
1203                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1204                         rt->rt_gateway = nh->nh_gw;
1205                 if (unlikely(nh->nh_exceptions))
1206                         rt_bind_exception(rt, nh, fl4->daddr);
1207                 rt_init_metrics(rt, fl4, fi);
1208 #ifdef CONFIG_IP_ROUTE_CLASSID
1209                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1210 #endif
1211         }
1212
1213 #ifdef CONFIG_IP_ROUTE_CLASSID
1214 #ifdef CONFIG_IP_MULTIPLE_TABLES
1215         set_class_tag(rt, res->tclassid);
1216 #endif
1217         set_class_tag(rt, itag);
1218 #endif
1219 }
1220
1221 static struct rtable *rt_dst_alloc(struct net_device *dev,
1222                                    bool nopolicy, bool noxfrm)
1223 {
1224         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1225                          DST_HOST | DST_NOCACHE |
1226                          (nopolicy ? DST_NOPOLICY : 0) |
1227                          (noxfrm ? DST_NOXFRM : 0));
1228 }
1229
1230 /* called in rcu_read_lock() section */
1231 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1232                                 u8 tos, struct net_device *dev, int our)
1233 {
1234         struct rtable *rth;
1235         struct in_device *in_dev = __in_dev_get_rcu(dev);
1236         u32 itag = 0;
1237         int err;
1238
1239         /* Primary sanity checks. */
1240
1241         if (in_dev == NULL)
1242                 return -EINVAL;
1243
1244         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1245             skb->protocol != htons(ETH_P_IP))
1246                 goto e_inval;
1247
1248         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1249                 if (ipv4_is_loopback(saddr))
1250                         goto e_inval;
1251
1252         if (ipv4_is_zeronet(saddr)) {
1253                 if (!ipv4_is_local_multicast(daddr))
1254                         goto e_inval;
1255         } else {
1256                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1257                                           in_dev, &itag);
1258                 if (err < 0)
1259                         goto e_err;
1260         }
1261         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1262                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1263         if (!rth)
1264                 goto e_nobufs;
1265
1266 #ifdef CONFIG_IP_ROUTE_CLASSID
1267         rth->dst.tclassid = itag;
1268 #endif
1269         rth->dst.output = ip_rt_bug;
1270
1271         rth->rt_key_dst = daddr;
1272         rth->rt_key_src = saddr;
1273         rth->rt_genid   = rt_genid(dev_net(dev));
1274         rth->rt_flags   = RTCF_MULTICAST;
1275         rth->rt_type    = RTN_MULTICAST;
1276         rth->rt_key_tos = tos;
1277         rth->rt_dst     = daddr;
1278         rth->rt_src     = saddr;
1279         rth->rt_route_iif = dev->ifindex;
1280         rth->rt_iif     = dev->ifindex;
1281         rth->rt_oif     = 0;
1282         rth->rt_mark    = skb->mark;
1283         rth->rt_pmtu    = 0;
1284         rth->rt_gateway = daddr;
1285         rth->fi = NULL;
1286         if (our) {
1287                 rth->dst.input= ip_local_deliver;
1288                 rth->rt_flags |= RTCF_LOCAL;
1289         }
1290
1291 #ifdef CONFIG_IP_MROUTE
1292         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1293                 rth->dst.input = ip_mr_input;
1294 #endif
1295         RT_CACHE_STAT_INC(in_slow_mc);
1296
1297         skb_dst_set(skb, &rth->dst);
1298         return 0;
1299
1300 e_nobufs:
1301         return -ENOBUFS;
1302 e_inval:
1303         return -EINVAL;
1304 e_err:
1305         return err;
1306 }
1307
1308
1309 static void ip_handle_martian_source(struct net_device *dev,
1310                                      struct in_device *in_dev,
1311                                      struct sk_buff *skb,
1312                                      __be32 daddr,
1313                                      __be32 saddr)
1314 {
1315         RT_CACHE_STAT_INC(in_martian_src);
1316 #ifdef CONFIG_IP_ROUTE_VERBOSE
1317         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1318                 /*
1319                  *      RFC1812 recommendation, if source is martian,
1320                  *      the only hint is MAC header.
1321                  */
1322                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1323                         &daddr, &saddr, dev->name);
1324                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1325                         print_hex_dump(KERN_WARNING, "ll header: ",
1326                                        DUMP_PREFIX_OFFSET, 16, 1,
1327                                        skb_mac_header(skb),
1328                                        dev->hard_header_len, true);
1329                 }
1330         }
1331 #endif
1332 }
1333
1334 /* called in rcu_read_lock() section */
1335 static int __mkroute_input(struct sk_buff *skb,
1336                            const struct fib_result *res,
1337                            struct in_device *in_dev,
1338                            __be32 daddr, __be32 saddr, u32 tos,
1339                            struct rtable **result)
1340 {
1341         struct rtable *rth;
1342         int err;
1343         struct in_device *out_dev;
1344         unsigned int flags = 0;
1345         u32 itag;
1346
1347         /* get a working reference to the output device */
1348         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1349         if (out_dev == NULL) {
1350                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1351                 return -EINVAL;
1352         }
1353
1354
1355         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1356                                   in_dev->dev, in_dev, &itag);
1357         if (err < 0) {
1358                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1359                                          saddr);
1360
1361                 goto cleanup;
1362         }
1363
1364         if (err)
1365                 flags |= RTCF_DIRECTSRC;
1366
1367         if (out_dev == in_dev && err &&
1368             (IN_DEV_SHARED_MEDIA(out_dev) ||
1369              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1370                 flags |= RTCF_DOREDIRECT;
1371
1372         if (skb->protocol != htons(ETH_P_IP)) {
1373                 /* Not IP (i.e. ARP). Do not create route, if it is
1374                  * invalid for proxy arp. DNAT routes are always valid.
1375                  *
1376                  * Proxy arp feature have been extended to allow, ARP
1377                  * replies back to the same interface, to support
1378                  * Private VLAN switch technologies. See arp.c.
1379                  */
1380                 if (out_dev == in_dev &&
1381                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1382                         err = -EINVAL;
1383                         goto cleanup;
1384                 }
1385         }
1386
1387         rth = rt_dst_alloc(out_dev->dev,
1388                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1389                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1390         if (!rth) {
1391                 err = -ENOBUFS;
1392                 goto cleanup;
1393         }
1394
1395         rth->rt_key_dst = daddr;
1396         rth->rt_key_src = saddr;
1397         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1398         rth->rt_flags = flags;
1399         rth->rt_type = res->type;
1400         rth->rt_key_tos = tos;
1401         rth->rt_dst     = daddr;
1402         rth->rt_src     = saddr;
1403         rth->rt_route_iif = in_dev->dev->ifindex;
1404         rth->rt_iif     = in_dev->dev->ifindex;
1405         rth->rt_oif     = 0;
1406         rth->rt_mark    = skb->mark;
1407         rth->rt_pmtu    = 0;
1408         rth->rt_gateway = daddr;
1409         rth->fi = NULL;
1410
1411         rth->dst.input = ip_forward;
1412         rth->dst.output = ip_output;
1413
1414         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1415
1416         *result = rth;
1417         err = 0;
1418  cleanup:
1419         return err;
1420 }
1421
1422 static int ip_mkroute_input(struct sk_buff *skb,
1423                             struct fib_result *res,
1424                             const struct flowi4 *fl4,
1425                             struct in_device *in_dev,
1426                             __be32 daddr, __be32 saddr, u32 tos)
1427 {
1428         struct rtable *rth = NULL;
1429         int err;
1430
1431 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1432         if (res->fi && res->fi->fib_nhs > 1)
1433                 fib_select_multipath(res);
1434 #endif
1435
1436         /* create a routing cache entry */
1437         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1438         if (err)
1439                 return err;
1440
1441         skb_dst_set(skb, &rth->dst);
1442         return 0;
1443 }
1444
1445 /*
1446  *      NOTE. We drop all the packets that has local source
1447  *      addresses, because every properly looped back packet
1448  *      must have correct destination already attached by output routine.
1449  *
1450  *      Such approach solves two big problems:
1451  *      1. Not simplex devices are handled properly.
1452  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1453  *      called with rcu_read_lock()
1454  */
1455
1456 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1457                                u8 tos, struct net_device *dev)
1458 {
1459         struct fib_result res;
1460         struct in_device *in_dev = __in_dev_get_rcu(dev);
1461         struct flowi4   fl4;
1462         unsigned int    flags = 0;
1463         u32             itag = 0;
1464         struct rtable   *rth;
1465         int             err = -EINVAL;
1466         struct net    *net = dev_net(dev);
1467
1468         /* IP on this device is disabled. */
1469
1470         if (!in_dev)
1471                 goto out;
1472
1473         /* Check for the most weird martians, which can be not detected
1474            by fib_lookup.
1475          */
1476
1477         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1478                 goto martian_source;
1479
1480         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1481                 goto brd_input;
1482
1483         /* Accept zero addresses only to limited broadcast;
1484          * I even do not know to fix it or not. Waiting for complains :-)
1485          */
1486         if (ipv4_is_zeronet(saddr))
1487                 goto martian_source;
1488
1489         if (ipv4_is_zeronet(daddr))
1490                 goto martian_destination;
1491
1492         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1493                 if (ipv4_is_loopback(daddr))
1494                         goto martian_destination;
1495
1496                 if (ipv4_is_loopback(saddr))
1497                         goto martian_source;
1498         }
1499
1500         /*
1501          *      Now we are ready to route packet.
1502          */
1503         fl4.flowi4_oif = 0;
1504         fl4.flowi4_iif = dev->ifindex;
1505         fl4.flowi4_mark = skb->mark;
1506         fl4.flowi4_tos = tos;
1507         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1508         fl4.daddr = daddr;
1509         fl4.saddr = saddr;
1510         err = fib_lookup(net, &fl4, &res);
1511         if (err != 0)
1512                 goto no_route;
1513
1514         RT_CACHE_STAT_INC(in_slow_tot);
1515
1516         if (res.type == RTN_BROADCAST)
1517                 goto brd_input;
1518
1519         if (res.type == RTN_LOCAL) {
1520                 err = fib_validate_source(skb, saddr, daddr, tos,
1521                                           net->loopback_dev->ifindex,
1522                                           dev, in_dev, &itag);
1523                 if (err < 0)
1524                         goto martian_source_keep_err;
1525                 if (err)
1526                         flags |= RTCF_DIRECTSRC;
1527                 goto local_input;
1528         }
1529
1530         if (!IN_DEV_FORWARD(in_dev))
1531                 goto no_route;
1532         if (res.type != RTN_UNICAST)
1533                 goto martian_destination;
1534
1535         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1536 out:    return err;
1537
1538 brd_input:
1539         if (skb->protocol != htons(ETH_P_IP))
1540                 goto e_inval;
1541
1542         if (!ipv4_is_zeronet(saddr)) {
1543                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1544                                           in_dev, &itag);
1545                 if (err < 0)
1546                         goto martian_source_keep_err;
1547                 if (err)
1548                         flags |= RTCF_DIRECTSRC;
1549         }
1550         flags |= RTCF_BROADCAST;
1551         res.type = RTN_BROADCAST;
1552         RT_CACHE_STAT_INC(in_brd);
1553
1554 local_input:
1555         rth = rt_dst_alloc(net->loopback_dev,
1556                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1557         if (!rth)
1558                 goto e_nobufs;
1559
1560         rth->dst.input= ip_local_deliver;
1561         rth->dst.output= ip_rt_bug;
1562 #ifdef CONFIG_IP_ROUTE_CLASSID
1563         rth->dst.tclassid = itag;
1564 #endif
1565
1566         rth->rt_key_dst = daddr;
1567         rth->rt_key_src = saddr;
1568         rth->rt_genid = rt_genid(net);
1569         rth->rt_flags   = flags|RTCF_LOCAL;
1570         rth->rt_type    = res.type;
1571         rth->rt_key_tos = tos;
1572         rth->rt_dst     = daddr;
1573         rth->rt_src     = saddr;
1574         rth->rt_route_iif = dev->ifindex;
1575         rth->rt_iif     = dev->ifindex;
1576         rth->rt_oif     = 0;
1577         rth->rt_mark    = skb->mark;
1578         rth->rt_pmtu    = 0;
1579         rth->rt_gateway = daddr;
1580         rth->fi = NULL;
1581         if (res.type == RTN_UNREACHABLE) {
1582                 rth->dst.input= ip_error;
1583                 rth->dst.error= -err;
1584                 rth->rt_flags   &= ~RTCF_LOCAL;
1585         }
1586         skb_dst_set(skb, &rth->dst);
1587         err = 0;
1588         goto out;
1589
1590 no_route:
1591         RT_CACHE_STAT_INC(in_no_route);
1592         res.type = RTN_UNREACHABLE;
1593         if (err == -ESRCH)
1594                 err = -ENETUNREACH;
1595         goto local_input;
1596
1597         /*
1598          *      Do not cache martian addresses: they should be logged (RFC1812)
1599          */
1600 martian_destination:
1601         RT_CACHE_STAT_INC(in_martian_dst);
1602 #ifdef CONFIG_IP_ROUTE_VERBOSE
1603         if (IN_DEV_LOG_MARTIANS(in_dev))
1604                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1605                                      &daddr, &saddr, dev->name);
1606 #endif
1607
1608 e_inval:
1609         err = -EINVAL;
1610         goto out;
1611
1612 e_nobufs:
1613         err = -ENOBUFS;
1614         goto out;
1615
1616 martian_source:
1617         err = -EINVAL;
1618 martian_source_keep_err:
1619         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1620         goto out;
1621 }
1622
1623 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1624                            u8 tos, struct net_device *dev, bool noref)
1625 {
1626         int res;
1627
1628         rcu_read_lock();
1629
1630         /* Multicast recognition logic is moved from route cache to here.
1631            The problem was that too many Ethernet cards have broken/missing
1632            hardware multicast filters :-( As result the host on multicasting
1633            network acquires a lot of useless route cache entries, sort of
1634            SDR messages from all the world. Now we try to get rid of them.
1635            Really, provided software IP multicast filter is organized
1636            reasonably (at least, hashed), it does not result in a slowdown
1637            comparing with route cache reject entries.
1638            Note, that multicast routers are not affected, because
1639            route cache entry is created eventually.
1640          */
1641         if (ipv4_is_multicast(daddr)) {
1642                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1643
1644                 if (in_dev) {
1645                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1646                                                   ip_hdr(skb)->protocol);
1647                         if (our
1648 #ifdef CONFIG_IP_MROUTE
1649                                 ||
1650                             (!ipv4_is_local_multicast(daddr) &&
1651                              IN_DEV_MFORWARD(in_dev))
1652 #endif
1653                            ) {
1654                                 int res = ip_route_input_mc(skb, daddr, saddr,
1655                                                             tos, dev, our);
1656                                 rcu_read_unlock();
1657                                 return res;
1658                         }
1659                 }
1660                 rcu_read_unlock();
1661                 return -EINVAL;
1662         }
1663         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1664         rcu_read_unlock();
1665         return res;
1666 }
1667 EXPORT_SYMBOL(ip_route_input_common);
1668
1669 /* called with rcu_read_lock() */
1670 static struct rtable *__mkroute_output(const struct fib_result *res,
1671                                        const struct flowi4 *fl4,
1672                                        __be32 orig_daddr, __be32 orig_saddr,
1673                                        int orig_oif, __u8 orig_rtos,
1674                                        struct net_device *dev_out,
1675                                        unsigned int flags)
1676 {
1677         struct fib_info *fi = res->fi;
1678         struct in_device *in_dev;
1679         u16 type = res->type;
1680         struct rtable *rth;
1681
1682         in_dev = __in_dev_get_rcu(dev_out);
1683         if (!in_dev)
1684                 return ERR_PTR(-EINVAL);
1685
1686         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1687                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1688                         return ERR_PTR(-EINVAL);
1689
1690         if (ipv4_is_lbcast(fl4->daddr))
1691                 type = RTN_BROADCAST;
1692         else if (ipv4_is_multicast(fl4->daddr))
1693                 type = RTN_MULTICAST;
1694         else if (ipv4_is_zeronet(fl4->daddr))
1695                 return ERR_PTR(-EINVAL);
1696
1697         if (dev_out->flags & IFF_LOOPBACK)
1698                 flags |= RTCF_LOCAL;
1699
1700         if (type == RTN_BROADCAST) {
1701                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1702                 fi = NULL;
1703         } else if (type == RTN_MULTICAST) {
1704                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1705                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1706                                      fl4->flowi4_proto))
1707                         flags &= ~RTCF_LOCAL;
1708                 /* If multicast route do not exist use
1709                  * default one, but do not gateway in this case.
1710                  * Yes, it is hack.
1711                  */
1712                 if (fi && res->prefixlen < 4)
1713                         fi = NULL;
1714         }
1715
1716         rth = rt_dst_alloc(dev_out,
1717                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1718                            IN_DEV_CONF_GET(in_dev, NOXFRM));
1719         if (!rth)
1720                 return ERR_PTR(-ENOBUFS);
1721
1722         rth->dst.output = ip_output;
1723
1724         rth->rt_key_dst = orig_daddr;
1725         rth->rt_key_src = orig_saddr;
1726         rth->rt_genid = rt_genid(dev_net(dev_out));
1727         rth->rt_flags   = flags;
1728         rth->rt_type    = type;
1729         rth->rt_key_tos = orig_rtos;
1730         rth->rt_dst     = fl4->daddr;
1731         rth->rt_src     = fl4->saddr;
1732         rth->rt_route_iif = 0;
1733         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
1734         rth->rt_oif     = orig_oif;
1735         rth->rt_mark    = fl4->flowi4_mark;
1736         rth->rt_pmtu    = 0;
1737         rth->rt_gateway = fl4->daddr;
1738         rth->fi = NULL;
1739
1740         RT_CACHE_STAT_INC(out_slow_tot);
1741
1742         if (flags & RTCF_LOCAL)
1743                 rth->dst.input = ip_local_deliver;
1744         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1745                 if (flags & RTCF_LOCAL &&
1746                     !(dev_out->flags & IFF_LOOPBACK)) {
1747                         rth->dst.output = ip_mc_output;
1748                         RT_CACHE_STAT_INC(out_slow_mc);
1749                 }
1750 #ifdef CONFIG_IP_MROUTE
1751                 if (type == RTN_MULTICAST) {
1752                         if (IN_DEV_MFORWARD(in_dev) &&
1753                             !ipv4_is_local_multicast(fl4->daddr)) {
1754                                 rth->dst.input = ip_mr_input;
1755                                 rth->dst.output = ip_mc_output;
1756                         }
1757                 }
1758 #endif
1759         }
1760
1761         rt_set_nexthop(rth, fl4, res, fi, type, 0);
1762
1763         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
1764                 rth->dst.flags |= DST_NOCACHE;
1765
1766         return rth;
1767 }
1768
1769 /*
1770  * Major route resolver routine.
1771  */
1772
1773 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1774 {
1775         struct net_device *dev_out = NULL;
1776         __u8 tos = RT_FL_TOS(fl4);
1777         unsigned int flags = 0;
1778         struct fib_result res;
1779         struct rtable *rth;
1780         __be32 orig_daddr;
1781         __be32 orig_saddr;
1782         int orig_oif;
1783
1784         res.tclassid    = 0;
1785         res.fi          = NULL;
1786         res.table       = NULL;
1787
1788         orig_daddr = fl4->daddr;
1789         orig_saddr = fl4->saddr;
1790         orig_oif = fl4->flowi4_oif;
1791
1792         fl4->flowi4_iif = net->loopback_dev->ifindex;
1793         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1794         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1795                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1796
1797         rcu_read_lock();
1798         if (fl4->saddr) {
1799                 rth = ERR_PTR(-EINVAL);
1800                 if (ipv4_is_multicast(fl4->saddr) ||
1801                     ipv4_is_lbcast(fl4->saddr) ||
1802                     ipv4_is_zeronet(fl4->saddr))
1803                         goto out;
1804
1805                 /* I removed check for oif == dev_out->oif here.
1806                    It was wrong for two reasons:
1807                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1808                       is assigned to multiple interfaces.
1809                    2. Moreover, we are allowed to send packets with saddr
1810                       of another iface. --ANK
1811                  */
1812
1813                 if (fl4->flowi4_oif == 0 &&
1814                     (ipv4_is_multicast(fl4->daddr) ||
1815                      ipv4_is_lbcast(fl4->daddr))) {
1816                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1817                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1818                         if (dev_out == NULL)
1819                                 goto out;
1820
1821                         /* Special hack: user can direct multicasts
1822                            and limited broadcast via necessary interface
1823                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1824                            This hack is not just for fun, it allows
1825                            vic,vat and friends to work.
1826                            They bind socket to loopback, set ttl to zero
1827                            and expect that it will work.
1828                            From the viewpoint of routing cache they are broken,
1829                            because we are not allowed to build multicast path
1830                            with loopback source addr (look, routing cache
1831                            cannot know, that ttl is zero, so that packet
1832                            will not leave this host and route is valid).
1833                            Luckily, this hack is good workaround.
1834                          */
1835
1836                         fl4->flowi4_oif = dev_out->ifindex;
1837                         goto make_route;
1838                 }
1839
1840                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1841                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1842                         if (!__ip_dev_find(net, fl4->saddr, false))
1843                                 goto out;
1844                 }
1845         }
1846
1847
1848         if (fl4->flowi4_oif) {
1849                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1850                 rth = ERR_PTR(-ENODEV);
1851                 if (dev_out == NULL)
1852                         goto out;
1853
1854                 /* RACE: Check return value of inet_select_addr instead. */
1855                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1856                         rth = ERR_PTR(-ENETUNREACH);
1857                         goto out;
1858                 }
1859                 if (ipv4_is_local_multicast(fl4->daddr) ||
1860                     ipv4_is_lbcast(fl4->daddr)) {
1861                         if (!fl4->saddr)
1862                                 fl4->saddr = inet_select_addr(dev_out, 0,
1863                                                               RT_SCOPE_LINK);
1864                         goto make_route;
1865                 }
1866                 if (fl4->saddr) {
1867                         if (ipv4_is_multicast(fl4->daddr))
1868                                 fl4->saddr = inet_select_addr(dev_out, 0,
1869                                                               fl4->flowi4_scope);
1870                         else if (!fl4->daddr)
1871                                 fl4->saddr = inet_select_addr(dev_out, 0,
1872                                                               RT_SCOPE_HOST);
1873                 }
1874         }
1875
1876         if (!fl4->daddr) {
1877                 fl4->daddr = fl4->saddr;
1878                 if (!fl4->daddr)
1879                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1880                 dev_out = net->loopback_dev;
1881                 fl4->flowi4_oif = net->loopback_dev->ifindex;
1882                 res.type = RTN_LOCAL;
1883                 flags |= RTCF_LOCAL;
1884                 goto make_route;
1885         }
1886
1887         if (fib_lookup(net, fl4, &res)) {
1888                 res.fi = NULL;
1889                 res.table = NULL;
1890                 if (fl4->flowi4_oif) {
1891                         /* Apparently, routing tables are wrong. Assume,
1892                            that the destination is on link.
1893
1894                            WHY? DW.
1895                            Because we are allowed to send to iface
1896                            even if it has NO routes and NO assigned
1897                            addresses. When oif is specified, routing
1898                            tables are looked up with only one purpose:
1899                            to catch if destination is gatewayed, rather than
1900                            direct. Moreover, if MSG_DONTROUTE is set,
1901                            we send packet, ignoring both routing tables
1902                            and ifaddr state. --ANK
1903
1904
1905                            We could make it even if oif is unknown,
1906                            likely IPv6, but we do not.
1907                          */
1908
1909                         if (fl4->saddr == 0)
1910                                 fl4->saddr = inet_select_addr(dev_out, 0,
1911                                                               RT_SCOPE_LINK);
1912                         res.type = RTN_UNICAST;
1913                         goto make_route;
1914                 }
1915                 rth = ERR_PTR(-ENETUNREACH);
1916                 goto out;
1917         }
1918
1919         if (res.type == RTN_LOCAL) {
1920                 if (!fl4->saddr) {
1921                         if (res.fi->fib_prefsrc)
1922                                 fl4->saddr = res.fi->fib_prefsrc;
1923                         else
1924                                 fl4->saddr = fl4->daddr;
1925                 }
1926                 dev_out = net->loopback_dev;
1927                 fl4->flowi4_oif = dev_out->ifindex;
1928                 res.fi = NULL;
1929                 flags |= RTCF_LOCAL;
1930                 goto make_route;
1931         }
1932
1933 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1934         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1935                 fib_select_multipath(&res);
1936         else
1937 #endif
1938         if (!res.prefixlen &&
1939             res.table->tb_num_default > 1 &&
1940             res.type == RTN_UNICAST && !fl4->flowi4_oif)
1941                 fib_select_default(&res);
1942
1943         if (!fl4->saddr)
1944                 fl4->saddr = FIB_RES_PREFSRC(net, res);
1945
1946         dev_out = FIB_RES_DEV(res);
1947         fl4->flowi4_oif = dev_out->ifindex;
1948
1949
1950 make_route:
1951         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
1952                                tos, dev_out, flags);
1953
1954 out:
1955         rcu_read_unlock();
1956         return rth;
1957 }
1958 EXPORT_SYMBOL_GPL(__ip_route_output_key);
1959
1960 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
1961 {
1962         return NULL;
1963 }
1964
1965 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
1966 {
1967         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1968
1969         return mtu ? : dst->dev->mtu;
1970 }
1971
1972 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
1973                                           struct sk_buff *skb, u32 mtu)
1974 {
1975 }
1976
1977 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
1978                                        struct sk_buff *skb)
1979 {
1980 }
1981
1982 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
1983                                           unsigned long old)
1984 {
1985         return NULL;
1986 }
1987
1988 static struct dst_ops ipv4_dst_blackhole_ops = {
1989         .family                 =       AF_INET,
1990         .protocol               =       cpu_to_be16(ETH_P_IP),
1991         .destroy                =       ipv4_dst_destroy,
1992         .check                  =       ipv4_blackhole_dst_check,
1993         .mtu                    =       ipv4_blackhole_mtu,
1994         .default_advmss         =       ipv4_default_advmss,
1995         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
1996         .redirect               =       ipv4_rt_blackhole_redirect,
1997         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
1998         .neigh_lookup           =       ipv4_neigh_lookup,
1999 };
2000
2001 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2002 {
2003         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2004         struct rtable *ort = (struct rtable *) dst_orig;
2005
2006         if (rt) {
2007                 struct dst_entry *new = &rt->dst;
2008
2009                 new->__use = 1;
2010                 new->input = dst_discard;
2011                 new->output = dst_discard;
2012
2013                 new->dev = ort->dst.dev;
2014                 if (new->dev)
2015                         dev_hold(new->dev);
2016
2017                 rt->rt_key_dst = ort->rt_key_dst;
2018                 rt->rt_key_src = ort->rt_key_src;
2019                 rt->rt_key_tos = ort->rt_key_tos;
2020                 rt->rt_route_iif = ort->rt_route_iif;
2021                 rt->rt_iif = ort->rt_iif;
2022                 rt->rt_oif = ort->rt_oif;
2023                 rt->rt_mark = ort->rt_mark;
2024                 rt->rt_pmtu = ort->rt_pmtu;
2025
2026                 rt->rt_genid = rt_genid(net);
2027                 rt->rt_flags = ort->rt_flags;
2028                 rt->rt_type = ort->rt_type;
2029                 rt->rt_dst = ort->rt_dst;
2030                 rt->rt_src = ort->rt_src;
2031                 rt->rt_gateway = ort->rt_gateway;
2032                 rt->fi = ort->fi;
2033                 if (rt->fi)
2034                         atomic_inc(&rt->fi->fib_clntref);
2035
2036                 dst_free(new);
2037         }
2038
2039         dst_release(dst_orig);
2040
2041         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2042 }
2043
2044 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2045                                     struct sock *sk)
2046 {
2047         struct rtable *rt = __ip_route_output_key(net, flp4);
2048
2049         if (IS_ERR(rt))
2050                 return rt;
2051
2052         if (flp4->flowi4_proto)
2053                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2054                                                    flowi4_to_flowi(flp4),
2055                                                    sk, 0);
2056
2057         return rt;
2058 }
2059 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2060
2061 static int rt_fill_info(struct net *net,
2062                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2063                         int nowait, unsigned int flags)
2064 {
2065         struct rtable *rt = skb_rtable(skb);
2066         struct rtmsg *r;
2067         struct nlmsghdr *nlh;
2068         unsigned long expires = 0;
2069         u32 error;
2070         u32 metrics[RTAX_MAX];
2071
2072         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2073         if (nlh == NULL)
2074                 return -EMSGSIZE;
2075
2076         r = nlmsg_data(nlh);
2077         r->rtm_family    = AF_INET;
2078         r->rtm_dst_len  = 32;
2079         r->rtm_src_len  = 0;
2080         r->rtm_tos      = rt->rt_key_tos;
2081         r->rtm_table    = RT_TABLE_MAIN;
2082         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2083                 goto nla_put_failure;
2084         r->rtm_type     = rt->rt_type;
2085         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2086         r->rtm_protocol = RTPROT_UNSPEC;
2087         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2088         if (rt->rt_flags & RTCF_NOTIFY)
2089                 r->rtm_flags |= RTM_F_NOTIFY;
2090
2091         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2092                 goto nla_put_failure;
2093         if (rt->rt_key_src) {
2094                 r->rtm_src_len = 32;
2095                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2096                         goto nla_put_failure;
2097         }
2098         if (rt->dst.dev &&
2099             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2100                 goto nla_put_failure;
2101 #ifdef CONFIG_IP_ROUTE_CLASSID
2102         if (rt->dst.tclassid &&
2103             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2104                 goto nla_put_failure;
2105 #endif
2106         if (!rt_is_input_route(rt) &&
2107             rt->rt_src != rt->rt_key_src) {
2108                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2109                         goto nla_put_failure;
2110         }
2111         if (rt->rt_dst != rt->rt_gateway &&
2112             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2113                 goto nla_put_failure;
2114
2115         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2116         if (rt->rt_pmtu)
2117                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2118         if (rtnetlink_put_metrics(skb, metrics) < 0)
2119                 goto nla_put_failure;
2120
2121         if (rt->rt_mark &&
2122             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2123                 goto nla_put_failure;
2124
2125         error = rt->dst.error;
2126         expires = rt->dst.expires;
2127         if (expires) {
2128                 if (time_before(jiffies, expires))
2129                         expires -= jiffies;
2130                 else
2131                         expires = 0;
2132         }
2133
2134         if (rt_is_input_route(rt)) {
2135 #ifdef CONFIG_IP_MROUTE
2136                 __be32 dst = rt->rt_dst;
2137
2138                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2139                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2140                         int err = ipmr_get_route(net, skb,
2141                                                  rt->rt_src, rt->rt_dst,
2142                                                  r, nowait);
2143                         if (err <= 0) {
2144                                 if (!nowait) {
2145                                         if (err == 0)
2146                                                 return 0;
2147                                         goto nla_put_failure;
2148                                 } else {
2149                                         if (err == -EMSGSIZE)
2150                                                 goto nla_put_failure;
2151                                         error = err;
2152                                 }
2153                         }
2154                 } else
2155 #endif
2156                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2157                                 goto nla_put_failure;
2158         }
2159
2160         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2161                 goto nla_put_failure;
2162
2163         return nlmsg_end(skb, nlh);
2164
2165 nla_put_failure:
2166         nlmsg_cancel(skb, nlh);
2167         return -EMSGSIZE;
2168 }
2169
2170 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2171 {
2172         struct net *net = sock_net(in_skb->sk);
2173         struct rtmsg *rtm;
2174         struct nlattr *tb[RTA_MAX+1];
2175         struct rtable *rt = NULL;
2176         __be32 dst = 0;
2177         __be32 src = 0;
2178         u32 iif;
2179         int err;
2180         int mark;
2181         struct sk_buff *skb;
2182
2183         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2184         if (err < 0)
2185                 goto errout;
2186
2187         rtm = nlmsg_data(nlh);
2188
2189         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2190         if (skb == NULL) {
2191                 err = -ENOBUFS;
2192                 goto errout;
2193         }
2194
2195         /* Reserve room for dummy headers, this skb can pass
2196            through good chunk of routing engine.
2197          */
2198         skb_reset_mac_header(skb);
2199         skb_reset_network_header(skb);
2200
2201         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2202         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2203         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2204
2205         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2206         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2207         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2208         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2209
2210         if (iif) {
2211                 struct net_device *dev;
2212
2213                 dev = __dev_get_by_index(net, iif);
2214                 if (dev == NULL) {
2215                         err = -ENODEV;
2216                         goto errout_free;
2217                 }
2218
2219                 skb->protocol   = htons(ETH_P_IP);
2220                 skb->dev        = dev;
2221                 skb->mark       = mark;
2222                 local_bh_disable();
2223                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2224                 local_bh_enable();
2225
2226                 rt = skb_rtable(skb);
2227                 if (err == 0 && rt->dst.error)
2228                         err = -rt->dst.error;
2229         } else {
2230                 struct flowi4 fl4 = {
2231                         .daddr = dst,
2232                         .saddr = src,
2233                         .flowi4_tos = rtm->rtm_tos,
2234                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2235                         .flowi4_mark = mark,
2236                 };
2237                 rt = ip_route_output_key(net, &fl4);
2238
2239                 err = 0;
2240                 if (IS_ERR(rt))
2241                         err = PTR_ERR(rt);
2242         }
2243
2244         if (err)
2245                 goto errout_free;
2246
2247         skb_dst_set(skb, &rt->dst);
2248         if (rtm->rtm_flags & RTM_F_NOTIFY)
2249                 rt->rt_flags |= RTCF_NOTIFY;
2250
2251         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2252                            RTM_NEWROUTE, 0, 0);
2253         if (err <= 0)
2254                 goto errout_free;
2255
2256         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2257 errout:
2258         return err;
2259
2260 errout_free:
2261         kfree_skb(skb);
2262         goto errout;
2263 }
2264
2265 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2266 {
2267         return skb->len;
2268 }
2269
2270 void ip_rt_multicast_event(struct in_device *in_dev)
2271 {
2272         rt_cache_flush(dev_net(in_dev->dev), 0);
2273 }
2274
2275 #ifdef CONFIG_SYSCTL
2276 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2277                                         void __user *buffer,
2278                                         size_t *lenp, loff_t *ppos)
2279 {
2280         if (write) {
2281                 int flush_delay;
2282                 ctl_table ctl;
2283                 struct net *net;
2284
2285                 memcpy(&ctl, __ctl, sizeof(ctl));
2286                 ctl.data = &flush_delay;
2287                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2288
2289                 net = (struct net *)__ctl->extra1;
2290                 rt_cache_flush(net, flush_delay);
2291                 return 0;
2292         }
2293
2294         return -EINVAL;
2295 }
2296
2297 static ctl_table ipv4_route_table[] = {
2298         {
2299                 .procname       = "gc_thresh",
2300                 .data           = &ipv4_dst_ops.gc_thresh,
2301                 .maxlen         = sizeof(int),
2302                 .mode           = 0644,
2303                 .proc_handler   = proc_dointvec,
2304         },
2305         {
2306                 .procname       = "max_size",
2307                 .data           = &ip_rt_max_size,
2308                 .maxlen         = sizeof(int),
2309                 .mode           = 0644,
2310                 .proc_handler   = proc_dointvec,
2311         },
2312         {
2313                 /*  Deprecated. Use gc_min_interval_ms */
2314
2315                 .procname       = "gc_min_interval",
2316                 .data           = &ip_rt_gc_min_interval,
2317                 .maxlen         = sizeof(int),
2318                 .mode           = 0644,
2319                 .proc_handler   = proc_dointvec_jiffies,
2320         },
2321         {
2322                 .procname       = "gc_min_interval_ms",
2323                 .data           = &ip_rt_gc_min_interval,
2324                 .maxlen         = sizeof(int),
2325                 .mode           = 0644,
2326                 .proc_handler   = proc_dointvec_ms_jiffies,
2327         },
2328         {
2329                 .procname       = "gc_timeout",
2330                 .data           = &ip_rt_gc_timeout,
2331                 .maxlen         = sizeof(int),
2332                 .mode           = 0644,
2333                 .proc_handler   = proc_dointvec_jiffies,
2334         },
2335         {
2336                 .procname       = "gc_interval",
2337                 .data           = &ip_rt_gc_interval,
2338                 .maxlen         = sizeof(int),
2339                 .mode           = 0644,
2340                 .proc_handler   = proc_dointvec_jiffies,
2341         },
2342         {
2343                 .procname       = "redirect_load",
2344                 .data           = &ip_rt_redirect_load,
2345                 .maxlen         = sizeof(int),
2346                 .mode           = 0644,
2347                 .proc_handler   = proc_dointvec,
2348         },
2349         {
2350                 .procname       = "redirect_number",
2351                 .data           = &ip_rt_redirect_number,
2352                 .maxlen         = sizeof(int),
2353                 .mode           = 0644,
2354                 .proc_handler   = proc_dointvec,
2355         },
2356         {
2357                 .procname       = "redirect_silence",
2358                 .data           = &ip_rt_redirect_silence,
2359                 .maxlen         = sizeof(int),
2360                 .mode           = 0644,
2361                 .proc_handler   = proc_dointvec,
2362         },
2363         {
2364                 .procname       = "error_cost",
2365                 .data           = &ip_rt_error_cost,
2366                 .maxlen         = sizeof(int),
2367                 .mode           = 0644,
2368                 .proc_handler   = proc_dointvec,
2369         },
2370         {
2371                 .procname       = "error_burst",
2372                 .data           = &ip_rt_error_burst,
2373                 .maxlen         = sizeof(int),
2374                 .mode           = 0644,
2375                 .proc_handler   = proc_dointvec,
2376         },
2377         {
2378                 .procname       = "gc_elasticity",
2379                 .data           = &ip_rt_gc_elasticity,
2380                 .maxlen         = sizeof(int),
2381                 .mode           = 0644,
2382                 .proc_handler   = proc_dointvec,
2383         },
2384         {
2385                 .procname       = "mtu_expires",
2386                 .data           = &ip_rt_mtu_expires,
2387                 .maxlen         = sizeof(int),
2388                 .mode           = 0644,
2389                 .proc_handler   = proc_dointvec_jiffies,
2390         },
2391         {
2392                 .procname       = "min_pmtu",
2393                 .data           = &ip_rt_min_pmtu,
2394                 .maxlen         = sizeof(int),
2395                 .mode           = 0644,
2396                 .proc_handler   = proc_dointvec,
2397         },
2398         {
2399                 .procname       = "min_adv_mss",
2400                 .data           = &ip_rt_min_advmss,
2401                 .maxlen         = sizeof(int),
2402                 .mode           = 0644,
2403                 .proc_handler   = proc_dointvec,
2404         },
2405         { }
2406 };
2407
2408 static struct ctl_table ipv4_route_flush_table[] = {
2409         {
2410                 .procname       = "flush",
2411                 .maxlen         = sizeof(int),
2412                 .mode           = 0200,
2413                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2414         },
2415         { },
2416 };
2417
2418 static __net_init int sysctl_route_net_init(struct net *net)
2419 {
2420         struct ctl_table *tbl;
2421
2422         tbl = ipv4_route_flush_table;
2423         if (!net_eq(net, &init_net)) {
2424                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2425                 if (tbl == NULL)
2426                         goto err_dup;
2427         }
2428         tbl[0].extra1 = net;
2429
2430         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2431         if (net->ipv4.route_hdr == NULL)
2432                 goto err_reg;
2433         return 0;
2434
2435 err_reg:
2436         if (tbl != ipv4_route_flush_table)
2437                 kfree(tbl);
2438 err_dup:
2439         return -ENOMEM;
2440 }
2441
2442 static __net_exit void sysctl_route_net_exit(struct net *net)
2443 {
2444         struct ctl_table *tbl;
2445
2446         tbl = net->ipv4.route_hdr->ctl_table_arg;
2447         unregister_net_sysctl_table(net->ipv4.route_hdr);
2448         BUG_ON(tbl == ipv4_route_flush_table);
2449         kfree(tbl);
2450 }
2451
2452 static __net_initdata struct pernet_operations sysctl_route_ops = {
2453         .init = sysctl_route_net_init,
2454         .exit = sysctl_route_net_exit,
2455 };
2456 #endif
2457
2458 static __net_init int rt_genid_init(struct net *net)
2459 {
2460         get_random_bytes(&net->ipv4.rt_genid,
2461                          sizeof(net->ipv4.rt_genid));
2462         get_random_bytes(&net->ipv4.dev_addr_genid,
2463                          sizeof(net->ipv4.dev_addr_genid));
2464         return 0;
2465 }
2466
2467 static __net_initdata struct pernet_operations rt_genid_ops = {
2468         .init = rt_genid_init,
2469 };
2470
2471 static int __net_init ipv4_inetpeer_init(struct net *net)
2472 {
2473         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2474
2475         if (!bp)
2476                 return -ENOMEM;
2477         inet_peer_base_init(bp);
2478         net->ipv4.peers = bp;
2479         return 0;
2480 }
2481
2482 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2483 {
2484         struct inet_peer_base *bp = net->ipv4.peers;
2485
2486         net->ipv4.peers = NULL;
2487         inetpeer_invalidate_tree(bp);
2488         kfree(bp);
2489 }
2490
2491 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2492         .init   =       ipv4_inetpeer_init,
2493         .exit   =       ipv4_inetpeer_exit,
2494 };
2495
2496 #ifdef CONFIG_IP_ROUTE_CLASSID
2497 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2498 #endif /* CONFIG_IP_ROUTE_CLASSID */
2499
2500 int __init ip_rt_init(void)
2501 {
2502         int rc = 0;
2503
2504 #ifdef CONFIG_IP_ROUTE_CLASSID
2505         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2506         if (!ip_rt_acct)
2507                 panic("IP: failed to allocate ip_rt_acct\n");
2508 #endif
2509
2510         ipv4_dst_ops.kmem_cachep =
2511                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2512                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2513
2514         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2515
2516         if (dst_entries_init(&ipv4_dst_ops) < 0)
2517                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2518
2519         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2520                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2521
2522         ipv4_dst_ops.gc_thresh = ~0;
2523         ip_rt_max_size = INT_MAX;
2524
2525         devinet_init();
2526         ip_fib_init();
2527
2528         if (ip_rt_proc_init())
2529                 pr_err("Unable to create route proc files\n");
2530 #ifdef CONFIG_XFRM
2531         xfrm_init();
2532         xfrm4_init(ip_rt_max_size);
2533 #endif
2534         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2535
2536 #ifdef CONFIG_SYSCTL
2537         register_pernet_subsys(&sysctl_route_ops);
2538 #endif
2539         register_pernet_subsys(&rt_genid_ops);
2540         register_pernet_subsys(&ipv4_inetpeer_ops);
2541         return rc;
2542 }
2543
2544 #ifdef CONFIG_SYSCTL
2545 /*
2546  * We really need to sanitize the damn ipv4 init order, then all
2547  * this nonsense will go away.
2548  */
2549 void __init ip_static_sysctl_init(void)
2550 {
2551         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2552 }
2553 #endif