atm: clip: Use device neigh support on top of "arp_tbl".
[~shefty/rdma-dev.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 /*
135  *      Interface to generic destination cache.
136  */
137
138 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
139 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
140 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
141 static void              ipv4_dst_destroy(struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145 static int rt_garbage_collect(struct dst_ops *ops);
146
147 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148                             int how)
149 {
150 }
151
152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153 {
154         struct rtable *rt = (struct rtable *) dst;
155         struct inet_peer *peer;
156         u32 *p = NULL;
157
158         if (!rt->peer)
159                 rt_bind_peer(rt, rt->rt_dst, 1);
160
161         peer = rt->peer;
162         if (peer) {
163                 u32 *old_p = __DST_METRICS_PTR(old);
164                 unsigned long prev, new;
165
166                 p = peer->metrics;
167                 if (inet_metrics_new(peer))
168                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170                 new = (unsigned long) p;
171                 prev = cmpxchg(&dst->_metrics, old, new);
172
173                 if (prev != old) {
174                         p = __DST_METRICS_PTR(prev);
175                         if (prev & DST_METRICS_READ_ONLY)
176                                 p = NULL;
177                 } else {
178                         if (rt->fi) {
179                                 fib_info_put(rt->fi);
180                                 rt->fi = NULL;
181                         }
182                 }
183         }
184         return p;
185 }
186
187 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
188
189 static struct dst_ops ipv4_dst_ops = {
190         .family =               AF_INET,
191         .protocol =             cpu_to_be16(ETH_P_IP),
192         .gc =                   rt_garbage_collect,
193         .check =                ipv4_dst_check,
194         .default_advmss =       ipv4_default_advmss,
195         .mtu =                  ipv4_mtu,
196         .cow_metrics =          ipv4_cow_metrics,
197         .destroy =              ipv4_dst_destroy,
198         .ifdown =               ipv4_dst_ifdown,
199         .negative_advice =      ipv4_negative_advice,
200         .link_failure =         ipv4_link_failure,
201         .update_pmtu =          ip_rt_update_pmtu,
202         .local_out =            __ip_local_out,
203         .neigh_lookup =         ipv4_neigh_lookup,
204 };
205
206 #define ECN_OR_COST(class)      TC_PRIO_##class
207
208 const __u8 ip_tos2prio[16] = {
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(BESTEFFORT),
211         TC_PRIO_BESTEFFORT,
212         ECN_OR_COST(BESTEFFORT),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_BULK,
216         ECN_OR_COST(BULK),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE,
220         ECN_OR_COST(INTERACTIVE),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK),
223         TC_PRIO_INTERACTIVE_BULK,
224         ECN_OR_COST(INTERACTIVE_BULK)
225 };
226
227
228 /*
229  * Route cache.
230  */
231
232 /* The locking scheme is rather straight forward:
233  *
234  * 1) Read-Copy Update protects the buckets of the central route hash.
235  * 2) Only writers remove entries, and they hold the lock
236  *    as they look at rtable reference counts.
237  * 3) Only readers acquire references to rtable entries,
238  *    they do so with atomic increments and with the
239  *    lock held.
240  */
241
242 struct rt_hash_bucket {
243         struct rtable __rcu     *chain;
244 };
245
246 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247         defined(CONFIG_PROVE_LOCKING)
248 /*
249  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250  * The size of this table is a power of two and depends on the number of CPUS.
251  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
252  */
253 #ifdef CONFIG_LOCKDEP
254 # define RT_HASH_LOCK_SZ        256
255 #else
256 # if NR_CPUS >= 32
257 #  define RT_HASH_LOCK_SZ       4096
258 # elif NR_CPUS >= 16
259 #  define RT_HASH_LOCK_SZ       2048
260 # elif NR_CPUS >= 8
261 #  define RT_HASH_LOCK_SZ       1024
262 # elif NR_CPUS >= 4
263 #  define RT_HASH_LOCK_SZ       512
264 # else
265 #  define RT_HASH_LOCK_SZ       256
266 # endif
267 #endif
268
269 static spinlock_t       *rt_hash_locks;
270 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
271
272 static __init void rt_hash_lock_init(void)
273 {
274         int i;
275
276         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277                         GFP_KERNEL);
278         if (!rt_hash_locks)
279                 panic("IP: failed to allocate rt_hash_locks\n");
280
281         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282                 spin_lock_init(&rt_hash_locks[i]);
283 }
284 #else
285 # define rt_hash_lock_addr(slot) NULL
286
287 static inline void rt_hash_lock_init(void)
288 {
289 }
290 #endif
291
292 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
293 static unsigned                 rt_hash_mask __read_mostly;
294 static unsigned int             rt_hash_log  __read_mostly;
295
296 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
297 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
298
299 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
300                                    int genid)
301 {
302         return jhash_3words((__force u32)daddr, (__force u32)saddr,
303                             idx, genid)
304                 & rt_hash_mask;
305 }
306
307 static inline int rt_genid(struct net *net)
308 {
309         return atomic_read(&net->ipv4.rt_genid);
310 }
311
312 #ifdef CONFIG_PROC_FS
313 struct rt_cache_iter_state {
314         struct seq_net_private p;
315         int bucket;
316         int genid;
317 };
318
319 static struct rtable *rt_cache_get_first(struct seq_file *seq)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         struct rtable *r = NULL;
323
324         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
325                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
326                         continue;
327                 rcu_read_lock_bh();
328                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
329                 while (r) {
330                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
331                             r->rt_genid == st->genid)
332                                 return r;
333                         r = rcu_dereference_bh(r->dst.rt_next);
334                 }
335                 rcu_read_unlock_bh();
336         }
337         return r;
338 }
339
340 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
341                                           struct rtable *r)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344
345         r = rcu_dereference_bh(r->dst.rt_next);
346         while (!r) {
347                 rcu_read_unlock_bh();
348                 do {
349                         if (--st->bucket < 0)
350                                 return NULL;
351                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
352                 rcu_read_lock_bh();
353                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
354         }
355         return r;
356 }
357
358 static struct rtable *rt_cache_get_next(struct seq_file *seq,
359                                         struct rtable *r)
360 {
361         struct rt_cache_iter_state *st = seq->private;
362         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
363                 if (dev_net(r->dst.dev) != seq_file_net(seq))
364                         continue;
365                 if (r->rt_genid == st->genid)
366                         break;
367         }
368         return r;
369 }
370
371 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
372 {
373         struct rtable *r = rt_cache_get_first(seq);
374
375         if (r)
376                 while (pos && (r = rt_cache_get_next(seq, r)))
377                         --pos;
378         return pos ? NULL : r;
379 }
380
381 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382 {
383         struct rt_cache_iter_state *st = seq->private;
384         if (*pos)
385                 return rt_cache_get_idx(seq, *pos - 1);
386         st->genid = rt_genid(seq_file_net(seq));
387         return SEQ_START_TOKEN;
388 }
389
390 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391 {
392         struct rtable *r;
393
394         if (v == SEQ_START_TOKEN)
395                 r = rt_cache_get_first(seq);
396         else
397                 r = rt_cache_get_next(seq, v);
398         ++*pos;
399         return r;
400 }
401
402 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403 {
404         if (v && v != SEQ_START_TOKEN)
405                 rcu_read_unlock_bh();
406 }
407
408 static int rt_cache_seq_show(struct seq_file *seq, void *v)
409 {
410         if (v == SEQ_START_TOKEN)
411                 seq_printf(seq, "%-127s\n",
412                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414                            "HHUptod\tSpecDst");
415         else {
416                 struct rtable *r = v;
417                 struct neighbour *n;
418                 int len;
419
420                 n = dst_get_neighbour(&r->dst);
421                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
422                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
423                         r->dst.dev ? r->dst.dev->name : "*",
424                         (__force u32)r->rt_dst,
425                         (__force u32)r->rt_gateway,
426                         r->rt_flags, atomic_read(&r->dst.__refcnt),
427                         r->dst.__use, 0, (__force u32)r->rt_src,
428                         dst_metric_advmss(&r->dst) + 40,
429                         dst_metric(&r->dst, RTAX_WINDOW),
430                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
431                               dst_metric(&r->dst, RTAX_RTTVAR)),
432                         r->rt_key_tos,
433                         -1,
434                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
435                         r->rt_spec_dst, &len);
436
437                 seq_printf(seq, "%*s\n", 127 - len, "");
438         }
439         return 0;
440 }
441
442 static const struct seq_operations rt_cache_seq_ops = {
443         .start  = rt_cache_seq_start,
444         .next   = rt_cache_seq_next,
445         .stop   = rt_cache_seq_stop,
446         .show   = rt_cache_seq_show,
447 };
448
449 static int rt_cache_seq_open(struct inode *inode, struct file *file)
450 {
451         return seq_open_net(inode, file, &rt_cache_seq_ops,
452                         sizeof(struct rt_cache_iter_state));
453 }
454
455 static const struct file_operations rt_cache_seq_fops = {
456         .owner   = THIS_MODULE,
457         .open    = rt_cache_seq_open,
458         .read    = seq_read,
459         .llseek  = seq_lseek,
460         .release = seq_release_net,
461 };
462
463
464 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
465 {
466         int cpu;
467
468         if (*pos == 0)
469                 return SEQ_START_TOKEN;
470
471         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
472                 if (!cpu_possible(cpu))
473                         continue;
474                 *pos = cpu+1;
475                 return &per_cpu(rt_cache_stat, cpu);
476         }
477         return NULL;
478 }
479
480 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
481 {
482         int cpu;
483
484         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
485                 if (!cpu_possible(cpu))
486                         continue;
487                 *pos = cpu+1;
488                 return &per_cpu(rt_cache_stat, cpu);
489         }
490         return NULL;
491
492 }
493
494 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
495 {
496
497 }
498
499 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
500 {
501         struct rt_cache_stat *st = v;
502
503         if (v == SEQ_START_TOKEN) {
504                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
505                 return 0;
506         }
507
508         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
509                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
510                    dst_entries_get_slow(&ipv4_dst_ops),
511                    st->in_hit,
512                    st->in_slow_tot,
513                    st->in_slow_mc,
514                    st->in_no_route,
515                    st->in_brd,
516                    st->in_martian_dst,
517                    st->in_martian_src,
518
519                    st->out_hit,
520                    st->out_slow_tot,
521                    st->out_slow_mc,
522
523                    st->gc_total,
524                    st->gc_ignored,
525                    st->gc_goal_miss,
526                    st->gc_dst_overflow,
527                    st->in_hlist_search,
528                    st->out_hlist_search
529                 );
530         return 0;
531 }
532
533 static const struct seq_operations rt_cpu_seq_ops = {
534         .start  = rt_cpu_seq_start,
535         .next   = rt_cpu_seq_next,
536         .stop   = rt_cpu_seq_stop,
537         .show   = rt_cpu_seq_show,
538 };
539
540
541 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
542 {
543         return seq_open(file, &rt_cpu_seq_ops);
544 }
545
546 static const struct file_operations rt_cpu_seq_fops = {
547         .owner   = THIS_MODULE,
548         .open    = rt_cpu_seq_open,
549         .read    = seq_read,
550         .llseek  = seq_lseek,
551         .release = seq_release,
552 };
553
554 #ifdef CONFIG_IP_ROUTE_CLASSID
555 static int rt_acct_proc_show(struct seq_file *m, void *v)
556 {
557         struct ip_rt_acct *dst, *src;
558         unsigned int i, j;
559
560         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
561         if (!dst)
562                 return -ENOMEM;
563
564         for_each_possible_cpu(i) {
565                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
566                 for (j = 0; j < 256; j++) {
567                         dst[j].o_bytes   += src[j].o_bytes;
568                         dst[j].o_packets += src[j].o_packets;
569                         dst[j].i_bytes   += src[j].i_bytes;
570                         dst[j].i_packets += src[j].i_packets;
571                 }
572         }
573
574         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
575         kfree(dst);
576         return 0;
577 }
578
579 static int rt_acct_proc_open(struct inode *inode, struct file *file)
580 {
581         return single_open(file, rt_acct_proc_show, NULL);
582 }
583
584 static const struct file_operations rt_acct_proc_fops = {
585         .owner          = THIS_MODULE,
586         .open           = rt_acct_proc_open,
587         .read           = seq_read,
588         .llseek         = seq_lseek,
589         .release        = single_release,
590 };
591 #endif
592
593 static int __net_init ip_rt_do_proc_init(struct net *net)
594 {
595         struct proc_dir_entry *pde;
596
597         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
598                         &rt_cache_seq_fops);
599         if (!pde)
600                 goto err1;
601
602         pde = proc_create("rt_cache", S_IRUGO,
603                           net->proc_net_stat, &rt_cpu_seq_fops);
604         if (!pde)
605                 goto err2;
606
607 #ifdef CONFIG_IP_ROUTE_CLASSID
608         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
609         if (!pde)
610                 goto err3;
611 #endif
612         return 0;
613
614 #ifdef CONFIG_IP_ROUTE_CLASSID
615 err3:
616         remove_proc_entry("rt_cache", net->proc_net_stat);
617 #endif
618 err2:
619         remove_proc_entry("rt_cache", net->proc_net);
620 err1:
621         return -ENOMEM;
622 }
623
624 static void __net_exit ip_rt_do_proc_exit(struct net *net)
625 {
626         remove_proc_entry("rt_cache", net->proc_net_stat);
627         remove_proc_entry("rt_cache", net->proc_net);
628 #ifdef CONFIG_IP_ROUTE_CLASSID
629         remove_proc_entry("rt_acct", net->proc_net);
630 #endif
631 }
632
633 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
634         .init = ip_rt_do_proc_init,
635         .exit = ip_rt_do_proc_exit,
636 };
637
638 static int __init ip_rt_proc_init(void)
639 {
640         return register_pernet_subsys(&ip_rt_proc_ops);
641 }
642
643 #else
644 static inline int ip_rt_proc_init(void)
645 {
646         return 0;
647 }
648 #endif /* CONFIG_PROC_FS */
649
650 static inline void rt_free(struct rtable *rt)
651 {
652         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
653 }
654
655 static inline void rt_drop(struct rtable *rt)
656 {
657         ip_rt_put(rt);
658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
659 }
660
661 static inline int rt_fast_clean(struct rtable *rth)
662 {
663         /* Kill broadcast/multicast entries very aggresively, if they
664            collide in hash table with more useful entries */
665         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
666                 rt_is_input_route(rth) && rth->dst.rt_next;
667 }
668
669 static inline int rt_valuable(struct rtable *rth)
670 {
671         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
672                 (rth->peer && rth->peer->pmtu_expires);
673 }
674
675 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
676 {
677         unsigned long age;
678         int ret = 0;
679
680         if (atomic_read(&rth->dst.__refcnt))
681                 goto out;
682
683         age = jiffies - rth->dst.lastuse;
684         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
685             (age <= tmo2 && rt_valuable(rth)))
686                 goto out;
687         ret = 1;
688 out:    return ret;
689 }
690
691 /* Bits of score are:
692  * 31: very valuable
693  * 30: not quite useless
694  * 29..0: usage counter
695  */
696 static inline u32 rt_score(struct rtable *rt)
697 {
698         u32 score = jiffies - rt->dst.lastuse;
699
700         score = ~score & ~(3<<30);
701
702         if (rt_valuable(rt))
703                 score |= (1<<31);
704
705         if (rt_is_output_route(rt) ||
706             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
707                 score |= (1<<30);
708
709         return score;
710 }
711
712 static inline bool rt_caching(const struct net *net)
713 {
714         return net->ipv4.current_rt_cache_rebuild_count <=
715                 net->ipv4.sysctl_rt_cache_rebuild_count;
716 }
717
718 static inline bool compare_hash_inputs(const struct rtable *rt1,
719                                        const struct rtable *rt2)
720 {
721         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
722                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
723                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
724 }
725
726 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
727 {
728         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
729                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
730                 (rt1->rt_mark ^ rt2->rt_mark) |
731                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
732                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
733                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
734 }
735
736 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
737 {
738         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
739 }
740
741 static inline int rt_is_expired(struct rtable *rth)
742 {
743         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
744 }
745
746 /*
747  * Perform a full scan of hash table and free all entries.
748  * Can be called by a softirq or a process.
749  * In the later case, we want to be reschedule if necessary
750  */
751 static void rt_do_flush(struct net *net, int process_context)
752 {
753         unsigned int i;
754         struct rtable *rth, *next;
755
756         for (i = 0; i <= rt_hash_mask; i++) {
757                 struct rtable __rcu **pprev;
758                 struct rtable *list;
759
760                 if (process_context && need_resched())
761                         cond_resched();
762                 rth = rcu_access_pointer(rt_hash_table[i].chain);
763                 if (!rth)
764                         continue;
765
766                 spin_lock_bh(rt_hash_lock_addr(i));
767
768                 list = NULL;
769                 pprev = &rt_hash_table[i].chain;
770                 rth = rcu_dereference_protected(*pprev,
771                         lockdep_is_held(rt_hash_lock_addr(i)));
772
773                 while (rth) {
774                         next = rcu_dereference_protected(rth->dst.rt_next,
775                                 lockdep_is_held(rt_hash_lock_addr(i)));
776
777                         if (!net ||
778                             net_eq(dev_net(rth->dst.dev), net)) {
779                                 rcu_assign_pointer(*pprev, next);
780                                 rcu_assign_pointer(rth->dst.rt_next, list);
781                                 list = rth;
782                         } else {
783                                 pprev = &rth->dst.rt_next;
784                         }
785                         rth = next;
786                 }
787
788                 spin_unlock_bh(rt_hash_lock_addr(i));
789
790                 for (; list; list = next) {
791                         next = rcu_dereference_protected(list->dst.rt_next, 1);
792                         rt_free(list);
793                 }
794         }
795 }
796
797 /*
798  * While freeing expired entries, we compute average chain length
799  * and standard deviation, using fixed-point arithmetic.
800  * This to have an estimation of rt_chain_length_max
801  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
802  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
803  */
804
805 #define FRACT_BITS 3
806 #define ONE (1UL << FRACT_BITS)
807
808 /*
809  * Given a hash chain and an item in this hash chain,
810  * find if a previous entry has the same hash_inputs
811  * (but differs on tos, mark or oif)
812  * Returns 0 if an alias is found.
813  * Returns ONE if rth has no alias before itself.
814  */
815 static int has_noalias(const struct rtable *head, const struct rtable *rth)
816 {
817         const struct rtable *aux = head;
818
819         while (aux != rth) {
820                 if (compare_hash_inputs(aux, rth))
821                         return 0;
822                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
823         }
824         return ONE;
825 }
826
827 /*
828  * Perturbation of rt_genid by a small quantity [1..256]
829  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
830  * many times (2^24) without giving recent rt_genid.
831  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
832  */
833 static void rt_cache_invalidate(struct net *net)
834 {
835         unsigned char shuffle;
836
837         get_random_bytes(&shuffle, sizeof(shuffle));
838         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
839 }
840
841 /*
842  * delay < 0  : invalidate cache (fast : entries will be deleted later)
843  * delay >= 0 : invalidate & flush cache (can be long)
844  */
845 void rt_cache_flush(struct net *net, int delay)
846 {
847         rt_cache_invalidate(net);
848         if (delay >= 0)
849                 rt_do_flush(net, !in_softirq());
850 }
851
852 /* Flush previous cache invalidated entries from the cache */
853 void rt_cache_flush_batch(struct net *net)
854 {
855         rt_do_flush(net, !in_softirq());
856 }
857
858 static void rt_emergency_hash_rebuild(struct net *net)
859 {
860         if (net_ratelimit())
861                 printk(KERN_WARNING "Route hash chain too long!\n");
862         rt_cache_invalidate(net);
863 }
864
865 /*
866    Short description of GC goals.
867
868    We want to build algorithm, which will keep routing cache
869    at some equilibrium point, when number of aged off entries
870    is kept approximately equal to newly generated ones.
871
872    Current expiration strength is variable "expire".
873    We try to adjust it dynamically, so that if networking
874    is idle expires is large enough to keep enough of warm entries,
875    and when load increases it reduces to limit cache size.
876  */
877
878 static int rt_garbage_collect(struct dst_ops *ops)
879 {
880         static unsigned long expire = RT_GC_TIMEOUT;
881         static unsigned long last_gc;
882         static int rover;
883         static int equilibrium;
884         struct rtable *rth;
885         struct rtable __rcu **rthp;
886         unsigned long now = jiffies;
887         int goal;
888         int entries = dst_entries_get_fast(&ipv4_dst_ops);
889
890         /*
891          * Garbage collection is pretty expensive,
892          * do not make it too frequently.
893          */
894
895         RT_CACHE_STAT_INC(gc_total);
896
897         if (now - last_gc < ip_rt_gc_min_interval &&
898             entries < ip_rt_max_size) {
899                 RT_CACHE_STAT_INC(gc_ignored);
900                 goto out;
901         }
902
903         entries = dst_entries_get_slow(&ipv4_dst_ops);
904         /* Calculate number of entries, which we want to expire now. */
905         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
906         if (goal <= 0) {
907                 if (equilibrium < ipv4_dst_ops.gc_thresh)
908                         equilibrium = ipv4_dst_ops.gc_thresh;
909                 goal = entries - equilibrium;
910                 if (goal > 0) {
911                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
912                         goal = entries - equilibrium;
913                 }
914         } else {
915                 /* We are in dangerous area. Try to reduce cache really
916                  * aggressively.
917                  */
918                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
919                 equilibrium = entries - goal;
920         }
921
922         if (now - last_gc >= ip_rt_gc_min_interval)
923                 last_gc = now;
924
925         if (goal <= 0) {
926                 equilibrium += goal;
927                 goto work_done;
928         }
929
930         do {
931                 int i, k;
932
933                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
934                         unsigned long tmo = expire;
935
936                         k = (k + 1) & rt_hash_mask;
937                         rthp = &rt_hash_table[k].chain;
938                         spin_lock_bh(rt_hash_lock_addr(k));
939                         while ((rth = rcu_dereference_protected(*rthp,
940                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
941                                 if (!rt_is_expired(rth) &&
942                                         !rt_may_expire(rth, tmo, expire)) {
943                                         tmo >>= 1;
944                                         rthp = &rth->dst.rt_next;
945                                         continue;
946                                 }
947                                 *rthp = rth->dst.rt_next;
948                                 rt_free(rth);
949                                 goal--;
950                         }
951                         spin_unlock_bh(rt_hash_lock_addr(k));
952                         if (goal <= 0)
953                                 break;
954                 }
955                 rover = k;
956
957                 if (goal <= 0)
958                         goto work_done;
959
960                 /* Goal is not achieved. We stop process if:
961
962                    - if expire reduced to zero. Otherwise, expire is halfed.
963                    - if table is not full.
964                    - if we are called from interrupt.
965                    - jiffies check is just fallback/debug loop breaker.
966                      We will not spin here for long time in any case.
967                  */
968
969                 RT_CACHE_STAT_INC(gc_goal_miss);
970
971                 if (expire == 0)
972                         break;
973
974                 expire >>= 1;
975
976                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977                         goto out;
978         } while (!in_softirq() && time_before_eq(jiffies, now));
979
980         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
981                 goto out;
982         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
983                 goto out;
984         if (net_ratelimit())
985                 printk(KERN_WARNING "dst cache overflow\n");
986         RT_CACHE_STAT_INC(gc_dst_overflow);
987         return 1;
988
989 work_done:
990         expire += ip_rt_gc_min_interval;
991         if (expire > ip_rt_gc_timeout ||
992             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
993             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
994                 expire = ip_rt_gc_timeout;
995 out:    return 0;
996 }
997
998 /*
999  * Returns number of entries in a hash chain that have different hash_inputs
1000  */
1001 static int slow_chain_length(const struct rtable *head)
1002 {
1003         int length = 0;
1004         const struct rtable *rth = head;
1005
1006         while (rth) {
1007                 length += has_noalias(head, rth);
1008                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1009         }
1010         return length >> FRACT_BITS;
1011 }
1012
1013 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1014 {
1015         static const __be32 inaddr_any = 0;
1016         struct net_device *dev = dst->dev;
1017         const __be32 *pkey = daddr;
1018         struct neighbour *n;
1019
1020         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1021                 pkey = &inaddr_any;
1022
1023         n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1024         if (n)
1025                 return n;
1026         return neigh_create(&arp_tbl, pkey, dev);
1027 }
1028
1029 static int rt_bind_neighbour(struct rtable *rt)
1030 {
1031         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1032         if (IS_ERR(n))
1033                 return PTR_ERR(n);
1034         dst_set_neighbour(&rt->dst, n);
1035
1036         return 0;
1037 }
1038
1039 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1040                                      struct sk_buff *skb, int ifindex)
1041 {
1042         struct rtable   *rth, *cand;
1043         struct rtable __rcu **rthp, **candp;
1044         unsigned long   now;
1045         u32             min_score;
1046         int             chain_length;
1047         int attempts = !in_softirq();
1048
1049 restart:
1050         chain_length = 0;
1051         min_score = ~(u32)0;
1052         cand = NULL;
1053         candp = NULL;
1054         now = jiffies;
1055
1056         if (!rt_caching(dev_net(rt->dst.dev))) {
1057                 /*
1058                  * If we're not caching, just tell the caller we
1059                  * were successful and don't touch the route.  The
1060                  * caller hold the sole reference to the cache entry, and
1061                  * it will be released when the caller is done with it.
1062                  * If we drop it here, the callers have no way to resolve routes
1063                  * when we're not caching.  Instead, just point *rp at rt, so
1064                  * the caller gets a single use out of the route
1065                  * Note that we do rt_free on this new route entry, so that
1066                  * once its refcount hits zero, we are still able to reap it
1067                  * (Thanks Alexey)
1068                  * Note: To avoid expensive rcu stuff for this uncached dst,
1069                  * we set DST_NOCACHE so that dst_release() can free dst without
1070                  * waiting a grace period.
1071                  */
1072
1073                 rt->dst.flags |= DST_NOCACHE;
1074                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1075                         int err = rt_bind_neighbour(rt);
1076                         if (err) {
1077                                 if (net_ratelimit())
1078                                         printk(KERN_WARNING
1079                                             "Neighbour table failure & not caching routes.\n");
1080                                 ip_rt_put(rt);
1081                                 return ERR_PTR(err);
1082                         }
1083                 }
1084
1085                 goto skip_hashing;
1086         }
1087
1088         rthp = &rt_hash_table[hash].chain;
1089
1090         spin_lock_bh(rt_hash_lock_addr(hash));
1091         while ((rth = rcu_dereference_protected(*rthp,
1092                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1093                 if (rt_is_expired(rth)) {
1094                         *rthp = rth->dst.rt_next;
1095                         rt_free(rth);
1096                         continue;
1097                 }
1098                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1099                         /* Put it first */
1100                         *rthp = rth->dst.rt_next;
1101                         /*
1102                          * Since lookup is lockfree, the deletion
1103                          * must be visible to another weakly ordered CPU before
1104                          * the insertion at the start of the hash chain.
1105                          */
1106                         rcu_assign_pointer(rth->dst.rt_next,
1107                                            rt_hash_table[hash].chain);
1108                         /*
1109                          * Since lookup is lockfree, the update writes
1110                          * must be ordered for consistency on SMP.
1111                          */
1112                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1113
1114                         dst_use(&rth->dst, now);
1115                         spin_unlock_bh(rt_hash_lock_addr(hash));
1116
1117                         rt_drop(rt);
1118                         if (skb)
1119                                 skb_dst_set(skb, &rth->dst);
1120                         return rth;
1121                 }
1122
1123                 if (!atomic_read(&rth->dst.__refcnt)) {
1124                         u32 score = rt_score(rth);
1125
1126                         if (score <= min_score) {
1127                                 cand = rth;
1128                                 candp = rthp;
1129                                 min_score = score;
1130                         }
1131                 }
1132
1133                 chain_length++;
1134
1135                 rthp = &rth->dst.rt_next;
1136         }
1137
1138         if (cand) {
1139                 /* ip_rt_gc_elasticity used to be average length of chain
1140                  * length, when exceeded gc becomes really aggressive.
1141                  *
1142                  * The second limit is less certain. At the moment it allows
1143                  * only 2 entries per bucket. We will see.
1144                  */
1145                 if (chain_length > ip_rt_gc_elasticity) {
1146                         *candp = cand->dst.rt_next;
1147                         rt_free(cand);
1148                 }
1149         } else {
1150                 if (chain_length > rt_chain_length_max &&
1151                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1152                         struct net *net = dev_net(rt->dst.dev);
1153                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1154                         if (!rt_caching(net)) {
1155                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1156                                         rt->dst.dev->name, num);
1157                         }
1158                         rt_emergency_hash_rebuild(net);
1159                         spin_unlock_bh(rt_hash_lock_addr(hash));
1160
1161                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1162                                         ifindex, rt_genid(net));
1163                         goto restart;
1164                 }
1165         }
1166
1167         /* Try to bind route to arp only if it is output
1168            route or unicast forwarding path.
1169          */
1170         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1171                 int err = rt_bind_neighbour(rt);
1172                 if (err) {
1173                         spin_unlock_bh(rt_hash_lock_addr(hash));
1174
1175                         if (err != -ENOBUFS) {
1176                                 rt_drop(rt);
1177                                 return ERR_PTR(err);
1178                         }
1179
1180                         /* Neighbour tables are full and nothing
1181                            can be released. Try to shrink route cache,
1182                            it is most likely it holds some neighbour records.
1183                          */
1184                         if (attempts-- > 0) {
1185                                 int saved_elasticity = ip_rt_gc_elasticity;
1186                                 int saved_int = ip_rt_gc_min_interval;
1187                                 ip_rt_gc_elasticity     = 1;
1188                                 ip_rt_gc_min_interval   = 0;
1189                                 rt_garbage_collect(&ipv4_dst_ops);
1190                                 ip_rt_gc_min_interval   = saved_int;
1191                                 ip_rt_gc_elasticity     = saved_elasticity;
1192                                 goto restart;
1193                         }
1194
1195                         if (net_ratelimit())
1196                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1197                         rt_drop(rt);
1198                         return ERR_PTR(-ENOBUFS);
1199                 }
1200         }
1201
1202         rt->dst.rt_next = rt_hash_table[hash].chain;
1203
1204         /*
1205          * Since lookup is lockfree, we must make sure
1206          * previous writes to rt are committed to memory
1207          * before making rt visible to other CPUS.
1208          */
1209         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1210
1211         spin_unlock_bh(rt_hash_lock_addr(hash));
1212
1213 skip_hashing:
1214         if (skb)
1215                 skb_dst_set(skb, &rt->dst);
1216         return rt;
1217 }
1218
1219 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1220
1221 static u32 rt_peer_genid(void)
1222 {
1223         return atomic_read(&__rt_peer_genid);
1224 }
1225
1226 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1227 {
1228         struct inet_peer *peer;
1229
1230         peer = inet_getpeer_v4(daddr, create);
1231
1232         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1233                 inet_putpeer(peer);
1234         else
1235                 rt->rt_peer_genid = rt_peer_genid();
1236 }
1237
1238 /*
1239  * Peer allocation may fail only in serious out-of-memory conditions.  However
1240  * we still can generate some output.
1241  * Random ID selection looks a bit dangerous because we have no chances to
1242  * select ID being unique in a reasonable period of time.
1243  * But broken packet identifier may be better than no packet at all.
1244  */
1245 static void ip_select_fb_ident(struct iphdr *iph)
1246 {
1247         static DEFINE_SPINLOCK(ip_fb_id_lock);
1248         static u32 ip_fallback_id;
1249         u32 salt;
1250
1251         spin_lock_bh(&ip_fb_id_lock);
1252         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1253         iph->id = htons(salt & 0xFFFF);
1254         ip_fallback_id = salt;
1255         spin_unlock_bh(&ip_fb_id_lock);
1256 }
1257
1258 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1259 {
1260         struct rtable *rt = (struct rtable *) dst;
1261
1262         if (rt) {
1263                 if (rt->peer == NULL)
1264                         rt_bind_peer(rt, rt->rt_dst, 1);
1265
1266                 /* If peer is attached to destination, it is never detached,
1267                    so that we need not to grab a lock to dereference it.
1268                  */
1269                 if (rt->peer) {
1270                         iph->id = htons(inet_getid(rt->peer, more));
1271                         return;
1272                 }
1273         } else
1274                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1275                        __builtin_return_address(0));
1276
1277         ip_select_fb_ident(iph);
1278 }
1279 EXPORT_SYMBOL(__ip_select_ident);
1280
1281 static void rt_del(unsigned hash, struct rtable *rt)
1282 {
1283         struct rtable __rcu **rthp;
1284         struct rtable *aux;
1285
1286         rthp = &rt_hash_table[hash].chain;
1287         spin_lock_bh(rt_hash_lock_addr(hash));
1288         ip_rt_put(rt);
1289         while ((aux = rcu_dereference_protected(*rthp,
1290                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1291                 if (aux == rt || rt_is_expired(aux)) {
1292                         *rthp = aux->dst.rt_next;
1293                         rt_free(aux);
1294                         continue;
1295                 }
1296                 rthp = &aux->dst.rt_next;
1297         }
1298         spin_unlock_bh(rt_hash_lock_addr(hash));
1299 }
1300
1301 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1302 {
1303         struct rtable *rt = (struct rtable *) dst;
1304         __be32 orig_gw = rt->rt_gateway;
1305         struct neighbour *n, *old_n;
1306
1307         dst_confirm(&rt->dst);
1308
1309         rt->rt_gateway = peer->redirect_learned.a4;
1310
1311         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1312         if (IS_ERR(n))
1313                 return PTR_ERR(n);
1314         old_n = xchg(&rt->dst._neighbour, n);
1315         if (old_n)
1316                 neigh_release(old_n);
1317         if (!n || !(n->nud_state & NUD_VALID)) {
1318                 if (n)
1319                         neigh_event_send(n, NULL);
1320                 rt->rt_gateway = orig_gw;
1321                 return -EAGAIN;
1322         } else {
1323                 rt->rt_flags |= RTCF_REDIRECTED;
1324                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1325         }
1326         return 0;
1327 }
1328
1329 /* called in rcu_read_lock() section */
1330 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1331                     __be32 saddr, struct net_device *dev)
1332 {
1333         int s, i;
1334         struct in_device *in_dev = __in_dev_get_rcu(dev);
1335         __be32 skeys[2] = { saddr, 0 };
1336         int    ikeys[2] = { dev->ifindex, 0 };
1337         struct inet_peer *peer;
1338         struct net *net;
1339
1340         if (!in_dev)
1341                 return;
1342
1343         net = dev_net(dev);
1344         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1345             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1346             ipv4_is_zeronet(new_gw))
1347                 goto reject_redirect;
1348
1349         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1350                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1351                         goto reject_redirect;
1352                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1353                         goto reject_redirect;
1354         } else {
1355                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1356                         goto reject_redirect;
1357         }
1358
1359         for (s = 0; s < 2; s++) {
1360                 for (i = 0; i < 2; i++) {
1361                         unsigned int hash;
1362                         struct rtable __rcu **rthp;
1363                         struct rtable *rt;
1364
1365                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1366
1367                         rthp = &rt_hash_table[hash].chain;
1368
1369                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1370                                 rthp = &rt->dst.rt_next;
1371
1372                                 if (rt->rt_key_dst != daddr ||
1373                                     rt->rt_key_src != skeys[s] ||
1374                                     rt->rt_oif != ikeys[i] ||
1375                                     rt_is_input_route(rt) ||
1376                                     rt_is_expired(rt) ||
1377                                     !net_eq(dev_net(rt->dst.dev), net) ||
1378                                     rt->dst.error ||
1379                                     rt->dst.dev != dev ||
1380                                     rt->rt_gateway != old_gw)
1381                                         continue;
1382
1383                                 if (!rt->peer)
1384                                         rt_bind_peer(rt, rt->rt_dst, 1);
1385
1386                                 peer = rt->peer;
1387                                 if (peer) {
1388                                         if (peer->redirect_learned.a4 != new_gw) {
1389                                                 peer->redirect_learned.a4 = new_gw;
1390                                                 atomic_inc(&__rt_peer_genid);
1391                                         }
1392                                         check_peer_redir(&rt->dst, peer);
1393                                 }
1394                         }
1395                 }
1396         }
1397         return;
1398
1399 reject_redirect:
1400 #ifdef CONFIG_IP_ROUTE_VERBOSE
1401         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1402                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1403                         "  Advised path = %pI4 -> %pI4\n",
1404                        &old_gw, dev->name, &new_gw,
1405                        &saddr, &daddr);
1406 #endif
1407         ;
1408 }
1409
1410 static bool peer_pmtu_expired(struct inet_peer *peer)
1411 {
1412         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1413
1414         return orig &&
1415                time_after_eq(jiffies, orig) &&
1416                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1417 }
1418
1419 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1420 {
1421         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1422
1423         return orig &&
1424                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1425 }
1426
1427 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1428 {
1429         struct rtable *rt = (struct rtable *)dst;
1430         struct dst_entry *ret = dst;
1431
1432         if (rt) {
1433                 if (dst->obsolete > 0) {
1434                         ip_rt_put(rt);
1435                         ret = NULL;
1436                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1437                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1438                                                 rt->rt_oif,
1439                                                 rt_genid(dev_net(dst->dev)));
1440                         rt_del(hash, rt);
1441                         ret = NULL;
1442                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1443                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1444                 }
1445         }
1446         return ret;
1447 }
1448
1449 /*
1450  * Algorithm:
1451  *      1. The first ip_rt_redirect_number redirects are sent
1452  *         with exponential backoff, then we stop sending them at all,
1453  *         assuming that the host ignores our redirects.
1454  *      2. If we did not see packets requiring redirects
1455  *         during ip_rt_redirect_silence, we assume that the host
1456  *         forgot redirected route and start to send redirects again.
1457  *
1458  * This algorithm is much cheaper and more intelligent than dumb load limiting
1459  * in icmp.c.
1460  *
1461  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1462  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1463  */
1464
1465 void ip_rt_send_redirect(struct sk_buff *skb)
1466 {
1467         struct rtable *rt = skb_rtable(skb);
1468         struct in_device *in_dev;
1469         struct inet_peer *peer;
1470         int log_martians;
1471
1472         rcu_read_lock();
1473         in_dev = __in_dev_get_rcu(rt->dst.dev);
1474         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1475                 rcu_read_unlock();
1476                 return;
1477         }
1478         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1479         rcu_read_unlock();
1480
1481         if (!rt->peer)
1482                 rt_bind_peer(rt, rt->rt_dst, 1);
1483         peer = rt->peer;
1484         if (!peer) {
1485                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1486                 return;
1487         }
1488
1489         /* No redirected packets during ip_rt_redirect_silence;
1490          * reset the algorithm.
1491          */
1492         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1493                 peer->rate_tokens = 0;
1494
1495         /* Too many ignored redirects; do not send anything
1496          * set dst.rate_last to the last seen redirected packet.
1497          */
1498         if (peer->rate_tokens >= ip_rt_redirect_number) {
1499                 peer->rate_last = jiffies;
1500                 return;
1501         }
1502
1503         /* Check for load limit; set rate_last to the latest sent
1504          * redirect.
1505          */
1506         if (peer->rate_tokens == 0 ||
1507             time_after(jiffies,
1508                        (peer->rate_last +
1509                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1510                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1511                 peer->rate_last = jiffies;
1512                 ++peer->rate_tokens;
1513 #ifdef CONFIG_IP_ROUTE_VERBOSE
1514                 if (log_martians &&
1515                     peer->rate_tokens == ip_rt_redirect_number &&
1516                     net_ratelimit())
1517                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1518                                &ip_hdr(skb)->saddr, rt->rt_iif,
1519                                 &rt->rt_dst, &rt->rt_gateway);
1520 #endif
1521         }
1522 }
1523
1524 static int ip_error(struct sk_buff *skb)
1525 {
1526         struct rtable *rt = skb_rtable(skb);
1527         struct inet_peer *peer;
1528         unsigned long now;
1529         bool send;
1530         int code;
1531
1532         switch (rt->dst.error) {
1533         case EINVAL:
1534         default:
1535                 goto out;
1536         case EHOSTUNREACH:
1537                 code = ICMP_HOST_UNREACH;
1538                 break;
1539         case ENETUNREACH:
1540                 code = ICMP_NET_UNREACH;
1541                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1542                                 IPSTATS_MIB_INNOROUTES);
1543                 break;
1544         case EACCES:
1545                 code = ICMP_PKT_FILTERED;
1546                 break;
1547         }
1548
1549         if (!rt->peer)
1550                 rt_bind_peer(rt, rt->rt_dst, 1);
1551         peer = rt->peer;
1552
1553         send = true;
1554         if (peer) {
1555                 now = jiffies;
1556                 peer->rate_tokens += now - peer->rate_last;
1557                 if (peer->rate_tokens > ip_rt_error_burst)
1558                         peer->rate_tokens = ip_rt_error_burst;
1559                 peer->rate_last = now;
1560                 if (peer->rate_tokens >= ip_rt_error_cost)
1561                         peer->rate_tokens -= ip_rt_error_cost;
1562                 else
1563                         send = false;
1564         }
1565         if (send)
1566                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1567
1568 out:    kfree_skb(skb);
1569         return 0;
1570 }
1571
1572 /*
1573  *      The last two values are not from the RFC but
1574  *      are needed for AMPRnet AX.25 paths.
1575  */
1576
1577 static const unsigned short mtu_plateau[] =
1578 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1579
1580 static inline unsigned short guess_mtu(unsigned short old_mtu)
1581 {
1582         int i;
1583
1584         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1585                 if (old_mtu > mtu_plateau[i])
1586                         return mtu_plateau[i];
1587         return 68;
1588 }
1589
1590 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1591                                  unsigned short new_mtu,
1592                                  struct net_device *dev)
1593 {
1594         unsigned short old_mtu = ntohs(iph->tot_len);
1595         unsigned short est_mtu = 0;
1596         struct inet_peer *peer;
1597
1598         peer = inet_getpeer_v4(iph->daddr, 1);
1599         if (peer) {
1600                 unsigned short mtu = new_mtu;
1601
1602                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1603                         /* BSD 4.2 derived systems incorrectly adjust
1604                          * tot_len by the IP header length, and report
1605                          * a zero MTU in the ICMP message.
1606                          */
1607                         if (mtu == 0 &&
1608                             old_mtu >= 68 + (iph->ihl << 2))
1609                                 old_mtu -= iph->ihl << 2;
1610                         mtu = guess_mtu(old_mtu);
1611                 }
1612
1613                 if (mtu < ip_rt_min_pmtu)
1614                         mtu = ip_rt_min_pmtu;
1615                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1616                         unsigned long pmtu_expires;
1617
1618                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1619                         if (!pmtu_expires)
1620                                 pmtu_expires = 1UL;
1621
1622                         est_mtu = mtu;
1623                         peer->pmtu_learned = mtu;
1624                         peer->pmtu_expires = pmtu_expires;
1625                         atomic_inc(&__rt_peer_genid);
1626                 }
1627
1628                 inet_putpeer(peer);
1629         }
1630         return est_mtu ? : new_mtu;
1631 }
1632
1633 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1634 {
1635         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1636
1637         if (!expires)
1638                 return;
1639         if (time_before(jiffies, expires)) {
1640                 u32 orig_dst_mtu = dst_mtu(dst);
1641                 if (peer->pmtu_learned < orig_dst_mtu) {
1642                         if (!peer->pmtu_orig)
1643                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1644                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1645                 }
1646         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1647                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1648 }
1649
1650 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1651 {
1652         struct rtable *rt = (struct rtable *) dst;
1653         struct inet_peer *peer;
1654
1655         dst_confirm(dst);
1656
1657         if (!rt->peer)
1658                 rt_bind_peer(rt, rt->rt_dst, 1);
1659         peer = rt->peer;
1660         if (peer) {
1661                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1662
1663                 if (mtu < ip_rt_min_pmtu)
1664                         mtu = ip_rt_min_pmtu;
1665                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1666
1667                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1668                         if (!pmtu_expires)
1669                                 pmtu_expires = 1UL;
1670
1671                         peer->pmtu_learned = mtu;
1672                         peer->pmtu_expires = pmtu_expires;
1673
1674                         atomic_inc(&__rt_peer_genid);
1675                         rt->rt_peer_genid = rt_peer_genid();
1676                 }
1677                 check_peer_pmtu(dst, peer);
1678         }
1679 }
1680
1681
1682 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1683 {
1684         struct rtable *rt = (struct rtable *) dst;
1685
1686         if (rt_is_expired(rt))
1687                 return NULL;
1688         if (rt->rt_peer_genid != rt_peer_genid()) {
1689                 struct inet_peer *peer;
1690
1691                 if (!rt->peer)
1692                         rt_bind_peer(rt, rt->rt_dst, 0);
1693
1694                 peer = rt->peer;
1695                 if (peer) {
1696                         check_peer_pmtu(dst, peer);
1697
1698                         if (peer->redirect_learned.a4 &&
1699                             peer->redirect_learned.a4 != rt->rt_gateway) {
1700                                 if (check_peer_redir(dst, peer))
1701                                         return NULL;
1702                         }
1703                 }
1704
1705                 rt->rt_peer_genid = rt_peer_genid();
1706         }
1707         return dst;
1708 }
1709
1710 static void ipv4_dst_destroy(struct dst_entry *dst)
1711 {
1712         struct rtable *rt = (struct rtable *) dst;
1713         struct inet_peer *peer = rt->peer;
1714
1715         if (rt->fi) {
1716                 fib_info_put(rt->fi);
1717                 rt->fi = NULL;
1718         }
1719         if (peer) {
1720                 rt->peer = NULL;
1721                 inet_putpeer(peer);
1722         }
1723 }
1724
1725
1726 static void ipv4_link_failure(struct sk_buff *skb)
1727 {
1728         struct rtable *rt;
1729
1730         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1731
1732         rt = skb_rtable(skb);
1733         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1734                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1735 }
1736
1737 static int ip_rt_bug(struct sk_buff *skb)
1738 {
1739         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1740                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1741                 skb->dev ? skb->dev->name : "?");
1742         kfree_skb(skb);
1743         WARN_ON(1);
1744         return 0;
1745 }
1746
1747 /*
1748    We do not cache source address of outgoing interface,
1749    because it is used only by IP RR, TS and SRR options,
1750    so that it out of fast path.
1751
1752    BTW remember: "addr" is allowed to be not aligned
1753    in IP options!
1754  */
1755
1756 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1757 {
1758         __be32 src;
1759
1760         if (rt_is_output_route(rt))
1761                 src = ip_hdr(skb)->saddr;
1762         else {
1763                 struct fib_result res;
1764                 struct flowi4 fl4;
1765                 struct iphdr *iph;
1766
1767                 iph = ip_hdr(skb);
1768
1769                 memset(&fl4, 0, sizeof(fl4));
1770                 fl4.daddr = iph->daddr;
1771                 fl4.saddr = iph->saddr;
1772                 fl4.flowi4_tos = RT_TOS(iph->tos);
1773                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1774                 fl4.flowi4_iif = skb->dev->ifindex;
1775                 fl4.flowi4_mark = skb->mark;
1776
1777                 rcu_read_lock();
1778                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1779                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1780                 else
1781                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1782                                         RT_SCOPE_UNIVERSE);
1783                 rcu_read_unlock();
1784         }
1785         memcpy(addr, &src, 4);
1786 }
1787
1788 #ifdef CONFIG_IP_ROUTE_CLASSID
1789 static void set_class_tag(struct rtable *rt, u32 tag)
1790 {
1791         if (!(rt->dst.tclassid & 0xFFFF))
1792                 rt->dst.tclassid |= tag & 0xFFFF;
1793         if (!(rt->dst.tclassid & 0xFFFF0000))
1794                 rt->dst.tclassid |= tag & 0xFFFF0000;
1795 }
1796 #endif
1797
1798 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1799 {
1800         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1801
1802         if (advmss == 0) {
1803                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1804                                ip_rt_min_advmss);
1805                 if (advmss > 65535 - 40)
1806                         advmss = 65535 - 40;
1807         }
1808         return advmss;
1809 }
1810
1811 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1812 {
1813         const struct rtable *rt = (const struct rtable *) dst;
1814         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1815
1816         if (mtu && rt_is_output_route(rt))
1817                 return mtu;
1818
1819         mtu = dst->dev->mtu;
1820
1821         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1822
1823                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1824                         mtu = 576;
1825         }
1826
1827         if (mtu > IP_MAX_MTU)
1828                 mtu = IP_MAX_MTU;
1829
1830         return mtu;
1831 }
1832
1833 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1834                             struct fib_info *fi)
1835 {
1836         struct inet_peer *peer;
1837         int create = 0;
1838
1839         /* If a peer entry exists for this destination, we must hook
1840          * it up in order to get at cached metrics.
1841          */
1842         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1843                 create = 1;
1844
1845         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1846         if (peer) {
1847                 rt->rt_peer_genid = rt_peer_genid();
1848                 if (inet_metrics_new(peer))
1849                         memcpy(peer->metrics, fi->fib_metrics,
1850                                sizeof(u32) * RTAX_MAX);
1851                 dst_init_metrics(&rt->dst, peer->metrics, false);
1852
1853                 check_peer_pmtu(&rt->dst, peer);
1854                 if (peer->redirect_learned.a4 &&
1855                     peer->redirect_learned.a4 != rt->rt_gateway) {
1856                         rt->rt_gateway = peer->redirect_learned.a4;
1857                         rt->rt_flags |= RTCF_REDIRECTED;
1858                 }
1859         } else {
1860                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1861                         rt->fi = fi;
1862                         atomic_inc(&fi->fib_clntref);
1863                 }
1864                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1865         }
1866 }
1867
1868 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1869                            const struct fib_result *res,
1870                            struct fib_info *fi, u16 type, u32 itag)
1871 {
1872         struct dst_entry *dst = &rt->dst;
1873
1874         if (fi) {
1875                 if (FIB_RES_GW(*res) &&
1876                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1877                         rt->rt_gateway = FIB_RES_GW(*res);
1878                 rt_init_metrics(rt, fl4, fi);
1879 #ifdef CONFIG_IP_ROUTE_CLASSID
1880                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1881 #endif
1882         }
1883
1884         if (dst_mtu(dst) > IP_MAX_MTU)
1885                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1886         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1887                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1888
1889 #ifdef CONFIG_IP_ROUTE_CLASSID
1890 #ifdef CONFIG_IP_MULTIPLE_TABLES
1891         set_class_tag(rt, fib_rules_tclass(res));
1892 #endif
1893         set_class_tag(rt, itag);
1894 #endif
1895 }
1896
1897 static struct rtable *rt_dst_alloc(struct net_device *dev,
1898                                    bool nopolicy, bool noxfrm)
1899 {
1900         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1901                          DST_HOST |
1902                          (nopolicy ? DST_NOPOLICY : 0) |
1903                          (noxfrm ? DST_NOXFRM : 0));
1904 }
1905
1906 /* called in rcu_read_lock() section */
1907 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908                                 u8 tos, struct net_device *dev, int our)
1909 {
1910         unsigned int hash;
1911         struct rtable *rth;
1912         __be32 spec_dst;
1913         struct in_device *in_dev = __in_dev_get_rcu(dev);
1914         u32 itag = 0;
1915         int err;
1916
1917         /* Primary sanity checks. */
1918
1919         if (in_dev == NULL)
1920                 return -EINVAL;
1921
1922         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1923             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1924                 goto e_inval;
1925
1926         if (ipv4_is_zeronet(saddr)) {
1927                 if (!ipv4_is_local_multicast(daddr))
1928                         goto e_inval;
1929                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1930         } else {
1931                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1932                                           &itag);
1933                 if (err < 0)
1934                         goto e_err;
1935         }
1936         rth = rt_dst_alloc(init_net.loopback_dev,
1937                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1938         if (!rth)
1939                 goto e_nobufs;
1940
1941 #ifdef CONFIG_IP_ROUTE_CLASSID
1942         rth->dst.tclassid = itag;
1943 #endif
1944         rth->dst.output = ip_rt_bug;
1945
1946         rth->rt_key_dst = daddr;
1947         rth->rt_key_src = saddr;
1948         rth->rt_genid   = rt_genid(dev_net(dev));
1949         rth->rt_flags   = RTCF_MULTICAST;
1950         rth->rt_type    = RTN_MULTICAST;
1951         rth->rt_key_tos = tos;
1952         rth->rt_dst     = daddr;
1953         rth->rt_src     = saddr;
1954         rth->rt_route_iif = dev->ifindex;
1955         rth->rt_iif     = dev->ifindex;
1956         rth->rt_oif     = 0;
1957         rth->rt_mark    = skb->mark;
1958         rth->rt_gateway = daddr;
1959         rth->rt_spec_dst= spec_dst;
1960         rth->rt_peer_genid = 0;
1961         rth->peer = NULL;
1962         rth->fi = NULL;
1963         if (our) {
1964                 rth->dst.input= ip_local_deliver;
1965                 rth->rt_flags |= RTCF_LOCAL;
1966         }
1967
1968 #ifdef CONFIG_IP_MROUTE
1969         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1970                 rth->dst.input = ip_mr_input;
1971 #endif
1972         RT_CACHE_STAT_INC(in_slow_mc);
1973
1974         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1975         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1976         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1977
1978 e_nobufs:
1979         return -ENOBUFS;
1980 e_inval:
1981         return -EINVAL;
1982 e_err:
1983         return err;
1984 }
1985
1986
1987 static void ip_handle_martian_source(struct net_device *dev,
1988                                      struct in_device *in_dev,
1989                                      struct sk_buff *skb,
1990                                      __be32 daddr,
1991                                      __be32 saddr)
1992 {
1993         RT_CACHE_STAT_INC(in_martian_src);
1994 #ifdef CONFIG_IP_ROUTE_VERBOSE
1995         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1996                 /*
1997                  *      RFC1812 recommendation, if source is martian,
1998                  *      the only hint is MAC header.
1999                  */
2000                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2001                         &daddr, &saddr, dev->name);
2002                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2003                         int i;
2004                         const unsigned char *p = skb_mac_header(skb);
2005                         printk(KERN_WARNING "ll header: ");
2006                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2007                                 printk("%02x", *p);
2008                                 if (i < (dev->hard_header_len - 1))
2009                                         printk(":");
2010                         }
2011                         printk("\n");
2012                 }
2013         }
2014 #endif
2015 }
2016
2017 /* called in rcu_read_lock() section */
2018 static int __mkroute_input(struct sk_buff *skb,
2019                            const struct fib_result *res,
2020                            struct in_device *in_dev,
2021                            __be32 daddr, __be32 saddr, u32 tos,
2022                            struct rtable **result)
2023 {
2024         struct rtable *rth;
2025         int err;
2026         struct in_device *out_dev;
2027         unsigned int flags = 0;
2028         __be32 spec_dst;
2029         u32 itag;
2030
2031         /* get a working reference to the output device */
2032         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2033         if (out_dev == NULL) {
2034                 if (net_ratelimit())
2035                         printk(KERN_CRIT "Bug in ip_route_input" \
2036                                "_slow(). Please, report\n");
2037                 return -EINVAL;
2038         }
2039
2040
2041         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2042                                   in_dev->dev, &spec_dst, &itag);
2043         if (err < 0) {
2044                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2045                                          saddr);
2046
2047                 goto cleanup;
2048         }
2049
2050         if (err)
2051                 flags |= RTCF_DIRECTSRC;
2052
2053         if (out_dev == in_dev && err &&
2054             (IN_DEV_SHARED_MEDIA(out_dev) ||
2055              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2056                 flags |= RTCF_DOREDIRECT;
2057
2058         if (skb->protocol != htons(ETH_P_IP)) {
2059                 /* Not IP (i.e. ARP). Do not create route, if it is
2060                  * invalid for proxy arp. DNAT routes are always valid.
2061                  *
2062                  * Proxy arp feature have been extended to allow, ARP
2063                  * replies back to the same interface, to support
2064                  * Private VLAN switch technologies. See arp.c.
2065                  */
2066                 if (out_dev == in_dev &&
2067                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2068                         err = -EINVAL;
2069                         goto cleanup;
2070                 }
2071         }
2072
2073         rth = rt_dst_alloc(out_dev->dev,
2074                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2075                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2076         if (!rth) {
2077                 err = -ENOBUFS;
2078                 goto cleanup;
2079         }
2080
2081         rth->rt_key_dst = daddr;
2082         rth->rt_key_src = saddr;
2083         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2084         rth->rt_flags = flags;
2085         rth->rt_type = res->type;
2086         rth->rt_key_tos = tos;
2087         rth->rt_dst     = daddr;
2088         rth->rt_src     = saddr;
2089         rth->rt_route_iif = in_dev->dev->ifindex;
2090         rth->rt_iif     = in_dev->dev->ifindex;
2091         rth->rt_oif     = 0;
2092         rth->rt_mark    = skb->mark;
2093         rth->rt_gateway = daddr;
2094         rth->rt_spec_dst= spec_dst;
2095         rth->rt_peer_genid = 0;
2096         rth->peer = NULL;
2097         rth->fi = NULL;
2098
2099         rth->dst.input = ip_forward;
2100         rth->dst.output = ip_output;
2101
2102         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2103
2104         *result = rth;
2105         err = 0;
2106  cleanup:
2107         return err;
2108 }
2109
2110 static int ip_mkroute_input(struct sk_buff *skb,
2111                             struct fib_result *res,
2112                             const struct flowi4 *fl4,
2113                             struct in_device *in_dev,
2114                             __be32 daddr, __be32 saddr, u32 tos)
2115 {
2116         struct rtable* rth = NULL;
2117         int err;
2118         unsigned hash;
2119
2120 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2121         if (res->fi && res->fi->fib_nhs > 1)
2122                 fib_select_multipath(res);
2123 #endif
2124
2125         /* create a routing cache entry */
2126         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2127         if (err)
2128                 return err;
2129
2130         /* put it into the cache */
2131         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2132                        rt_genid(dev_net(rth->dst.dev)));
2133         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2134         if (IS_ERR(rth))
2135                 return PTR_ERR(rth);
2136         return 0;
2137 }
2138
2139 /*
2140  *      NOTE. We drop all the packets that has local source
2141  *      addresses, because every properly looped back packet
2142  *      must have correct destination already attached by output routine.
2143  *
2144  *      Such approach solves two big problems:
2145  *      1. Not simplex devices are handled properly.
2146  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2147  *      called with rcu_read_lock()
2148  */
2149
2150 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2151                                u8 tos, struct net_device *dev)
2152 {
2153         struct fib_result res;
2154         struct in_device *in_dev = __in_dev_get_rcu(dev);
2155         struct flowi4   fl4;
2156         unsigned        flags = 0;
2157         u32             itag = 0;
2158         struct rtable * rth;
2159         unsigned        hash;
2160         __be32          spec_dst;
2161         int             err = -EINVAL;
2162         struct net    * net = dev_net(dev);
2163
2164         /* IP on this device is disabled. */
2165
2166         if (!in_dev)
2167                 goto out;
2168
2169         /* Check for the most weird martians, which can be not detected
2170            by fib_lookup.
2171          */
2172
2173         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2174             ipv4_is_loopback(saddr))
2175                 goto martian_source;
2176
2177         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2178                 goto brd_input;
2179
2180         /* Accept zero addresses only to limited broadcast;
2181          * I even do not know to fix it or not. Waiting for complains :-)
2182          */
2183         if (ipv4_is_zeronet(saddr))
2184                 goto martian_source;
2185
2186         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2187                 goto martian_destination;
2188
2189         /*
2190          *      Now we are ready to route packet.
2191          */
2192         fl4.flowi4_oif = 0;
2193         fl4.flowi4_iif = dev->ifindex;
2194         fl4.flowi4_mark = skb->mark;
2195         fl4.flowi4_tos = tos;
2196         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2197         fl4.daddr = daddr;
2198         fl4.saddr = saddr;
2199         err = fib_lookup(net, &fl4, &res);
2200         if (err != 0) {
2201                 if (!IN_DEV_FORWARD(in_dev))
2202                         goto e_hostunreach;
2203                 goto no_route;
2204         }
2205
2206         RT_CACHE_STAT_INC(in_slow_tot);
2207
2208         if (res.type == RTN_BROADCAST)
2209                 goto brd_input;
2210
2211         if (res.type == RTN_LOCAL) {
2212                 err = fib_validate_source(skb, saddr, daddr, tos,
2213                                           net->loopback_dev->ifindex,
2214                                           dev, &spec_dst, &itag);
2215                 if (err < 0)
2216                         goto martian_source_keep_err;
2217                 if (err)
2218                         flags |= RTCF_DIRECTSRC;
2219                 spec_dst = daddr;
2220                 goto local_input;
2221         }
2222
2223         if (!IN_DEV_FORWARD(in_dev))
2224                 goto e_hostunreach;
2225         if (res.type != RTN_UNICAST)
2226                 goto martian_destination;
2227
2228         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2229 out:    return err;
2230
2231 brd_input:
2232         if (skb->protocol != htons(ETH_P_IP))
2233                 goto e_inval;
2234
2235         if (ipv4_is_zeronet(saddr))
2236                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2237         else {
2238                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2239                                           &itag);
2240                 if (err < 0)
2241                         goto martian_source_keep_err;
2242                 if (err)
2243                         flags |= RTCF_DIRECTSRC;
2244         }
2245         flags |= RTCF_BROADCAST;
2246         res.type = RTN_BROADCAST;
2247         RT_CACHE_STAT_INC(in_brd);
2248
2249 local_input:
2250         rth = rt_dst_alloc(net->loopback_dev,
2251                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2252         if (!rth)
2253                 goto e_nobufs;
2254
2255         rth->dst.input= ip_local_deliver;
2256         rth->dst.output= ip_rt_bug;
2257 #ifdef CONFIG_IP_ROUTE_CLASSID
2258         rth->dst.tclassid = itag;
2259 #endif
2260
2261         rth->rt_key_dst = daddr;
2262         rth->rt_key_src = saddr;
2263         rth->rt_genid = rt_genid(net);
2264         rth->rt_flags   = flags|RTCF_LOCAL;
2265         rth->rt_type    = res.type;
2266         rth->rt_key_tos = tos;
2267         rth->rt_dst     = daddr;
2268         rth->rt_src     = saddr;
2269 #ifdef CONFIG_IP_ROUTE_CLASSID
2270         rth->dst.tclassid = itag;
2271 #endif
2272         rth->rt_route_iif = dev->ifindex;
2273         rth->rt_iif     = dev->ifindex;
2274         rth->rt_oif     = 0;
2275         rth->rt_mark    = skb->mark;
2276         rth->rt_gateway = daddr;
2277         rth->rt_spec_dst= spec_dst;
2278         rth->rt_peer_genid = 0;
2279         rth->peer = NULL;
2280         rth->fi = NULL;
2281         if (res.type == RTN_UNREACHABLE) {
2282                 rth->dst.input= ip_error;
2283                 rth->dst.error= -err;
2284                 rth->rt_flags   &= ~RTCF_LOCAL;
2285         }
2286         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2287         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2288         err = 0;
2289         if (IS_ERR(rth))
2290                 err = PTR_ERR(rth);
2291         goto out;
2292
2293 no_route:
2294         RT_CACHE_STAT_INC(in_no_route);
2295         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2296         res.type = RTN_UNREACHABLE;
2297         if (err == -ESRCH)
2298                 err = -ENETUNREACH;
2299         goto local_input;
2300
2301         /*
2302          *      Do not cache martian addresses: they should be logged (RFC1812)
2303          */
2304 martian_destination:
2305         RT_CACHE_STAT_INC(in_martian_dst);
2306 #ifdef CONFIG_IP_ROUTE_VERBOSE
2307         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2308                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2309                         &daddr, &saddr, dev->name);
2310 #endif
2311
2312 e_hostunreach:
2313         err = -EHOSTUNREACH;
2314         goto out;
2315
2316 e_inval:
2317         err = -EINVAL;
2318         goto out;
2319
2320 e_nobufs:
2321         err = -ENOBUFS;
2322         goto out;
2323
2324 martian_source:
2325         err = -EINVAL;
2326 martian_source_keep_err:
2327         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2328         goto out;
2329 }
2330
2331 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2332                            u8 tos, struct net_device *dev, bool noref)
2333 {
2334         struct rtable * rth;
2335         unsigned        hash;
2336         int iif = dev->ifindex;
2337         struct net *net;
2338         int res;
2339
2340         net = dev_net(dev);
2341
2342         rcu_read_lock();
2343
2344         if (!rt_caching(net))
2345                 goto skip_cache;
2346
2347         tos &= IPTOS_RT_MASK;
2348         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2349
2350         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2351              rth = rcu_dereference(rth->dst.rt_next)) {
2352                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2353                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2354                      (rth->rt_route_iif ^ iif) |
2355                      (rth->rt_key_tos ^ tos)) == 0 &&
2356                     rth->rt_mark == skb->mark &&
2357                     net_eq(dev_net(rth->dst.dev), net) &&
2358                     !rt_is_expired(rth)) {
2359                         if (noref) {
2360                                 dst_use_noref(&rth->dst, jiffies);
2361                                 skb_dst_set_noref(skb, &rth->dst);
2362                         } else {
2363                                 dst_use(&rth->dst, jiffies);
2364                                 skb_dst_set(skb, &rth->dst);
2365                         }
2366                         RT_CACHE_STAT_INC(in_hit);
2367                         rcu_read_unlock();
2368                         return 0;
2369                 }
2370                 RT_CACHE_STAT_INC(in_hlist_search);
2371         }
2372
2373 skip_cache:
2374         /* Multicast recognition logic is moved from route cache to here.
2375            The problem was that too many Ethernet cards have broken/missing
2376            hardware multicast filters :-( As result the host on multicasting
2377            network acquires a lot of useless route cache entries, sort of
2378            SDR messages from all the world. Now we try to get rid of them.
2379            Really, provided software IP multicast filter is organized
2380            reasonably (at least, hashed), it does not result in a slowdown
2381            comparing with route cache reject entries.
2382            Note, that multicast routers are not affected, because
2383            route cache entry is created eventually.
2384          */
2385         if (ipv4_is_multicast(daddr)) {
2386                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2387
2388                 if (in_dev) {
2389                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2390                                                   ip_hdr(skb)->protocol);
2391                         if (our
2392 #ifdef CONFIG_IP_MROUTE
2393                                 ||
2394                             (!ipv4_is_local_multicast(daddr) &&
2395                              IN_DEV_MFORWARD(in_dev))
2396 #endif
2397                            ) {
2398                                 int res = ip_route_input_mc(skb, daddr, saddr,
2399                                                             tos, dev, our);
2400                                 rcu_read_unlock();
2401                                 return res;
2402                         }
2403                 }
2404                 rcu_read_unlock();
2405                 return -EINVAL;
2406         }
2407         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2408         rcu_read_unlock();
2409         return res;
2410 }
2411 EXPORT_SYMBOL(ip_route_input_common);
2412
2413 /* called with rcu_read_lock() */
2414 static struct rtable *__mkroute_output(const struct fib_result *res,
2415                                        const struct flowi4 *fl4,
2416                                        __be32 orig_daddr, __be32 orig_saddr,
2417                                        int orig_oif, struct net_device *dev_out,
2418                                        unsigned int flags)
2419 {
2420         struct fib_info *fi = res->fi;
2421         u32 tos = RT_FL_TOS(fl4);
2422         struct in_device *in_dev;
2423         u16 type = res->type;
2424         struct rtable *rth;
2425
2426         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2427                 return ERR_PTR(-EINVAL);
2428
2429         if (ipv4_is_lbcast(fl4->daddr))
2430                 type = RTN_BROADCAST;
2431         else if (ipv4_is_multicast(fl4->daddr))
2432                 type = RTN_MULTICAST;
2433         else if (ipv4_is_zeronet(fl4->daddr))
2434                 return ERR_PTR(-EINVAL);
2435
2436         if (dev_out->flags & IFF_LOOPBACK)
2437                 flags |= RTCF_LOCAL;
2438
2439         in_dev = __in_dev_get_rcu(dev_out);
2440         if (!in_dev)
2441                 return ERR_PTR(-EINVAL);
2442
2443         if (type == RTN_BROADCAST) {
2444                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2445                 fi = NULL;
2446         } else if (type == RTN_MULTICAST) {
2447                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2448                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2449                                      fl4->flowi4_proto))
2450                         flags &= ~RTCF_LOCAL;
2451                 /* If multicast route do not exist use
2452                  * default one, but do not gateway in this case.
2453                  * Yes, it is hack.
2454                  */
2455                 if (fi && res->prefixlen < 4)
2456                         fi = NULL;
2457         }
2458
2459         rth = rt_dst_alloc(dev_out,
2460                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2461                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2462         if (!rth)
2463                 return ERR_PTR(-ENOBUFS);
2464
2465         rth->dst.output = ip_output;
2466
2467         rth->rt_key_dst = orig_daddr;
2468         rth->rt_key_src = orig_saddr;
2469         rth->rt_genid = rt_genid(dev_net(dev_out));
2470         rth->rt_flags   = flags;
2471         rth->rt_type    = type;
2472         rth->rt_key_tos = tos;
2473         rth->rt_dst     = fl4->daddr;
2474         rth->rt_src     = fl4->saddr;
2475         rth->rt_route_iif = 0;
2476         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2477         rth->rt_oif     = orig_oif;
2478         rth->rt_mark    = fl4->flowi4_mark;
2479         rth->rt_gateway = fl4->daddr;
2480         rth->rt_spec_dst= fl4->saddr;
2481         rth->rt_peer_genid = 0;
2482         rth->peer = NULL;
2483         rth->fi = NULL;
2484
2485         RT_CACHE_STAT_INC(out_slow_tot);
2486
2487         if (flags & RTCF_LOCAL) {
2488                 rth->dst.input = ip_local_deliver;
2489                 rth->rt_spec_dst = fl4->daddr;
2490         }
2491         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2492                 rth->rt_spec_dst = fl4->saddr;
2493                 if (flags & RTCF_LOCAL &&
2494                     !(dev_out->flags & IFF_LOOPBACK)) {
2495                         rth->dst.output = ip_mc_output;
2496                         RT_CACHE_STAT_INC(out_slow_mc);
2497                 }
2498 #ifdef CONFIG_IP_MROUTE
2499                 if (type == RTN_MULTICAST) {
2500                         if (IN_DEV_MFORWARD(in_dev) &&
2501                             !ipv4_is_local_multicast(fl4->daddr)) {
2502                                 rth->dst.input = ip_mr_input;
2503                                 rth->dst.output = ip_mc_output;
2504                         }
2505                 }
2506 #endif
2507         }
2508
2509         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2510
2511         return rth;
2512 }
2513
2514 /*
2515  * Major route resolver routine.
2516  * called with rcu_read_lock();
2517  */
2518
2519 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2520 {
2521         struct net_device *dev_out = NULL;
2522         u32 tos = RT_FL_TOS(fl4);
2523         unsigned int flags = 0;
2524         struct fib_result res;
2525         struct rtable *rth;
2526         __be32 orig_daddr;
2527         __be32 orig_saddr;
2528         int orig_oif;
2529
2530         res.fi          = NULL;
2531 #ifdef CONFIG_IP_MULTIPLE_TABLES
2532         res.r           = NULL;
2533 #endif
2534
2535         orig_daddr = fl4->daddr;
2536         orig_saddr = fl4->saddr;
2537         orig_oif = fl4->flowi4_oif;
2538
2539         fl4->flowi4_iif = net->loopback_dev->ifindex;
2540         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2541         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2542                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2543
2544         rcu_read_lock();
2545         if (fl4->saddr) {
2546                 rth = ERR_PTR(-EINVAL);
2547                 if (ipv4_is_multicast(fl4->saddr) ||
2548                     ipv4_is_lbcast(fl4->saddr) ||
2549                     ipv4_is_zeronet(fl4->saddr))
2550                         goto out;
2551
2552                 /* I removed check for oif == dev_out->oif here.
2553                    It was wrong for two reasons:
2554                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2555                       is assigned to multiple interfaces.
2556                    2. Moreover, we are allowed to send packets with saddr
2557                       of another iface. --ANK
2558                  */
2559
2560                 if (fl4->flowi4_oif == 0 &&
2561                     (ipv4_is_multicast(fl4->daddr) ||
2562                      ipv4_is_lbcast(fl4->daddr))) {
2563                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2564                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2565                         if (dev_out == NULL)
2566                                 goto out;
2567
2568                         /* Special hack: user can direct multicasts
2569                            and limited broadcast via necessary interface
2570                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2571                            This hack is not just for fun, it allows
2572                            vic,vat and friends to work.
2573                            They bind socket to loopback, set ttl to zero
2574                            and expect that it will work.
2575                            From the viewpoint of routing cache they are broken,
2576                            because we are not allowed to build multicast path
2577                            with loopback source addr (look, routing cache
2578                            cannot know, that ttl is zero, so that packet
2579                            will not leave this host and route is valid).
2580                            Luckily, this hack is good workaround.
2581                          */
2582
2583                         fl4->flowi4_oif = dev_out->ifindex;
2584                         goto make_route;
2585                 }
2586
2587                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2588                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2589                         if (!__ip_dev_find(net, fl4->saddr, false))
2590                                 goto out;
2591                 }
2592         }
2593
2594
2595         if (fl4->flowi4_oif) {
2596                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2597                 rth = ERR_PTR(-ENODEV);
2598                 if (dev_out == NULL)
2599                         goto out;
2600
2601                 /* RACE: Check return value of inet_select_addr instead. */
2602                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2603                         rth = ERR_PTR(-ENETUNREACH);
2604                         goto out;
2605                 }
2606                 if (ipv4_is_local_multicast(fl4->daddr) ||
2607                     ipv4_is_lbcast(fl4->daddr)) {
2608                         if (!fl4->saddr)
2609                                 fl4->saddr = inet_select_addr(dev_out, 0,
2610                                                               RT_SCOPE_LINK);
2611                         goto make_route;
2612                 }
2613                 if (fl4->saddr) {
2614                         if (ipv4_is_multicast(fl4->daddr))
2615                                 fl4->saddr = inet_select_addr(dev_out, 0,
2616                                                               fl4->flowi4_scope);
2617                         else if (!fl4->daddr)
2618                                 fl4->saddr = inet_select_addr(dev_out, 0,
2619                                                               RT_SCOPE_HOST);
2620                 }
2621         }
2622
2623         if (!fl4->daddr) {
2624                 fl4->daddr = fl4->saddr;
2625                 if (!fl4->daddr)
2626                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2627                 dev_out = net->loopback_dev;
2628                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2629                 res.type = RTN_LOCAL;
2630                 flags |= RTCF_LOCAL;
2631                 goto make_route;
2632         }
2633
2634         if (fib_lookup(net, fl4, &res)) {
2635                 res.fi = NULL;
2636                 if (fl4->flowi4_oif) {
2637                         /* Apparently, routing tables are wrong. Assume,
2638                            that the destination is on link.
2639
2640                            WHY? DW.
2641                            Because we are allowed to send to iface
2642                            even if it has NO routes and NO assigned
2643                            addresses. When oif is specified, routing
2644                            tables are looked up with only one purpose:
2645                            to catch if destination is gatewayed, rather than
2646                            direct. Moreover, if MSG_DONTROUTE is set,
2647                            we send packet, ignoring both routing tables
2648                            and ifaddr state. --ANK
2649
2650
2651                            We could make it even if oif is unknown,
2652                            likely IPv6, but we do not.
2653                          */
2654
2655                         if (fl4->saddr == 0)
2656                                 fl4->saddr = inet_select_addr(dev_out, 0,
2657                                                               RT_SCOPE_LINK);
2658                         res.type = RTN_UNICAST;
2659                         goto make_route;
2660                 }
2661                 rth = ERR_PTR(-ENETUNREACH);
2662                 goto out;
2663         }
2664
2665         if (res.type == RTN_LOCAL) {
2666                 if (!fl4->saddr) {
2667                         if (res.fi->fib_prefsrc)
2668                                 fl4->saddr = res.fi->fib_prefsrc;
2669                         else
2670                                 fl4->saddr = fl4->daddr;
2671                 }
2672                 dev_out = net->loopback_dev;
2673                 fl4->flowi4_oif = dev_out->ifindex;
2674                 res.fi = NULL;
2675                 flags |= RTCF_LOCAL;
2676                 goto make_route;
2677         }
2678
2679 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2680         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2681                 fib_select_multipath(&res);
2682         else
2683 #endif
2684         if (!res.prefixlen &&
2685             res.table->tb_num_default > 1 &&
2686             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2687                 fib_select_default(&res);
2688
2689         if (!fl4->saddr)
2690                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2691
2692         dev_out = FIB_RES_DEV(res);
2693         fl4->flowi4_oif = dev_out->ifindex;
2694
2695
2696 make_route:
2697         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2698                                dev_out, flags);
2699         if (!IS_ERR(rth)) {
2700                 unsigned int hash;
2701
2702                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2703                                rt_genid(dev_net(dev_out)));
2704                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2705         }
2706
2707 out:
2708         rcu_read_unlock();
2709         return rth;
2710 }
2711
2712 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2713 {
2714         struct rtable *rth;
2715         unsigned int hash;
2716
2717         if (!rt_caching(net))
2718                 goto slow_output;
2719
2720         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2721
2722         rcu_read_lock_bh();
2723         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2724                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2725                 if (rth->rt_key_dst == flp4->daddr &&
2726                     rth->rt_key_src == flp4->saddr &&
2727                     rt_is_output_route(rth) &&
2728                     rth->rt_oif == flp4->flowi4_oif &&
2729                     rth->rt_mark == flp4->flowi4_mark &&
2730                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2731                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2732                     net_eq(dev_net(rth->dst.dev), net) &&
2733                     !rt_is_expired(rth)) {
2734                         dst_use(&rth->dst, jiffies);
2735                         RT_CACHE_STAT_INC(out_hit);
2736                         rcu_read_unlock_bh();
2737                         if (!flp4->saddr)
2738                                 flp4->saddr = rth->rt_src;
2739                         if (!flp4->daddr)
2740                                 flp4->daddr = rth->rt_dst;
2741                         return rth;
2742                 }
2743                 RT_CACHE_STAT_INC(out_hlist_search);
2744         }
2745         rcu_read_unlock_bh();
2746
2747 slow_output:
2748         return ip_route_output_slow(net, flp4);
2749 }
2750 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2751
2752 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2753 {
2754         return NULL;
2755 }
2756
2757 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2758 {
2759         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2760
2761         return mtu ? : dst->dev->mtu;
2762 }
2763
2764 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2765 {
2766 }
2767
2768 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2769                                           unsigned long old)
2770 {
2771         return NULL;
2772 }
2773
2774 static struct dst_ops ipv4_dst_blackhole_ops = {
2775         .family                 =       AF_INET,
2776         .protocol               =       cpu_to_be16(ETH_P_IP),
2777         .destroy                =       ipv4_dst_destroy,
2778         .check                  =       ipv4_blackhole_dst_check,
2779         .mtu                    =       ipv4_blackhole_mtu,
2780         .default_advmss         =       ipv4_default_advmss,
2781         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2782         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2783         .neigh_lookup           =       ipv4_neigh_lookup,
2784 };
2785
2786 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2787 {
2788         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2789         struct rtable *ort = (struct rtable *) dst_orig;
2790
2791         if (rt) {
2792                 struct dst_entry *new = &rt->dst;
2793
2794                 new->__use = 1;
2795                 new->input = dst_discard;
2796                 new->output = dst_discard;
2797                 dst_copy_metrics(new, &ort->dst);
2798
2799                 new->dev = ort->dst.dev;
2800                 if (new->dev)
2801                         dev_hold(new->dev);
2802
2803                 rt->rt_key_dst = ort->rt_key_dst;
2804                 rt->rt_key_src = ort->rt_key_src;
2805                 rt->rt_key_tos = ort->rt_key_tos;
2806                 rt->rt_route_iif = ort->rt_route_iif;
2807                 rt->rt_iif = ort->rt_iif;
2808                 rt->rt_oif = ort->rt_oif;
2809                 rt->rt_mark = ort->rt_mark;
2810
2811                 rt->rt_genid = rt_genid(net);
2812                 rt->rt_flags = ort->rt_flags;
2813                 rt->rt_type = ort->rt_type;
2814                 rt->rt_dst = ort->rt_dst;
2815                 rt->rt_src = ort->rt_src;
2816                 rt->rt_gateway = ort->rt_gateway;
2817                 rt->rt_spec_dst = ort->rt_spec_dst;
2818                 rt->peer = ort->peer;
2819                 if (rt->peer)
2820                         atomic_inc(&rt->peer->refcnt);
2821                 rt->fi = ort->fi;
2822                 if (rt->fi)
2823                         atomic_inc(&rt->fi->fib_clntref);
2824
2825                 dst_free(new);
2826         }
2827
2828         dst_release(dst_orig);
2829
2830         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2831 }
2832
2833 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2834                                     struct sock *sk)
2835 {
2836         struct rtable *rt = __ip_route_output_key(net, flp4);
2837
2838         if (IS_ERR(rt))
2839                 return rt;
2840
2841         if (flp4->flowi4_proto)
2842                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2843                                                    flowi4_to_flowi(flp4),
2844                                                    sk, 0);
2845
2846         return rt;
2847 }
2848 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2849
2850 static int rt_fill_info(struct net *net,
2851                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2852                         int nowait, unsigned int flags)
2853 {
2854         struct rtable *rt = skb_rtable(skb);
2855         struct rtmsg *r;
2856         struct nlmsghdr *nlh;
2857         unsigned long expires = 0;
2858         const struct inet_peer *peer = rt->peer;
2859         u32 id = 0, ts = 0, tsage = 0, error;
2860
2861         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2862         if (nlh == NULL)
2863                 return -EMSGSIZE;
2864
2865         r = nlmsg_data(nlh);
2866         r->rtm_family    = AF_INET;
2867         r->rtm_dst_len  = 32;
2868         r->rtm_src_len  = 0;
2869         r->rtm_tos      = rt->rt_key_tos;
2870         r->rtm_table    = RT_TABLE_MAIN;
2871         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2872         r->rtm_type     = rt->rt_type;
2873         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2874         r->rtm_protocol = RTPROT_UNSPEC;
2875         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2876         if (rt->rt_flags & RTCF_NOTIFY)
2877                 r->rtm_flags |= RTM_F_NOTIFY;
2878
2879         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2880
2881         if (rt->rt_key_src) {
2882                 r->rtm_src_len = 32;
2883                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2884         }
2885         if (rt->dst.dev)
2886                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2887 #ifdef CONFIG_IP_ROUTE_CLASSID
2888         if (rt->dst.tclassid)
2889                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2890 #endif
2891         if (rt_is_input_route(rt))
2892                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2893         else if (rt->rt_src != rt->rt_key_src)
2894                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2895
2896         if (rt->rt_dst != rt->rt_gateway)
2897                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2898
2899         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2900                 goto nla_put_failure;
2901
2902         if (rt->rt_mark)
2903                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2904
2905         error = rt->dst.error;
2906         if (peer) {
2907                 inet_peer_refcheck(rt->peer);
2908                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2909                 if (peer->tcp_ts_stamp) {
2910                         ts = peer->tcp_ts;
2911                         tsage = get_seconds() - peer->tcp_ts_stamp;
2912                 }
2913                 expires = ACCESS_ONCE(peer->pmtu_expires);
2914                 if (expires) {
2915                         if (time_before(jiffies, expires))
2916                                 expires -= jiffies;
2917                         else
2918                                 expires = 0;
2919                 }
2920         }
2921
2922         if (rt_is_input_route(rt)) {
2923 #ifdef CONFIG_IP_MROUTE
2924                 __be32 dst = rt->rt_dst;
2925
2926                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2927                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2928                         int err = ipmr_get_route(net, skb,
2929                                                  rt->rt_src, rt->rt_dst,
2930                                                  r, nowait);
2931                         if (err <= 0) {
2932                                 if (!nowait) {
2933                                         if (err == 0)
2934                                                 return 0;
2935                                         goto nla_put_failure;
2936                                 } else {
2937                                         if (err == -EMSGSIZE)
2938                                                 goto nla_put_failure;
2939                                         error = err;
2940                                 }
2941                         }
2942                 } else
2943 #endif
2944                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2945         }
2946
2947         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2948                                expires, error) < 0)
2949                 goto nla_put_failure;
2950
2951         return nlmsg_end(skb, nlh);
2952
2953 nla_put_failure:
2954         nlmsg_cancel(skb, nlh);
2955         return -EMSGSIZE;
2956 }
2957
2958 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2959 {
2960         struct net *net = sock_net(in_skb->sk);
2961         struct rtmsg *rtm;
2962         struct nlattr *tb[RTA_MAX+1];
2963         struct rtable *rt = NULL;
2964         __be32 dst = 0;
2965         __be32 src = 0;
2966         u32 iif;
2967         int err;
2968         int mark;
2969         struct sk_buff *skb;
2970
2971         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2972         if (err < 0)
2973                 goto errout;
2974
2975         rtm = nlmsg_data(nlh);
2976
2977         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2978         if (skb == NULL) {
2979                 err = -ENOBUFS;
2980                 goto errout;
2981         }
2982
2983         /* Reserve room for dummy headers, this skb can pass
2984            through good chunk of routing engine.
2985          */
2986         skb_reset_mac_header(skb);
2987         skb_reset_network_header(skb);
2988
2989         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2990         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2991         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2992
2993         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2994         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2995         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2996         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2997
2998         if (iif) {
2999                 struct net_device *dev;
3000
3001                 dev = __dev_get_by_index(net, iif);
3002                 if (dev == NULL) {
3003                         err = -ENODEV;
3004                         goto errout_free;
3005                 }
3006
3007                 skb->protocol   = htons(ETH_P_IP);
3008                 skb->dev        = dev;
3009                 skb->mark       = mark;
3010                 local_bh_disable();
3011                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3012                 local_bh_enable();
3013
3014                 rt = skb_rtable(skb);
3015                 if (err == 0 && rt->dst.error)
3016                         err = -rt->dst.error;
3017         } else {
3018                 struct flowi4 fl4 = {
3019                         .daddr = dst,
3020                         .saddr = src,
3021                         .flowi4_tos = rtm->rtm_tos,
3022                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3023                         .flowi4_mark = mark,
3024                 };
3025                 rt = ip_route_output_key(net, &fl4);
3026
3027                 err = 0;
3028                 if (IS_ERR(rt))
3029                         err = PTR_ERR(rt);
3030         }
3031
3032         if (err)
3033                 goto errout_free;
3034
3035         skb_dst_set(skb, &rt->dst);
3036         if (rtm->rtm_flags & RTM_F_NOTIFY)
3037                 rt->rt_flags |= RTCF_NOTIFY;
3038
3039         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3040                            RTM_NEWROUTE, 0, 0);
3041         if (err <= 0)
3042                 goto errout_free;
3043
3044         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3045 errout:
3046         return err;
3047
3048 errout_free:
3049         kfree_skb(skb);
3050         goto errout;
3051 }
3052
3053 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3054 {
3055         struct rtable *rt;
3056         int h, s_h;
3057         int idx, s_idx;
3058         struct net *net;
3059
3060         net = sock_net(skb->sk);
3061
3062         s_h = cb->args[0];
3063         if (s_h < 0)
3064                 s_h = 0;
3065         s_idx = idx = cb->args[1];
3066         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3067                 if (!rt_hash_table[h].chain)
3068                         continue;
3069                 rcu_read_lock_bh();
3070                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3071                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3072                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3073                                 continue;
3074                         if (rt_is_expired(rt))
3075                                 continue;
3076                         skb_dst_set_noref(skb, &rt->dst);
3077                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3078                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3079                                          1, NLM_F_MULTI) <= 0) {
3080                                 skb_dst_drop(skb);
3081                                 rcu_read_unlock_bh();
3082                                 goto done;
3083                         }
3084                         skb_dst_drop(skb);
3085                 }
3086                 rcu_read_unlock_bh();
3087         }
3088
3089 done:
3090         cb->args[0] = h;
3091         cb->args[1] = idx;
3092         return skb->len;
3093 }
3094
3095 void ip_rt_multicast_event(struct in_device *in_dev)
3096 {
3097         rt_cache_flush(dev_net(in_dev->dev), 0);
3098 }
3099
3100 #ifdef CONFIG_SYSCTL
3101 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3102                                         void __user *buffer,
3103                                         size_t *lenp, loff_t *ppos)
3104 {
3105         if (write) {
3106                 int flush_delay;
3107                 ctl_table ctl;
3108                 struct net *net;
3109
3110                 memcpy(&ctl, __ctl, sizeof(ctl));
3111                 ctl.data = &flush_delay;
3112                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3113
3114                 net = (struct net *)__ctl->extra1;
3115                 rt_cache_flush(net, flush_delay);
3116                 return 0;
3117         }
3118
3119         return -EINVAL;
3120 }
3121
3122 static ctl_table ipv4_route_table[] = {
3123         {
3124                 .procname       = "gc_thresh",
3125                 .data           = &ipv4_dst_ops.gc_thresh,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec,
3129         },
3130         {
3131                 .procname       = "max_size",
3132                 .data           = &ip_rt_max_size,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 /*  Deprecated. Use gc_min_interval_ms */
3139
3140                 .procname       = "gc_min_interval",
3141                 .data           = &ip_rt_gc_min_interval,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec_jiffies,
3145         },
3146         {
3147                 .procname       = "gc_min_interval_ms",
3148                 .data           = &ip_rt_gc_min_interval,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec_ms_jiffies,
3152         },
3153         {
3154                 .procname       = "gc_timeout",
3155                 .data           = &ip_rt_gc_timeout,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec_jiffies,
3159         },
3160         {
3161                 .procname       = "redirect_load",
3162                 .data           = &ip_rt_redirect_load,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec,
3166         },
3167         {
3168                 .procname       = "redirect_number",
3169                 .data           = &ip_rt_redirect_number,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec,
3173         },
3174         {
3175                 .procname       = "redirect_silence",
3176                 .data           = &ip_rt_redirect_silence,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec,
3180         },
3181         {
3182                 .procname       = "error_cost",
3183                 .data           = &ip_rt_error_cost,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "error_burst",
3190                 .data           = &ip_rt_error_burst,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .procname       = "gc_elasticity",
3197                 .data           = &ip_rt_gc_elasticity,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "mtu_expires",
3204                 .data           = &ip_rt_mtu_expires,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec_jiffies,
3208         },
3209         {
3210                 .procname       = "min_pmtu",
3211                 .data           = &ip_rt_min_pmtu,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec,
3215         },
3216         {
3217                 .procname       = "min_adv_mss",
3218                 .data           = &ip_rt_min_advmss,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec,
3222         },
3223         { }
3224 };
3225
3226 static struct ctl_table empty[1];
3227
3228 static struct ctl_table ipv4_skeleton[] =
3229 {
3230         { .procname = "route", 
3231           .mode = 0555, .child = ipv4_route_table},
3232         { .procname = "neigh", 
3233           .mode = 0555, .child = empty},
3234         { }
3235 };
3236
3237 static __net_initdata struct ctl_path ipv4_path[] = {
3238         { .procname = "net", },
3239         { .procname = "ipv4", },
3240         { },
3241 };
3242
3243 static struct ctl_table ipv4_route_flush_table[] = {
3244         {
3245                 .procname       = "flush",
3246                 .maxlen         = sizeof(int),
3247                 .mode           = 0200,
3248                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3249         },
3250         { },
3251 };
3252
3253 static __net_initdata struct ctl_path ipv4_route_path[] = {
3254         { .procname = "net", },
3255         { .procname = "ipv4", },
3256         { .procname = "route", },
3257         { },
3258 };
3259
3260 static __net_init int sysctl_route_net_init(struct net *net)
3261 {
3262         struct ctl_table *tbl;
3263
3264         tbl = ipv4_route_flush_table;
3265         if (!net_eq(net, &init_net)) {
3266                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3267                 if (tbl == NULL)
3268                         goto err_dup;
3269         }
3270         tbl[0].extra1 = net;
3271
3272         net->ipv4.route_hdr =
3273                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3274         if (net->ipv4.route_hdr == NULL)
3275                 goto err_reg;
3276         return 0;
3277
3278 err_reg:
3279         if (tbl != ipv4_route_flush_table)
3280                 kfree(tbl);
3281 err_dup:
3282         return -ENOMEM;
3283 }
3284
3285 static __net_exit void sysctl_route_net_exit(struct net *net)
3286 {
3287         struct ctl_table *tbl;
3288
3289         tbl = net->ipv4.route_hdr->ctl_table_arg;
3290         unregister_net_sysctl_table(net->ipv4.route_hdr);
3291         BUG_ON(tbl == ipv4_route_flush_table);
3292         kfree(tbl);
3293 }
3294
3295 static __net_initdata struct pernet_operations sysctl_route_ops = {
3296         .init = sysctl_route_net_init,
3297         .exit = sysctl_route_net_exit,
3298 };
3299 #endif
3300
3301 static __net_init int rt_genid_init(struct net *net)
3302 {
3303         get_random_bytes(&net->ipv4.rt_genid,
3304                          sizeof(net->ipv4.rt_genid));
3305         get_random_bytes(&net->ipv4.dev_addr_genid,
3306                          sizeof(net->ipv4.dev_addr_genid));
3307         return 0;
3308 }
3309
3310 static __net_initdata struct pernet_operations rt_genid_ops = {
3311         .init = rt_genid_init,
3312 };
3313
3314
3315 #ifdef CONFIG_IP_ROUTE_CLASSID
3316 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3317 #endif /* CONFIG_IP_ROUTE_CLASSID */
3318
3319 static __initdata unsigned long rhash_entries;
3320 static int __init set_rhash_entries(char *str)
3321 {
3322         if (!str)
3323                 return 0;
3324         rhash_entries = simple_strtoul(str, &str, 0);
3325         return 1;
3326 }
3327 __setup("rhash_entries=", set_rhash_entries);
3328
3329 int __init ip_rt_init(void)
3330 {
3331         int rc = 0;
3332
3333 #ifdef CONFIG_IP_ROUTE_CLASSID
3334         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3335         if (!ip_rt_acct)
3336                 panic("IP: failed to allocate ip_rt_acct\n");
3337 #endif
3338
3339         ipv4_dst_ops.kmem_cachep =
3340                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3341                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3342
3343         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3344
3345         if (dst_entries_init(&ipv4_dst_ops) < 0)
3346                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3347
3348         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3349                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3350
3351         rt_hash_table = (struct rt_hash_bucket *)
3352                 alloc_large_system_hash("IP route ca